萌新笔记——C++里创建 Trie字典树(中文词典)(三)(联想)

编程爱好者联盟 2016-12-05

萌新做词典第三篇,做得不好,还请指正,谢谢大佬!

今天把词典的联想做好了,也是比较low的,还改了之前的查询、遍历等代码。  Orz

一样地先放上运行结果:

1 test1
 2 ID : 2    char : 件    word : 编程软件
 3 ID : 3    char : 习    word : 编程学习
 4 ID : 4    char : 站    word : 编程学习网站
 5 ID : 1    char : 门    word : 编程入门
 6 
 7 test2
 8 ID : 5    char : 练    word : 编程训练
 9 ID : 1    char : 门    word : 编程入门
10 ID : 3    char : 习    word : 编程学习
11 ID : 4    char : 站    word : 编程学习网站
12 ID : 2    char : 件    word : 编程软件
13 find ID : 3    word : 编程学习
14 
15 associate "编程" : 
16 find!
17 训练
18 入门
19 学习
20 学习网站
21 软件

测试用的test.cc

p_fNEwJfKixA9iCGnAN4qSyaLTu_fXlYHhniP3fF0Xn5jc_r_IdEypbuMnj7PTfTJfRBj5H9TxjBbe_p7Vd8y9TLEkmN7jOS8KrelW_PWzo.gifp_fNEwJfKixA9iCGnAN4qSyaLTu_fXlYHhniP3fF0Xn45HP0qh8qr3QeuHh8e_XueXSW9RC8R6KAcgRGUQByNv1esdxXsFwKC6-jJAvn4mQ.gif

1 #include "Dictionary.h"
 2 #include <iostream>
 3 #include <string>
 4 #include <vector>
 5 using std::cout;
 6 using std::endl;
 7 using std::string;
 8 using std::vector;
 9 
10 int test1()
11 {
12     ccx::Dictionary words;
13     string word1 = "编程入门";    
14     string word2 = "编程软件";    
15     string word3 = "编程学习";    
16     string word4 = "编程学习网站";    
17     
18     words.push(word1);    
19     words.push(word2);    
20     words.push(word3);    
21     words.push(word4);    
22 
23     words.resetIt();
24     
25     while(!words.isEnd())
26     {
27         cout << "ID : " << words.getCurWordId() 
28             << "\tchar : " << words.getCurChar() 
29             << "\tword : " << words.getCurWord() << endl;
30         words.next();
31     }
32     
33     words.leading_out();
34     return 0;
35 }
36 
37 
38 int test2()
39 {
40     ccx::Dictionary words;
41     words.leading_in();
42 
43 
44     string word("编程训练");
45     words.push(word);
46     words.resetIt();
47 
48     while(!words.isEnd())
49     {
50         cout << "ID : " << words.getCurWordId() 
51             << "\tchar : " << words.getCurChar() 
52             << "\tword : " << words.getCurWord() << endl;
53         words.next();
54     }
55     string tmp = "编程学习";    
56     int id = words.search(tmp);
57     if(-1 == id)
58     {
59         cout << "no such word like \"" << tmp << "\"" << endl;
60     }else{
61         cout << "find ID : " << id
62             << "\tword : " << tmp << endl;
63     }
64 
65     cout << endl;
66     cout << "associate \"编程\" : " << endl;
67 
68     vector<string> data;    
69     string temp = "编程";
70     
71     if(words.associate(temp, data))
72     {
73         cout << "find!" << endl;
74         for(auto & elem : data)
75         {
76             cout << elem << endl;
77         }
78     }else{
79         cout << "can't find" << endl;
80     }
81 
82 
83     return 0;
84 }
85 
86 int main()
87 {
88     cout << "test1" << endl;
89     test1();
90     cout << endl;
91     cout << "test2" << endl;
92     test2();
93     cout << endl;
94 }
View Code

test1不变,test2 在导入后再插入一个词“编程训练”,发现ID是正常的。

然后在test2最后调用联想函数,传入“编程”,能够正常传出所有的字符串。

在做这个的时候,一开始想的很简单,就是拿传入的词去树中查找,找到最后一人字对应的节点,然后以那个节点为根进行遍历。然后就开开心心地去写了,结果写一部分就要对之前的代码进行更改,于是,这个接口越来越“肥”了:

Dictionary.h

#ifndef __DICTIONARY_H__
 #define __DICTIONARY_H__
 
 #include "DictionaryData.h"
 #include "DictionaryConf.h"
 
 #include <memory>
 #include <vector>
 #include <list>
 
 namespace ccx{
 
 using std::shared_ptr;
 using std::vector;
 using std::list;
 
 class Dictionary
 {
     typedef unordered_map<string, pDictElem>::iterator WordIt;
     public:
         Dictionary();
         void push(const string & word);
         void push(vector<string> & words);
         int search(const string & word);
         bool associate(const string & word, vector<string> & data);
     private:
         void AddWord(const string & word, int wordId);
         void splitWord(const string & word, vector<string> & characters);//把词拆成字
         int search(vector<string> & data, pDictElem & pcur);
         pDictElem _dictionary;
         DictionaryConf _conf;    
 
 //遍历
     public:
         string getCurChar();
         string getCurWord();
         int getCurWordId();
         bool isEnd();
         void resetIt();
         void next();
     private:
         void resetPoint(pDictElem pcur);
         void next(pDictElem & pcur, list<WordIt> & stackWord, list<pDictElem> & stackDict);
         void nextWord(pDictElem & pcur, list<WordIt> & stackWord, list<pDictElem> & stackDict);
         string getCurWord(list<WordIt> & stackWord);
         
         pDictElem _pcur;
         WordIt _itcur;
         
 //用list实现栈,遍历时方便
         list<WordIt> _stackWord;
         list<pDictElem> _stackDict;
 
 //导入导出
     public:
         void leading_in();
         void leading_out();
 };
 
 }
 
 #endif

对几个原有的函数进行了重载,主要是为了能够复用一些代码,但是又想不到合适的新的函数名(英语不太好Orz)。

首先,是要能够查找并返回新的根结点,于是对search进行修改:

1 int Dictionary::search(vector<string> & characters, pDictElem & root)
 2 {
 3     vector<string>::iterator it_char;
 4     it_char = characters.begin();    
 5     root = _dictionary;
 6     int i = 0;
 7     for(; it_char != characters.end(); ++it_char, ++i)
 8     {
 9         WordIt it_word;
10         it_word = root->_words.find(*it_char);
11         
12         if(it_word == root->_words.end())
13         {
14             break;
15         }else{
16             root = it_word->second;
17         }
18     }
19     return i;
20 }

形参第一项是分解后的字集,第二项是一个智能指针,指向某个节点。这里返回值改为了字集的第几项,有两个目的:

1、插入函数中可以方便地知道下一个要插入的是哪个字符

2、联想函数中可以判断字集中的字是否都存在于词典中

3、好吧,我没想到其它好办法,而且当时是想到上面两点就这么做了,后来发现,插入部分的代码根本就不用改

然后是重载search:

1 int Dictionary::search(const string & word)
 2 {
 3     pDictElem root = _dictionary;
 4     vector<string> temp;
 5     splitWord(word, temp);
 6     
 7     int ret = search(temp, root);
 8     int size = temp.size();
 9     if(ret != size)
10     {
11         return -1;
12     }
13     return root->_wordId;
14 }

在这里对字进行分解,并定义一个临时的根结点,这样做的目的是为了保护private中的根结点,并且可以在多线程环境中互不干扰。

能够找到“新的根”后,就要对它进行遍历了。如果只有单一线程或进程来使用它,这里可以直接把resetPoint(原来的)修改一下,设置指定结点就可以了:

void Dictionary::resetPoint(pDictElem pcur)
 {
     _pcur = pcur;
     if(_stackDict.size())
     {
         _stackDict.clear();
     }
     if(_stackWord.size())
     {
         _stackWord.clear();
     }
     next();
 }

如果是这样,那前面也完全不用修改。由于这个词典最后是要应用到miniSearchEngin中,于是我对遍历部分的函数进行了修改:

void Dictionary::next()
 {
     next(_pcur, _stackWord, _stackDict);
 }
 
 void Dictionary::next(pDictElem & pcur, list<WordIt> & stackWord, list<pDictElem> & stackDict)
 {
     while(pcur)
     {
         nextWord(pcur, stackWord, stackDict);
         if(!pcur || pcur->_wordId)
         {
             break;
         }
     }
 }
 
 void Dictionary::nextWord(pDictElem & pcur, list<WordIt> & stackWord, list<pDictElem> & stackDict)
 {
     if(pcur)
     {
         if(pcur->_words.size())
         {
             stackDict.push_back(pcur);
             stackWord.push_back(pcur->_words.begin());
             pcur = stackWord.back()->second;
         }else{
             ++(stackWord.back());
         }
         while(stackWord.back() == stackDict.back()->_words.end())
         {
             stackDict.pop_back();
             stackWord.pop_back();
             if(!stackDict.size())
             {
                 pcur = NULL;
             }
             ++(stackWord.back());
         }
         if(pcur)
         {
             pcur = stackWord.back()->second;
         }    
     }
 }

next部分,改为传入参数,这样就可以在associate里定义临时的栈和智能指针等,遍历的时候与其它工作并没有任何关系。

同样地,getWord也要做相同的更改:

string Dictionary::getCurWord()
 {
     return getCurWord(_stackWord);
 }
 
 string Dictionary::getCurWord(list<WordIt> & stackWord)
 {
     string temp;
     list<WordIt>::iterator it_word;    
     it_word = stackWord.begin();    
 
     for(; it_word != stackWord.end(); ++it_word)
     {
         temp += (*it_word)->first;
     }
     return temp;
 }

当然了,对外提供的接口都是不要传参的,其它的只能在内部使用,于是放入了private区。

终于可以开始写联想了0.0

bool Dictionary::associate(const string & word, vector<string> & data)
 {
     pDictElem root = _dictionary;
     vector<string> temp;
     splitWord(word, temp);
     
     int ret = search(temp, root);
     int size = temp.size();
     if(ret != size)
     {
         return false;
     }
     
     list<WordIt> stackWord;
     list<pDictElem> stackDict;
     next(root, stackWord, stackDict);
     while(root)
     {
         string temp = getCurWord(stackWord);
         data.push_back(temp);    
         next(root, stackWord, stackDict);
     }
     
     if(!data.size())
     {
         return false;
     }
     return true;
 }

返回bool类型,可以方便地判断是否联想成功,即以传入的词做为前缀,能否找到剩余部分(词典里有存)。于是乎,一个渣渣型号的词典就做好啦~~~

Dictionary.cc

p_fNEwJfKixA9iCGnAN4qSyaLTu_fXlYHhniP3fF0Xn5jc_r_IdEypbuMnj7PTfTJfRBj5H9TxjBbe_p7Vd8y9TLEkmN7jOS8KrelW_PWzo.gifp_fNEwJfKixA9iCGnAN4qSyaLTu_fXlYHhniP3fF0Xn45HP0qh8qr3QeuHh8e_XueXSW9RC8R6KAcgRGUQByNv1esdxXsFwKC6-jJAvn4mQ.gif

1 #include "Dictionary.h"
  2 #include <iostream>
  3 #include <fstream>
  4 #include <string>
  5 #include <json/json.h>
  6 
  7 namespace ccx{
  8 
  9 using std::endl;
 10 using std::cout;
 11 using std::pair;
 12 using std::ofstream;
 13 using std::ifstream;
 14 
 15 Dictionary::Dictionary()
 16 : _dictionary(new DictElem)
 17 , _conf()
 18 {
 19     _dictionary->_wordId = 0;
 20     _pcur = _dictionary;
 21 }
 22 
 23 void Dictionary::splitWord(const string & word, vector<string> & characters)
 24 {
 25     int num = word.size();
 26     int i = 0;
 27     while(i < num)
 28     {
 29         int size = 1;
 30         if(word[i] & 0x80)
 31         {
 32             char temp = word[i];
 33             temp <<= 1;
 34             do{
 35                 temp <<= 1;
 36                 ++size;
 37             }while(temp & 0x80);
 38         }
 39         string subWord;
 40         subWord = word.substr(i, size);
 41         characters.push_back(subWord);
 42         i += size;
 43     }
 44 }
 45 
 46 void Dictionary::AddWord(const string & word, int wordId)
 47 {
 48     vector<string> characters;
 49     splitWord(word, characters);
 50     
 51     vector<string>::iterator it_char;
 52     it_char = characters.begin();    
 53     pDictElem root;
 54     root = _dictionary;
 55     for(; it_char != characters.end(); ++it_char)
 56     {
 57         WordIt it_word;
 58         it_word = root->_words.find(*it_char);
 59         
 60         if(it_word == root->_words.end())
 61         {
 62             pair<string, pDictElem> temp;
 63             temp.first = *it_char;
 64             pDictElem dictemp(new DictElem);
 65             dictemp->_word = *it_char;
 66             dictemp->_wordId = 0;
 67             temp.second = dictemp;
 68             root->_words.insert(temp);
 69             root = dictemp;
 70         }else{
 71             root = it_word->second;
 72         }
 73     }
 74     if(!root->_wordId)
 75     {
 76         root->_wordId = wordId;
 77     }
 78 }
 79 
 80 void Dictionary::push(const string & word)
 81 {
 82     ++(_dictionary->_wordId);
 83     AddWord(word, _dictionary->_wordId);
 84 }
 85 
 86 void Dictionary::push(vector<string> & words)
 87 {
 88     int size = words.size();
 89     for(int i = 0; i < size; ++i)
 90     {
 91         push(words[i]);
 92     }
 93 }
 94 
 95 int Dictionary::search(const string & word)
 96 {
 97     pDictElem root = _dictionary;
 98     vector<string> temp;
 99     splitWord(word, temp);
100     
101     int ret = search(temp, root);
102     int size = temp.size();
103     if(ret != size)
104     {
105         return -1;
106     }
107     return root->_wordId;
108 }
109 
110 int Dictionary::search(vector<string> & characters, pDictElem & root)
111 {
112     vector<string>::iterator it_char;
113     it_char = characters.begin();    
114     root = _dictionary;
115     int i = 0;
116     for(; it_char != characters.end(); ++it_char, ++i)
117     {
118         WordIt it_word;
119         it_word = root->_words.find(*it_char);
120         
121         if(it_word == root->_words.end())
122         {
123             break;
124         }else{
125             root = it_word->second;
126         }
127     }
128     return i;
129 }
130 
131 bool Dictionary::associate(const string & word, vector<string> & data)
132 {
133     pDictElem root = _dictionary;
134     vector<string> temp;
135     splitWord(word, temp);
136     
137     int ret = search(temp, root);
138     int size = temp.size();
139     if(ret != size)
140     {
141         return false;
142     }
143     
144     list<WordIt> stackWord;
145     list<pDictElem> stackDict;
146     next(root, stackWord, stackDict);
147     while(root)
148     {
149         string temp = getCurWord(stackWord);
150         data.push_back(temp);    
151         next(root, stackWord, stackDict);
152     }
153     
154     if(!data.size())
155     {
156         return false;
157     }
158     return true;
159 }
160 
161 //遍历用
162 
163 void Dictionary::resetPoint(pDictElem pcur)
164 {
165     _pcur = pcur;
166     if(_stackDict.size())
167     {
168         _stackDict.clear();
169     }
170     if(_stackWord.size())
171     {
172         _stackWord.clear();
173     }
174     next();
175 }
176 
177 void Dictionary::resetIt()
178 {
179     resetPoint(_dictionary);
180 }
181 
182 void Dictionary::next()
183 {
184     next(_pcur, _stackWord, _stackDict);
185 }
186 
187 void Dictionary::next(pDictElem & pcur, list<WordIt> & stackWord, list<pDictElem> & stackDict)
188 {
189     while(pcur)
190     {
191         nextWord(pcur, stackWord, stackDict);
192         if(!pcur || pcur->_wordId)
193         {
194             break;
195         }
196     }
197 }
198 
199 void Dictionary::nextWord(pDictElem & pcur, list<WordIt> & stackWord, list<pDictElem> & stackDict)
200 {
201     if(pcur)
202     {
203         if(pcur->_words.size())
204         {
205             stackDict.push_back(pcur);
206             stackWord.push_back(pcur->_words.begin());
207             pcur = stackWord.back()->second;
208         }else{
209             ++(stackWord.back());
210         }
211         while(stackWord.back() == stackDict.back()->_words.end())
212         {
213             stackDict.pop_back();
214             stackWord.pop_back();
215             if(!stackDict.size())
216             {
217                 pcur = NULL;
218             }
219             ++(stackWord.back());
220         }
221         if(pcur)
222         {
223             pcur = stackWord.back()->second;
224         }    
225     }
226 }
227 
228 string Dictionary::getCurChar()
229 {
230     return _pcur->_word;
231 }
232 
233 int Dictionary::getCurWordId()
234 {
235     return _pcur->_wordId;
236 }
237 
238 string Dictionary::getCurWord()
239 {
240     return getCurWord(_stackWord);
241 }
242 
243 string Dictionary::getCurWord(list<WordIt> & stackWord)
244 {
245     string temp;
246     list<WordIt>::iterator it_word;    
247     it_word = stackWord.begin();    
248 
249     for(; it_word != stackWord.end(); ++it_word)
250     {
251         temp += (*it_word)->first;
252     }
253     return temp;
254 }
255 
256 bool Dictionary::isEnd()
257 {
258     return _pcur == NULL;
259 }
260 
261 void Dictionary::leading_in()//导入,失败没必要退出程序
262 {
263     ifstream ifs;
264     const char * path = _conf.getDictionaryPath().c_str();
265     ifs.open(path);
266     if(!ifs.good())
267     {
268         cout << "open Dictionary.json error(leading_in)" << endl;
269     }else{
270         Json::Value root;
271         Json::Reader reader;
272         
273         if(!reader.parse(ifs, root, false))
274         {
275             cout << "json read Dictionary.json error" << endl;
276         }else{
277             int size = root.size();
278             for(int i = 0; i < size; ++i)
279             {
280                 string word = root[i]["Word"].asString();
281                 int wordId = root[i]["WordId"].asInt();
282                 AddWord(word, wordId);
283                 ++(_dictionary->_wordId);
284             }
285         }
286     }
287 }
288 
289 void Dictionary::leading_out()
290 {
291     Json::Value root;
292     Json::FastWriter writer;
293 
294     resetIt();
295     
296     while(!isEnd())
297     {
298         Json::Value elem;
299         elem["Word"] = getCurWord();
300         elem["WordId"] = getCurWordId();
301         root.append(elem);
302         next();
303     }
304 
305     string words;
306     words = writer.write(root);
307     
308     ofstream ofs;
309     const char * path = _conf.getDictionaryPath().c_str();
310     ofs.open(path);
311     if(!ofs.good())
312     {
313         cout << "open Dictionary.json error(leading_out)" << endl;
314         ofs.open("Dictionary.tmp");
315         if(!ofs.good())
316         {
317             exit(EXIT_FAILURE);
318         }
319     }
320     
321     ofs << words;
322     ofs.close();
323 }
324 
325 }
View Code

相关推荐