xxx

 1 #ifndef ALGORITHM_WUMANBER_H
 2 #define ALGORITHM_WUMANBER_H
 3 
 4 #include <vector>
 5 #include <string>
 6 #include <set>
 7 
 8 typedef std::set<std::string> ResultSetType;
 9 typedef std::vector<unsigned int> MatchPosVector;
10 typedef std::pair<unsigned int, int> PrefixIdPairType;
11 typedef std::vector<PrefixIdPairType> PrefixTableType;
12 
13 class WuManber
14 {
15     public:
16         WuManber();
17         ~WuManber();
18         /**
19          * Init Function
20          * 
21          * @param patterns      pattern list to be matched
22          */
23         bool Init(const std::vector<std::string>& patterns);
24 
25         /** 
26          * @param text           raw text
27          * @param textLength     length of text
28          * @param res            string set containing matched patterns
29          * 
30          * @return value 0: no pattern matchs, n: n patterns matched(n>0)
31          */
32         int Search( const char* text, const int textLength, ResultSetType& res);
33 
34         /**
35          * @param  str           raw text
36          * @param  res           string set containing matched patterns
37          *
38          * @return value 0: no pattern matchs, n: n patterns matched(n>0)
39          */
40          int Search(const std::string& str, ResultSetType& res);
41 
42         /**
43          * @brief Search text 
44          *
45          * @return value 0: no pattern matchs, n: n patterns matched(n>0)
46          */
47         int Search(const char* text, const int textLength);
48         
49         /**
50          * @param  str           raw text
51          * param  matchPosVector        vector containing matched patterns postion
52          * @return value 0: no pattern matchs, n: n patterns matched(n>0)
53          */
54         int Search(const char* text, const int textLength, MatchPosVector &matchPosVector);
55         
56         /**
57          * param  matchPosVector        vector containing matched patterns postion
58          * @return value 0: no pattern matchs, n: n patterns matched(n>0)
59          */
60         int Search(const std::string& str, MatchPosVector &matchPosVector);
61 
62         /**
63          * @brief Search text
64          *
65          * @return value 0: no pattern matchs, n: n patterns matched(n>0)
66          */
67         int Search(const std::string& str);
68 
69     private:
70         // minmum length of patterns
71         int32_t mMin;
72         // SHIFT table
73         std::vector<int32_t> mShiftTable;
74         // a combination of HASH and PREFIX table 
75         std::vector<PrefixTableType> mHashTable;
76         // patterns
77         std::vector<std::string> mPatterns;
78         // size of SHIFT and HASH table
79         int32_t mTableSize;
80         // size of block
81         int32_t mBlock;
82 };
83 
84 #endif
  1 #include <cmath>
  2 #include <iostream>
  3 #include "wumanber.h"
  4 
  5 using namespace std;
  6 
  7 /** 
  8  * @brief   String hash function.
  9  * 
 10  * @param str   the string needed to be hashed
 11  * @param len   length of the substr should be hashed
 12  * 
 13  * @return hash code
 14  */
 15 unsigned int HashCode(const char* str, int len)
 16 {
 17     unsigned int hash = 0;
 18     while (*str && len>0)
 19     {
 20         hash = (*str++) + (hash << 6) + (hash << 16) - hash;
 21         --len;
 22     }
 23     return (hash & 0x7FFFFFFF);
 24 }
 25 
 26 /** 
 27  * @brief constructor 
 28  */
 29 WuManber::WuManber():mMin(0), mTableSize(0), mBlock(3)
 30 {
 31     //VOID
 32 }
 33 
 34 /**
 35  * @brief Init
 36  */
 37 bool WuManber::Init(const vector<string>& patterns)
 38 {
 39     int patternSize = patterns.size();
 40 
 41     //check if no pattern specified
 42     if (patternSize == 0)
 43     {
 44         //cerr << "Error: wumanber init failed because no pattern specified." << endl;
 45         return false;
 46     }
 47     
 48     //caculate the minmum pattern length
 49     mMin = patterns[0].length();
 50     int32_t lenPattern = 0;
 51     for (int i = 0; i < patternSize; ++i) 
 52     {
 53         lenPattern = patterns[i].length();
 54         if (lenPattern < mMin)
 55         {
 56             mMin = lenPattern;
 57         }
 58     }
 59 
 60     //check if mBlock larger than mMin
 61     if (mBlock > mMin)
 62     {
 63         //cerr << "Warning: mBlock is larger than minmum pattern length, reset mBlock to minmum, but it will seriously affect the effiency." << endl;
 64         mBlock = mMin;
 65     }
 66 
 67     //choose a suitable mTableSize for SHIFT, HASH table
 68     int32_t primes[6] = {1003, 10007, 100003, 1000003, 10000019, 100000007};
 69     vector<int32_t> primeList(&primes[0], &primes[6]);
 70 
 71     int32_t threshold = 10 * mMin;
 72     for (size_t i = 0; i < primeList.size(); ++i)
 73     {
 74         if (primeList[i] > patternSize && primeList[i] / patternSize > threshold)
 75         {
 76             mTableSize = primeList[i];
 77             break;
 78         }
 79     }
 80     cout << mTableSize << " " << mBlock << " " << mMin << endl;
 81     //if size of patternList is huge.
 82     if (0 == mTableSize)
 83     {
 84         //cerr << "Warning: amount of pattern is very large, will cost a great amount of memory." << endl;
 85         mTableSize = primeList[5];
 86     }
 87 
 88     //construct ShiftTable and HashTable, and set default value for SHIFT table
 89     mPatterns = patterns;
 90     mHashTable.resize(mTableSize);
 91     // default value is m-mBlock+1 for shift
 92     int32_t defaultValue = mMin - mBlock + 1;
 93     mShiftTable.resize(mTableSize, defaultValue);
 94 
 95     //loop through patterns
 96     for (int id = 0; id < patternSize; ++id) 
 97     { 
 98         // loop through each pattern from right to left
 99         for (int index = mMin; index >= mBlock; --index)
100         {
101             unsigned int hash = HashCode(patterns[id].c_str() + index - mBlock, mBlock) % mTableSize;
102             if (mShiftTable[hash] > (mMin - index))
103             {
104                 mShiftTable[hash]  = mMin - index;
105             }
106             if (index == mMin)
107             {
108                 unsigned int prefixHash = HashCode(patterns[id].c_str(), mBlock);
109                 mHashTable[hash].push_back(make_pair(prefixHash, id));
110             }
111         }
112     }
113     cout << "Term number : " <<  mPatterns.size() << endl;
114     return true;
115 }
116 
117 /** 
118  * @brief destructor
119  */
120 WuManber::~WuManber()
121 {
122     //VOID
123 }
124 
125 
126 /**
127  * @public
128  * @brief search multiple pattern in text at one time
129  */
130 int WuManber::Search(const char* text, const int textLength, ResultSetType& res)
131 {
132     //hit count: value to be returned
133     int hits = 0;
134     int32_t index = mMin - 1; // start off by matching end of largest common pattern
135     
136     int32_t blockMaxIndex = mBlock - 1;
137     int32_t windowMaxIndex = mMin - 1;
138     
139     while (index < textLength)
140     {
141         unsigned int blockHash = HashCode(text + index - blockMaxIndex, mBlock);
142         blockHash = blockHash % mTableSize;
143         int shift = mShiftTable[blockHash];
144         if (shift > 0)
145         {
146             index += shift;
147         }
148         else
149         {  
150             // we have a potential match when shift is 0
151             unsigned int prefixHash = HashCode(text + index - windowMaxIndex, mBlock);
152             PrefixTableType &element = mHashTable[blockHash];
153             PrefixTableType::iterator iter = element.begin();
154 
155             while (element.end() != iter)
156             {
157                 if (prefixHash == iter->first)
158                 {   
159                     // since prefindex matches, compare target substring with pattern
160                     // we know first two characters already match
161                     const char* indexTarget = text + index - windowMaxIndex;    //+mBlock
162                     const char* indexPattern = mPatterns[iter->second].c_str(); //+mBlock
163                     
164                     while (('\0' != *indexTarget) && ('\0' != *indexPattern))
165                     {
166                         // match until we reach end of either string
167                         if (*indexTarget == *indexPattern)
168                         {
169                             // match against chosen case sensitivity
170                             ++indexTarget;
171                             ++indexPattern;
172                         }
173                         else
174                             break;
175                     }
176                     // match succeed since we reach the end of the pattern.
177                     if ('\0' == *indexPattern)
178                     {
179                         res.insert(string(mPatterns[iter->second]));
180                         ++hits;
181                     }
182                 }//end if
183                 ++iter;
184             }//end while
185             ++index;
186         }//end else
187     }//end while
188 
189     return hits;
190 }
191 
192 /**
193  * Search
194  */
195 int WuManber::Search(const string& str, ResultSetType& res)
196 {
197     return Search(str.c_str(), str.length(), res);
198 }
199 
200 /**
201  * Search
202  */
203 int WuManber::Search(const char* text, const int textLength)
204 {
205     //hit count: value to be returned
206     int hits = 0;
207     int index = mMin - 1; // start off by matching end of largest common pattern
208 
209     uint32_t blockMaxIndex = mBlock - 1;
210     uint32_t windowMaxIndex = mMin - 1;
211 
212     while (index < textLength)
213     {
214         unsigned int blockHash = HashCode(text + index - blockMaxIndex, mBlock);
215         blockHash = blockHash % mTableSize;
216         int shift = mShiftTable[blockHash];
217         if (shift > 0)
218         {
219             index += shift;
220         }
221         else
222         {
223             // we have a potential match when shift is 0
224             unsigned int prefixHash = HashCode(text + index - windowMaxIndex, mBlock);
225             //prefixHash = prefixHash % mTableSize;
226             PrefixTableType &element = mHashTable[blockHash];
227             PrefixTableType::iterator iter = element.begin();
228 
229             while (element.end() != iter)
230             {
231                 if (prefixHash == iter->first)
232                 {
233                     // since prefindex matches, compare target substring with pattern
234                     // we know first two characters already match
235                     const char* indexTarget = text + index - windowMaxIndex;    //+mBlock
236                     const char* indexPattern = mPatterns[iter->second].c_str();  //+mBlock
237 
238                     while (('\0' != *indexTarget) && ('\0' != *indexPattern))
239                     {
240                         // match until we reach end of either string
241                         if (*indexTarget == *indexPattern)
242                         {
243                             // match against chosen case sensitivity
244                             ++indexTarget;
245                             ++indexPattern;
246                         }
247                         else
248                             break;
249                     }
250                     // match succeed since we reach the end of the pattern.
251                     if ('\0' == *indexPattern)
252                     {
253                         ++hits;
254                     }
255                 }//end if
256                 ++iter;
257             }//end while
258             ++index;
259         }//end else
260     }//end while
261 
262     return hits;
263 }
264 
265 int WuManber::Search(const char* text, const int textLength, MatchPosVector &matchPosVector)
266 {
267     //hit count: value to be returned
268     int hits = 0;
269     int index = mMin - 1; // start off by matching end of largest common pattern
270 
271     uint32_t blockMaxIndex = mBlock - 1;
272     uint32_t windowMaxIndex = mMin - 1;
273 
274     while (index < textLength)
275     {
276         unsigned int blockHash = HashCode(text + index - blockMaxIndex, mBlock);
277         blockHash = blockHash % mTableSize;
278         int shift = mShiftTable[blockHash];
279         if (shift > 0)
280         {
281             index += shift;
282         }
283         else
284         {
285             // we have a potential match when shift is 0
286             unsigned int prefixHash = HashCode(text + index - windowMaxIndex, mBlock);
287             //prefixHash = prefixHash % mTableSize;
288             PrefixTableType &element = mHashTable[blockHash];
289             PrefixTableType::iterator iter = element.begin();
290 
291             while (element.end() != iter)
292             {
293                 if (prefixHash == iter->first)
294                 {
295                     // since prefindex matches, compare target substring with pattern
296                     // we know first two characters already match
297                     const char* indexTarget = text + index - windowMaxIndex;    //+mBlock
298                     const char* indexPattern = mPatterns[iter->second].c_str();  //+mBlock
299 
300                     while (('\0' != *indexTarget) && ('\0' != *indexPattern))
301                     {
302                         // match until we reach end of either string
303                         if (*indexTarget == *indexPattern)
304                         {
305                             // match against chosen case sensitivity
306                             ++indexTarget;
307                             ++indexPattern;
308                         }
309                         else
310                             break;
311                     }
312                     // match succeed since we reach the end of the pattern.
313                     if ('\0' == *indexPattern)
314                     {
315                         ++hits;
316                         matchPosVector.push_back(index);
317                     }
318                 }//end if
319                 ++iter;
320             }//end while
321             ++index;
322         }//end else
323     }//end while
324 
325     return hits;
326 }
327 
328 int WuManber::Search(const string& str, MatchPosVector &matchPosVector)
329 {
330     return Search(str.c_str(), str.length(), matchPosVector);
331 }
332 
333 int WuManber::Search(const string& str)
334 {
335     return Search(str.c_str(), str.length());
336 }

 

 1 #include <iostream>
 2 #include <fstream>
 3 #include <string.h>
 4 #include <vector>
 5 #include <algorithm>
 6 //#include "wumanber.h"
 7 
 8 using namespace std;
 9 
10 
11 //WuManber search;
12 
13 
14 int main()
15 {
16     ifstream readfile;
17     string line;
18     readfile.open("test_wumanber.dat", ios::in);
19     vector<string> pattern;
20     vector<unsigned int> pos;
21     while (getline(readfile, line)) {
22         if (line[0] == 1) {
23             line.erase(0,1);
24             pattern.push_back(line);
25         }
26     }
27     for (vector<string>::iterator it = pattern.begin(); it !=\
28             pattern.end(); it++)
29         cout << *it << endl;
30     /*search.Init(pattern);*/
31     //ResultSetType res;
32     //cout << search.Search(target, strlen(target), pos) << endl;
33     /*cout << endl;*/
34 }

 

posted on 2013-03-21 19:39  brainworm  阅读(477)  评论(0编辑  收藏  举报

导航