Boyer-Moore算法的C++实现
以上给出了通俗易懂的算法讲解,下面给出代码实现,使用的宽字符,这样就不限于英文字母了:(stdafx.h编译不过去就屏蔽掉)
// StringSearch_BoyerMoore.cpp : 定义控制台应用程序的入口点。 // /* @Date: 2016/03/21 18:31; @Where: Tongzhou District of Beijing; @Author: Alex Hong; @Mail: worksdata@163.com; @Reference: http://www.ruanyifeng.com/blog/2013/05/boyer-moore_string_search_algorithm.html BM算法-阮一峰的网络日志; */ #include "stdafx.h" #include <sstream> #include <iostream> #include <string> #include <vector> #include <map> using namespace std; typedef std::vector<int> TArrayInt; TArrayInt g_arSearchResult;//store the index of searched result; typedef std::map<std::wstring, int> TMapWszToInt; //E.g. "我,4":7; typedef std::map<int, int> TMapIntToInt; void StringSearch_BM(wchar_t* wszStrMain, int iLenMain, wchar_t* wszSearch, int iLenSearch, bool bOnlyFirst) { //reset variables; g_arSearchResult.clear(); //handle exceptions.. if( NULL == wszStrMain || NULL == wszSearch) return; if(iLenSearch > iLenMain) return; /* 未知的字符有很多, 但是已知的搜索字符是有限的. 我们只初始化已知的字符的跳跃值(Skip), 数组中不存在的就是-1; Shift数组的初始化同样基于这种想法. 初始化这两个数组的值后, 对于长主串/耗时的任务是有利的; */ int i = 0, j = 0; //Initialize the Skip array data for Bad-Character-Rules.. TMapWszToInt mapSkip; mapSkip.clear(); TMapWszToInt::iterator it1; std::wostringstream oStream; for(i = iLenSearch - 1; i > 0; --i)//0-->1; { for(j = i - 1; j >= 0; --j) { if(wszSearch[j] != wszSearch[i]) //Skip is for bad character which is different with wszSearch; { oStream.clear();//clear bad bits; oStream.str(L"");//clear data; oStream<<wszSearch[j]<<','<<i;//format to key string; //wcout<<oStream.str().c_str()<<endl;//test.. it1 = mapSkip.find(oStream.str()); if(mapSkip.end() == it1) //only set the pos of nearest same bad character; mapSkip.insert(TMapWszToInt::value_type(oStream.str(), i-j)); } } } //好后缀的前提是: 至少有一个字符是相同的!!!!!! //Initialize the Shift array data for Good-Suffix-Rules.. //Shift数组的键值是好后缀的最左字符的下标; //1. 首先判断模式串中是否含有完全一致的好后缀;有就直接计算位移, EXIT; // 其实这本身就是一个字符串搜索的过程,为了简化计算, 这里只使用Bad-Character-Rule; //2. 如果没有, 就在串首寻找好后缀的子串; TMapIntToInt mapShift; mapShift.clear(); mapShift.insert(TMapIntToInt::value_type(0, iLenSearch));//0==i的话就匹配了, Shift[]=iLenSearch; //1. try to find the same suffix.. for(int iSuffixPosL = iLenSearch - 1; iSuffixPosL > 0; --iSuffixPosL)//iSuffixPosL: left position of good suffix; { int iSuffixLen = iLenSearch - iSuffixPosL; wchar_t* wszSuffix = wszSearch + iSuffixPosL; for(int iPosMainL = iSuffixPosL - 1; iPosMainL >= 0;)//不断向左移动suffix串比较; { //compare.. for(i = 0; i < iSuffixLen; ++i) { if(wszSearch[iPosMainL+i] != wszSuffix[i])//bad character break; } //check result.. if(iSuffixLen == i) //find it! { mapShift[iLenSearch-i] = iSuffixPosL - iPosMainL; break; //结束为当前suffix找Shift值; } else //calc the skip distance; { for(j = i + 1; j < iSuffixLen; ++j)//在suffix中寻找最近的坏字符 { if(wszSuffix[j] == wszSearch[iPosMainL+i]) break; } iPosMainL = iPosMainL - (j - i);//iSuffixLen==j的情况也适用; } } } //2. try to find the child suffix at head.. TMapIntToInt::iterator it2; for(int iSuffixPosL = iLenSearch - 1; iSuffixPosL > 0; --iSuffixPosL)//iSuffixPosL: left position of good suffix; { it2 = mapShift.find(iSuffixPosL); if(mapShift.end() == it2) { wchar_t* wszSuffix = NULL; int iSuffixLen = 0; for(i = 0; i < iLenSearch - iSuffixPosL; ++i)//form sub suffix big to small. { wszSuffix = wszSearch + iSuffixPosL + i; iSuffixLen = iLenSearch - iSuffixPosL - i; //compare if match.. for(j = 0; j < iSuffixLen; ++j) { if(wszSearch[j] != wszSuffix[j]) break; } if(iSuffixLen == j) //find it! { mapShift[iSuffixPosL] = iLenSearch - iSuffixLen; //以好后缀的最后一个字符为准; break; } } //if no sub suffix at all.. it2 = mapShift.find(iSuffixPosL); if(mapShift.end() == it2) mapShift[iSuffixPosL] = iLenSearch; } } //================ //start to search! //================ int iPosMainL = 0;//left iterator of main string; //int iPosSearch = 0;//iterator of search string; no use! while( iPosMainL <= iLenMain - iLenSearch + 1) { //compare with aligned sub-string.. int i = iLenSearch - 1; for(; i >= 0; --i)//right to left comparison; { if(wszSearch[i] != wszStrMain[iPosMainL+i])//bad character! break; } //----calc how many steps to move rightwards..---- int iSkip = 0; if(i < 0) { g_arSearchResult.push_back(iPosMainL); if(bOnlyFirst) { mapSkip.clear(); mapShift.clear(); return; } iSkip = iLenSearch; } else { //1. calc max skip steps for bad characters.. oStream.clear();//clear bits; oStream.str(L"");//clear data; oStream<<wszStrMain[iPosMainL+i]<<','<<i; it1 = mapSkip.find((wchar_t*)oStream.str().c_str()); if(mapSkip.end() == it1) iSkip = i + 1; else iSkip = it1->second; } //--2. calc max shift steps for good suffix..-- int iShift = 0; if(i < iLenSearch - 1)//至少有一个相同的字符才可以使用"好后缀"规则! { it2 = mapShift.find(i-1); if(mapShift.end() != it2) iShift = it2->second; } iPosMainL += ((iSkip>iShift) ? iSkip : iShift); } //clear.. mapSkip.clear(); mapShift.clear(); } int _tmain(int argc, _TCHAR* argv[]) { wchar_t wszMainStr[] = L"THIS IS A 我你她我不是的LE EXA我们我们MPLEEXAMPLE我们"; wchar_t wszSearchStr[] = L"我们"; StringSearch_BM(wszMainStr, wcslen(wszMainStr), wszSearchStr, wcslen(wszSearchStr), false); for(TArrayInt::iterator it = g_arSearchResult.begin(); it != g_arSearchResult.end(); ++it) cout<<*it<<endl; //clear.. g_arSearchResult.clear(); getchar(); return 0; }
一些例子分析过程:
主字符串: THIS IS A SIMPLE EXAMPLE (长度:24)
搜索字符串: EXAMPLE (长度:7)
mapSkip的初始化值:
主串中的坏字符的下面字符是:'E'
mapSkip['L,6'] = 1
mapSkip['P,6'] = 2
mapSkip['M,6'] = 3
mapSkip['A,6'] = 4
mapSkip['X,6'] = 5
mapSkip['E,6'] = 不是坏字符, 舍去
其他: 7
-----------------------------------------------------------
搜索字符串: EXAMPLE (长度:7)
主串中的坏字符的下面字符是:
'L'
mapSkip['P,5'] = 1
mapSkip['M,5'] = 2
mapSkip['A,5'] = 3
mapSkip['X,5'] = 4
mapSkip['E,5'] = 5
其他:6
---------------------------------------------------------------
...
主字符串: THIS IS A SIMPLE EXAMPLE (长度:24)
搜索字符串: EXAMPLE (长度:7)
主串中的坏字符的下面字符是:
'X'
mapSkip['E,1'] = 1
其他:2
---------------------------------------------------------------
mapSkip['[任何],0'] = 1
============好后缀数组初始化的流程============================
搜索字符串: SIMPLE EXAMPLE (长度:14)
0 1 2 3 4 5 6 7 8 9 10 11 12 13
S I M P L E E X A M P L E
好后缀的第一个字符为:'E' --> Shift[13] = 6; //以最后一个字符的下标为基准计算要移动的距离;
好后缀的第一个字符为:'L' --> Shift[12] = 8; //"SIMPLE"中有全匹配的, 取全匹配的;
好后缀的第一个字符为:'P' --> Shift[11] = 8;
好后缀的第一个字符为:'M' --> Shift[10] = 8;
好后缀的第一个字符为:'A' --> Shift[9] = 14; //没有全匹配的, 在串头部找子串匹配的, 这里没有, 认为在-1位置(不妨虚拟出来);
...
好后缀的第一个字符为:'I' --> Shift[1] = 14;
好后缀的第一个字符为:'S' --> Shift[0] = 14;//可能需要特殊处理;
E.g. 2
0 1 2 3 4 5 6 7
L E X A M P L E
好后缀的第一个字符为:'E' --> Shift[7] = 6;
好后缀的第一个字符为:'L' --> Shift[6] = 6;
好后缀的第一个字符为:'P' --> Shift[6] = 6;