AC多模匹配+完整实现源码
前段时间同事讲了一下AC多模匹配的原理,就试着写了下代码,现在再看代码的时候有些地方连自己都快看不懂了,所以想做一个笔记
查找10M的日志文件,即使最后一行,用时不到0.2毫秒
能正确查找出ssss串中的s,ss,sss,ssss个数和位置,notepad无法实现
1、构建树
构建模式串(需要搜索的串)为{"hee","he", "h","she", "his" ,"her","hers"}的树。注:下划线序号表示层高
构建过程也是查找的过程,举例讲解构建:hee过程,其他类似
构建hee过程:
1、构建h节点:输入串为h,以根节点作为起始节点SNode,查找SNode下是否存在值为h,层高为strlen(h)=1的节点,不存在,插入
2、构建第一个e节点:输入串为he,以根节点作为起始节点SNode,查找SNode下是否存在值为h,层高为strlen(h)=1的节点,存在,以h节点作为SNode,查找是否存值为e,层高为strlen(he)的节点,不存在,插入
3、构建第二个e节点:输入串为hee,以根节点作为起始节点SNode,查找SNode下是否存在值为h,层高为strlen(h)=1的节点,存在,以h节点作为SNode,查找是否存值为e,层高为strlen(he)的节点,存在,以e节点作为SNode节点,查找是否存值为e,层高为strlen(hee)的节点,不存在,插入。构建字符e为构建串的最后一个字符时,该节点为输出节点
构建一个串中某个字符的节点时,会将该字符的前面所有字符+该字符作为一个串输入,搜索树并插入。因为每个字符必须要知道前面的路径
2、构建失败节点
- 根节点的失败节点为根节点,第一层节点的失败节点为根节点
- 某个节点的失败节点为:以该节点的父节点的失败节点为起点查找该节点,如果能找到,找到的节点为失败节点,不能找到,则失败节点为根节点
已知: f(root) = root;f(h_1)=root;f(s_1)=root
解: f(e_2) = g(f(h_1),e) = g(root,e) = root
f(e_3) = g(f(e_2),e) = g(root,e) = root
f(r_3) = g(f(e_2),r) = g(root,r) = root
f(s_4) = g(f(r_3),s) = g(root,s) = s_1
f(i_2) = g(f(h_1),i) = g(root,i) = root
f(s_3) = g(f(i_2),s) = g(root,s) = s_1
f(h_2) = g(f(s_1),h) = g(root,h) = h_1
f(e_3) = g(f(h_2),e)=g(h_1,e)=e_2
3、查找"ushers"中的模式串
1、遍历"ushers",从根节点作为起始节点SNode,查找字符为u的节点,查找失败,以SNode的失败节点为起始节点查找下一个字符s
2、以SNode作为起始节点,查找字符为s的节点,查找成功,判断节点是否为匹配节点,输出匹配值,以s节点作为起始节点查找h节点,直到遍历完"ushers"
大家可以手动试一下构建模式串为{"s","ss","sss","ssss"}的树,构建每个节点的失败节点,并在内容"ssss"中查找模式串
文字只讲了大致流程,还有很多细节没有描述出来,语言水平有限啊,细节大家看代码吧...
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
AC.h
#ifndef AC_H #define AC_H #define MAX_CHILD_LEN 128 #define MAX_OUTPUT 128 #include <string.h> #include <stdio.h> class Node { public: Node(char ele){ element = ele; parent = NULL; failNode = NULL; isMatchNode=false; memset(nodeList,0,MAX_CHILD_LEN); childNum=0; high = 0; memset(outPut,0,MAX_OUTPUT); outPutNum = 0; } ~Node(){ printf("node release,high:%d value:%c\n",high,element); for(int i=0;i<outPutNum;i++){ char * ele = outPut[i]; printf("release output str:%s\n",ele); delete ele; ele = NULL; } } char element; Node * parent; Node * nodeList[MAX_CHILD_LEN]; Node * failNode; bool isMatchNode; int childNum; int high; char * outPut[MAX_OUTPUT]; int outPutNum; }; typedef void(*Func)(char * matchedStr,int post); class AC { public: AC() {root = new Node(NULL);root->failNode = root;} ~AC(){ delete root; root = NULL; } void initTree(char * patterns[],int patLen){ for(int i=0;i<patLen;i++){ char * element = patterns[i]; int eleLen = strlen(element); for(int j=0;j<eleLen;j++){ bool isMatch = false; if(j == eleLen-1) isMatch = true; char * p = new char[j+2]; memset(p,'\0',j+2); strncpy(p,element,j+1); insert(p,isMatch); } } } void buildFailNode(){ traceAllNodes(root); } void match(char * srcTxt,int txtLen,Func f){ Node * startNode = root; for(int i=0;i<txtLen;i++){ char e = srcTxt[i]; bool isOk = false; for(int j=0;j<startNode->childNum;j++){ Node * node = startNode->nodeList[j]; if(node->element == e){ isOk = true; startNode = node; Node * failNode = node->failNode; while(failNode!=root){ if(failNode->isMatchNode){ for(int k=0;k<failNode->outPutNum;k++) f(failNode->outPut[k],i); } failNode = failNode->failNode; } if(node->isMatchNode){ for(int k=0;k<node->outPutNum;k++) f(node->outPut[k],i); } break; } } if(!isOk){ startNode = startNode->failNode; if(startNode!=root) i--; } } } void deleteTree(){ printf("delete tree--------------------------------\n"); traceDelNodes(root); } void printACTree(){ printf("tree structure-----------------------------\n"); printf("high value match failNode childNum children outPutStr \n"); tracePrintNodes(root); } private: void insert(char * ele,bool isMatch){ int eleLen = strlen(ele); //搜索ele最后一个元素节点是否存在,不存在则返回父节点 int startH = 1; Node * pnode = NULL; if(!search(root,startH,ele,pnode)){ Node * cnode = new Node(ele[eleLen-1]); cnode->high = eleLen; cnode->parent = pnode; cnode->isMatchNode = isMatch; pnode->nodeList[pnode->childNum]=cnode; pnode->childNum++; if(isMatch) cnode->outPut[cnode->outPutNum++]=ele; } } bool search(Node * pnode,int & index,char * ele,Node * &retNode){ for(int i=0;i<pnode->childNum;i++){ Node * node = pnode->nodeList[i]; if(node->element == ele[index-1]){ if(index == strlen(ele)) return true; index++; return search(node,index,ele,retNode); } } retNode = pnode; return false; } void initFailNode(Node * node){ if(node->high == 1){ node->failNode = root; }else{ //以父节点的失败函数作为起点,node的element作为触发边得到node的失败函数 Node * failNode = NULL; searchFailNode(node->parent,node,failNode); node->failNode = failNode; } } void traceAllNodes(Node * node){ for(int i=0;i<node->childNum;i++){ Node * cnode = node->nodeList[i]; initFailNode(cnode); traceAllNodes(cnode); } } void searchFailNode(Node * pnode,Node * cnode,Node * & retNode){ Node * failNode = pnode->failNode; for(int i=0;i<failNode->childNum;i++){ Node * node = failNode->nodeList[i]; if(node->element == cnode->element){ retNode = node; return; } } //循环已经走完,说明没找到节点,如果已经搜索了根节点没找到,则返回根节点作为失败节点,否则继续搜索 if(pnode->failNode == root) retNode = root; else searchFailNode(pnode->parent,pnode,retNode); } void tracePrintNodes(Node *node){ for(int i=0;i<node->childNum;i++){ Node * cnode = node->nodeList[i]; printf(" %d %c %d %d %d ",cnode->high,cnode->element,cnode->isMatchNode,cnode->failNode->high,cnode->childNum); for(int j=0;j<cnode->childNum;j++){ Node * lnode = cnode->nodeList[j]; printf("%c ",lnode->element); } printf(" "); for(int j=0;j<cnode->outPutNum;j++) printf("%s ",cnode->outPut[j]); printf("\n"); tracePrintNodes(cnode); } } void traceDelNodes(Node * node){ for(int i=0;i<node->childNum;i++){ Node * cnode = node->nodeList[i]; traceDelNodes(cnode); } if(node!=root){ delete node; node = NULL; } } private: Node * root; }; #endif // AC_H ----------------------------------------------------------------------------------- main.cpp #include "AC.h" #include <time.h> #include <sys/time.h> #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> #include <errno.h> #include <unistd.h> #include <map> #define MAX_FILE_CONTENT 1024*1024*20 void currentTime(char * timeOutPut,int timeLen){ time_t t = time(NULL); struct tm * timeinfo = localtime(&t); strftime (timeOutPut,timeLen,"%Y-%m-%d %H:%M:%S:",timeinfo); struct timeval tval; gettimeofday(&tval,NULL); sprintf(timeOutPut+strlen(timeOutPut),"%d",tval.tv_usec/1000); } //回调函数在外面进行统计,找到一个回调一次 void findCallBack(char * matchedStr,int startPos){ char curTime[32]={0}; currentTime(curTime,sizeof(curTime)); printf("end time:%s matchedStr:%s matchedPos:%d\n",curTime,matchedStr,startPos); } bool readFile(char * fileName,char * fileContent){ int fd = open(fileName,O_RDONLY); if(-1 == fd){ printf("open file error:%d\n",errno); return false; } int len = read(fd,fileContent,MAX_FILE_CONTENT); if(-1 == len){ printf("read file error:%d\n",errno); close(fd); return false; } close(fd); return true; } void ac_func(char ** pattern,int patLen,char *txt,int txtLen){ AC ac; ac.initTree(pattern,patLen); ac.buildFailNode(); ac.printACTree(); char curTime[32]={0}; currentTime(curTime,sizeof(curTime)); printf("searching tree-----------------------------\n"); printf("start time of search:%s\n",curTime); ac.match(txt,txtLen,findCallBack); ac.deleteTree(); } void test1(){ char *pattern[]={"hee","he", "h","she", "his" ,"her","hers"}; char *txt = "ushers"; int patternLen = sizeof(pattern)/sizeof(char*); int txtLen = strlen(txt); ac_func(pattern,patternLen,txt,txtLen); } void test2(){ //模式串出现的顺序在文件中刚好相反,里面存在一个关键字在日志文件的最后一行 char * pattern[] = {"39_347990193541512029","39_347739859976612007","13_1002D375","2017-09-29 17:43:37:517"}; //文件内容太大,使用堆内存 char * txt = new char[MAX_FILE_CONTENT]; bzero(txt,MAX_FILE_CONTENT); //test.log为一个10M的日志 if(!readFile("../testfile/test.log",txt)) return; int patternLen = sizeof(pattern)/sizeof(char*); int txtLen = strlen(txt); ac_func(pattern,patternLen,txt,txtLen); delete txt; txt = NULL; } void test3(){ //模式串和内容出现自包含,notepad遇到这样的搜索有bug char *pattern[]={"s", "ss","sss","ssss"}; char *txt = "ssss"; int patternLen = sizeof(pattern)/sizeof(char*); int txtLen = strlen(txt); ac_func(pattern,patternLen,txt,txtLen); } int main() { test1(); test2(); test3(); return 0; }
----------------------------------------------------------------------------------------------------
运行结果部分截图