参考链接:http://www.cnblogs.com/zzqcn/p/3525636.html
感谢原文作者。
花了两天半时间实现并测试了算法。
按照上文的思路实现了一遍,可能是原文中有些地方描述的不是特别清楚,导致一开始测试的时候发现了各种匹配遗漏的情况,后经过自己各种努力终于解决了各种遗漏。
同时在实现过程中也遇到了各种小问题,最后都解决了,总结起来主要有四个大坑,自己实现的时候需要注意,四个坑都在代码的注释里面了。
这里的实现虽然不会有遗漏的情况,但会有同一模式串在相同的偏移多次被命中的情况,但无伤大雅,至少没有遗漏不是吗。实际应用中只需对结果做去重就好了。
测试结论:对一个101.3MB的PE,从中随机抽取长度在[16-116)Bytes的模式串16个,分别用memcmp方式和AC自动机方式进行匹配,memcmp方式耗时33秒,AC方式耗时12秒,可见优势还是比较明显的。
代码中如有哪里不对,欢迎一起讨论。
1 #include <cstdlib> 2 #include <cstdio> 3 #include <cstring> 4 #include <stdint.h> 5 #include <vector> 6 #include <map> 7 #include <queue> 8 #include <ctime> 9 10 typedef struct ACNode 11 { 12 uint64_t u64Depth; 13 struct ACNode *pFail; 14 std::map<unsigned char, struct ACNode *> *pmpGotoTab; 15 struct ACParrent 16 { 17 struct ACNode *pParent; 18 unsigned char ucCondition; 19 } Parent; 20 bool bIsMathed; 21 } AC_NODE, *P_AC_NODE; 22 23 typedef void (__stdcall *P_AC_FOUND_CALLBACK)(const unsigned char *In_pucBuf, uint64_t In_u64EndPos, uint64_t In_u64Len); 24 25 int InitACGoto(const std::vector<const std::vector<unsigned char> *> &In_vctPattern, 26 std::vector<P_AC_NODE> &Out_vctACNodes) 27 { 28 int iRetVal = 0; 29 P_AC_NODE pRoot = NULL; 30 unsigned int uiPattIdx = 0; 31 unsigned int uiUCharIdx = 0; 32 uint16_t u16Idx = 0; 33 34 if (In_vctPattern.empty()) 35 { 36 iRetVal = -1; 37 goto fun_ret; 38 } 39 40 pRoot = (P_AC_NODE)calloc(1, sizeof(AC_NODE)); 41 if (pRoot == NULL) 42 { 43 iRetVal = -2; 44 goto fun_ret; 45 } 46 47 pRoot->pmpGotoTab = new std::map<unsigned char, struct ACNode *>(); 48 for (u16Idx = 0; u16Idx <= 0xff; u16Idx ++) 49 pRoot->pmpGotoTab->insert(std::pair<unsigned char, struct ACNode *>((unsigned char)u16Idx, pRoot)); 50 Out_vctACNodes.push_back(pRoot); 51 52 for (uiPattIdx = 0; uiPattIdx < In_vctPattern.size(); uiPattIdx ++) 53 { 54 P_AC_NODE pCurNode = pRoot; 55 for (uiUCharIdx = 0; uiUCharIdx < In_vctPattern[uiPattIdx]->size(); uiUCharIdx ++) 56 { 57 unsigned char ucCurUChar = In_vctPattern[uiPattIdx]->at(uiUCharIdx); 58 if (pCurNode->pmpGotoTab->find(ucCurUChar) == pCurNode->pmpGotoTab->end() 59 || (pCurNode->pmpGotoTab->find(ucCurUChar) != pCurNode->pmpGotoTab->end() 60 && pCurNode->pmpGotoTab->at(ucCurUChar) == pRoot)) 61 { 62 P_AC_NODE pNode = (P_AC_NODE)calloc(1, sizeof(AC_NODE)); 63 if (pNode == NULL) 64 { 65 iRetVal = -3; 66 goto fun_ret; 67 } 68 69 pNode->u64Depth = uiUCharIdx + 1; 70 pNode->Parent.pParent = pCurNode; 71 pNode->Parent.ucCondition = ucCurUChar; 72 pNode->pmpGotoTab = new std::map<unsigned char, struct ACNode *>(); 73 74 if (pCurNode->pmpGotoTab->find(ucCurUChar) != pCurNode->pmpGotoTab->end()) 75 pCurNode->pmpGotoTab->erase(ucCurUChar); 76 pCurNode->pmpGotoTab->insert(std::pair<unsigned char, struct ACNode *>(ucCurUChar, pNode)); 77 pCurNode = pNode; 78 Out_vctACNodes.push_back(pNode); 79 } 80 else 81 pCurNode = pCurNode->pmpGotoTab->at(ucCurUChar); 82 83 if (uiUCharIdx == In_vctPattern[uiPattIdx]->size() - 1) 84 pCurNode->bIsMathed = true; 85 } 86 } 87 88 fun_ret: 89 return iRetVal; 90 } 91 92 int ACFail(std::vector<P_AC_NODE> &Out_vctACNodes) 93 { 94 int iRetVal = 0; 95 std::queue<P_AC_NODE> quNodes; 96 97 if (Out_vctACNodes.empty()) 98 { 99 iRetVal = -1; 100 goto fun_ret; 101 } 102 103 quNodes.push(Out_vctACNodes[0]); 104 while (!quNodes.empty()) 105 { 106 std::map<unsigned char, struct ACNode *>::iterator itGoto; 107 P_AC_NODE pNode = quNodes.front(); 108 quNodes.pop(); 109 if (pNode->u64Depth <= 1) 110 pNode->pFail = Out_vctACNodes[0]; 111 else 112 { 113 P_AC_NODE pParentFail = pNode->Parent.pParent->pFail; 114 while (pParentFail->pmpGotoTab->find(pNode->Parent.ucCondition) == pParentFail->pmpGotoTab->end()) 115 pParentFail = pParentFail->pFail; 116 pNode->pFail = pParentFail->pmpGotoTab->at(pNode->Parent.ucCondition); 117 } 118 for (itGoto = pNode->pmpGotoTab->begin(); itGoto != pNode->pmpGotoTab->end(); itGoto ++) 119 { 120 if (itGoto->second != Out_vctACNodes[0]) 121 quNodes.push(itGoto->second); 122 } 123 } 124 125 fun_ret: 126 return iRetVal; 127 } 128 129 void __stdcall ACFoundCallBack(const unsigned char *In_pucBuf, uint64_t In_u64EndPos, uint64_t In_u64Len) 130 { 131 if (In_pucBuf == NULL || In_u64Len == 0) 132 goto fun_ret; 133 134 printf("<<<<<<<<<<FUCKOFF:%x\n", In_u64EndPos - In_u64Len); 135 136 fun_ret: 137 return; 138 } 139 140 int ACSearch(const P_AC_NODE In_pRoot, const unsigned char *In_pucBuf, uint64_t In_u64BufLen, P_AC_FOUND_CALLBACK In_pfCallBack) 141 { 142 int iRetVal = 0; 143 P_AC_NODE pCurrent = NULL; 144 uint64_t u64Idx = 0; 145 146 if (In_pRoot == NULL || In_pucBuf == NULL || In_u64BufLen == 0 || In_pfCallBack == NULL) 147 { 148 iRetVal = -1; 149 goto fun_ret; 150 } 151 152 pCurrent = In_pRoot; 153 for (u64Idx = 0; u64Idx < In_u64BufLen;) 154 { 155 P_AC_NODE pFail = NULL; 156 if (pCurrent->pmpGotoTab->find(In_pucBuf[u64Idx]) != pCurrent->pmpGotoTab->end()) 157 { 158 pCurrent = pCurrent->pmpGotoTab->at(In_pucBuf[u64Idx]); 159 //坑1,出现匹配失败时不要前进,只在匹配成功时前进 160 u64Idx ++; 161 } 162 else 163 pCurrent = pCurrent->pFail; 164 165 //坑3,每个节点都需要沿着失配指针一直向上找所有匹配到的结果,而不是 166 //只在匹配成功时才这么做,否则会出现匹配遗漏(形如“abcd”和“bc”这样的特征串并存的情况) 167 pFail = pCurrent->pFail; 168 //坑4,一定要走到根,否则会出现匹配遗漏 169 while (pFail != In_pRoot) 170 { 171 if (pFail->bIsMathed) 172 In_pfCallBack(In_pucBuf, u64Idx, pFail->u64Depth); 173 pFail = pFail->pFail; 174 } 175 //坑2,不管是否匹配成功,都要判断当前节点状态,因为出现失配后的 176 //转移也有可能转到一个成功匹配的节点上 177 if (pCurrent->bIsMathed) 178 In_pfCallBack(In_pucBuf, u64Idx, pCurrent->u64Depth); 179 } 180 181 fun_ret: 182 return iRetVal; 183 } 184 185 void ReleaseACNodes(std::vector<P_AC_NODE> &Out_vctACNodes) 186 { 187 unsigned int uiIdx = 0; 188 for (uiIdx = 0; uiIdx < Out_vctACNodes.size(); uiIdx ++) 189 { 190 delete Out_vctACNodes[uiIdx]->pmpGotoTab; 191 free(Out_vctACNodes[uiIdx]); 192 } 193 Out_vctACNodes.clear(); 194 } 195 196 void main(int argc, char **argv) 197 { 198 std::vector<P_AC_NODE> vctNodes; 199 std::vector<const std::vector<unsigned char> *> vctPatterns; 200 unsigned char *pucBuf = NULL; 201 FILE *pf = NULL; 202 long lFileSize = 0; 203 time_t tACBegin = {0}; 204 double dMemSec = 0.0; 205 206 pf = fopen(argv[1], "rb"); 207 fseek(pf, 0, SEEK_END); 208 lFileSize = ftell(pf); 209 fseek(pf, 0, SEEK_SET); 210 pucBuf = (unsigned char *)calloc(lFileSize, 1); 211 fread(pucBuf, 1, lFileSize, pf); 212 fclose(pf); 213 for (int i = 0; i < 1600; i ++) 214 { 215 std::vector<unsigned char> *pvctPattern = new std::vector<unsigned char>(); 216 int iBegin = rand() % (lFileSize - 128); 217 int iLen = rand() % 100 + 16; 218 for (int j = 0; j < iLen; j ++) 219 pvctPattern->push_back(pucBuf[j + iBegin]); 220 vctPatterns.push_back(pvctPattern); 221 printf("%x:%u\n", iBegin, iLen); 222 for (long j = 0; j < lFileSize - iLen; j ++) 223 { 224 time_t tMemBegin = time(NULL); 225 if (memcmp(pucBuf + iBegin, pucBuf + j, iLen) == 0) 226 printf(">>>>>>>>>>Off:%x\n", j); 227 dMemSec += difftime(time(NULL), tMemBegin); 228 } 229 } 230 231 InitACGoto(vctPatterns, vctNodes); 232 ACFail(vctNodes); 233 tACBegin = time(NULL); 234 ACSearch(vctNodes[0], pucBuf, lFileSize, ACFoundCallBack); 235 printf("MemTime::%f\nACTime::%f\n", dMemSec, difftime(time(NULL), tACBegin)); 236 ReleaseACNodes(vctNodes); 237 return; 238 }
本文首发于博客园,任何其他站点均为爬虫或转载,爬虫最无耻。