C++ 简单中文敏感词检测工具类
具体思路:
1->敏感词库,可从数据库读取,也可以从文件加载.
2->将敏感词转化为gbk编码,因为gbk严格按照字符一个字节,汉字两个字节的格式编码,便于容易切分文字段.
3->将所有敏感词以首个字符[英文一字节,汉字两字节]转换为一个整数,然后按照这个整数给所有敏感词建立索引,索引的value用list,因为考虑到同一个整数对应多个关键字.
4->检测一段内文字类容时,也实现转化为gbk,然后逐个字符[英文一字节,汉字两字节]检测是否有以该字符为首的敏感词.
代码.h
1 #ifndef SENSITIVE_WORDS_CHECKER_ 2 #define SENSITIVE_WORDS_CHECKER_ 3 #include <stdint.h> 4 #include <stdio.h> 5 #include <memory.h> 6 #include <map> 7 #include <vector> 8 9 enum { 10 enmMaxWordLength = 32, //每个敏感词最大长度 11 enmMaxWordsFileLength = 1024 * 128, //敏感词文件最大长度128k 12 enmMaxContentLength = 1024, // 单次检测内容测最大长度 13 }; 14 15 struct SensitiveWord 16 { 17 char szWord[enmMaxWordLength]; 18 SensitiveWord() 19 { 20 memset(szWord, 0, enmMaxWordLength); 21 } 22 }; 23 24 typedef std::vector<SensitiveWord*> WordList; 25 typedef std::map<uint32_t, WordList*> WordMap; 26 27 class SensitiveWordsChecker 28 { 29 public: 30 SensitiveWordsChecker() :arrSensitiveWord(NULL), nSensitiveWordCnt(0){} 31 ~SensitiveWordsChecker(){ delete[] arrSensitiveWord; } 32 public: 33 void LoadWordsFromUTF8File(const char *file_name); 34 void LoadWordsFromGBKFile(const char *file_name); 35 protected: 36 int32_t WriteToFile(const char buf[], const int32_t buf_size, const char *file_name); 37 void DumpWordMap(); 38 void GenTestData(); 39 void Test(); 40 void StrAppend(char buf[], const uint32_t bufLen, uint32_t &offset, const char *fmt, ...); 41 private: 42 int32_t LoadFile(char buf[], const uint32_t buf_size, const char *file_name); 43 int32_t CodeConvert(char *from_charset, char *to_charset, char *inbuf, size_t inlen, char *outbuf, size_t outlen); 44 int32_t UTF8_To_GBK(char *inbuf, size_t inlen, char *outbuf, size_t outlen); 45 int32_t GBK_To_UTF8(char *inbuf, size_t inlen, char *outbuf, size_t outlen); 46 uint32_t GetWordsCount(char buf[],const uint32_t buf_size,char separator); 47 char *StrcpyExcludeChar(char *dst, const uint32_t dst_len, const char *src, const char *exclude_list); 48 int32_t GetWords(char gbk_buf[], const uint32_t buf_size, char separator); 49 void BuildWordMap(); 50 uint32_t GetFirstCharFromGBK(char gbk_buf[]); 51 uint32_t GetFirstCharFromTUF8(char utf8_buf[]); 52 uint32_t GetFirstChar(char buf[]); 53 // 返回 0 表示in_utf8_buf里面没有敏感词 54 // 返回 1 表示in_utf8_buf里面含有关键词,并将关键词替换为*输出到out_utf8_buf 55 int32_t CheckSensitiveWord(char out_utf8_buf[], char in_utf8_buf[]); 56 const SensitiveWord* FindSensitiveWord(uint32_t code,const char *pos); 57 private: 58 SensitiveWord *arrSensitiveWord; 59 uint32_t nSensitiveWordCnt; 60 WordMap mapWords; 61 }; 62 63 #endif
.cpp
1 #include "SenditiveWordsChecker.h" 2 #include "stdio.h" 3 #include "string.h" 4 #include "iconv.h" 5 #include <stdarg.h> 6 #include <new> 7 8 void SensitiveWordsChecker::LoadWordsFromUTF8File(const char *file_name) 9 { 10 char utf8_buf[enmMaxWordsFileLength] , gbk_buf[enmMaxWordsFileLength]; 11 LoadFile(utf8_buf, enmMaxWordsFileLength, file_name); 12 UTF8_To_GBK(utf8_buf, strlen(utf8_buf), gbk_buf, enmMaxWordsFileLength); 13 GetWords(gbk_buf, enmMaxWordsFileLength, ','); 14 } 15 16 void SensitiveWordsChecker::LoadWordsFromGBKFile(const char *file_name) 17 { 18 char gbk_buf[enmMaxWordsFileLength]; 19 LoadFile(gbk_buf, enmMaxWordsFileLength, file_name); 20 GetWords(gbk_buf, enmMaxWordsFileLength,','); 21 } 22 23 int32_t SensitiveWordsChecker::LoadFile(char buf[], const uint32_t buf_size, const char *file_name) 24 { 25 FILE * pFile; 26 size_t lSize = 0, result = 0; 27 fopen_s(&pFile, file_name, "rb"); 28 if (pFile == NULL) { fputs("File error\n", stderr); return -1; } 29 // obtain file size: 30 fseek(pFile, 0, SEEK_END); 31 lSize = ftell(pFile); 32 rewind(pFile); 33 if (lSize >= buf_size){ fputs("file too large\n", stderr); return -1; } 34 result = fread(buf, 1, lSize, pFile); 35 if (result != lSize) { fputs("Reading error\n", stderr); return -1; } 36 buf[lSize] = '\0'; 37 return fclose(pFile); 38 } 39 40 int32_t SensitiveWordsChecker::CodeConvert(char *from_charset, char *to_charset, char *inbuf, size_t inlen, char *outbuf, size_t outlen) 41 { 42 iconv_t cd; 43 char **pin = &inbuf; 44 char **pout = &outbuf; 45 46 cd = iconv_open(to_charset, from_charset); 47 if (cd == 0) 48 return -1; 49 memset(outbuf, 0, outlen); 50 if (iconv(cd, pin, &inlen, pout, &outlen) == -1) 51 return -1; 52 iconv_close(cd); 53 *pout = '\0'; 54 return 0; 55 } 56 57 int32_t SensitiveWordsChecker::UTF8_To_GBK(char *inbuf, size_t inlen, char *outbuf, size_t outlen) 58 { 59 return CodeConvert("utf-8", "gbk", inbuf, inlen, outbuf, outlen); 60 } 61 62 int32_t SensitiveWordsChecker::GBK_To_UTF8(char *inbuf, size_t inlen, char *outbuf, size_t outlen) 63 { 64 return CodeConvert("gbk", "utf-8", inbuf, inlen, outbuf, outlen); 65 } 66 67 uint32_t SensitiveWordsChecker::GetWordsCount(char buf[], const uint32_t buf_size, char separator) 68 { 69 const char *p = buf - 1; 70 uint32_t i = 0; 71 while ((p = strchr(p + 1, separator)) != NULL) 72 { 73 ++i; 74 } 75 return i; 76 } 77 78 int32_t SensitiveWordsChecker::WriteToFile(const char buf[], const int32_t buf_size, const char *file_name) 79 { 80 FILE * pFile; 81 size_t result; 82 fopen_s(&pFile, file_name, "wb"); 83 if (pFile == NULL) { fputs("File error\n", stderr); return -1; } 84 result = fwrite(buf, 1, buf_size, pFile); 85 if (result != buf_size) { fputs("Writing error\n", stderr); return -1; } 86 return fclose(pFile); 87 } 88 89 int32_t SensitiveWordsChecker::GetWords(char gbk_buf[], const uint32_t buf_size, char separator) 90 { 91 char buf[enmMaxWordsFileLength]; 92 StrcpyExcludeChar(buf, enmMaxWordsFileLength, gbk_buf, "\n"); //排除换行符 93 uint32_t nWordsCount = GetWordsCount(buf, buf_size,','); 94 printf("words_count=%d\n", nWordsCount); 95 arrSensitiveWord = new SensitiveWord[nWordsCount]; 96 if (arrSensitiveWord == NULL){return -1;} 97 nSensitiveWordCnt = 0; 98 const char *p = NULL,*q = buf; 99 while ((p = strchr(q, separator)) != NULL) 100 { 101 memcpy(arrSensitiveWord[nSensitiveWordCnt].szWord, q, p - q); 102 //printf("%s\n", arrSensitiveWord[nSensitiveWordCnt].szWord); 103 q = p + 1; 104 ++nSensitiveWordCnt; 105 } 106 BuildWordMap(); 107 return 0; 108 } 109 110 char * SensitiveWordsChecker::StrcpyExcludeChar(char *dst, const uint32_t dst_len, const char *src, const char *exclude_list) 111 { 112 uint32_t i = 0, j = 0, flag = 0; 113 const char *p = NULL; 114 if (dst == NULL && src == NULL)return NULL; 115 if (dst == src)return dst; 116 for (; j < dst_len && src[i] != '\0'; ++i) 117 { 118 flag = 0; 119 p = exclude_list; 120 while (p && *p != '\0') 121 { 122 if (*p == src[i]){ flag = 1; break; } 123 p++; 124 } 125 if (flag == 0)dst[j++] = src[i]; 126 } 127 dst[j] = '\0'; 128 return dst; 129 } 130 131 uint32_t SensitiveWordsChecker::GetFirstCharFromGBK(char gbk_buf[]) 132 { 133 int32_t code = 0; 134 int32_t len = strlen(gbk_buf); 135 if (len == 0)return 0; 136 if (gbk_buf[0] >= 0 || len == 1) 137 { 138 //printf("%c\n", gbk_buf[0]); 139 return uint32_t(gbk_buf[0]); //ASCII 字符 140 } 141 else 142 { 143 short high = (short)gbk_buf[0] + 256; 144 short low = (short)gbk_buf[1] + 256; 145 code = high * 256 + low; 146 char cstr[3]; 147 cstr[0] = gbk_buf[0]; // GBK严格按照两个字节表示一个中文字符 148 cstr[1] = gbk_buf[1]; 149 cstr[2] = 0; 150 //printf("%s %x\n", cstr, code); 151 return code; 152 } 153 } 154 155 uint32_t SensitiveWordsChecker::GetFirstCharFromTUF8(char utf8_buf[]) 156 { 157 uint32_t code = 0; 158 int32_t len = strlen(utf8_buf); 159 if (len == 0)return 0; 160 if (utf8_buf[0] >= 0 || len == 1) 161 { 162 printf("%c\n", utf8_buf[0]); 163 return int32_t(utf8_buf[0]); //ASCII 字符 164 } 165 else 166 { 167 short high = (short)utf8_buf[0]; 168 short mid = (short)utf8_buf[1]; 169 short low = (short)utf8_buf[2]; 170 code = high * 256 * 256 + mid * 256 + low; 171 char cstr[4]; 172 cstr[0] = utf8_buf[0]; // UTF8大多数情况下三个字节表示一个中文字符 173 cstr[1] = utf8_buf[1]; 174 cstr[2] = utf8_buf[2]; 175 cstr[3] = 0; 176 printf("%s\n", cstr); 177 return code; 178 } 179 } 180 181 uint32_t SensitiveWordsChecker::GetFirstChar(char buf[]) 182 { 183 uint32_t code = 0; 184 int32_t len = strlen(buf); 185 if (len == 0)return 0; 186 return (uint32_t)buf[0]; 187 } 188 189 void SensitiveWordsChecker::BuildWordMap() 190 { 191 WordList *wordList = NULL; 192 for (uint32_t i = 0; i < nSensitiveWordCnt; ++i) 193 { 194 uint32_t code = GetFirstCharFromGBK(arrSensitiveWord[i].szWord); 195 WordMap::iterator it = mapWords.find(code); 196 if (it == mapWords.end()) 197 { 198 wordList = new WordList(); 199 mapWords[code] = wordList; 200 } 201 else 202 { 203 wordList = it->second; 204 } 205 wordList->push_back(&arrSensitiveWord[i]); 206 } 207 DumpWordMap(); 208 GenTestData(); 209 Test(); 210 } 211 212 void SensitiveWordsChecker::DumpWordMap() 213 { 214 uint32_t word_cnt = 0,i = 0; 215 WordMap::const_iterator it = mapWords.begin(); 216 for (; it != mapWords.end(); ++it) 217 { 218 //printf("%u : %u\n", i++, it->second->size()); 219 word_cnt += it->second->size(); 220 } 221 printf("word_cnt = %u\n", word_cnt); 222 } 223 224 int32_t SensitiveWordsChecker::CheckSensitiveWord(char out_utf8_buf[], char in_utf8_buf[]) 225 { 226 // 先把被检测字符串转换为GBK编码 227 char gbk_buf[enmMaxContentLength],out_gbk_buf[enmMaxContentLength]; 228 UTF8_To_GBK(in_utf8_buf, strlen(in_utf8_buf), gbk_buf, enmMaxContentLength); 229 // 提取GBK字串里面的每一个字符,去map里面查找以该字符为首的关键词列表 230 int32_t gbk_buf_len = strlen(gbk_buf); 231 uint32_t code = 0, flag = 0, out_gbk_buf_len = 0; 232 char c = 0, cstr[3] = { 0 }; 233 for (int32_t i = 0; i < gbk_buf_len;) 234 { 235 flag = 0; 236 if (gbk_buf[i] >= 0 || i == gbk_buf_len - 1) 237 { 238 c = gbk_buf[i]; 239 //printf("%c\n", c); //ASCII字符 240 code = (uint32_t)c; 241 flag = 1; 242 out_gbk_buf[out_gbk_buf_len] = c; 243 } 244 else 245 { 246 flag = 2; 247 short high = (short)gbk_buf[i] + 256; 248 short low = (short)gbk_buf[i + 1] + 256; 249 code = high * 256 + low; 250 251 cstr[0] = gbk_buf[i]; 252 cstr[1] = gbk_buf[i + 1]; 253 cstr[2] = 0; 254 255 out_gbk_buf[out_gbk_buf_len] = cstr[0]; 256 out_gbk_buf[out_gbk_buf_len + 1] = cstr[1]; 257 //printf("%s\n", cstr); 258 } 259 // 检查敏感词 260 const SensitiveWord *sensitiveWord = FindSensitiveWord(code, &gbk_buf[i]); 261 int32_t word_len = 0; 262 if (NULL != sensitiveWord) 263 { 264 flag = 0; 265 //printf("%s\n", sensitiveWord->szWord); 266 word_len = strlen(sensitiveWord->szWord); 267 memset(&out_gbk_buf[out_gbk_buf_len],'*', word_len); 268 } 269 int32_t step = word_len + flag; 270 i += step; 271 out_gbk_buf_len += step; 272 } 273 out_gbk_buf[out_gbk_buf_len] = '\0'; 274 //printf("out_gbk_buf = %s\n", out_gbk_buf); 275 GBK_To_UTF8(out_gbk_buf, strlen(out_gbk_buf), out_utf8_buf, enmMaxContentLength); 276 return 0; 277 } 278 279 const SensitiveWord* SensitiveWordsChecker::FindSensitiveWord(uint32_t code, const char *pos) 280 { 281 int32_t word_len = 0; 282 WordMap::const_iterator it = mapWords.find(code); 283 if (it == mapWords.end()){ return NULL; } 284 WordList *wordList = it->second; 285 for (uint32_t i = 0; i < wordList->size(); i++) 286 { 287 const SensitiveWord *sensitiveWord = (*wordList)[i]; 288 word_len = strlen(sensitiveWord->szWord); 289 // 如果内容一样,就说明是敏感词 290 if (memcmp(sensitiveWord->szWord, pos, word_len) == 0) 291 { 292 return sensitiveWord; 293 } 294 } 295 return NULL; 296 } 297 298 void SensitiveWordsChecker::GenTestData() 299 { 300 char in_gbk_buf[enmMaxWordsFileLength], out_gbk_buf[enmMaxWordsFileLength]; 301 LoadFile(in_gbk_buf, enmMaxWordsFileLength, "poem.txt"); 302 int32_t len = strlen(in_gbk_buf); 303 uint32_t n = 0; 304 for (int32_t i = 0; i < len && n < enmMaxWordsFileLength;++i) 305 { 306 if (i % 4 == 0 && short(in_gbk_buf[i]) > 0) 307 { 308 int32_t nRandIndex = rand() % nSensitiveWordCnt; 309 SensitiveWord sensitiveWord = arrSensitiveWord[nRandIndex]; 310 int32_t word_len = strlen(sensitiveWord.szWord); 311 for (int32_t j = 0; j < word_len && n < enmMaxWordsFileLength; ++j) 312 { 313 out_gbk_buf[n++] = sensitiveWord.szWord[j]; 314 } 315 } 316 out_gbk_buf[n++] = in_gbk_buf[i]; 317 } 318 out_gbk_buf[n] = '\0'; 319 char out_utf8_buf[enmMaxWordsFileLength]; 320 GBK_To_UTF8(out_gbk_buf, strlen(out_gbk_buf), out_utf8_buf, enmMaxWordsFileLength); 321 WriteToFile(out_utf8_buf, strlen(out_utf8_buf), "test_data.txt"); 322 } 323 324 void SensitiveWordsChecker::Test() 325 { 326 const int32_t max_line_len = 1024; 327 char utf8_buf[enmMaxWordsFileLength]; 328 char out_utf8_buf[enmMaxWordsFileLength]; 329 LoadFile(utf8_buf, enmMaxWordsFileLength, "test_data.txt"); 330 const char *p = NULL, *q = utf8_buf; 331 uint32_t offset = 0; 332 while ((p = strchr(q, '\n')) != NULL) 333 { 334 char in_uft8_line[max_line_len] = { 0 }; 335 char out_uft8_line[max_line_len] = { 0 }; 336 char out_gbk_line[max_line_len] = { 0 }; 337 memcpy(in_uft8_line, q, p - q); 338 UTF8_To_GBK(in_uft8_line, strlen(in_uft8_line), out_gbk_line, max_line_len); 339 printf("%s\n", out_gbk_line); 340 CheckSensitiveWord(out_uft8_line, in_uft8_line); 341 q = p + 1; 342 char gbk[enmMaxContentLength]; 343 UTF8_To_GBK(out_uft8_line, strlen(out_uft8_line), gbk, enmMaxContentLength); 344 printf("%s\n", gbk); 345 StrAppend(out_utf8_buf, enmMaxWordsFileLength, offset, "%s", out_uft8_line); 346 } 347 WriteToFile(out_utf8_buf, offset, "test_data_ret.txt"); 348 } 349 350 void SensitiveWordsChecker::StrAppend(char buf[], const uint32_t bufLen, uint32_t &offset, const char *fmt, ...) 351 { 352 va_list argptr; 353 va_start(argptr, fmt); 354 if (offset < bufLen) 355 { 356 offset += vsprintf_s(buf + offset, bufLen - offset, fmt, argptr); 357 } 358 va_end(argptr); 359 }
测试效果:
完整VS2013工程:http://download.csdn.net/detail/tangxin19930330/9558997