C++ 简单中文敏感词检测工具类

具体思路:

1->敏感词库,可从数据库读取,也可以从文件加载.

2->将敏感词转化为gbk编码,因为gbk严格按照字符一个字节,汉字两个字节的格式编码,便于容易切分文字段.

3->将所有敏感词以首个字符[英文一字节,汉字两字节]转换为一个整数,然后按照这个整数给所有敏感词建立索引,索引的value用list,因为考虑到同一个整数对应多个关键字.

4->检测一段内文字类容时,也实现转化为gbk,然后逐个字符[英文一字节,汉字两字节]检测是否有以该字符为首的敏感词.

代码.h

 1 #ifndef SENSITIVE_WORDS_CHECKER_
 2 #define SENSITIVE_WORDS_CHECKER_
 3 #include <stdint.h>
 4 #include <stdio.h>
 5 #include <memory.h>
 6 #include <map>
 7 #include <vector>
 8 
 9 enum {
10     enmMaxWordLength = 32,    //每个敏感词最大长度
11     enmMaxWordsFileLength = 1024 * 128,    //敏感词文件最大长度128k
12     enmMaxContentLength = 1024,    // 单次检测内容测最大长度
13 };
14 
15 struct SensitiveWord
16 {
17     char szWord[enmMaxWordLength];
18     SensitiveWord()
19     {
20         memset(szWord, 0, enmMaxWordLength);
21     }
22 };
23 
24 typedef std::vector<SensitiveWord*> WordList;
25 typedef std::map<uint32_t, WordList*> WordMap;
26 
27 class SensitiveWordsChecker
28 {
29 public:
30     SensitiveWordsChecker() :arrSensitiveWord(NULL), nSensitiveWordCnt(0){}
31     ~SensitiveWordsChecker(){ delete[] arrSensitiveWord; }
32 public:
33     void LoadWordsFromUTF8File(const char *file_name);
34     void LoadWordsFromGBKFile(const char *file_name);
35 protected:
36     int32_t WriteToFile(const char buf[], const int32_t buf_size, const char *file_name);
37     void DumpWordMap();
38     void GenTestData();
39     void Test();
40     void StrAppend(char buf[], const uint32_t bufLen, uint32_t &offset, const char *fmt, ...);
41 private:
42     int32_t LoadFile(char buf[], const uint32_t buf_size, const char *file_name);
43     int32_t CodeConvert(char *from_charset, char *to_charset, char *inbuf, size_t inlen, char *outbuf, size_t outlen);
44     int32_t UTF8_To_GBK(char *inbuf, size_t inlen, char *outbuf, size_t outlen);
45     int32_t GBK_To_UTF8(char *inbuf, size_t inlen, char *outbuf, size_t outlen);
46     uint32_t GetWordsCount(char buf[],const uint32_t buf_size,char separator);
47     char *StrcpyExcludeChar(char *dst, const uint32_t dst_len, const char *src, const char *exclude_list);
48     int32_t GetWords(char gbk_buf[], const uint32_t buf_size, char separator);
49     void BuildWordMap();
50     uint32_t GetFirstCharFromGBK(char gbk_buf[]);
51     uint32_t GetFirstCharFromTUF8(char utf8_buf[]);
52     uint32_t GetFirstChar(char buf[]);
53     // 返回 0 表示in_utf8_buf里面没有敏感词
54     // 返回 1 表示in_utf8_buf里面含有关键词,并将关键词替换为*输出到out_utf8_buf
55     int32_t CheckSensitiveWord(char out_utf8_buf[], char in_utf8_buf[]);
56     const SensitiveWord* FindSensitiveWord(uint32_t code,const char *pos);
57 private:
58     SensitiveWord *arrSensitiveWord;
59     uint32_t nSensitiveWordCnt;
60     WordMap mapWords;
61 };
62 
63 #endif
View Code

.cpp

  1 #include "SenditiveWordsChecker.h"
  2 #include "stdio.h"
  3 #include "string.h"
  4 #include "iconv.h"
  5 #include <stdarg.h>
  6 #include <new>
  7 
  8 void SensitiveWordsChecker::LoadWordsFromUTF8File(const char *file_name)
  9 {
 10     char utf8_buf[enmMaxWordsFileLength] , gbk_buf[enmMaxWordsFileLength];
 11     LoadFile(utf8_buf, enmMaxWordsFileLength, file_name);
 12     UTF8_To_GBK(utf8_buf, strlen(utf8_buf), gbk_buf, enmMaxWordsFileLength);
 13     GetWords(gbk_buf, enmMaxWordsFileLength, ',');
 14 }
 15 
 16 void SensitiveWordsChecker::LoadWordsFromGBKFile(const char *file_name)
 17 {
 18     char gbk_buf[enmMaxWordsFileLength];
 19     LoadFile(gbk_buf, enmMaxWordsFileLength, file_name);
 20     GetWords(gbk_buf, enmMaxWordsFileLength,',');
 21 }
 22 
 23 int32_t SensitiveWordsChecker::LoadFile(char buf[], const uint32_t buf_size, const char *file_name)
 24 {
 25     FILE * pFile;
 26     size_t lSize = 0, result = 0;
 27     fopen_s(&pFile, file_name, "rb");
 28     if (pFile == NULL) { fputs("File error\n", stderr); return -1; }
 29     // obtain file size:
 30     fseek(pFile, 0, SEEK_END);
 31     lSize = ftell(pFile);
 32     rewind(pFile);
 33     if (lSize >= buf_size){ fputs("file too large\n", stderr); return -1; }
 34     result = fread(buf, 1, lSize, pFile);
 35     if (result != lSize) { fputs("Reading error\n", stderr); return -1; }
 36     buf[lSize] = '\0';
 37     return fclose(pFile);
 38 }
 39 
 40 int32_t SensitiveWordsChecker::CodeConvert(char *from_charset, char *to_charset, char *inbuf, size_t inlen, char *outbuf, size_t outlen)
 41 {
 42     iconv_t cd;
 43     char **pin = &inbuf;
 44     char **pout = &outbuf;
 45 
 46     cd = iconv_open(to_charset, from_charset);
 47     if (cd == 0)
 48         return -1;
 49     memset(outbuf, 0, outlen);
 50     if (iconv(cd, pin, &inlen, pout, &outlen) == -1)
 51         return -1;
 52     iconv_close(cd);
 53     *pout = '\0';
 54     return 0;
 55 }
 56 
 57 int32_t SensitiveWordsChecker::UTF8_To_GBK(char *inbuf, size_t inlen, char *outbuf, size_t outlen)
 58 {
 59     return CodeConvert("utf-8", "gbk", inbuf, inlen, outbuf, outlen);
 60 }
 61 
 62 int32_t SensitiveWordsChecker::GBK_To_UTF8(char *inbuf, size_t inlen, char *outbuf, size_t outlen)
 63 {
 64     return CodeConvert("gbk", "utf-8", inbuf, inlen, outbuf, outlen);
 65 }
 66 
 67 uint32_t SensitiveWordsChecker::GetWordsCount(char buf[], const uint32_t buf_size, char separator)
 68 {
 69     const char *p = buf - 1;
 70     uint32_t i = 0;
 71     while ((p = strchr(p + 1, separator)) != NULL)
 72     {
 73         ++i;
 74     }
 75     return i;
 76 }
 77 
 78 int32_t SensitiveWordsChecker::WriteToFile(const char buf[], const int32_t buf_size, const char *file_name)
 79 {
 80     FILE * pFile;
 81     size_t result;
 82     fopen_s(&pFile, file_name, "wb");
 83     if (pFile == NULL) { fputs("File error\n", stderr); return -1; }
 84     result = fwrite(buf, 1, buf_size, pFile);
 85     if (result != buf_size) { fputs("Writing error\n", stderr); return -1; }
 86     return fclose(pFile);
 87 }
 88 
 89 int32_t SensitiveWordsChecker::GetWords(char gbk_buf[], const uint32_t buf_size, char separator)
 90 {
 91     char buf[enmMaxWordsFileLength];
 92     StrcpyExcludeChar(buf, enmMaxWordsFileLength, gbk_buf, "\n");    //排除换行符
 93     uint32_t nWordsCount = GetWordsCount(buf, buf_size,',');
 94     printf("words_count=%d\n", nWordsCount);
 95     arrSensitiveWord = new SensitiveWord[nWordsCount];
 96     if (arrSensitiveWord == NULL){return -1;}
 97     nSensitiveWordCnt = 0;
 98     const char *p = NULL,*q = buf;
 99     while ((p = strchr(q, separator)) != NULL)
100     {
101         memcpy(arrSensitiveWord[nSensitiveWordCnt].szWord, q, p - q);
102         //printf("%s\n", arrSensitiveWord[nSensitiveWordCnt].szWord);
103         q = p + 1;
104         ++nSensitiveWordCnt;
105     }
106     BuildWordMap();
107     return 0;
108 }
109 
110 char * SensitiveWordsChecker::StrcpyExcludeChar(char *dst, const uint32_t dst_len, const char *src, const char *exclude_list)
111 {
112     uint32_t i = 0, j = 0, flag = 0;
113     const char *p = NULL;
114     if (dst == NULL && src == NULL)return NULL;
115     if (dst == src)return dst;
116     for (; j < dst_len && src[i] != '\0'; ++i)
117     {
118         flag = 0;
119         p = exclude_list;
120         while (p && *p != '\0')
121         {
122             if (*p == src[i]){ flag = 1; break; }
123             p++;
124         }
125         if (flag == 0)dst[j++] = src[i];
126     }
127     dst[j] = '\0';
128     return dst;
129 }
130 
131 uint32_t SensitiveWordsChecker::GetFirstCharFromGBK(char gbk_buf[])
132 {
133     int32_t code = 0;
134     int32_t len = strlen(gbk_buf);
135     if (len == 0)return 0;
136     if (gbk_buf[0] >= 0 || len == 1)
137     {
138         //printf("%c\n", gbk_buf[0]);
139         return uint32_t(gbk_buf[0]);    //ASCII 字符
140     }
141     else
142     {
143         short high = (short)gbk_buf[0] + 256;
144         short low = (short)gbk_buf[1] + 256;
145         code = high * 256 + low;
146         char cstr[3];
147         cstr[0] = gbk_buf[0];    // GBK严格按照两个字节表示一个中文字符
148         cstr[1] = gbk_buf[1];
149         cstr[2] = 0;
150         //printf("%s %x\n", cstr, code);
151         return code;
152     }
153 }
154 
155 uint32_t SensitiveWordsChecker::GetFirstCharFromTUF8(char utf8_buf[])
156 {
157     uint32_t code = 0;
158     int32_t len = strlen(utf8_buf);
159     if (len == 0)return 0;
160     if (utf8_buf[0] >= 0 || len == 1)
161     {
162         printf("%c\n", utf8_buf[0]);
163         return int32_t(utf8_buf[0]);    //ASCII 字符
164     }
165     else
166     {
167         short high = (short)utf8_buf[0];
168         short mid = (short)utf8_buf[1];
169         short low = (short)utf8_buf[2];
170         code = high * 256 * 256 + mid * 256 + low;
171         char cstr[4];
172         cstr[0] = utf8_buf[0];    // UTF8大多数情况下三个字节表示一个中文字符
173         cstr[1] = utf8_buf[1];
174         cstr[2] = utf8_buf[2];
175         cstr[3] = 0;
176         printf("%s\n", cstr);
177         return code;
178     }
179 }
180 
181 uint32_t SensitiveWordsChecker::GetFirstChar(char buf[])
182 {
183     uint32_t code = 0;
184     int32_t len = strlen(buf);
185     if (len == 0)return 0;
186     return (uint32_t)buf[0];
187 }
188 
189 void SensitiveWordsChecker::BuildWordMap()
190 {
191     WordList *wordList = NULL;
192     for (uint32_t i = 0; i < nSensitiveWordCnt; ++i)
193     {
194         uint32_t code = GetFirstCharFromGBK(arrSensitiveWord[i].szWord);
195         WordMap::iterator it = mapWords.find(code);
196         if (it == mapWords.end())
197         {
198             wordList = new WordList();
199             mapWords[code] = wordList;
200         }
201         else
202         {
203             wordList = it->second;
204         }
205         wordList->push_back(&arrSensitiveWord[i]);
206     }
207     DumpWordMap();
208     GenTestData();
209     Test();
210 }
211 
212 void SensitiveWordsChecker::DumpWordMap()
213 {
214     uint32_t word_cnt = 0,i = 0;
215     WordMap::const_iterator it = mapWords.begin();
216     for (; it != mapWords.end(); ++it)
217     {
218         //printf("%u : %u\n", i++, it->second->size());
219         word_cnt += it->second->size();
220     }
221     printf("word_cnt = %u\n", word_cnt);
222 }
223 
224 int32_t SensitiveWordsChecker::CheckSensitiveWord(char out_utf8_buf[], char in_utf8_buf[])
225 {
226     // 先把被检测字符串转换为GBK编码
227     char gbk_buf[enmMaxContentLength],out_gbk_buf[enmMaxContentLength];
228     UTF8_To_GBK(in_utf8_buf, strlen(in_utf8_buf), gbk_buf, enmMaxContentLength);
229     // 提取GBK字串里面的每一个字符,去map里面查找以该字符为首的关键词列表
230     int32_t gbk_buf_len = strlen(gbk_buf);
231     uint32_t code = 0, flag = 0, out_gbk_buf_len = 0;
232     char c = 0, cstr[3] = { 0 };
233     for (int32_t i = 0; i < gbk_buf_len;)
234     {
235         flag = 0;
236         if (gbk_buf[i] >= 0 || i == gbk_buf_len - 1)
237         {
238             c = gbk_buf[i];
239             //printf("%c\n", c);   //ASCII字符
240             code = (uint32_t)c;
241             flag = 1;
242             out_gbk_buf[out_gbk_buf_len] = c;
243         }
244         else
245         {
246             flag = 2;
247             short high = (short)gbk_buf[i] + 256;
248             short low = (short)gbk_buf[i + 1] + 256;
249             code = high * 256 + low;
250 
251             cstr[0] = gbk_buf[i];
252             cstr[1] = gbk_buf[i + 1];
253             cstr[2] = 0;
254 
255             out_gbk_buf[out_gbk_buf_len] = cstr[0];
256             out_gbk_buf[out_gbk_buf_len + 1] = cstr[1];
257             //printf("%s\n", cstr);
258         }
259         // 检查敏感词
260         const SensitiveWord *sensitiveWord = FindSensitiveWord(code, &gbk_buf[i]);
261         int32_t word_len = 0;
262         if (NULL != sensitiveWord)
263         {
264             flag = 0;
265             //printf("%s\n", sensitiveWord->szWord);
266             word_len = strlen(sensitiveWord->szWord);
267             memset(&out_gbk_buf[out_gbk_buf_len],'*', word_len);
268         }
269         int32_t step = word_len + flag;
270         i += step;
271         out_gbk_buf_len += step;
272     }
273     out_gbk_buf[out_gbk_buf_len] = '\0';
274     //printf("out_gbk_buf = %s\n", out_gbk_buf);
275     GBK_To_UTF8(out_gbk_buf, strlen(out_gbk_buf), out_utf8_buf, enmMaxContentLength);
276     return 0;
277 }
278 
279 const SensitiveWord* SensitiveWordsChecker::FindSensitiveWord(uint32_t code, const char *pos)
280 {
281     int32_t word_len = 0;
282     WordMap::const_iterator it = mapWords.find(code);
283     if (it == mapWords.end()){ return NULL; }
284     WordList *wordList = it->second;
285     for (uint32_t i = 0; i < wordList->size(); i++)
286     {
287         const SensitiveWord *sensitiveWord = (*wordList)[i];
288         word_len = strlen(sensitiveWord->szWord);
289         // 如果内容一样,就说明是敏感词
290         if (memcmp(sensitiveWord->szWord, pos, word_len) == 0)
291         {
292             return sensitiveWord;
293         }
294     }
295     return NULL;
296 }
297 
298 void SensitiveWordsChecker::GenTestData()
299 {
300     char in_gbk_buf[enmMaxWordsFileLength], out_gbk_buf[enmMaxWordsFileLength];
301     LoadFile(in_gbk_buf, enmMaxWordsFileLength, "poem.txt");
302     int32_t len = strlen(in_gbk_buf);
303     uint32_t n = 0;
304     for (int32_t i = 0; i < len && n < enmMaxWordsFileLength;++i)
305     {
306         if (i % 4 == 0 && short(in_gbk_buf[i]) > 0)
307         {
308             int32_t nRandIndex = rand() % nSensitiveWordCnt;
309             SensitiveWord sensitiveWord = arrSensitiveWord[nRandIndex];
310             int32_t word_len = strlen(sensitiveWord.szWord);
311             for (int32_t j = 0; j < word_len && n < enmMaxWordsFileLength; ++j)
312             {
313                 out_gbk_buf[n++] = sensitiveWord.szWord[j];
314             }
315         }
316         out_gbk_buf[n++] = in_gbk_buf[i];
317     }
318     out_gbk_buf[n] = '\0';
319     char out_utf8_buf[enmMaxWordsFileLength];
320     GBK_To_UTF8(out_gbk_buf, strlen(out_gbk_buf), out_utf8_buf, enmMaxWordsFileLength);
321     WriteToFile(out_utf8_buf, strlen(out_utf8_buf), "test_data.txt");
322 }
323 
324 void SensitiveWordsChecker::Test()
325 {
326     const int32_t max_line_len = 1024;
327     char utf8_buf[enmMaxWordsFileLength];
328     char out_utf8_buf[enmMaxWordsFileLength];
329     LoadFile(utf8_buf, enmMaxWordsFileLength, "test_data.txt");
330     const char *p = NULL, *q = utf8_buf;
331     uint32_t offset = 0;
332     while ((p = strchr(q, '\n')) != NULL)
333     {
334         char in_uft8_line[max_line_len] = { 0 };
335         char out_uft8_line[max_line_len] = { 0 };
336         char out_gbk_line[max_line_len] = { 0 };
337         memcpy(in_uft8_line, q, p - q);
338         UTF8_To_GBK(in_uft8_line, strlen(in_uft8_line), out_gbk_line, max_line_len);
339         printf("%s\n", out_gbk_line);
340         CheckSensitiveWord(out_uft8_line, in_uft8_line);
341         q = p + 1;
342         char gbk[enmMaxContentLength];
343         UTF8_To_GBK(out_uft8_line, strlen(out_uft8_line), gbk, enmMaxContentLength);
344         printf("%s\n", gbk);
345         StrAppend(out_utf8_buf, enmMaxWordsFileLength, offset, "%s", out_uft8_line);
346     }
347     WriteToFile(out_utf8_buf, offset, "test_data_ret.txt");
348 }
349 
350 void SensitiveWordsChecker::StrAppend(char buf[], const uint32_t bufLen, uint32_t &offset, const char *fmt, ...)
351 {
352     va_list argptr;
353     va_start(argptr, fmt);
354     if (offset < bufLen)
355     {
356         offset += vsprintf_s(buf + offset, bufLen - offset, fmt, argptr);
357     }
358     va_end(argptr);
359 }
View Code

测试效果:

完整VS2013工程:http://download.csdn.net/detail/tangxin19930330/9558997

posted @ 2016-06-24 23:33  你好阿汤哥  Views(2479)  Comments(0Edit  收藏  举报