参考链接:
https://zhuanlan.zhihu.com/p/84685657
https://www.cnblogs.com/luguoshuai/p/9254190.html
一开始打算使用树结构来实现,但考虑到敏感字数量过多,其构建的树会十分庞大,占用过多内存,所以这里使用的是上面链接里面提到的“滑动窗口”方法
例如,要检测的字符串为“123”,则实际上只需处理,以下6种情况:1,12,123,2,23,3;进行6次敏感字的判断即可
1 local sensitiveWordList = { 2 "早啊", 3 "a啊", 4 } --测试用 5 local sensitiveWordDic = {} --key:敏感字 value:true 6 7 --获取字符数 8 function GetWordCount(str) 9 local _,count = string.gsub(str, "[^\128-\193]", "") 10 return count 11 end 12 13 --将字符串转为table 14 function GetWordTable(str) 15 local temp = {} 16 for uchar in string.gmatch(str, "[%z\1-\127\194-\244][\128-\191]*") do 17 temp[#temp+1] = uchar 18 end 19 return temp 20 end 21 22 --将字符串的table转为字符串 23 function GetWordStr(wordTable, startPos, endPos) 24 local str = "" 25 local startPos = startPos or 1 26 local endPos = endPos or #wordTable 27 for i=startPos,endPos do 28 str = str .. wordTable[i] 29 end 30 return str 31 end 32 33 --处理敏感字 34 --isOnlyCheck:true表示仅检查是否有敏感字 35 function HandleSensitiveWord(str, isOnlyCheck) 36 local wordTable = GetWordTable(str) 37 local recordPos = {} --记录敏感字的位置 38 local hasSensitiveWord = false --是否有敏感字 39 local resultStr 40 41 for i=1,#wordTable do --开始位置 42 for j=i,#wordTable do --结束位置 43 local strTemp = GetWordStr(wordTable, i, j) 44 print(strTemp) 45 if sensitiveWordDic[strTemp] then 46 hasSensitiveWord = true 47 table.insert(recordPos, {i, j}) 48 i = j 49 break 50 end 51 end 52 53 if hasSensitiveWord and isOnlyCheck then 54 break 55 end 56 end 57 58 if isOnlyCheck then 59 return hasSensitiveWord 60 else 61 for i=1,#recordPos do 62 local startPos = recordPos[i][1] 63 local endPos = recordPos[i][2] 64 for j=startPos,endPos do 65 wordTable[j] = "*" 66 end 67 end 68 resultStr = GetWordStr(wordTable) 69 return hasSensitiveWord, resultStr 70 end 71 end 72 73 function Init() 74 for i=1,#sensitiveWordList do 75 sensitiveWordDic[sensitiveWordList[i]] = true 76 end 77 78 --测试 79 local hasSensitiveWord, resultStr = HandleSensitiveWord("a早啊s") 80 print(hasSensitiveWord, resultStr) 81 end 82 83 Init()
结果如下: