C# 敏感词过滤

    public class BadWordFilter

    {

        #region 变量

        private HashSet<string> hash = new HashSet<string>();

        private byte[] fastCheck = new byte[char.MaxValue];

        private byte[] fastLength = new byte[char.MaxValue];

        private BitArray charCheck = new BitArray(char.MaxValue);

        private BitArray endCheck = new BitArray(char.MaxValue);

        private int maxWordLength = 0;

        private int minWordLength = int.MaxValue;

        private string _replaceString = "*";

        private string _newWord;

        #endregion

 

        #region 单例模式创建实例

        private static BadWordFilter badWordFilter = null;

 

        /// <summary>

        /// 构造函数

        /// </summary>

        private BadWordFilter() { }

 

        /// <summary>

        /// 单例

        /// </summary>

        /// <returns></returns>

        public static BadWordFilter CreateBadWordsFilter()

        {

            if (badWordFilter == null)

            {

                badWordFilter = new BadWordFilter();

            }

            return badWordFilter;

        }

        #endregion

 

        #region 初始化数据,将List集合类型敏感词放入HashSet中

        /// <summary>

        /// 初始化数据,将敏感词放入HashSet中

        /// </summary>

        /// <param name="badwords"></param>

        public void Init(List<BadWordEntity> badwords)

        {

            foreach (BadWordEntity word in badwords)

            {

                maxWordLength = Math.Max(maxWordLength, word.BadWord.Length);

                minWordLength = Math.Min(minWordLength, word.BadWord.Length);

                for (int i = 0; i < 7 && i < word.BadWord.Length; i++)

                {

                    fastCheck[word.BadWord[i]] |= (byte)(1 << i);

                }

                for (int i = 7; i < word.BadWord.Length; i++)

                {

                    fastCheck[word.BadWord[i]] |= 0x80;

                }

                if (word.BadWord.Length == 1)

                {

                    charCheck[word.BadWord[0]] = true;

                }

                else

                {

                    fastLength[word.BadWord[0]] |= (byte)(1 << (Math.Min(7, word.BadWord.Length - 2)));

                    endCheck[word.BadWord[word.BadWord.Length - 1]] = true;

 

                    hash.Add(word.BadWord);

                }

            }

        }

        #endregion

 

        #region 初始化数据,将String[]类型敏感词放入HashSet中

        /// <summary>

        /// 初始化数据,将敏感词放入HashSet中

        /// </summary>

        /// <param name="badwords"></param>

        private void Init(string[] badwords)

        {

            foreach (string word in badwords)

            {

                maxWordLength = Math.Max(maxWordLength, word.Length);

                minWordLength = Math.Min(minWordLength, word.Length);

                for (int i = 0; i < 7 && i < word.Length; i++)

                {

                    fastCheck[word[i]] |= (byte)(1 << i);

                }

                for (int i = 7; i < word.Length; i++)

                {

                    fastCheck[word[i]] |= 0x80;

                }

                if (word.Length == 1)

                {

                    charCheck[word[0]] = true;

                }

                else

                {

                    fastLength[word[0]] |= (byte)(1 << (Math.Min(7, word.Length - 2)));

                    endCheck[word[word.Length - 1]] = true;

 

                    hash.Add(word);

                }

            }

        }

        #endregion

 

        #region 检查是否有敏感词

        /// <summary>

        /// 检查是否有敏感词

        /// </summary>

        /// <param name="text"></param>

        /// <returns></returns>

        public bool HasBadWord(string text)

        {

            int index = 0;

            while (index < text.Length)

            {

                int count = 1;

                if (index > 0 || (fastCheck[text[index]] & 1) == 0)

                {

                    while (index < text.Length - 1 && (fastCheck[text[++index]] & 1) == 0) ;

                }

 

                char begin = text[index];

                if (minWordLength == 1 && charCheck[begin])

                {

                    return true;

                }

 

                for (int j = 1; j <= Math.Min(maxWordLength, text.Length - index - 1); j++)

                {

                    char current = text[index + j];

 

                    if ((fastCheck[current] & 1) == 0)

                    {

                        ++count;

                    }

 

                    if ((fastCheck[current] & (1 << Math.Min(j, 7))) == 0)

                    {

                        break;

                    }

 

                    if (j + 1 >= minWordLength)

                    {

                        if ((fastLength[begin] & (1 << Math.Min(j - 1, 7))) > 0 && endCheck[current])

                        {

                            string sub = text.Substring(index, j + 1);

 

                            if (hash.Contains(sub))

                            {

                                return true;

                            }

                        }

                    }

                }

 

                index += count;

            }

 

            return false;

        }

        #endregion

 

        #region 替换敏感词

        /// <summary>

        /// 替换敏感词

        /// </summary>

        /// <param name="text"></param>

        /// <returns></returns>

        public string ReplaceBadWord(string text)

        {

            int index = 0;

 

            for (index = 0; index < text.Length; index++)

            {

                if ((fastCheck[text[index]] & 1) == 0)

                {

                    while (index < text.Length - 1 && (fastCheck[text[++index]] & 1) == 0) ;

                }

 

                //单字节检测

                if (minWordLength == 1 && charCheck[text[index]])

                {

                    text = text.Replace(text[index], _replaceString[0]);

                    continue;

                }

                //多字节检测

                for (int j = 1; j <= Math.Min(maxWordLength, text.Length - index - 1); j++)

                {

                    //快速排除

                    if ((fastCheck[text[index + j]] & (1 << Math.Min(j, 7))) == 0)

                    {

                        break;

                    }

 

                    if (j + 1 >= minWordLength)

                    {

                        string sub = text.Substring(index, j + 1);

 

                        if (hash.Contains(sub))

                        {

 

                            //替换字符操作

                            char cc = _replaceString[0];

                            string rp = _replaceString.PadRight((j + 1), cc);

                            text = text.Replace(sub, rp);

                            //记录新位置

                            index += j;

                            break;

                        }

                    }

                }

            }

            _newWord = text;

            return text;

        }

        #endregion

    }

 

    #region 敏感词实体类

    /// <summary>

    /// 敏感词实体

    /// </summary>

    public class BadWordEntity

    {

        /// <summary>

        /// 敏感词

        /// </summary>

        public string BadWord { get; set; }

    }

    #endregion

posted @ 2015-02-06 10:04  余粮  阅读(1252)  评论(1编辑  收藏  举报