【转】关键字过滤算法

 

using System;
using System.Collections.Generic;
using System.Text;
using System.Data;
using System.Collections;

namespace BLL.Common
{
    #region 操作类
    public class KeywordsFilter
    {
        #region 关键字过滤
        /// <summary>
        /// 关键字过滤
        /// 
        /// </summary>
        /// <param name="keywords"></param>
        /// <returns></returns>
        public static string Filter(string keywords)
        {
            //需过滤关键字集合
            List<string> badwords = new List<string>();
            
            KeywordsFilterClass kf = new KeywordsFilterClass();
            keywords = kf.BadwordInKeywords(keywords, badwords);
            return keywords;
        }
        #endregion

    }
    #endregion

    #region 关键字过滤类
    /// <summary>
    /// 关键字过滤类
    /// </summary>
    public class KeywordsFilterClass
    {

        private Dictionary<string, object> hash = new Dictionary<string, object>();
        //脏字字典 开头脏字存储
        private BitArray firstCharCheck = new BitArray(char.MaxValue);
        //脏字字典 单个char存储
        private BitArray allCharCheck = new BitArray(char.MaxValue);
        private int maxLength = 0;

        /// <summary>
        /// 初始化 已存储的 过滤字符串
        /// </summary>
        /// <param name="words"></param>
        private void InitHash(List<string> badwords)
        {
            foreach (string word in badwords)
            {
                //保存字典内不存在的脏字
                if (!hash.ContainsKey(word))
                {
                    hash.Add(word, null);
                    //设置脏字计算长度
                    this.maxLength = Math.Max(this.maxLength, word.Length);
                    firstCharCheck[word[0]] = true;
                    foreach (char c in word)
                    {
                        allCharCheck[c] = true;
                    }
                }
            }
        }
        /// <summary>
        /// 替换字符串中的脏字为指定的字符
        /// </summary>
        /// <param name="text"></param>
        /// <returns></returns>
        public string BadwordInKeywords(string text, List<string> badwords)
        {
            //初始化 脏字字典
            this.InitHash(badwords);
            int index = 0;

            while (index < text.Length)
            {
                //判断开头脏字
                if (!firstCharCheck[text[index]])
                {
                    //未找到开头脏字 则索引累加
                    while (index < text.Length - 1 && !firstCharCheck[text[++index]]) ;
                }
                for (int j = 1; j <= Math.Min(maxLength, text.Length - index); j++)
                {
                    if (!allCharCheck[text[index + j - 1]])
                    {
                        break;
                    }
                    string sub = text.Substring(index, j);

                    if (hash.ContainsKey(sub))
                    {
                        text = text.Replace(sub, "**");
                        //this.InitHash(badwords);
                        index += j;
                        break;
                    }
                }
                index++;
            }
            return text;
        }
    }
    #endregion
}

 

posted @ 2012-09-15 12:05  TiestoRay  阅读(134)  评论(0编辑  收藏  举报