脏字处理类,很快

引用地址:http://www.cnblogs.com/xingd/archive/2008/02/01/1061800.html

主要是参考上面的代码改的,bool的那个方法基本上没有改,string的那个方法是我参照bool的方法改的.

原楼主只写了一些片段(核心代码),楼主主要是实现了检测是不是含有脏字,并没有替换,相信大家都会改,但即使简单,也是需要人做的,再说了,这么简单的事情,大鸟,大侠们当然不屑了,我是菜鸟我不怕,copy后修改是我自身的属性,OK,你们不要笑我...

我试了多次,很爽,速度很快....有什么不对的请大伙指教,还有 while (index < text.Length - 1 && (fastCheck[text[++index]] & 1) == 0) ;
这个不知道是么意思..

using System;
using System.Collections;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;

namespace CommonUnit
{
    public class BadWordParse
    {


        private HashSet<string> hash = new HashSet<string>();
        private byte[] fastCheck = new byte[char.MaxValue];
        private BitArray charCheck = new BitArray(char.MaxValue);
        private int maxWordLength = 0;
        private int minWordLength = int.MaxValue;
        private bool _isHave = false;
        private string _replaceString = "*";
        private char _splitString = '|';
        private string _newWord;
        private string _badWordFilePath;


        /// <summary>
        /// 是否含有脏字
        /// </summary>
        public bool IsHave
        {
            get { return _isHave; }
        }

        /// <summary>
        /// 替换后字符串
        /// </summary>
        public string ReplaceString
        {
            set { _replaceString = value; }
        }
        /// <summary>
        /// 脏字字典切割符
        /// </summary>
        public char SplitString
        {
            set { _splitString = value; }
        }

        /// <summary>
        /// 更新后的字符串
        /// </summary>
        public string NewWord
        {
            get { return _newWord; }
        }

        /// <summary>
        /// 脏字字典文档路径
        /// </summary>
        public string BadWordFilePath
        {
            get { return _badWordFilePath; }
            set { _badWordFilePath = value; }
        }

        public BadWordParse(string filePath)
        {
            _badWordFilePath = filePath;
            string srList = string.Empty;
            if (File.Exists(_badWordFilePath))
            {
                StreamReader sr = new StreamReader(_badWordFilePath, Encoding.GetEncoding("gb2312"));
                srList = sr.ReadToEnd();
                sr.Close();
                sr.Dispose();
            }
            string[] badwords = srList.Split('|');
            foreach (string word in badwords)
            {
                maxWordLength = Math.Max(maxWordLength, word.Length);
                minWordLength = Math.Min(minWordLength, word.Length);
                for (int i = 0; i < 7 && i < word.Length; i++)
                {
                    fastCheck[word[i]] |= (byte)(1 << i);
                }

                for (int i = 7; i < word.Length; i++)
                {
                    fastCheck[word[i]] |= 0x80;
                }

                if (word.Length == 1)
                {
                    charCheck[word[0]] = true;
                }
                else
                {
                    hash.Add(word);
                }
            }
        }
        public bool HasBadWord(string text)
        {
            int index = 0;

            while (index < text.Length)
            {


                if ((fastCheck[text[index]] & 1) == 0)
                {
                    while (index < text.Length - 1 && (fastCheck[text[++index]] & 1) == 0) ;
                }

                //单字节检测
                if (minWordLength == 1 && charCheck[text[index]])
                {
                    return true;
                }


                //多字节检测
                for (int j = 1; j <= Math.Min(maxWordLength, text.Length - index - 1); j++)
                {
                    //快速排除
                    if ((fastCheck[text[index + j]] & (1 << Math.Min(j, 7))) == 0)
                    {
                        break;
                    }

                    if (j + 1 >= minWordLength)
                    {
                        string sub = text.Substring(index, j + 1);

                        if (hash.Contains(sub))
                        {
                            return true;
                        }
                    }
                }
                index++;
            }
            return false;
        }

        public string ReplaceBadWord(string text)
        {
            int index = 0;

            for (index = 0; index < text.Length; index++)
            {
                if ((fastCheck[text[index]] & 1) == 0)
                {
                    while (index < text.Length - 1 && (fastCheck[text[++index]] & 1) == 0) ;
                }

                //单字节检测
                if (minWordLength == 1 && charCheck[text[index]])
                {
                    //return true;
                    _isHave = true;
                    text = text.Replace(text[index], _replaceString[0]);
                    continue;
                }
                //多字节检测
                for (int j = 1; j <= Math.Min(maxWordLength, text.Length - index - 1); j++)
                {

                    //快速排除
                    if ((fastCheck[text[index + j]] & (1 << Math.Min(j, 7))) == 0)
                    {
                        break;
                    }

                    if (j + 1 >= minWordLength)
                    {
                        string sub = text.Substring(index, j + 1);

                        if (hash.Contains(sub))
                        {

                            //替换字符操作
                            _isHave = true;
                            char cc = _replaceString[0];
                            string rp = _replaceString.PadRight((j + 1), cc);
                            text = text.Replace(sub, rp);
                            //记录新位置
                            index += j;
                            break;
                        }
                    }
                }
            }
            _newWord = text;
            return text;
        }
    }
}

脏字典E://Text/badword.txt:引用地址:http://www.cnblogs.com/goody9807/archive/2006/09/12/502094.html

以下是测试代码:

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using CommonLibrary;
using NUnit.Framework;

namespace MyWebTest.CommonLibraryTest
{

    [TestFixture]
    public class BadWordParseTest
    {
       [System.Runtime.InteropServices.DllImport("kernel32.dll")]
       public static extern uint GetTickCount();
       [Test]
       public void Test() {

           string filePath = "E://Text/badword.txt";
           string testString = string.Empty;
           System.IO.StreamReader sr = new System.IO.StreamReader(filePath,System.Text.Encoding.GetEncoding("gb2312"));
           testString = sr.ReadToEnd();
           sr.Close();
           sr.Dispose();
           uint t = GetTickCount();
           BadWordParse bwp = new BadWordParse(filePath);
           string parsedString = bwp.ReplaceBadWord(testString);
           uint time = GetTickCount() - t;
           Console.Write("使用时间:"+time.ToString());
           Console.Write("\r\n");
           Console.Write("原始字符串" + parsedString);
           Console.Write("\r\n");
           Console.Write("替换后字符串"+parsedString);
       }
    }
}

测试结果图片:

posted @ 2008-12-10 17:31  极品菜鸟  阅读(1948)  评论(8编辑  收藏  举报