.net2.0 非法关键字过滤算法

偶尔在网上看到这一篇文章,有可能会用到此东西,暂时记下来,以后用的上时再研究

.net 2.0 不支持  HashSet ,需实现此类,比HashTab速度快一些

using System;
using System.Collections;
using System.Collections.Generic;
using System.Runtime.Serialization;

public class HashSet<T> : ICollection<T>, ISerializable, IDeserializationCallback
{
private readonly Dictionary<T, object> dict;

public HashSet()
{
dict
= new Dictionary<T, object>();
}

public HashSet(IEnumerable<T> items)
:
this()
{
if (items == null)
{
return;
}

foreach (T item in items)
{
Add(item);
}
}

public HashSet<T> NullSet { get { return new HashSet<T>(); } }

#region ICollection<T> Members

public void Add(T item)
{
if (null == item)
{
throw new ArgumentNullException("item");
}

dict[item]
= null;
}

/// <summary>
/// Removes all items from the <see cref="T:System.Collections.Generic.ICollection`1"/>.
/// </summary>
/// <exception cref="T:System.NotSupportedException">The <see cref="T:System.Collections.Generic.ICollection`1"/> is read-only. </exception>
public void Clear()
{
dict.Clear();
}

public bool Contains(T item)
{
return dict.ContainsKey(item);
}

/// <summary>
/// Copies the items of the <see cref="T:System.Collections.Generic.ICollection`1"/> to an <see cref="T:System.Array"/>, starting at a particular <see cref="T:System.Array"/> index.
/// </summary>
/// <param name="array">The one-dimensional <see cref="T:System.Array"/> that is the destination of the items copied from <see cref="T:System.Collections.Generic.ICollection`1"/>. The <see cref="T:System.Array"/> must have zero-based indexing.</param><param name="arrayIndex">The zero-based index in <paramref name="array"/> at which copying begins.</param><exception cref="T:System.ArgumentNullException"><paramref name="array"/> is null.</exception><exception cref="T:System.ArgumentOutOfRangeException"><paramref name="arrayIndex"/> is less than 0.</exception><exception cref="T:System.ArgumentException"><paramref name="array"/> is multidimensional.-or-<paramref name="arrayIndex"/> is equal to or greater than the length of <paramref name="array"/>.-or-The number of items in the source <see cref="T:System.Collections.Generic.ICollection`1"/> is greater than the available space from <paramref name="arrayIndex"/> to the end of the destination <paramref name="array"/>.-or-Type T cannot be cast automatically to the type of the destination <paramref name="array"/>.</exception>
public void CopyTo(T[] array, int arrayIndex)
{
if (array == null) throw new ArgumentNullException("array");
if (arrayIndex < 0 || arrayIndex >= array.Length || arrayIndex >= Count)
{
throw new ArgumentOutOfRangeException("arrayIndex");
}

dict.Keys.CopyTo(array, arrayIndex);
}

/// <summary>
/// Removes the first occurrence of a specific object from the <see cref="T:System.Collections.Generic.ICollection`1"/>.
/// </summary>
/// <returns>
/// true if <paramref name="item"/> was successfully removed from the <see cref="T:System.Collections.Generic.ICollection`1"/>; otherwise, false. This method also returns false if <paramref name="item"/> is not found in the original <see cref="T:System.Collections.Generic.ICollection`1"/>.
/// </returns>
/// <param name="item">The object to remove from the <see cref="T:System.Collections.Generic.ICollection`1"/>.</param><exception cref="T:System.NotSupportedException">The <see cref="T:System.Collections.Generic.ICollection`1"/> is read-only.</exception>
public bool Remove(T item)
{
return dict.Remove(item);
}

/// <summary>
/// Gets the number of items contained in the <see cref="T:System.Collections.Generic.ICollection`1"/>.
/// </summary>
/// <returns>
/// The number of items contained in the <see cref="T:System.Collections.Generic.ICollection`1"/>.
/// </returns>
public int Count
{
get { return dict.Count; }
}

/// <summary>
/// Gets a value indicating whether the <see cref="T:System.Collections.Generic.ICollection`1"/> is read-only.
/// </summary>
/// <returns>
/// true if the <see cref="T:System.Collections.Generic.ICollection`1"/> is read-only; otherwise, false.
/// </returns>
public bool IsReadOnly
{
get
{
return false;
}
}

#endregion

public HashSet<T> Union(HashSet<T> set)
{
HashSet
<T> unionSet = new HashSet<T>(this);

if (null == set)
{
return unionSet;
}

foreach (T item in set)
{
if (unionSet.Contains(item))
{
continue;
}

unionSet.Add(item);
}

return unionSet;
}

public HashSet<T> Subtract(HashSet<T> set)
{
HashSet
<T> subtractSet = new HashSet<T>(this);

if (null == set)
{
return subtractSet;
}

foreach (T item in set)
{
if (!subtractSet.Contains(item))
{
continue;
}

subtractSet.dict.Remove(item);
}

return subtractSet;
}

public bool IsSubsetOf(HashSet<T> set)
{
HashSet
<T> setToCompare = set ?? NullSet;

foreach (T item in this)
{
if (!setToCompare.Contains(item))
{
return false;
}
}

return true;
}

public HashSet<T> Intersection(HashSet<T> set)
{
HashSet
<T> intersectionSet = NullSet;

if (null == set)
{
return intersectionSet;
}

foreach (T item in this)
{
if (!set.Contains(item))
{
continue;
}

intersectionSet.Add(item);
}

foreach (T item in set)
{
if (!Contains(item) || intersectionSet.Contains(item))
{
continue;
}

intersectionSet.Add(item);
}

return intersectionSet;
}

public bool IsProperSubsetOf(HashSet<T> set)
{
HashSet
<T> setToCompare = set ?? NullSet;

// A is a proper subset of a if the b is a subset of a and a != b
return (IsSubsetOf(setToCompare) && !setToCompare.IsSubsetOf(this));
}

public bool IsSupersetOf(HashSet<T> set)
{
HashSet
<T> setToCompare = set ?? NullSet;

foreach (T item in setToCompare)
{
if (!Contains(item))
{
return false;
}
}

return true;
}

public bool IsProperSupersetOf(HashSet<T> set)
{
HashSet
<T> setToCompare = set ?? NullSet;

// B is a proper superset of a if b is a superset of a and a != b
return (IsSupersetOf(setToCompare) && !setToCompare.IsSupersetOf(this));
}

public List<T> ToList()
{
return new List<T>(this);
}

#region Implementation of ISerializable

/// <summary>
/// Populates a <see cref="T:System.Runtime.Serialization.SerializationInfo"/> with the data needed to serialize the target object.
/// </summary>
/// <param name="info">The <see cref="T:System.Runtime.Serialization.SerializationInfo"/> to populate with data. </param><param name="context">The destination (see <see cref="T:System.Runtime.Serialization.StreamingContext"/>) for this serialization. </param><exception cref="T:System.Security.SecurityException">The caller does not have the required permission. </exception>
public void GetObjectData(SerializationInfo info, StreamingContext context)
{
if (info == null) throw new ArgumentNullException("info");
dict.GetObjectData(info, context);
}

#endregion

#region Implementation of IDeserializationCallback

/// <summary>
/// Runs when the entire object graph has been deserialized.
/// </summary>
/// <param name="sender">The object that initiated the callback. The functionality for this parameter is not currently implemented. </param>
public void OnDeserialization(object sender)
{
dict.OnDeserialization(sender);
}

#endregion

#region Implementation of IEnumerable

/// <summary>
/// Returns an enumerator that iterates through the collection.
/// </summary>
/// <returns>
/// A <see cref="T:System.Collections.Generic.IEnumerator`1"/> that can be used to iterate through the collection.
/// </returns>
/// <filterpriority>1</filterpriority>
public IEnumerator<T> GetEnumerator()
{
return dict.Keys.GetEnumerator();
}

/// <summary>
/// Returns an enumerator that iterates through a collection.
/// </summary>
/// <returns>
/// An <see cref="T:System.Collections.IEnumerator"/> object that can be used to iterate through the collection.
/// </returns>
/// <filterpriority>2</filterpriority>
IEnumerator IEnumerable.GetEnumerator()
{
return GetEnumerator();
}

#endregion
}

以下为过滤算法

public class BadWordsFilter
{
private HashSet<string> hash = new HashSet<string>();
private byte[] fastCheck = new byte[char.MaxValue];
private byte[] fastLength = new byte[char.MaxValue];
private BitArray charCheck = new BitArray(char.MaxValue);
private BitArray endCheck = new BitArray(char.MaxValue);
private int maxWordLength = 0;
private int minWordLength = int.MaxValue;

public BadWordsFilter()
{

}

public void Init(string[] badwords)
{
foreach (string word in badwords)
{
maxWordLength
= Math.Max(maxWordLength, word.Length);
minWordLength
= Math.Min(minWordLength, word.Length);

for (int i = 0; i < 7 && i < word.Length; i++)
{
fastCheck[word[i]]
|= (byte)(1 << i);
}

for (int i = 7; i < word.Length; i++)
{
fastCheck[word[i]]
|= 0x80;
}

if (word.Length == 1)
{
charCheck[word[
0]] = true;
}
else
{
fastLength[word[
0]] |= (byte)(1 << (Math.Min(7, word.Length - 2)));
endCheck[word[word.Length
- 1]] = true;

hash.Add(word);
}
}
}

public string Filter(string text, string mask)
{
throw new NotImplementedException();
}

public bool HasBadWord(string text)
{
int index = 0;

while (index < text.Length)
{
int count = 1;

if (index > 0 || (fastCheck[text[index]] & 1) == 0)
{
while (index < text.Length - 1 && (fastCheck[text[++index]] & 1) == 0) ;
}

char begin = text[index];

if (minWordLength == 1 && charCheck[begin])
{
return true;
}

for (int j = 1; j <= Math.Min(maxWordLength, text.Length - index - 1); j++)
{
char current = text[index + j];

if ((fastCheck[current] & 1) == 0)
{
++count;
}

if ((fastCheck[current] & (1 << Math.Min(j, 7))) == 0)
{
break;
}

if (j + 1 >= minWordLength)
{
if ((fastLength[begin] & (1 << Math.Min(j - 1, 7))) > 0 && endCheck[current])
{
string sub = text.Substring(index, j + 1);

if (hash.Contains(sub))
{
return true;
}
}
}
}

index
+= count;
}

return false;
}
}
}

posted @ 2011-03-08 17:47  [静水流深]  阅读(835)  评论(0编辑  收藏  举报