c#-SimHash匹配相似-算法
使用场景:Google 的 simhash 算法
//通过大量测试,simhash用于比较大文本,比如500字以上效果都还蛮好,距离小于3的基本都是相似,误判率也比较低。 //从我的经验,如果我们假定N是每个块的大小,M是重叠的字符的数目,N = 4和M = 3是最好的选择 |
public class SimHashAnalyser : IAnalyser { private const int HashSize = 32; public float GetLikenessValue( string needle, string haystack) { var needleSimHash = this .DoCalculateSimHash(needle); var hayStackSimHash = this .DoCalculateSimHash(haystack); return (HashSize - GetHammingDistance(needleSimHash, hayStackSimHash)) / ( float )HashSize; } private static IEnumerable< int > DoHashTokens(IEnumerable< string > tokens) { var hashedTokens = new List< int >(); foreach ( string token in tokens) { hashedTokens.Add(token.GetHashCode()); } return hashedTokens; } private static int GetHammingDistance( int firstValue, int secondValue) { var hammingBits = firstValue ^ secondValue; var hammingValue = 0; for ( int i = 0; i < 32; i++) { if (IsBitSet(hammingBits, i)) { hammingValue += 1; } } return hammingValue; } private static bool IsBitSet( int b, int pos) { return (b & (1 << pos)) != 0; } private int DoCalculateSimHash( string input) { ITokeniser tokeniser = new OverlappingStringTokeniser(4, 3); var hashedtokens = DoHashTokens(tokeniser.Tokenise(input)); var vector = new int [HashSize]; for ( var i = 0; i < HashSize; i++) { vector[i] = 0; } foreach ( var value in hashedtokens) { for ( var j = 0; j < HashSize; j++) { if (IsBitSet(value, j)) { vector[j] += 1; } else { vector[j] -= 1; } } } var fingerprint = 0; for ( var i = 0; i < HashSize; i++) { if (vector[i] > 0) { fingerprint += 1 << i; } } return fingerprint; } } public interface IAnalyser { float GetLikenessValue( string needle, string haystack); } public interface ITokeniser { IEnumerable< string > Tokenise( string input); } public class FixedSizeStringTokeniser : ITokeniser { private readonly ushort tokensize = 5; public FixedSizeStringTokeniser( ushort tokenSize) { if (tokenSize < 2 || tokenSize > 127) { throw new ArgumentException( "Token 不能超出范围" ); } this .tokensize = tokenSize; } public IEnumerable< string > Tokenise( string input) { var chunks = new List< string >(); int offset = 0; while (offset < input.Length) { chunks.Add( new string (input.Skip(offset).Take( this .tokensize).ToArray())); offset += this .tokensize; } return chunks; } } public class OverlappingStringTokeniser : ITokeniser { private readonly ushort chunkSize = 4; private readonly ushort overlapSize = 3; public OverlappingStringTokeniser( ushort chunkSize, ushort overlapSize) { if (chunkSize <= overlapSize) { throw new ArgumentException( "Chunck 必须大于 overlap" ); } this .overlapSize = overlapSize; this .chunkSize = chunkSize; } public IEnumerable< string > Tokenise( string input) { var result = new List< string >(); int position = 0; while (position < input.Length - this .chunkSize) { result.Add(input.Substring(position, this .chunkSize)); position += this .chunkSize - this .overlapSize; } return result; } } |
使用:
const string HayStack = "中国香港………………" ; const string Needle = "中国香港 2013………………" ; IAnalyser analyser = new SimHashAnalyser(); var likeness = analyser.GetLikenessValue(Needle, HayStack); Console.Clear(); Console.WriteLine( "Likeness: {0}%" , likeness * 100); Console.ReadKey(); |
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· Linux系列:如何用 C#调用 C方法造成内存泄露
· AI与.NET技术实操系列(二):开始使用ML.NET
· 记一次.NET内存居高不下排查解决与启示
· 探究高空视频全景AR技术的实现原理
· 理解Rust引用及其生命周期标识(上)
· 阿里最新开源QwQ-32B,效果媲美deepseek-r1满血版,部署成本又又又降低了!
· 单线程的Redis速度为什么快?
· 展开说说关于C#中ORM框架的用法!
· SQL Server 2025 AI相关能力初探
· Pantheons:用 TypeScript 打造主流大模型对话的一站式集成库