#!/usr/bin/env python # -*- coding=utf-8 -*- # Implementation of Charikar simhashes in Python # See: http://dsrg.mff.cuni.cz/~holub/sw/shash/#a1 class simhash(): def __init__(self, tokens='', hashbits=128): self.hashbits = hashbits self.hash = self.simhash(tokens) def __str__(self): return str(self.hash) def __long__(self): return long(self.hash) def __float__(self): return float(self.hash) def simhash(self, tokens): # Returns a Charikar simhash with appropriate bitlength v = [0]*self.hashbits for t in [self._string_hash(x) for x in tokens]: bitmask = 0 #print (t) for i in range(self.hashbits): bitmask = 1 << i #print(t,bitmask, t & bitmask) if t & bitmask: v[i] += 1 #查看当前bit位是否为1,是的话则将该位+1 else: v[i] += -1 #否则得话,该位减1 fingerprint = 0 for i in range(self.hashbits): if v[i] >= 0: fingerprint += 1 << i #整个文档的fingerprint为最终各个位大于等于0的位的和 return fingerprint def _string_hash(self, v): # A variable-length version of Python's builtin hash if v == "": return 0 else: x = ord(v[0])<<7 m = 1000003 mask = 2**self.hashbits-1 for c in v: x = ((x*m)^ord(c)) & mask x ^= len(v) if x == -1: x = -2 return x def hamming_distance(self, other_hash): x = (self.hash ^ other_hash.hash) & ((1 << self.hashbits) - 1) tot = 0 while x: tot += 1 x &= x-1 return tot def similarity(self, other_hash): a = float(self.hash) b = float(other_hash) if a>b: return b/a return a/b if __name__ == '__main__': #看看哪些东西google最看重?标点? s = '看看哪些东西google最看重?标点?' hash1 =simhash(s.split()) #print("0x%x" % hash1) #print ("%s\t0x%x" % (s, hash1)) s = '看看哪些东西google最看重!标点!' hash2 = simhash(s.split()) #print ("%s\t[simhash = 0x%x]" % (s, hash2)) print '%f%% percent similarity on hash' %(100*(hash1.similarity(hash2))) print hash1.hamming_distance(hash2),"bits differ out of", hash1.hashbits