中文的文本相似度的计算是基于分词来的
余弦定理:
提取两段文本中所有词(事先需用空格分割好)
计算每个词在两段文本中分别出现的次数(用BSD tree.h里的RBTREE保存)
用词频做为A、B的分量
使用余弦公式计算AB夹角的余弦值
jaccard:
大体差不多
代码如下,分词用的是mmseg
1 #!/usr/bin/env python
2 #coding=utf-8
3 import psyco
4 psyco.full()
5 import math
6 from mmseg.search import seg_txt_2_dict
7 # --- top-level functions ---
8 def measure_similarity(file_a, file_b, sim_func = None):
9 '''
10 Returns the textual similarity of file_a and file_b using chosen similarity metric
11 'sim_func' defaults to cosine_sim if not specified
12 Consumes file_a and file_b
13 '''
14 if sim_func == None: sim_func = cosine_sim # default to cosine_sim
15
16 u = term_vec(file_a)
17 v = term_vec(file_b)
18
19 return sim_func(u, v)
20
21
22 def cosine_sim(u, v):
23 '''
24 Returns the cosine similarity of u,v: <u,v>/(|u||v|)
25 where |u| is the L2 norm
26 '''
27 div = (l2_norm(u) * l2_norm(v))
28 if div == 0:
29 return 0
30 return dot_product(u, v) / (l2_norm(u) * l2_norm(v))
31
32 def jaccard_sim(A, B):
33 r'''
34 Returns the Jaccard similarity of A,B: |A \cap B| / |A \cup B|
35 We treat A and B as multi-sets (The Jaccard coefficient is technically defined over sets)
36 '''
37 div = mag_union(A, B)
38 if div == 0:
39 return 0
40 else:
41 return mag_intersect(A, B) / div
42
43 # --- Term-vector operations ---
44
45 def dot_product(v1, v2):
46 '''Returns dot product of two term vectors'''
47 val = 0.0
48 for term in v1:
49 if term in v2: val += v1[term] * v2[term]
50 return val
51
52 def l2_norm(v):
53 '''Returns L2 norm of term vector v'''
54 val = 0.0
55 for term in v:
56 val += v[term]**2
57 val = math.sqrt(val)
58 return val
59
60 def mag_union(A, B):
61 '''
62 Returns magnitude of multiset-union of A and B
63 '''
64 val = 0
65 for term in A: val += A[term]
66 for term in B: val += B[term]
67 return val
68
69 def mag_intersect(A, B):
70 '''
71 Returns magnitude of multiset-intersection of A and B
72 '''
73 val = 0
74 for term in A:
75 if term in B: val += min(A[term], B[term])
76 return val
77
78 # another name for l2_norm()
79
80 # --- Utilities for creating term vectors from data ---
81 def term_vec(f):
82 '''Returns a term vector for 'file', represented as a dictionary mapping {term->frequency}'''
83
84 return seg_txt_2_dict(f)
85
86 # --- Exceptions ---
87 class Error(Exception):
88 '''Base class for Exception types used in this module'''
89 pass
90
91 class FileFormatException(Exception):
92 pass