语句相似度

参考链接:https://zhuanlan.zhihu.com/p/88938220
https://blog.csdn.net/yjw123456/article/details/107923566
https://blog.csdn.net/betterzl/article/details/109983541
列表的交并差:https://blog.csdn.net/qdPython/article/details/118802922
jieba的使用:https://blog.csdn.net/lukabruce/article/details/82351742


import random,jieba

class Similarity():
def __init__(self,a,b):
# 分词
self.a = jieba.lcut(a)
self.b = jieba.lcut(b)
# 词袋 a,b的并集
self.word_bag = list(set(self.a).union(set(self.b)))
print('词袋:',self.word_bag)
# 词频
self.wf1 = self.word_frequency(self.a)
self.wf2 = self.word_frequency(self.b)
print('a词频:',self.wf1)
print('b词频:',self.wf2)

# 统计词频
def word_frequency(self,word):
c = []
for i in self.word_bag:
if i in word:
c.append(1)
else:
c.append(0)
return c
# 欧氏距离
def euclidean_distance(self):
# 统计
count = 0
for i, t in enumerate(self.word_bag):
count += (self.wf1[i] - self.wf2[i]) ** 2
print('欧氏距离:',1 / (count ** 0.5))
return 1 / (count ** 0.5)
# 余弦距离
def cosine_distance(self):
# 统计
count1, count2, count3 = 0, 0, 0
for i, t in enumerate(self.word_bag):
count1 += self.wf1[i] * self.wf2[i]
count2 += self.wf1[i] * self.wf1[i]
count3 += self.wf2[i] * self.wf2[i]
print('余弦距离:',count1 / ((count1 ** 0.5) * (count3 ** 0.5)))
return count1 / ((count1 ** 0.5) * (count3 ** 0.5))
# Jacard相似度
def Jacard_distance(self):
count = 0
for i in self.a:
if i in self.b:
count += 1
print('Jacard:',count / len(max(self.a, self.b)))
return count / len(max(self.a, self.b))
# 海明距离
def hamming_distance(self):
c = 0
for i, t in enumerate(self.word_bag):
if self.wf1[i] == self.wf2[i]:
c += 1
else:
c += 0
print('海明距离:',c/len(max(self.a, self.b)))
return c

posted @   记录——去繁就简  阅读(30)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· DeepSeek 开源周回顾「GitHub 热点速览」
· 物流快递公司核心技术能力-地址解析分单基础技术分享
· .NET 10首个预览版发布:重大改进与新特性概览!
· AI与.NET技术实操系列(二):开始使用ML.NET
· 单线程的Redis速度为什么快?
点击右上角即可分享
微信分享提示