参考链接:https://zhuanlan.zhihu.com/p/88938220
https://blog.csdn.net/yjw123456/article/details/107923566
https://blog.csdn.net/betterzl/article/details/109983541
列表的交并差:https://blog.csdn.net/qdPython/article/details/118802922
jieba的使用:https://blog.csdn.net/lukabruce/article/details/82351742
import random,jieba
class Similarity():
def __init__(self,a,b):
# 分词
self.a = jieba.lcut(a)
self.b = jieba.lcut(b)
# 词袋 a,b的并集
self.word_bag = list(set(self.a).union(set(self.b)))
print('词袋:',self.word_bag)
# 词频
self.wf1 = self.word_frequency(self.a)
self.wf2 = self.word_frequency(self.b)
print('a词频:',self.wf1)
print('b词频:',self.wf2)
# 统计词频
def word_frequency(self,word):
c = []
for i in self.word_bag:
if i in word:
c.append(1)
else:
c.append(0)
return c
# 欧氏距离
def euclidean_distance(self):
# 统计
count = 0
for i, t in enumerate(self.word_bag):
count += (self.wf1[i] - self.wf2[i]) ** 2
print('欧氏距离:',1 / (count ** 0.5))
return 1 / (count ** 0.5)
# 余弦距离
def cosine_distance(self):
# 统计
count1, count2, count3 = 0, 0, 0
for i, t in enumerate(self.word_bag):
count1 += self.wf1[i] * self.wf2[i]
count2 += self.wf1[i] * self.wf1[i]
count3 += self.wf2[i] * self.wf2[i]
print('余弦距离:',count1 / ((count1 ** 0.5) * (count3 ** 0.5)))
return count1 / ((count1 ** 0.5) * (count3 ** 0.5))
# Jacard相似度
def Jacard_distance(self):
count = 0
for i in self.a:
if i in self.b:
count += 1
print('Jacard:',count / len(max(self.a, self.b)))
return count / len(max(self.a, self.b))
# 海明距离
def hamming_distance(self):
c = 0
for i, t in enumerate(self.word_bag):
if self.wf1[i] == self.wf2[i]:
c += 1
else:
c += 0
print('海明距离:',c/len(max(self.a, self.b)))
return c
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· DeepSeek 开源周回顾「GitHub 热点速览」
· 物流快递公司核心技术能力-地址解析分单基础技术分享
· .NET 10首个预览版发布:重大改进与新特性概览!
· AI与.NET技术实操系列(二):开始使用ML.NET
· 单线程的Redis速度为什么快?