【余弦定理】中文文本相似度计算
相似度计算
# 输入A,B两段语句,判断相似度
import jieba
from collections import Counter
def preprocess_data(text):
"""数据预处理函数,分词并去除停用词"""
# 使用结巴分词对文本进行分词
words = jieba.cut(text)
# 去除停用词,这里只列举了几个示例停用词,实际应用中需要根据具体需求添加更多停用词
stopwords = ['的', '了', '和', '是', '就', '而', '及', '与', '或']
filtered_words = [word for word in words if word not in stopwords]
return filtered_words
def extract_features(words):
"""特征提取函数,使用词袋模型"""
features = Counter(words)
return str(features)
def cosine_similarity(features1, features2):
"""余弦相似度计算函数"""
numerator = sum(features1[word] * features2[word] for word in set(features1) & set(features2))
denominator = ((sum(features1[word] ** 2 for word in features1) ** 0.5) * (
sum(features2[word] ** 2 for word in features2) ** 0.5))
if not denominator:
return 0.0
else:
return round(numerator / float(denominator), 3)
def check_duplicate(content, input_text, threshold=0.7):
"""查重函数,判断当前文本是否与已有文本重复"""
# 对当前文本进行预处理和特征提取
words = preprocess_data(content)
features = extract_features(words)
# 在此模拟已有文本的特征
existing_features = extract_features(preprocess_data(input_text))
similarity = cosine_similarity(eval(features), eval(existing_features))
# 根据设定的相似度阈值来判断是否重复
if similarity >= threshold:
return similarity
else:
return similarity
similarity = check_duplicate("我是你的人","我是你的情人")
print('similarity',similarity)
Python全栈(后端、数据分析、脚本、爬虫、EXE客户端) / 前端(WEB,移动,H5) / Linux / SpringBoot / 机器学习