【余弦定理】中文文本相似度计算

相似度计算

# 输入A,B两段语句,判断相似度

import jieba
from collections import Counter

def preprocess_data(text):
    """数据预处理函数,分词并去除停用词"""
    # 使用结巴分词对文本进行分词
    words = jieba.cut(text)
    # 去除停用词,这里只列举了几个示例停用词,实际应用中需要根据具体需求添加更多停用词
    stopwords = ['的', '了', '和', '是', '就', '而', '及', '与', '或']
    filtered_words = [word for word in words if word not in stopwords]
    return filtered_words

def extract_features(words):
    """特征提取函数,使用词袋模型"""
    features = Counter(words)
    return str(features)

def cosine_similarity(features1, features2):
    """余弦相似度计算函数"""
    numerator = sum(features1[word] * features2[word] for word in set(features1) & set(features2))
    denominator = ((sum(features1[word] ** 2 for word in features1) ** 0.5) * (
            sum(features2[word] ** 2 for word in features2) ** 0.5))
    if not denominator:
        return 0.0
    else:
        return round(numerator / float(denominator), 3)


def check_duplicate(content, input_text, threshold=0.7):
    """查重函数,判断当前文本是否与已有文本重复"""
    # 对当前文本进行预处理和特征提取
    words = preprocess_data(content)
    features = extract_features(words)
    
    # 在此模拟已有文本的特征
    existing_features = extract_features(preprocess_data(input_text))
    
    similarity = cosine_similarity(eval(features), eval(existing_features))
    
    # 根据设定的相似度阈值来判断是否重复
    if similarity >= threshold:
        return similarity
    else:
        return similarity


similarity = check_duplicate("我是你的人","我是你的情人")
print('similarity',similarity)
posted @ 2024-02-28 17:33  PythonNew_Mr.Wang  Views(124)  Comments(0Edit  收藏  举报