python余弦定理计算相似度

# -*- coding: utf-8 -*-
import jieba
import jieba.analyse
import math

def sentence_resemble():
    '''
    计算两个句子的相似度:
        1,将输入的两个句子分词
        2,求分词后两句子的并集(去重)
        3,计算两句子各自词频
        4,求词频向量
        5,套用余弦定理公式求出相似度
    余弦值越接近1,就表明夹角越接近0度,也就是两个向量越相似,这就叫"余弦相似性"
    :return:
    '''
    str1="我喜欢看电视,不喜欢看电影"
    str2="我不喜欢看电视,也不喜欢看电影"


    # 结巴分词,得到去掉逗号的数组
    str1 = jieba.cut(str1)
    str1 = ",".join(str1)
    str1_array = str1.split(",")
    str1_array.remove(u"")

    str2 = jieba.cut(str2)
    str2 = ",".join(str2)
    str2_array = str2.split(",")
    str2_array.remove(u"")

    # 求分词后两句子的并集(去重)
    all_array = list(set(str1_array+str2_array))
    all = sorted(all_array)

    # 计算两句子各自词频
    str1_num_dic = num_count(str1_array)
    str2_num_dic = num_count(str2_array)
    # 套用余弦定理公式求出相似度
    cos = resemble_cal(all,str1_num_dic,str2_num_dic)
    print cos

def num_count(a):
    d = {k: a.count(k) for k in set(a)}
    return d

def article_resemble():
    all_key=set()
    with open("article_1.txt","r") as f:
        lines = f.readlines()
    lines = "".join(lines)
    article1_dic = analyse_word(lines)
    for k,v in article1_dic.items():
        all_key.add(k)


    with open("article_2.txt","r") as f:
        article2_lines = f.readlines()
        article2_lines = "".join(article2_lines)
    article2_dic = analyse_word(article2_lines)
    for k,v in article2_dic.items():
        all_key.add(k)

    cos = resemble_cal(all_key,article1_dic,article2_dic)
    print cos

def resemble_cal(all_key,article1_dic,article2_dic):
    str1_vector=[]
    str2_vector=[]
    # 计算词频向量
    for i in all_key:
        str1_count = article1_dic.get(i,0)
        str1_vector.append(str1_count)
        str2_count = article2_dic.get(i,0)
        str2_vector.append(str2_count)

    # 计算各自平方和
    str1_map = map(lambda x: x*x,str1_vector)
    str2_map = map(lambda x: x*x,str2_vector)

    str1_mod =  reduce(lambda x, y: x+y, str1_map)
    str2_mod = reduce(lambda x, y: x+y, str2_map)

    # 计算平方根
    str1_mod = math.sqrt(str1_mod)
    str2_mod = math.sqrt(str2_mod)

    # 计算向量积
    vector_multi = reduce(lambda x, y: x + y, map(lambda x, y: x * y, str1_vector, str2_vector))

    # 计算余弦值
    cos = float(vector_multi)/(str1_mod*str2_mod)
    return cos

'''
文章关键词提取
'''
def analyse_word(content):
    zidian={}
    return_dic={}
    # 内容分词
    fenci = jieba.cut_for_search(content)
    for fc in fenci:
        if fc in zidian:
            zidian[fc] += 1
        else:
            zidian[fc] = 1
    topK=30
    # 关键词  比率
    tfidf = jieba.analyse.extract_tags(content, topK=topK,withWeight=True)
    stopkeyword = [line.strip() for line in open('stop.txt').readlines()]
    for word_weight in tfidf:
        if word_weight in stopkeyword:
            continue
        frequence = zidian.get(word_weight[0], 'not found')
        return_dic[word_weight[0]]=frequence
    return return_dic

if __name__=="__main__":
    # 比较两句子相似度
    sentence_resemble()
    # 比较两篇文章相似度
    article_resemble()



文章样例详情查看 http://pan.baidu.com/s/1qXRIGUS

 

posted on 2017-08-18 17:10  在他乡123  阅读(928)  评论(0编辑  收藏  举报

导航