计算两篇文章相似度代码

# -*- coding:gb2312 -*-
from gensim import corpora, models, similarities
from nltk.tokenize import word_tokenize
from nltk.corpus import brown
courses=[]
temp=""
for line in file('aaa'):
    if(line!="\n"):
            temp =temp+line.strip()+"\t"
    else:
        courses.append(temp)
        temp=""

courses_name = []
for course in courses:
    x=course.strip().split('\t')
    courses_name.append(x[0].strip('#*'))
print courses_name[0:3]
document=['#*AD','ADdd']
document=document[0].decode('utf-8').lower()
print document
texts_tokenized = [[word.lower() for word in word_tokenize(document.decode('utf-8'))] for document in courses]
print texts_tokenized[0]
from nltk.corpus import stopwords
english_stopwords = stopwords.words('english')
texts_filtered_stopwords = [[word for word in document if not word in english_stopwords] for document in texts_tokenized]
print texts_filtered_stopwords[0]

english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%']
from nltk.stem.lancaster import LancasterStemmer
texts_filtered = [[word for word in document if not word in english_punctuations] for document in texts_filtered_stopwords]
print texts_filtered[0]
st = LancasterStemmer()
texts_stemmed = [[st.stem(word) for word in docment] for docment in texts_filtered]
print texts_stemmed[0]
all_stems = sum(texts_stemmed, [])
stems_once = set(stem for stem in set(all_stems) if all_stems.count(stem) == 2)#去掉次数为2的低频词汇
texts = [[stem for stem in text if stem not in stems_once] for text in texts_stemmed]
print texts
from gensim import corpora, models, similarities
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10)
index = similarities.MatrixSimilarity(lsi[corpus])
ml_course = texts[15]
ml_bow = dictionary.doc2bow(ml_course)
ml_lsi = lsi[ml_bow]
print ml_lsi
sims = index[ml_lsi]
sort_sims = sorted(enumerate(sims), key=lambda item: -item[1])
print sort_sims[1:11]
#print courses_name[23]

  

posted @ 2016-07-29 12:26  奋斗中的菲比  阅读(750)  评论(0编辑  收藏  举报