计算两篇文章相似度代码
# -*- coding:gb2312 -*- from gensim import corpora, models, similarities from nltk.tokenize import word_tokenize from nltk.corpus import brown courses=[] temp="" for line in file('aaa'): if(line!="\n"): temp =temp+line.strip()+"\t" else: courses.append(temp) temp="" courses_name = [] for course in courses: x=course.strip().split('\t') courses_name.append(x[0].strip('#*')) print courses_name[0:3] document=['#*AD','ADdd'] document=document[0].decode('utf-8').lower() print document texts_tokenized = [[word.lower() for word in word_tokenize(document.decode('utf-8'))] for document in courses] print texts_tokenized[0] from nltk.corpus import stopwords english_stopwords = stopwords.words('english') texts_filtered_stopwords = [[word for word in document if not word in english_stopwords] for document in texts_tokenized] print texts_filtered_stopwords[0] english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%'] from nltk.stem.lancaster import LancasterStemmer texts_filtered = [[word for word in document if not word in english_punctuations] for document in texts_filtered_stopwords] print texts_filtered[0] st = LancasterStemmer() texts_stemmed = [[st.stem(word) for word in docment] for docment in texts_filtered] print texts_stemmed[0] all_stems = sum(texts_stemmed, []) stems_once = set(stem for stem in set(all_stems) if all_stems.count(stem) == 2)#去掉次数为2的低频词汇 texts = [[stem for stem in text if stem not in stems_once] for text in texts_stemmed] print texts from gensim import corpora, models, similarities import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10) index = similarities.MatrixSimilarity(lsi[corpus]) ml_course = texts[15] ml_bow = dictionary.doc2bow(ml_course) ml_lsi = lsi[ml_bow] print ml_lsi sims = index[ml_lsi] sort_sims = sorted(enumerate(sims), key=lambda item: -item[1]) print sort_sims[1:11] #print courses_name[23]