python应用:主题分类(gensim lda)
安装第三方包:gensim
首先,执行去停词操作(去除与主题无关的词)
1 #-*-coding:utf8-*- 2 3 import jieba 4 5 def stopwordslist(filepath): 6 stopwords = [line.strip() for line in open(filepath, 'r').readlines()] 7 return stopwords 8 9 def seg_sentence(sentence): 10 sentence_seged = jieba.cut(sentence.strip()) 11 stopwords = stopwordslist('stopWords/stopwords.txt') 12 outstr = '' 13 for word in sentence_seged: 14 word = word.lower() 15 if word not in stopwords: 16 if word != '\t': 17 outstr += word 18 outstr += " " 19 return outstr 20 21 inputs = open('input/copurs.txt', 'r') 22 23 outputs = open('input/copurs_out.txt', 'w') 24 for line in inputs: 25 line_seg = seg_sentence(line) 26 outputs.write(line_seg + '\n') 27 outputs.close() 28 inputs.close()
然后,执行主题分类操作
1 import codecs 2 from gensim import corpora 3 from gensim.models import LdaModel 4 from gensim import models 5 from gensim.corpora import Dictionary 6 7 8 te = [] 9 fp = codecs.open('input/copurs_out.txt','r') 10 for line in fp: 11 line = line.split() 12 te.append([ w for w in line ]) 13 print len(te) 14 dictionary = corpora.Dictionary(te) 15 corpus = [ dictionary.doc2bow(text) for text in te ] 16 17 #tfidf = models.TfidfModel(corpus) 18 #corpus_tfidf = tfidf[corpus] 19 20 #########Run the LDA model for XX topics ############################### 21 lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=50,passes=2000) 22 doc_topic = [a for a in lda[corpus]] 23 24 ####### write the topics in file topics_result.txt #################### 25 topics_r = lda.print_topics(num_topics = 50, num_words = 10) 26 topic_name = codecs.open('output/topics_result.txt','w') 27 for v in topics_r: 28 topic_name.write(str(v)+'\n') 29 30 31 ###################### write the class results to file ######################### 32 ###################### each document belongs to which topic ###################### 33 34 fp2 = codecs.open('output/documents_result.txt','w') 35 for t in doc_topic: 36 c = [] 37 c.append([a[1] for a in t]) 38 m = max(c[0]) 39 40 for i in range(0, len(t)): 41 if m in t[i]: 42 #print(t[i]) 43 fp2.write(str(t[i][0]) + ' ' + str(t[i][1]) + '\n') 44 break 45 ################################ OVER ############################################
注意:上述主题分类,仅使用lda模型(根据频数计算)
也可混合使用tf-idf模型XX-topic下代码改为如下即可:
方式一
#########Run the LDA model for XX topics ############################### lda = LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=50,passes=2000) doc_topic = [a for a in lda[corpus_tfidf]]
或
方式二
#########Run the LDA model for XX topics ############################### lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=50,passes=2000) doc_topic = [a for a in lda[corpus_tfidf]]
常用方式为方式一,作者暂时为弄清楚这两种方式的区别,后期将会继续完善