import xlrd
path = r"D:\02-1python\2020.08.11-lda\data\2010-2011\usa\us1.xlsx"#修改路径
data = xlrd.open_workbook(path)
sheet_1_by_index = data.sheet_by_index(0) title = sheet_1_by_index.col_values(0) abstract = sheet_1_by_index.col_values(1) n_of_rows = sheet_1_by_index.nrows doc_set = []#空列表 for i in range(1,n_of_rows):#逐行读取 doc_set.append(title[i] + '. ' + abstract[i])
file_path = 'D:/02-1python/2020.08.11-lda/data/2010-2011/china/2695.txt' with open(file_path,'a') as file_handle: # .txt可以不自己新建,代码会自动新建 file_handle.write(str(doc_set[0:])) # 写入 file_handle.write('\n') # 有时放在循环里面需要自动转行,不然会覆盖上一条数据


import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
english_stopwords = stopwords.words("english")
english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '!', '@', '#', '%', '$', '*', "''"]
texts = []
for doc in doc_set:
    text_list = nltk.word_tokenize(doc)
    text_list0 = [word for word in text_list if word not in english_stopwords]
    english_stopwords2 = ['c', 'also', '2009', '2010', '2011', "'s"]#修改停用词:年份
    text_list1 = [word for word in text_list0 if word not in english_stopwords2]
    text_list2 = [word for word in text_list1 if word not in english_punctuations]
    text_list3 = [WordNetLemmatizer().lemmatize(word) for word in text_list2]
    text_list4 = [PorterStemmer().stem(word) for word in text_list3]
   #最终处理好的结果存放于text[]中 texts.append(text_list4)
#利用 gensim 库构建文档-词频矩阵
import gensim
from gensim import corpora
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
#for c in corpus:
from gensim.matutils import corpus2dense
corpus_matrix=corpus2dense(corpus, len(dictionary))

#使用gensim来创建 LDA 模型对象
Lda = gensim.models.ldamodel.LdaModel
#在文档-词频矩阵上运行和训练 LDA 模型
num_topics = 10#主题个数,参数可修改
ldamodel = Lda(corpus, num_topics=num_topics, id2word=dictionary, passes=100)#修改超参数,主题个数,遍历次数
doc_topic = [doc_t for doc_t in ldamodel[corpus]]
#for doc_topic in ldamodel.get_document_topics(corpus):
for topic_id in range(num_topics):
coherence_model_lda = gensim.models.CoherenceModel(model=ldamodel,texts=texts,dictionary=dictionary,coherence='c_v')
