1.dictionary = gensim.corpora.Dictionary(clean_content) 对输入的列表做一个数字映射字典,
2. corpus = [dictionary,doc2vec(cl_content) for cl_content in clean_content] # 输出clean_content每一个元素根据dictionary做数字映射后的结果
3.lda = gensim.model.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=20) # corpus表示映射后的文本列表, id2word表示根据哪个数字映射字典张开, num_topics表示主题的个数
4. lda.print_topics(1, topn=5) # 打印第一个主题,前5个词
第一步: 载入语料库数据
第二步:进行分词操作
第三步:载入停用词表,去除语料库中的停用词
第四步:
构建数字映射字典
对文本做逐个映射
构建LDA主题模型
打印主题模型的主题和前5个主题词
import pandas as pd import numpy as np import jieba # 1.导入数据语料的新闻数据 df_data = pd.read_table('data/val.txt', names=['category', 'theme', 'URL', 'content'], encoding='utf-8') # 2.对语料库进行分词操作 df_contents = df_data.content.values.tolist() # list of list 结构 Jie_content = [] for df_content in df_contents: split_content = jieba.lcut(df_content) if len(split_content) > 1 and split_content != '\t\n': Jie_content.append(split_content) # 3. 导入停止词的语料库, sep='\t'表示分隔符, quoting控制引号的常量, names=列名, index_col=False,不用第一列做为行的列名, encoding stopwords = pd.read_csv('stopwords.txt', sep='\t', quoting=3, names=['stopwords'], index_col=False, encoding='utf-8') print(stopwords.head()) # 对文本进行停止词的去除 def drop_stops(Jie_content, stopwords): clean_content = [] all_words = [] for j_content in Jie_content: line_clean = [] for line in j_content: if line in stopwords: continue line_clean.append(line) all_words.append(line) clean_content.append(line_clean) return clean_content, all_words # 将DateFrame的stopwords数据转换为list形式 stopwords = stopwords.stopwords.values.tolist() clean_content, all_words = drop_stops(Jie_content, stopwords) print(clean_content[0]) # 4. 进行LDA主题模型 import gensim from gensim import corpora # 使用gensim.dictionary 生成word2vec dictionary = corpora.Dictionary(clean_content) print(np.shape(dictionary)) # 对clean_content 根据dictionary映射构造向量 corpus = [dictionary.doc2bow(clean_c) for clean_c in clean_content] lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=20) print(lda.print_topic(1, topn=5))