python构建词篇矩阵
from pprint import pprint import xlrd #读取excel数据 import re import jieba #使用结巴进行中文分词 path = r"*\lda.xlsx" #修改路径 data = xlrd.open_workbook(path) sheet_1_by_index = data.sheet_by_index(0) #读取表一 title = sheet_1_by_index.col_values(1) #第二列 n_of_rows = sheet_1_by_index.nrows doc_set = [] #空列表 for i in range(1,n_of_rows): #逐行读取 doc_set.append(title[i]) #从文件导入停用词表 def stopwordslist(filepath): stopwords=[line.strip() for line in open(filepath,'r',encoding='utf-8').readlines()] return stopwords stopwords=stopwordslist(r"D:\01研\01大四\2020.3.13-国家突发卫生事件\20201008\stopwords.txt") texts = []#每篇文章关键词 stpwrdlst2 = ['和', '等', '对', '的', '不','与', '一','化']#去停用词2自编,这里是我自己觉得需要去掉的词 for doc in doc_set: #只保留中文 cleaned_doc = ''.join(re.findall(r'[\u4e00-\u9fa5]', doc)) #分词 doc_cut = jieba.lcut(cleaned_doc) #去停用词 text_list0 = [word for word in doc_cut if word not in stopwords and len(word)>1] text_list1 = [word for word in text_list0 if word not in stpwrdlst2] #if len(doc_cut)>1 and doc_cut not in stopwords: #texts.append(doc_cut) #最终处理好的结果存放于text[]中 texts.append(text_list1)
#利用 gensim 库构建词篇矩阵 import gensim from gensim import corpora #构建字典,把刚刚处理好的词都存进去 dictionary = corpora.Dictionary(texts) #构建文档-词频矩阵,得到的是词袋矩阵 corpus = [dictionary.doc2bow(text) for text in texts] #print('\n文档-词频矩阵:') #pprint(corpus) #pprint(corpus[0:19]) #for c in corpus: #print(c) #转换成稀疏矩阵 from gensim.matutils import corpus2dense corpus_matrix=corpus2dense(corpus, len(dictionary))
print(corpus_matrix)
[[1. 0. 0. ... 0. 0. 0.] [1. 0. 0. ... 0. 0. 0.] [1. 0. 1. ... 1. 0. 0.] ... [0. 0. 0. ... 0. 0. 1.] [0. 0. 0. ... 0. 0. 1.] [0. 0. 0. ... 0. 0. 1.]]
corpus_matrix.shape