python构建词篇矩阵

from pprint import pprint
import xlrd #读取excel数据
import re
import jieba #使用结巴进行中文分词

path = r"*\lda.xlsx" #修改路径
data = xlrd.open_workbook(path)

sheet_1_by_index = data.sheet_by_index(0) #读取表一
title = sheet_1_by_index.col_values(1) #第二列
n_of_rows = sheet_1_by_index.nrows
doc_set = [] #空列表
for i in range(1,n_of_rows): #逐行读取
    doc_set.append(title[i])

#从文件导入停用词表
def stopwordslist(filepath):
    stopwords=[line.strip() for line in open(filepath,'r',encoding='utf-8').readlines()]
    return stopwords
stopwords=stopwordslist(r"D:\01研\01大四\2020.3.13-国家突发卫生事件\20201008\stopwords.txt")

texts = []#每篇文章关键词
stpwrdlst2 = ['和', '等', '对', '的', '不','与', '一','化']#去停用词2自编，这里是我自己觉得需要去掉的词
for doc in doc_set:
    #只保留中文
    cleaned_doc = ''.join(re.findall(r'[\u4e00-\u9fa5]', doc))
    #分词
    doc_cut = jieba.lcut(cleaned_doc)
    #去停用词
    text_list0 = [word for word in doc_cut if word not in stopwords and len(word)>1]
    text_list1 = [word for word in text_list0 if word not in stpwrdlst2]
    #if len(doc_cut)>1 and doc_cut not in stopwords:
        #texts.append(doc_cut)
    #最终处理好的结果存放于text[]中
    texts.append(text_list1)

#利用 gensim 库构建词篇矩阵
import gensim
from gensim import corpora
#构建字典，把刚刚处理好的词都存进去
dictionary = corpora.Dictionary(texts)

#构建文档-词频矩阵，得到的是词袋矩阵
corpus = [dictionary.doc2bow(text) for text in texts]
#print('\n文档-词频矩阵：')
#pprint(corpus)
#pprint(corpus[0:19])
#for c in corpus:
    #print(c)

#转换成稀疏矩阵
from gensim.matutils import corpus2dense
corpus_matrix=corpus2dense(corpus, len(dictionary))

print(corpus_matrix)

[[1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 1. ... 1. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]]

corpus_matrix.shape

(1342, 15)

posted on 2020-10-18 22:21 cookie的笔记簿阅读(712) 评论(0) 编辑收藏举报