Python大数据:jieba 中文分词,词频统计
# -*- coding: UTF-8 -*- import sys import numpy as np import pandas as pd import jieba import jieba.analyse import codecs #设置pd的显示长度 pd.set_option('max_colwidth',500) #载入数据 rows=pd.read_csv('datas1.csv', header=0,encoding='utf-8',dtype=str) #载入停用词 jieba.analyse.set_stop_words('stoped.txt') # 保存全局分词,用于词频统计 segments = [] # 保存每行分词,用于关联分析 results = [] index = 0 for row in rows.index: content = rows[index:index+1]['content'].to_string() # 分词操作 # words = jieba.cut(content) # TF-IDF关键词抽取,关键词数量设置为topK,不能过滤标点和数字 #words = jieba.analyse.extract_tags(content, topK=20) #TextRank 关键词抽取,只获取固定词性 words = jieba.analyse.textrank(content, topK=20,withWeight=False,allowPOS=('ns', 'n', 'vn', 'v')) splitedStr = '' for word in words: # 记录全局分词 segments.append({'word':word, 'count':1}) splitedStr += word + ' ' # 记录行结果 results.append({'text':content, 'words': splitedStr}) index = index + 1 # 将结果数组转为df序列 dfSg = pd.DataFrame(segments) # 词频统计 dfWord = dfSg.groupby('word')['count'].sum() #导出csv dfWord.to_csv('keywords.csv',encoding='utf-8') dfRet = pd.DataFrame(results) dfRet.to_csv('result.csv',encoding='utf-8')