Python大数据:jieba 中文分词,词频统计

# -*- coding: UTF-8 -*-
import sys
import numpy as np
import pandas as pd
import jieba
import jieba.analyse
import codecs

#设置pd的显示长度
pd.set_option('max_colwidth',500)

#载入数据
rows=pd.read_csv('datas1.csv', header=0,encoding='utf-8',dtype=str)
#载入停用词
jieba.analyse.set_stop_words('stoped.txt')

# 保存全局分词,用于词频统计
segments = []
# 保存每行分词,用于关联分析
results = []
index = 0

for row in rows.index:
    content = rows[index:index+1]['content'].to_string()
    # 分词操作
    # words = jieba.cut(content)
    # TF-IDF关键词抽取,关键词数量设置为topK,不能过滤标点和数字
    #words = jieba.analyse.extract_tags(content, topK=20)
    #TextRank 关键词抽取,只获取固定词性
    words = jieba.analyse.textrank(content, topK=20,withWeight=False,allowPOS=('ns', 'n', 'vn', 'v'))
    splitedStr = ''
    for word in words:
        # 记录全局分词
        segments.append({'word':word, 'count':1})
        splitedStr += word + ' '
    # 记录行结果
    results.append({'text':content, 'words': splitedStr})
    index = index + 1

# 将结果数组转为df序列
dfSg = pd.DataFrame(segments)

# 词频统计
dfWord = dfSg.groupby('word')['count'].sum()

#导出csv
dfWord.to_csv('keywords.csv',encoding='utf-8')

dfRet = pd.DataFrame(results)
dfRet.to_csv('result.csv',encoding='utf-8')

 

posted @ 2018-03-21 00:27  黑冰.org  阅读(1814)  评论(0编辑  收藏  举报