jieba文本分词,去除停用词,添加用户词

import jieba
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np
import jieba.analyse
from pyquery import PyQuery

santi_text = open('./santi.txt', 'r', encoding='utf-8').read() #读取本地文档

jieba.enable_parallel(4) # 开启并行分词模式,参数为并行进程数 

jieba.load_userdict('./userdict.txt')#加载外部 用户词典

# 创建停用词list
def stopwordslist(filepath):
    stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
    return stopwords

# 对句子去除停用词
def movestopwords(sentence):
    stopwords = stopwordslist('./stop_words.txt')  # 这里加载停用词的路径
    santi_words =[x for x in sentence if len(x) >1 and x not in stopwords]

    return santi_words

def main():
    words = jieba.cut(PyQuery(santi_text).text()) #去除HTML标签
    word_list = movestopwords(words) # 去除停用词
    words_split = " ".join(word_list) #列表解析为字符串

    print('以下是tf-tdf算法-------------------------------------------------')
    keywords_tf = jieba.analyse.extract_tags(words_split, topK=100, withWeight=True,allowPOS=('ns', 'n', 'vn', 'v')) # tf-tdf算法
    for item in keywords_tf:
         print(item[0],item[1])

    print('以下是textrank算法-------------------------------------------------')
    keywords_rank = jieba.analyse.textrank(words_split, topK=100, withWeight=True,allowPOS=('ns', 'n', 'vn', 'v')) #textrank算法
    for item in keywords_rank:
         print(item[0],item[1])

    print('以下是纯词频统计-------------------------------------------------')
    mycount = Counter(word_list) # 统计词频
    for key, val in mycount.most_common(100):  # 有序(返回前10个)
        print(key, val)

    #alice_mask = np.array(Image.open("./zhihu.png")) #遮罩
    wc = WordCloud(
        # width=800,
        # height=600,
        background_color="#000000",  # 设置背景颜色
        max_words=50,  # 词的最大数(默认为200)
        max_font_size=400,  # 最大字体尺寸
        min_font_size=10,  # 最小字体尺寸(默认为4)
        #colormap='bone',  # string or matplotlib colormap, default="viridis"
        random_state=42,  # 设置有多少种随机生成状态,即有多少种配色方案
        #mask=plt.imread("./zhihu.png"),  # 读取遮罩图片!!
        #mask=alice_mask, #设置遮罩
        font_path='./SimHei.ttf'
    )


    my_wordcloud = wc.generate(words_split) #按词频生成词云
    plt.imshow(my_wordcloud) #展示词云
    plt.axis("off") #去除横纵轴
    plt.show()
    wc.to_file('zzz.png') # 保存图片文件

if __name__ == '__main__':
    main()

 

posted @ 2018-07-31 14:21  Erick-LONG  阅读(9500)  评论(0编辑  收藏  举报