jieba Word frequency statistics

#!/usr/bin/env python
# _*_ coding: utf-8 _*_
# @Time     : 2017/4/18 15:22
# @Author   : otfsenter
# @File     : strip_extr.py
import pprint

import jieba
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import PIL
from wordcloud import WordCloud


def wordcloudplot(txt):
    path = r'C:\Windows\Fonts\verdana.ttf'
    path = unicode(path, 'utf8').encode('gb18030')
    alice_mask = np.array(PIL.Image.open('she.jpg'))
    wordcloud = WordCloud(font_path=path,
                          background_color='white',
                          margin=1, width=10, height=2, mask=alice_mask,
                          max_words=200, max_font_size=1000, random_state=42)
    wordcloud = wordcloud.generate(txt)
    wordcloud.to_file('she2.jpg')
    plt.imshow(wordcloud)
    plt.axis('off')
    plt.show()


def main():
    a = []
    # b = {}
    f = open('nms.txt', 'r').read()
    words = list(jieba.cut(f))
    for index, word in enumerate(words):
        if len(word) > 1:
            a.append(word.encode('utf-8'))
            # b.setdefault(word, []).append(index)
    # for k in b:
    #     b[k] = len(b[k])

    # dict1 = sorted(b.iteritems(),
    #                key=lambda d: d[1],
    #                reverse=True)
    # print dict1
    # for k in dict1:
    #     print list(k)[0], list(k)[1]
    txt = ' '.join(a)
    wordcloudplot(txt)


if __name__ == '__main__':
    main()

posted @ 2017-04-21 14:26 idlewith 阅读(370) 评论(0) 收藏举报

刷新页面返回顶部

idlewith

个人博客：https://idlewith.com/，欢迎访问

jieba Word frequency statistics

公告