Python统计excel表格中文本的词频,生成词云图片

Posted on 2019-03-07 18:11  Mr.ning  阅读(4293)  评论(3编辑  收藏  举报
import xlrd
import jieba
import pymysql
import matplotlib.pylab as plt
from wordcloud import WordCloud
from collections import Counter
import numpy as np

def getExcelData(excel,txt):
    readbook = xlrd.open_workbook(excel)
    sheet = readbook.sheet_by_index(2) #取第二个sheet页
    rows = sheet.nrows
    i = 0
    while i < rows:
        txt += sheet.cell(i, 2).value #取第三列的值
        i += 1
    seg_list = jieba.cut(txt)
    c = Counter()
    result = {}
    for x in seg_list:
        if len(x) > 1 and x != '\r\n':
            c[x] += 1
    for (k, v) in c.most_common():
        result[k] = v #放到字典中,用于生成词云的源数据
    return result

def makeWordCloud(txt):
    x, y = np.ogrid[:300, :500]

    mask = (x - 150) ** 2 + (y - 150) ** 2 > 150 ** 2
    mask = 255 * mask.astype(int)

    wc = WordCloud(background_color="white",
                    max_words=500,
                    mask=mask,
                    repeat=True,
                    width=1000,
                    height=1000,
                    scale=4, #这个数值越大,产生的图片分辨率越高,字迹越清晰
                    font_path="C:\Windows\Fonts\STXINGKA.TTF")
    wc.generate_from_frequencies(txt)
    wc.to_file('abc.png')

    plt.axis("off")
    plt.imshow(wc, interpolation="bilinear")
    plt.show()


if __name__ == '__main__':
    txt = ''
    makeWordCloud(getExcelData('getdata.xlsx', txt))