jieba分词及词频统计小项目

import pandas as pd
import jieba
import jieba.analyse
from collections import Counter,OrderedDict
jieba.load_userdict('./userdict.txt')  # 加载外部 用户词典


def stopwordslist(filepath):
    stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
    return stopwords


def text_cut(text1):
    stopwords = stopwordslist('./stop_words.txt')  # 这里加载停用词的路径
    words = jieba.analyse.extract_tags(text1, topK=6, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v', 'm', 'q'))
    santi_words = [x for x in words if len(x) > 1 and x not in stopwords]
    return ','.join(santi_words)


def cut_term():
    data = pd.read_excel('./xxx.xlsx', sheet_name='Sheet3')

    data['term'] = data['合并'].apply(text_cut)
    print(data.head())
    data.to_excel('./Q2-xxxx_new2.xlsx', index=False)


def make_count(data):
    writer = pd.ExcelWriter('./Q2分行业分词结果11.xlsx', engine='xlsxwriter')
    all_industry = ['xxxx]
    for industry in all_industry:
        cut = data[data['一xxx']==industry]['term'].tolist()
        l = []
        for i in cut:
            l.extend(i.split(','))
        print(len(l))
        term_dic = dict(Counter(l))
        d = dict(sorted(term_dic.items(), key=lambda x: x[1], reverse=True))
        k = list(d.keys())
        v = list(d.values())
        df = pd.DataFrame({f'{industry}-词': k,'频率':v }, columns=[f'{industry}-词', '频率'])
        df.to_excel(writer,sheet_name=industry,index=False)
    writer.close()


data = pd.read_excel('./xxxxxx.xlsx', sheet_name='Sheet1')

make_count(data)

 

posted @ 2019-07-12 16:55  Erick-LONG  阅读(447)  评论(0编辑  收藏  举报