jieba分词及词频统计小项目
import pandas as pd import jieba import jieba.analyse from collections import Counter,OrderedDict jieba.load_userdict('./userdict.txt') # 加载外部 用户词典 def stopwordslist(filepath): stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()] return stopwords def text_cut(text1): stopwords = stopwordslist('./stop_words.txt') # 这里加载停用词的路径 words = jieba.analyse.extract_tags(text1, topK=6, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v', 'm', 'q')) santi_words = [x for x in words if len(x) > 1 and x not in stopwords] return ','.join(santi_words) def cut_term(): data = pd.read_excel('./xxx.xlsx', sheet_name='Sheet3') data['term'] = data['合并'].apply(text_cut) print(data.head()) data.to_excel('./Q2-xxxx_new2.xlsx', index=False) def make_count(data): writer = pd.ExcelWriter('./Q2分行业分词结果11.xlsx', engine='xlsxwriter') all_industry = ['xxxx] for industry in all_industry: cut = data[data['一xxx']==industry]['term'].tolist() l = [] for i in cut: l.extend(i.split(',')) print(len(l)) term_dic = dict(Counter(l)) d = dict(sorted(term_dic.items(), key=lambda x: x[1], reverse=True)) k = list(d.keys()) v = list(d.values()) df = pd.DataFrame({f'{industry}-词': k,'频率':v }, columns=[f'{industry}-词', '频率']) df.to_excel(writer,sheet_name=industry,index=False) writer.close() data = pd.read_excel('./xxxxxx.xlsx', sheet_name='Sheet1') make_count(data)