作业4
2018-09-27 14:33 cqchenqin 阅读(252) 评论(0) 编辑 收藏 举报# 2018年9月27日完整的英文词频统计 # strYee = ''''''.lower() # 准备utf-8编码的文本文件file fo = open('Nothingtolose.txt','r',encoding='utf-8') # 通过文本读取字符串str strYee = fo.read() fo.close() print(strYee) # 对文本进行预处理 # 字符串预处理 # 大小写 # 标点符号 # 特殊符号 # 分解提取单词list sep = '''.,;:?!-_''' for ch in sep: strYee = strYee.replace(ch,' ') strList = strYee.split() print(len(strList),strList) # 单词计数字典set,dict # 排除语法型词汇,代词、冠词、连词等无语义词 strSet = set(strList) exclude = {'a','the','and','i','you','in'} strSet = strSet-exclude print(len(strSet),strSet) strDic = {} for word in strSet: strDic[word] = strList.count(word) print(len(strDic),strDic) # 按词频排序list。sort(key=) wcList = list(strDic.items()) print(wcList) wcList.sort(key=lambda x:x[1],reverse=True) print(wcList) # print(strDic.items()) # 输出TOP(20) for i in range(20): print(wcList[i])
import jieba fo = open('我们的少年时代.txt','r',encoding='utf-8') strYee = fo.read() fo.close() print(strYee) print(list(jieba.cut(strYee))) print(list(jieba.cut(strYee,cut_all=True))) print(list(jieba.cut_for_search(strYee)))
# 中文词频统计 import jieba fo = open('我们的少年时代.txt','r',encoding='utf-8').read() wordsls = jieba.lcut(fo) wcdict = {} for word in wordsls: if len(word)==1: continue else: wcdict[word]=wcdict.get(word,0)+1 # wcdict['陶西']=wcdict['陶西']+wcdict['陶西说'] # del(wcdict['陶西说']) # wcdict['班小松']=wcdict['班小松']+wcdict['班小松说'] # del(wcdict['班小松说']) wcls = list(wcdict.items()) wcls.sort(key=lambda x:x[1],reverse=True) open('我们的少年时代.txt','w').write(wcls[1][0]) for i in range(25): print(wcls[i])