代码改变世界

作业4

2018-09-27 14:33  cqchenqin  阅读(252)  评论(0编辑  收藏  举报
# 2018年9月27日完整的英文词频统计
# strYee = ''''''.lower()
# 准备utf-8编码的文本文件file
fo = open('Nothingtolose.txt','r',encoding='utf-8')
# 通过文本读取字符串str
strYee = fo.read()
fo.close()
print(strYee)
# 对文本进行预处理
# 字符串预处理
# 大小写
# 标点符号
# 特殊符号
# 分解提取单词list
sep = '''.,;:?!-_'''
for ch in sep:
    strYee = strYee.replace(ch,' ')
strList = strYee.split()
print(len(strList),strList)
# 单词计数字典set,dict
# 排除语法型词汇,代词、冠词、连词等无语义词
strSet = set(strList)
exclude = {'a','the','and','i','you','in'}
strSet = strSet-exclude
print(len(strSet),strSet)

strDic = {}
for word in strSet:
    strDic[word] = strList.count(word)
print(len(strDic),strDic)
# 按词频排序list。sort(key=)
wcList = list(strDic.items())
print(wcList)
wcList.sort(key=lambda x:x[1],reverse=True)
print(wcList)
# print(strDic.items())
# 输出TOP(20)
for i in range(20):
    print(wcList[i])

import jieba
fo = open('我们的少年时代.txt','r',encoding='utf-8')
strYee = fo.read()
fo.close()
print(strYee)
print(list(jieba.cut(strYee)))
print(list(jieba.cut(strYee,cut_all=True)))
print(list(jieba.cut_for_search(strYee)))

 

# 中文词频统计
import jieba
fo = open('我们的少年时代.txt','r',encoding='utf-8').read()
wordsls = jieba.lcut(fo)
wcdict = {}
for word in wordsls:
    if len(word)==1:
        continue
    else:
        wcdict[word]=wcdict.get(word,0)+1
# wcdict['陶西']=wcdict['陶西']+wcdict['陶西说']
# del(wcdict['陶西说'])
# wcdict['班小松']=wcdict['班小松']+wcdict['班小松说']
# del(wcdict['班小松说'])
wcls = list(wcdict.items())
wcls.sort(key=lambda x:x[1],reverse=True)
open('我们的少年时代.txt','w').write(wcls[1][0])
for i in range(25):
    print(wcls[i])