综合练习:词频统计

f = open('test.txt','r')
news = f.read()
f.close()

sep = '''.,'?!:"'''
exclude = {'the','and','to','a','of','was','on','with','i','s','is','were','that','back','at','little','have'}
for w in sep:
    news = news.replace(w,' ')

wordList = news.lower().split()
wordDict = {}
'''
for v in wordList:
    wordDict[v] = wordDict.get(v, 0)+1
for v in exclude
    del( wordDict[v])
'''

wordset = set(wordList) - exclude
for v in wordset:
    wordDict[v] = wordList.count(v)

dictList = list(wordDict.items())
dictList.sort(key=lambda x:x[1],reverse=True)
for i in range(20):
    print(dictList[i])

f = open('newscount.txt','a')
for i in range(25):
    f.write(dictList[i][0]+' '+str(dictList[i][1])+'\n')

运行结果:

中文统计

import jieba

f = open('text.txt', 'r', encoding = 'utf-8')
news = f.read()
f.close()

sep=''',。‘’“”:;()!?、《》 '''
exclude={'我', '在', '不', '一', '了', '那', '是', '来', '他', '个', '行', '你', '的',
     '者','有','\n','-','出','这','时','没','她','到','上','们','会','着','说','要'
    , '为','过','看','得','里','克','去','想','好','天','小','后','地','么','都'
    , '还','以','对','能','大','也','很','而','然','下','但','吕','把','开','从'
    , '让','就','一个','可','点','跟','样','向','事','起','中','面'}

for c in sep:
    news = news.replace(c,' ')
wordList=list(jieba.cut(news))
wordDict={}
words=list(set(wordList)-exclude)

for w in range(0,len(words)):
    wordDict[words[w]]=news.count(str(words[w]))

dictList = list(wordDict.items())
dictList.sort(key=lambda x:x[1],reverse=True)

f = open('new.txt', 'a',encoding="utf-8")
for i in range(20):
    f.write(dictList[i][0] + ':' + str(dictList[i][1]) + '\n')
f.close()

 结果:

 

posted @ 2018-03-27 17:11  203陈冠权  阅读(117)  评论(0编辑  收藏  举报