综合练习:词频统计
1.英文词频统
代码如下:
f = open('lyric.txt','r') lyric = f.read() f.close() punctuation = ''',.?/:;'"''' a = {'in','on','with','by','for','at','about','under','of','i','a','is','its','so','and','dont','it','to','ill','the'} for i in punctuation: lyric = lyric.replace(i,'') result = lyric.lower().lstrip().rstrip() tempwords = result.split() print(tempwords) count = {} words = list(set(tempwords)-a) print(words) print(result) for i in range(0,len(words)): count[words[i]]=result.count(str(words[i])) print('单词 '+ words[i] + ' 的出现次数为:'+str(result.count(words[i]))) for i in count: print(i) print(count[i]) countList = list(count.items()) countList.sort(key=lambda x:x[1],reverse=True) print(countList) f = open('lyricCount.txt','a') for i in range(20): f.write(countList[i][0]+':'+str(countList[i][1])+'\n') f.close()
运行结果图:
2.中文词频统计
代码如下
import jieba f = open('sanguoyanyi.txt', 'r',encoding='utf-8') text = f.read() f.close() jieba.add_word('曹操') jieba.add_word('诸葛亮') jieba.add_word('孔明') punctuation = ''',。‘’“”:;()!?、 ''' a = {'的','\n','\u3000','曰','之','不','人','军','操','一','将', '大','马','来','德','有','于','下','兵','此', '玄','公','见','为','何','中','而','可','吾', '出','也','以','与','上','后','今','其','去', '日','明','言'} for i in punctuation: text = text.replace(i, '') print(list(jieba.cut(text))) tempwords = list(jieba.cut(text)) print(tempwords) count = {} words = list(set(tempwords) - a) print(words) for i in range(0, len(words)): count[words[i]] = text.count(str(words[i])) countList = list(count.items()) countList.sort(key=lambda x: x[1], reverse=True) print(countList) f = open('zzzCount.txt', 'a') for i in range(20): f.write(countList[i][0] + ':' + str(countList[i][1]) + '\n') f.close()
运行结果图: