一、英文词频统计
1 f=open("text.txt","r") 2 song=f.read() 3 f.close() 4 5 sep=''',.?—!"''' 6 7 exclude={'the','and','i','in',"i'm",'a','of','an','on','to','with'} 8 9 for c in sep: 10 song=song.replace(c,' ') 11 12 swl=song.lower().split() 13 14 swd={} 15 16 sws=set(swl)-exclude 17 18 for w in sws: 19 swd[w]=swl.count(w) 20 21 fl=list(swd.items()) 22 23 fl.sort(key = lambda x:x[1],reverse = True) 24 25 for i in fl: 26 print(i) 27 28 f=open("result.txt","w") 29 for i in range(20): 30 f.write(fl[i][0]+" "+str(fl[i][1])+"\n") 31 f.close()
二、中文词频统计
1 import jieba 2 3 f = open('xiyouji.txt','r', encoding='utf-8') 4 text = f.read() 5 f.close() 6 7 8 import jieba 9 10 #打开文件 11 file = open("zgsjtl.txt",'r',encoding="utf-8") 12 notes = file.read(); 13 file.close(); 14 15 #替换标点符号 16 sep = ''':。,?!;∶ ...“”''' 17 for i in sep: 18 notes = notes.replace(i,' '); 19 20 notes_list = list(jieba.cut(notes)); 21 22 23 24 exclude =[' ','\n','你','嗯','他','和','但','啊','的','来','是','去','在','上','走'] 25 26 notes_dict={} 27 for w in notes_list: 28 notes_dict[w] = notes_dict.get(w,0)+1 29 30 for w in exclude: 31 del (notes_dict[w]); 32 33 for w in notes_dict: 34 print(w,notes_dict[w]) 35 36 37 38 dictList = list(notes_dict.items()) 39 dictList.sort(key=lambda x:x[1],reverse=True); 40 print(dictList) 41 42 43 for i in range(20): 44 print(dictList[i]) 45 46 outfile = open("top20.txt","a") 47 for i in range(20): 48 outfile.write(dictList[i][0]+" "+str(dictList[i][1])+"\n") 49 outfile.close();