综合练习：词频统计

一、英文词频统计


 1 f=open("text.txt","r")
 2 song=f.read()
 3 f.close()
 4  
 5 sep=''',.?—!"'''
 6  
 7 exclude={'the','and','i','in',"i'm",'a','of','an','on','to','with'}
 8  
 9 for c in sep:
10     song=song.replace(c,' ')
11  
12 swl=song.lower().split()
13  
14 swd={}
15  
16 sws=set(swl)-exclude
17  
18 for w in sws:
19     swd[w]=swl.count(w)
20  
21 fl=list(swd.items())
22  
23 fl.sort(key = lambda x:x[1],reverse = True)
24  
25 for i in fl:
26     print(i)
27  
28 f=open("result.txt","w")
29 for i in range(20):
30     f.write(fl[i][0]+"  "+str(fl[i][1])+"\n")
31 f.close()

二、中文词频统计

 1 import jieba
 2  
 3 f = open('xiyouji.txt','r', encoding='utf-8')
 4 text = f.read()
 5 f.close()
 6  
 7  
 8 import jieba
 9   
10 #打开文件
11 file = open("zgsjtl.txt",'r',encoding="utf-8")
12 notes = file.read();
13 file.close();
14   
15 #替换标点符号
16 sep = '''：。，？！；∶ ．．．“”'''
17 for i in sep:
18     notes = notes.replace(i,' ');
19   
20 notes_list = list(jieba.cut(notes));
21   
22   
23  
24 exclude =[' ','\n','你','嗯','他','和','但','啊','的','来','是','去','在','上','走']
25   
26 notes_dict={}
27 for w in notes_list:
28     notes_dict[w] = notes_dict.get(w,0)+1
29   
30 for w in exclude:
31     del (notes_dict[w]);
32   
33 for w in notes_dict:
34     print(w,notes_dict[w])
35   
36   
37  
38 dictList = list(notes_dict.items())
39 dictList.sort(key=lambda x:x[1],reverse=True);
40 print(dictList)
41   
42  
43 for i in range(20):
44     print(dictList[i])
45  
46 outfile = open("top20.txt","a")
47 for i in range(20):
48     outfile.write(dictList[i][0]+" "+str(dictList[i][1])+"\n")
49 outfile.close();

posted on 2018-03-28 21:59 范楚广阅读(215) 评论(0) 编辑收藏举报

刷新页面返回顶部

导航

公告

综合练习：词频统计