python统计英文词频

比较简单的功能,需求只到了这里,所以也就没有继续下去了。

 1 # -*- encoding: utf-8 -*-   
 2 # by sorcerdu
 3 #基本功能和用法在提示中
 4 #原理是利用分隔符分词存入列表,然后从列表读出存入字典,键为词,值存放词的数量
 5 #中文统计词频的话,得先分词后再进行。
 6 import os,string,codecs
 7 import sys,time
 8 
 9 def readfile():
10     wordlist=[]
11     base=open('base.txt','r')
12     baseinfo=base.readlines()
13     tagf=open('tag.txt','r')
14     tagfinfo=tagf.readlines()
15     for i in tagfinfo:
16         tags=i.split(' ')
17     for i in baseinfo:
18         words=i.split(' ')
19         for word in words:
20             if word != '\t'and word != '\n' and word!=' ' and word != '' and word>=2:
21                word=word.replace('\t','')
22                word=word.replace('\n','')
23                word=word.replace(' ','')
24                word=word.replace('.\n','')
25                if word!='':
26                    wordlist.append(word)
27 ##        tags=['.','"',',','!','?','(',')']
28         for x in range(len(tags)):
29             tag=tags[x]
30             for k in range(len(wordlist)):
31                 if tag in wordlist[k]: #用符号分割
32                     words=wordlist[k].split(tag)
33                     del wordlist[k]
34                     for  j in range(len(words)): #去掉判断后的空字符
35                         if words[j]!='':
36                             wordlist.append(words[j])
37 
38 
39     
40     base.close()
41     tagf.close()
42     return wordlist
43 
44 
45 
46 def getstr(word,count,allwordnum):
47     countstr=word+'--------'+str(count)+'--------'+str(allwordnum)
48     return countstr
49 
50 if __name__=="__main__":
51    wordcnt={} 
52    wordlist=readfile()
53    wordlistall=wordlist
54    allwordnum=len(wordlistall)
55    outdata=open('count.txt','w')
56    print '******************************************'
57    print(u'提示:')
58    print(u'     1、要统计的文章放置于本程序路径下的base.txt中') 
59    print(u'     2、单词分割符存放在本程序路径下的tag.txt中,以空格为分隔符,默认已对换码符,换行符,空格,句号(英文)处理')
60    print(u'     3、统计的结果保存在本程序路径下的count.txt中')
61    print '******************************************'
62    print(u"开始统计咯......")
63    
64    print'------------------------------------------------------------------------'
65    for i in wordlistall:
66        if i in wordcnt:
67           wordcnt[i]+=1
68        else:
69           wordcnt[i]=1
70    for word,cnt in wordcnt.iteritems():
71        print word+'--------'+str(cnt)+'--------'+str(allwordnum)
72        outdata.write(getstr(word,cnt,allwordnum)+'\n')
73    
74    print'------------------------------------------------------------------------'
75    print(u"完成")
76    print(u'按任意键退出')
77    outdata.close()
78    os.system("pause")

 

posted @ 2013-01-13 21:32  sorcerdu  阅读(3939)  评论(0编辑  收藏  举报