python统计英文词频
比较简单的功能,需求只到了这里,所以也就没有继续下去了。
1 # -*- encoding: utf-8 -*- 2 # by sorcerdu 3 #基本功能和用法在提示中 4 #原理是利用分隔符分词存入列表,然后从列表读出存入字典,键为词,值存放词的数量 5 #中文统计词频的话,得先分词后再进行。 6 import os,string,codecs 7 import sys,time 8 9 def readfile(): 10 wordlist=[] 11 base=open('base.txt','r') 12 baseinfo=base.readlines() 13 tagf=open('tag.txt','r') 14 tagfinfo=tagf.readlines() 15 for i in tagfinfo: 16 tags=i.split(' ') 17 for i in baseinfo: 18 words=i.split(' ') 19 for word in words: 20 if word != '\t'and word != '\n' and word!=' ' and word != '' and word>=2: 21 word=word.replace('\t','') 22 word=word.replace('\n','') 23 word=word.replace(' ','') 24 word=word.replace('.\n','') 25 if word!='': 26 wordlist.append(word) 27 ## tags=['.','"',',','!','?','(',')'] 28 for x in range(len(tags)): 29 tag=tags[x] 30 for k in range(len(wordlist)): 31 if tag in wordlist[k]: #用符号分割 32 words=wordlist[k].split(tag) 33 del wordlist[k] 34 for j in range(len(words)): #去掉判断后的空字符 35 if words[j]!='': 36 wordlist.append(words[j]) 37 38 39 40 base.close() 41 tagf.close() 42 return wordlist 43 44 45 46 def getstr(word,count,allwordnum): 47 countstr=word+'--------'+str(count)+'--------'+str(allwordnum) 48 return countstr 49 50 if __name__=="__main__": 51 wordcnt={} 52 wordlist=readfile() 53 wordlistall=wordlist 54 allwordnum=len(wordlistall) 55 outdata=open('count.txt','w') 56 print '******************************************' 57 print(u'提示:') 58 print(u' 1、要统计的文章放置于本程序路径下的base.txt中') 59 print(u' 2、单词分割符存放在本程序路径下的tag.txt中,以空格为分隔符,默认已对换码符,换行符,空格,句号(英文)处理') 60 print(u' 3、统计的结果保存在本程序路径下的count.txt中') 61 print '******************************************' 62 print(u"开始统计咯......") 63 64 print'------------------------------------------------------------------------' 65 for i in wordlistall: 66 if i in wordcnt: 67 wordcnt[i]+=1 68 else: 69 wordcnt[i]=1 70 for word,cnt in wordcnt.iteritems(): 71 print word+'--------'+str(cnt)+'--------'+str(allwordnum) 72 outdata.write(getstr(word,cnt,allwordnum)+'\n') 73 74 print'------------------------------------------------------------------------' 75 print(u"完成") 76 print(u'按任意键退出') 77 outdata.close() 78 os.system("pause")