python统计文档中词频

python统计文档中词频的小程序

python版本2.7

效果如下:

程序如下,测试文件与完整程序在我的github中 

 1 #统计空格数与单词数 本函数只返回了空格数 需要的可以自己返回多个值
 2 def count_space(path):
 3     number_counts = 0
 4     space_counts = 0
 5     number_list = []
 6 
 7     with open(path, 'r') as f:
 8         for line in f:
 9             line = line.strip()
10             space_split_list = line.split(' ')
11             space_counts += len(space_split_list) - 1
12             for word in space_split_list:
13                     if word.isdigit():
14                         number_list.append(word)
15             number_counts = len(number_list)
16 
17     return space_counts
18 #大写转小写 过滤特殊字符等
19 def count_word(path):
20     result = {}
21     with open(path) as fileread:
22         alltext = fileread.read()
23 
24         alltext = alltext.lower()
25 
26         alltext = re.sub("\"|,|\.", "", alltext)
27 
28         for word in alltext.split():
29             if word not in result:
30                 result[word] = 0
31             result[word] += 1
32 
33         return result
34 
35 
36 def sort_by_count(d):
37 
38     d = collections.OrderedDict(sorted(d.items(), key = lambda t: -t[1]))
39     return d
40 
41 
42 if __name__ == '__main__':
43     try:
44         filename = 'read.txt'
45 
46         dword = count_word(filename)
47         dword = sort_by_count(dword)
48 
49         countspace = count_space(filename)
50         print "space_counts", countspace
51         count_word(filename)
52         for key,value in dword.items():
53             print key + ":%d" % value
54 
55     except IOError:
56         print 'cannot open file %s for read' % filename

 

posted @ 2016-04-24 13:36  ryan255  阅读(3178)  评论(0编辑  收藏  举报