统计文件夹下面的文本文件中频率最高的单词

通过该练习,需要熟悉lambda在字典排序中的使用,可参考这篇内容http://www.cnblogs.com/kaituorensheng/archive/2012/08/07/2627386.html

 1 # coding:utf-8
 2 import re
 3 import os
 4 
 5 
 6 def get_myfiles(path):
 7 
 8     filepath = os.listdir(path)
 9     files = []
10     for fp in filepath:
11         # 将path路径下面的文件进行拼接
12         fppath = path + '/' + fp
13         fppattern = fppath.split('.')
14         if os.path.isfile(fppath) and fppattern[2] == 'txt':
15             files.append(fppath)
16         elif os.path.isdir(fppath):
17             files += get_myfiles(fppath)
18     return files
19 
20 
21 def get_word(files):
22 
23     worddict = {}
24     for filename in files:
25         f = open(filename, 'rb')
26         s = f.read()
27         words = re.findall(r'[a-z0-9A-z]+', s)
28         for word in words:
29             worddict[word] = worddict[word] + 1 if word in worddict else 1
30         f.close()
31     wordsort = sorted(worddict.items(), key=lambda e: e[1], reverse=True)
32     return wordsort
33 
34 if __name__ == '__main__':
35     files = get_myfiles('.')
36     wordsort = get_word(files)
37     num = 1
38     # 因为wordsort已经倒序排列,因而只需要按照前面的几个值进行比对
39     for i in range(len(wordsort)-1):
40         if wordsort[i][1] == wordsort[i+1][1]:
41             num += 1
42         else:
43             break
44     # 通过用变量num记录最大值的个数,后面只需要根据num的值按顺序读取字典
45     for i in range(num):
46         print wordsort[i]

 下面通过构造类方法来实现该功能

import re
import io


class Counter:

    def __init__(self, path):
        """
        :param path:文件路径
        """
        self.mapping = {}
        with io.open(path, 'r', encoding='utf-8') as f:
            content = f.read()
            words = [s.lower() for s in re.findall(r'\w+', content)]
            for word in words:
                self.mapping[word] = self.mapping.get(word, 0) + 1

    def most_num(self, n):
        assert n > 0, "n should be large than 0"
        return sorted(self.mapping.items(), key=lambda s: s[1], reverse=True)[:n]

if __name__ == '__main__':
    most_num_5 = Counter('yes.txt').most_num(6)
    for item in most_num_5:
        print item

 

posted @ 2017-10-25 15:04  milian0711  阅读(262)  评论(0编辑  收藏  举报