统计文件夹下面的文本文件中频率最高的单词
通过该练习,需要熟悉lambda在字典排序中的使用,可参考这篇内容http://www.cnblogs.com/kaituorensheng/archive/2012/08/07/2627386.html
1 # coding:utf-8 2 import re 3 import os 4 5 6 def get_myfiles(path): 7 8 filepath = os.listdir(path) 9 files = [] 10 for fp in filepath: 11 # 将path路径下面的文件进行拼接 12 fppath = path + '/' + fp 13 fppattern = fppath.split('.') 14 if os.path.isfile(fppath) and fppattern[2] == 'txt': 15 files.append(fppath) 16 elif os.path.isdir(fppath): 17 files += get_myfiles(fppath) 18 return files 19 20 21 def get_word(files): 22 23 worddict = {} 24 for filename in files: 25 f = open(filename, 'rb') 26 s = f.read() 27 words = re.findall(r'[a-z0-9A-z]+', s) 28 for word in words: 29 worddict[word] = worddict[word] + 1 if word in worddict else 1 30 f.close() 31 wordsort = sorted(worddict.items(), key=lambda e: e[1], reverse=True) 32 return wordsort 33 34 if __name__ == '__main__': 35 files = get_myfiles('.') 36 wordsort = get_word(files) 37 num = 1 38 # 因为wordsort已经倒序排列,因而只需要按照前面的几个值进行比对 39 for i in range(len(wordsort)-1): 40 if wordsort[i][1] == wordsort[i+1][1]: 41 num += 1 42 else: 43 break 44 # 通过用变量num记录最大值的个数,后面只需要根据num的值按顺序读取字典 45 for i in range(num): 46 print wordsort[i]
下面通过构造类方法来实现该功能
import re import io class Counter: def __init__(self, path): """ :param path:文件路径 """ self.mapping = {} with io.open(path, 'r', encoding='utf-8') as f: content = f.read() words = [s.lower() for s in re.findall(r'\w+', content)] for word in words: self.mapping[word] = self.mapping.get(word, 0) + 1 def most_num(self, n): assert n > 0, "n should be large than 0" return sorted(self.mapping.items(), key=lambda s: s[1], reverse=True)[:n] if __name__ == '__main__': most_num_5 = Counter('yes.txt').most_num(6) for item in most_num_5: print item