python实现简单的英文词频统计
1 __author__ = 'Oscar_Yang' 2 # -*- coding= utf-8 -*- 3 #copyRight by OSCAR 4 """ 5 本脚本实现,合并几个英文文本,并且统计词频。 6 脚本定义了几个函数: 7 1、文件读取函数readFile(读取文件,输出每个文件的词频); 8 2、元素为词频的字典的合并函数,并且实现相同词的词频相加,返回全部词频; 9 3、调试部分,利用了高阶函数:map,reduce; 10 4、最后实现格式化输出,输入结果如图片所示。 11 """ 12 import functools 13 #定义文件读取函数,并且输出元素为词频的字典 14 def readFile(file_name): 15 y = [] 16 with open(file_name, 'r',encoding="utf-8") as f: 17 x=f.readlines() 18 for line in x: 19 y.extend(line.split()) 20 word_list2 = [] 21 22 # 单词格式化:去掉分词之后部分英文前后附带的标点符号 23 for word in y: 24 # last character of each word 25 word1 = word 26 27 # use a list of punctuation marks 28 while True: 29 lastchar = word1[-1:] 30 if lastchar in [",", ".", "!", "?", ";", '"']: 31 word2 = word1.rstrip(lastchar) 32 word1 = word2 33 else: 34 word2 = word1 35 break 36 37 while True: 38 firstchar = word2[0] 39 if firstchar in [",", ".", "!", "?", ";", '"']: 40 word3 = word2.lstrip(firstchar) 41 word2 = word3 42 else: 43 word3 = word2 44 break 45 # build a wordList of lower case modified words 46 word_list2.append(word3) 47 #统计词频 48 tf = {} 49 for word in word_list2: 50 word = word.lower() 51 # print(word) 52 word = ''.join(word.split()) 53 if word in tf: 54 tf[word] += 1 55 else: 56 tf[word] = 1 57 return tf 58 59 def get_counts(words): 60 tf = {} 61 for word in words: 62 word = word.lower() 63 # print(word) 64 word = ''.join(word.split()) 65 if word in tf: 66 tf[word] += 1 67 else: 68 tf[word] = 1 69 70 71 #合并两个字典的方法1 72 def merge1(dic1, dic2): 73 for k, v in dic1.items(): 74 if k in dic2.keys(): 75 dic2[k] += v 76 else: 77 dic2[k] = v 78 # print(dic2) 79 return dic2 80 81 #合并两个字典的方法2 82 def merge2(dic1, dic2): 83 from collections import Counter 84 counts = Counter(dic1) + Counter(dic2) 85 return counts 86 87 #获得前n个最热词和词频 88 def top_counts(word_list,n=10): 89 value_key_pairs = sorted([(count, tz) for tz, count in word_list.items()],reverse=True) 90 return value_key_pairs[:n] 91 # print(value_key_pairs[:n]) 92 93 #测试部分 94 if __name__ == '__main__': 95 file_list = [r'E:\graduate\Python\python那些事\articles\article_000.txt', 96 r'E:\graduate\Python\python那些事\articles\article_001.txt', 97 r'E:\graduate\Python\python那些事\articles\article_002.txt', 98 r'E:\graduate\Python\python那些事\articles\article_003.txt', 99 r'E:\graduate\Python\python那些事\articles\article_004.txt', 100 r'E:\graduate\Python\python那些事\articles\article_005.txt'] 101 102 cc=map(readFile,file_list) 103 word_list = functools.reduce(merge2,cc) 104 top_counts=top_counts(word_list) 105 # print(top_counts) 106 print ("最常用的单词排行榜:") 107 for word in top_counts[0:10]: 108 print("{0:10}{1}".format(word[1], word[0]))
2016-10-15
运行结果: