- 2017*****7072
- 潘文杰
- 码云地址:https://gitee.com/Fall825/word_frequency.git
- 程序分析:
def process_file(dst): # 读文件到缓冲区
try: # 打开文件
a=open(dst,"r")
except IOError as s:
print (s)
return None
try: # 读文件到缓冲区
bvffer=a.read()
except:
print ("Read File Error!")
return None
a.close()
return bvffer
此函数为读取文件并将其放到缓存区
def process_buffer(bvffer):
if bvffer:
word_freq = {}
bvffer=bvffer.lower()
for b in '"?!;.,':
# 下面添加处理缓冲区 bvffer代码,统计每个单词的频率,存放在字典word_freq
bvffer=bvffer.replace(b, " ")#把所有字母转换成小写便于统计
words=bvffer.strip().split()#strip去掉空白符,split以空白符为单词分界
for word in words:
word_freq[word]=word_freq.get(word,0)+1#将读取到的单词放置到词典里
return word_freq
将缓存区内文件统一小写格式并将其放置到字典里
def output_result(word_freq):
if word_freq:
sorted_word_freq = sorted(word_freq.items(), key=lambda v: v[1], reverse=True)
for item in sorted_word_freq[:10]: # 输出 Top 10 的单词
print(item)
def main():
dst = "Gone_with_the_wind.txt"
bvffer = process_file(dst)
word_freq = process_buffer(bvffer)
output_result(word_freq)
统计结果输出前10单词
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('dst')
args = parser.parse_args()
dst = args.dst
bvffer = process_file(dst)
word_freq = process_buffer(bvffer)
output_result(word_freq)
Main执行以上函数
- 性能分析结果及改进:
执行时间最长的代码是process_buffer
执行次数最多的代码是method 'get' of 'dict' objects