python实现指定目录下批量文件的单词计数:串行版本
直接上代码。
练习目标:
1. 使用 Python 面向对象的方法封装逻辑和表达 ;
2. 使用异常处理和日志API ;
3. 使用文件目录读写API ;
4. 使用 list, map, tuple 三种数据结构 ;
5. lambda 、正则使用及其它。
下一篇将实现并发版本。
#------------------------------------------------------------------------------- # Name: wordstat_serial.py # Purpose: statistic words in java files of given directory by serial # # Author: qin.shuq # # Created: 08/10/2014 # Copyright: (c) qin.shuq 2014 # Licence: <your licence> #------------------------------------------------------------------------------- import re import os import time import logging LOG_LEVELS = { 'DEBUG': logging.DEBUG, 'INFO': logging.INFO, 'WARN': logging.WARNING, 'ERROR': logging.ERROR, 'CRITICAL': logging.CRITICAL } def initlog(filename) : logger = logging.getLogger() hdlr = logging.FileHandler(filename) formatter = logging.Formatter("%(asctime)s %(levelname)s %(message)s") hdlr.setFormatter(formatter) logger.addHandler(hdlr) logger.setLevel(LOG_LEVELS['INFO']) return logger errlog = initlog("error.log") infolog = initlog("info.log") class WordReading(object): def __init__(self, fileList): self.fileList = fileList def readFileInternal(self, filename): lines = [] try: f = open(filename, 'r') lines = f.readlines() infolog.info('[successful read file %s]\n' % filename) f.close() except IOError, err: errorInfo = 'file %s Not found \n' % filename errlog.error(errorInfo) return lines def readFile(self): allLines = [] for filename in self.fileList: allLines.extend(self.readFileInternal(filename)) return allLines class WordAnalyzing(object): ''' return Map<Word, count> the occurrence times of each word ''' wordRegex = re.compile("[\w]+") def __init__(self, allLines): self.allLines = allLines def analyze(self): result = {} lineContent = ''.join(self.allLines) matches = WordAnalyzing.wordRegex.findall(lineContent) if matches: for word in matches: if result.get(word) is None: result[word] = 0 result[word] += 1 return result class FileObtainer(object): def __init__(self, dirpath, fileFilterFunc=None): self.dirpath = dirpath self.fileFilterFunc = fileFilterFunc def findAllFilesInDir(self): files = [] for path, dirs, filenames in os.walk(self.dirpath): if len(filenames) > 0: for filename in filenames: files.append(path+'/'+filename) if self.fileFilterFunc is None: return files else: return filter(self.fileFilterFunc, files) class PostProcessing(object): def __init__(self, resultMap): self.resultMap = resultMap def sortByValue(self): return sorted(self.resultMap.items(),key=lambda e:e[1], reverse=True) def obtainTopN(self, topN): sortedResult = self.sortByValue() sortedNum = len(sortedResult) topN = sortedNum if topN > sortedNum else topN for i in range(topN): topi = sortedResult[i] print topi[0], ' counts: ', topi[1] if __name__ == "__main__": dirpath = "c:\\Users\\qin.shuq\\Desktop\\region_master\\src" starttime = time.time() fileObtainer = FileObtainer(dirpath, lambda f: f.endswith('.java')) fileList = fileObtainer.findAllFilesInDir() endtime = time.time() print 'ObtainFile cost: ', (endtime-starttime)*1000 , 'ms' starttime = time.time() wr = WordReading(fileList) allLines = wr.readFile() endtime = time.time() print 'WordReading cost: ', (endtime-starttime)*1000 , 'ms' starttime = time.time() wa = WordAnalyzing(allLines) resultMap = wa.analyze() endtime = time.time() print 'WordAnalyzing cost: ', (endtime-starttime)*1000 , 'ms' starttime = time.time() postproc = PostProcessing(resultMap) postproc.obtainTopN(30) endtime = time.time() print 'PostProcessing cost: ', (endtime-starttime)*1000 , 'ms'
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 记一次.NET内存居高不下排查解决与启示
· 探究高空视频全景AR技术的实现原理
· 理解Rust引用及其生命周期标识(上)
· 浏览器原生「磁吸」效果!Anchor Positioning 锚点定位神器解析
· 没有源码,如何修改代码逻辑?
· 全程不用写代码,我用AI程序员写了一个飞机大战
· DeepSeek 开源周回顾「GitHub 热点速览」
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· 记一次.NET内存居高不下排查解决与启示
· 白话解读 Dapr 1.15:你的「微服务管家」又秀新绝活了