python实现指定目录下批量文件的单词计数:串行版本
直接上代码。
练习目标:
1. 使用 Python 面向对象的方法封装逻辑和表达 ;
2. 使用异常处理和日志API ;
3. 使用文件目录读写API ;
4. 使用 list, map, tuple 三种数据结构 ;
5. lambda 、正则使用及其它。
下一篇将实现并发版本。
#------------------------------------------------------------------------------- # Name: wordstat_serial.py # Purpose: statistic words in java files of given directory by serial # # Author: qin.shuq # # Created: 08/10/2014 # Copyright: (c) qin.shuq 2014 # Licence: <your licence> #------------------------------------------------------------------------------- import re import os import time import logging LOG_LEVELS = { 'DEBUG': logging.DEBUG, 'INFO': logging.INFO, 'WARN': logging.WARNING, 'ERROR': logging.ERROR, 'CRITICAL': logging.CRITICAL } def initlog(filename) : logger = logging.getLogger() hdlr = logging.FileHandler(filename) formatter = logging.Formatter("%(asctime)s %(levelname)s %(message)s") hdlr.setFormatter(formatter) logger.addHandler(hdlr) logger.setLevel(LOG_LEVELS['INFO']) return logger errlog = initlog("error.log") infolog = initlog("info.log") class WordReading(object): def __init__(self, fileList): self.fileList = fileList def readFileInternal(self, filename): lines = [] try: f = open(filename, 'r') lines = f.readlines() infolog.info('[successful read file %s]\n' % filename) f.close() except IOError, err: errorInfo = 'file %s Not found \n' % filename errlog.error(errorInfo) return lines def readFile(self): allLines = [] for filename in self.fileList: allLines.extend(self.readFileInternal(filename)) return allLines class WordAnalyzing(object): ''' return Map<Word, count> the occurrence times of each word ''' wordRegex = re.compile("[\w]+") def __init__(self, allLines): self.allLines = allLines def analyze(self): result = {} lineContent = ''.join(self.allLines) matches = WordAnalyzing.wordRegex.findall(lineContent) if matches: for word in matches: if result.get(word) is None: result[word] = 0 result[word] += 1 return result class FileObtainer(object): def __init__(self, dirpath, fileFilterFunc=None): self.dirpath = dirpath self.fileFilterFunc = fileFilterFunc def findAllFilesInDir(self): files = [] for path, dirs, filenames in os.walk(self.dirpath): if len(filenames) > 0: for filename in filenames: files.append(path+'/'+filename) if self.fileFilterFunc is None: return files else: return filter(self.fileFilterFunc, files) class PostProcessing(object): def __init__(self, resultMap): self.resultMap = resultMap def sortByValue(self): return sorted(self.resultMap.items(),key=lambda e:e[1], reverse=True) def obtainTopN(self, topN): sortedResult = self.sortByValue() sortedNum = len(sortedResult) topN = sortedNum if topN > sortedNum else topN for i in range(topN): topi = sortedResult[i] print topi[0], ' counts: ', topi[1] if __name__ == "__main__": dirpath = "c:\\Users\\qin.shuq\\Desktop\\region_master\\src" starttime = time.time() fileObtainer = FileObtainer(dirpath, lambda f: f.endswith('.java')) fileList = fileObtainer.findAllFilesInDir() endtime = time.time() print 'ObtainFile cost: ', (endtime-starttime)*1000 , 'ms' starttime = time.time() wr = WordReading(fileList) allLines = wr.readFile() endtime = time.time() print 'WordReading cost: ', (endtime-starttime)*1000 , 'ms' starttime = time.time() wa = WordAnalyzing(allLines) resultMap = wa.analyze() endtime = time.time() print 'WordAnalyzing cost: ', (endtime-starttime)*1000 , 'ms' starttime = time.time() postproc = PostProcessing(resultMap) postproc.obtainTopN(30) endtime = time.time() print 'PostProcessing cost: ', (endtime-starttime)*1000 , 'ms'