python读取doc
import os, time, fnmatch from docx import Document class search: def __init__(self, path, search_string, file_filter): self.search_path = path self.search_string = search_string self.file_filter = file_filter print ("Search %s in %s..." % ( self.search_string, self.search_path ) ) print ("_" * 80) time_begin = time.time() file_count = self.walk() print ("_" * 80) print ("%s files searched in %0.2fsec." % ( file_count, (time.time() - time_begin) )) #遍历所有的文件,记录文件数量 def walk(self): file_count = 0 for root, dirlist, filelist in os.walk(self.search_path, followlinks=True): for filename in filelist: for file_filter in self.file_filter: if fnmatch.fnmatch(filename, file_filter): self.search_file(os.path.join(root, filename)) file_count += 1 return file_count #遍历文件中的字符串,并且剪切显示出来 def search_file(self, filepath): d = Document(filepath) for para in d.paragraphs: if self.search_string in d.paragraphs: print(filepath) self.cutout_content(content) #剪切字符串并且显示 def cutout_content(self, content): current_pos = 0 search_string_len = len(self.search_string) for i in xrange(max_cutouts): try: #从current_pos位置往后寻找self.search_string个字符串 pos = content.index(self.search_string, current_pos) except ValueError: break #将显示窗口定义为寻找到的关键字向前向后各content_extract个字符 content_window = content[ pos - content_extract : pos + content_extract ] print (">>>", content_window.encode("String_Escape")) current_pos += pos + search_string_len print #主程序入口 if __name__ == "__main__": search_path = r"c:\Users\Administrator\Desktop" file_filter = ("*.docx",".doc") # fnmatch-Filter search_string = "history" content_extract = 35 #获取摘要35 max_cutouts = 20 #显示窗口20 search(search_path, search_string, file_filter)