python对不同类型文件(doc,txt,pdf)的字符查找
python对不同类型文件的字符查找
TXT文件:
def txt_handler(self, f_name, find_str): """ 处理txt文件 :param file_name: :return: """ line_count = 1; file_str_dict = {} if os.path.exists(f_name): f = open(f_name, 'r', encoding='utf-8') for line in f : if find_str in line: file_str_dict['file_name'] = f_name file_str_dict['line_count'] = line_count break else: line_count += 1 return file_str_dict
docx文件
需要用到docx包
pip install python-docx
参考https://python-docx.readthedocs.io/en/latest/
from docx import Document def docx_handler(self, f_name, find_str): """ 处理word docx文件 :param file_name: :return: """ # line_count = 1; file_str_dict = {} if os.path.exists(f_name): document = Document(f_name) # 打开文件x.docx for paragraph in document.paragraphs: # 每个获取段落 # print(paragraph.text) if find_str in paragraph.text: file_str_dict['file_name'] = f_name # file_str_dict['line_count'] = line_count break return file_str_dict
doc文件:
python没有专门处理doc文件的包,需要把doc转换成docx,再用docx文件类型方式进行处理
from win32com import client as wc def doc_to_docx(self, fileName): # 将doc转换成docx word = wc.Dispatch("Word.Application") doc = word.Documents.Open(fileName) # 使用参数16表示将doc转换成docx,保存成docx后才能 读文件 FileNameDocx = fileName[:-4] + '.docx' doc.SaveAs(FileNameDocx, 16) doc.Close() word.Quit() return FileNameDocx
pdf文件:
这里使用PDFMiner包
python3安装
python -m pip install pdfminer.six
参考文章
https://dzone.com/articles/exporting-data-from-pdfs-with-python
import io from pdfminer.converter import TextConverter from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer.pdfinterp import PDFResourceManager from pdfminer.pdfpage import PDFPage def pdf_handler(self, f_name, find_str): """ 处理pdf文件 :param file_name: :return: """ # line_count = 1; file_str_dict = {} if os.path.exists(f_name): # pdf = pdfplumber.open(f_name) # 打开文件x.pdf for page in self.extract_text_by_page(f_name): # 获取当前页面的全部文本信息,包括表格中的文字 if find_str in page: file_str_dict['file_name'] = f_name # file_str_dict['line_count'] = line_count break return file_str_dict @staticmethod def extract_text_by_page(pdf_path): """ 按页读取PDF 生成器函数按页生成(yield)了文本 :param pdf_path: :return: """ with open(pdf_path, 'rb') as fh: for page in PDFPage.get_pages(fh, caching=True, check_extractable=True): resource_manager = PDFResourceManager() fake_file_handle = io.StringIO() converter = TextConverter(resource_manager, fake_file_handle) page_interpreter = PDFPageInterpreter(resource_manager, converter) page_interpreter.process_page(page) text = fake_file_handle.getvalue() yield text # 使用生成器 # close open handles converter.close() fake_file_handle.close()