python对不同类型文件（doc,txt,pdf）的字符查找

python对不同类型文件的字符查找

TXT文件:

    def txt_handler(self, f_name, find_str):
        """
         处理txt文件
        :param file_name:
        :return:
        """
        line_count = 1;
        file_str_dict = {}
        if os.path.exists(f_name):
            f = open(f_name, 'r', encoding='utf-8')
            for line in f :
                if find_str in line:
                    file_str_dict['file_name'] = f_name
                    file_str_dict['line_count'] = line_count
                    break
                else:
                    line_count += 1
        return file_str_dict

docx文件

需要用到docx包

pip install python-docx

参考https://python-docx.readthedocs.io/en/latest/

from docx import Document

def docx_handler(self, f_name, find_str):
        """
         处理word docx文件
        :param file_name:
        :return:
        """
        # line_count = 1;
        file_str_dict = {}
        if os.path.exists(f_name):
            document = Document(f_name)  # 打开文件x.docx
            for paragraph in document.paragraphs:  # 每个获取段落
                # print(paragraph.text)
                if find_str in paragraph.text:
                    file_str_dict['file_name'] = f_name
                    # file_str_dict['line_count'] = line_count
                    break

        return file_str_dict

doc文件:

python没有专门处理doc文件的包，需要把doc转换成docx，再用docx文件类型方式进行处理

from win32com import client as wc
  
def doc_to_docx(self, fileName):
        # 将doc转换成docx
        word = wc.Dispatch("Word.Application")
        doc = word.Documents.Open(fileName)
        # 使用参数16表示将doc转换成docx，保存成docx后才能 读文件
        FileNameDocx = fileName[:-4] + '.docx'
        doc.SaveAs(FileNameDocx, 16)
        doc.Close()
        word.Quit()
        return FileNameDocx

pdf文件：

这里使用PDFMiner包

python3安装

python -m pip install pdfminer.six

参考文章

https://dzone.com/articles/exporting-data-from-pdfs-with-python

import io
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage   

   
def pdf_handler(self, f_name, find_str):
        """
         处理pdf文件
        :param file_name:
        :return:
        """
        # line_count = 1;
        file_str_dict = {}
        if os.path.exists(f_name):
            # pdf = pdfplumber.open(f_name)  # 打开文件x.pdf
            for page in self.extract_text_by_page(f_name):
                # 获取当前页面的全部文本信息，包括表格中的文字
                if find_str in page:
                    file_str_dict['file_name'] = f_name
                    # file_str_dict['line_count'] = line_count
                    break
        return file_str_dict

    @staticmethod
    def extract_text_by_page(pdf_path):
        """
        按页读取PDF
        生成器函数按页生成（yield）了文本
        :param pdf_path:
        :return:
        """
        with open(pdf_path, 'rb') as fh:
            for page in PDFPage.get_pages(fh,
                                          caching=True,
                                          check_extractable=True):
                resource_manager = PDFResourceManager()
                fake_file_handle = io.StringIO()
                converter = TextConverter(resource_manager, fake_file_handle)
                page_interpreter = PDFPageInterpreter(resource_manager, converter)
                page_interpreter.process_page(page)
                text = fake_file_handle.getvalue()
                yield text  # 使用生成器
                # close open handles
                converter.close()
                fake_file_handle.close()

posted @ 2019-12-11 10:06 一只小小的寄居蟹阅读(1032) 评论(0) 编辑收藏举报

刷新页面返回顶部

一只小小寄居蟹

python对不同类型文件（doc,txt,pdf）的字符查找

python对不同类型文件的字符查找

TXT文件:

docx文件

doc文件:

pdf文件：

公告