python 读取word、pdf文件内容

import docx2txt
import fitz
import docx
from docx.oxml import parse_xml


def get_doc_content(filepath):
    """获取word文本内容"""
    try:
        doc = docx.Document(filepath)
        content = []
        for element in doc.element.body:
            if element.__class__.__name__ == 'CT_P':  # 段落
                paragraph = docx.text.paragraph.Paragraph(parse_xml(element.xml), parent=None)
                content.append(paragraph.text)
            elif element.__class__.__name__ == 'CT_Tbl':  # 表格
                table = docx.table.Table(element, parent=None)
                table_texts = []
                for row in table.rows:
                    row_texts = []
                    for cell in row.cells:
                        if cell.text not in row_texts:
                            row_texts.append(cell.text)
                        else:
                            row_texts.append("")
                    table_texts.append("|".join(row_texts))
                content.append("\n".join(table_texts))
        content = "\n".join(content)
    except Exception:
        content = docx2txt.process(filepath)
    return content


def get_pdf_content(filepath):
    content = ""
    with fitz.Document(filepath) as doc:
        for page in doc:
            content += page.get_text()
    return content


def get_file_content(filepath):
    try:
        if filepath.endswith(".docx"):
            content = get_doc_content(filepath)
        elif filepath.endswith(".pdf"):
            content = get_pdf_content(filepath)
        elif filepath.endswith(".txt"):
            with open(filepath, "r", encoding="utf-8") as f:
                content = f.read()
        else:
            content = None
    except:
        content = None
    return content
posted @   二月雪  阅读(23)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 无需6万激活码!GitHub神秘组织3小时极速复刻Manus,手把手教你使用OpenManus搭建本
· C#/.NET/.NET Core优秀项目和框架2025年2月简报
· Manus爆火,是硬核还是营销?
· 终于写完轮子一部分:tcp代理 了,记录一下
· 【杭电多校比赛记录】2025“钉耙编程”中国大学生算法设计春季联赛(1)
点击右上角即可分享
微信分享提示