python 读取word、pdf文件内容

import docx2txt
import fitz
import docx
from docx.oxml import parse_xml


def get_doc_content(filepath):
    """获取word文本内容"""
    try:
        doc = docx.Document(filepath)
        content = []
        for element in doc.element.body:
            if element.__class__.__name__ == 'CT_P':  # 段落
                paragraph = docx.text.paragraph.Paragraph(parse_xml(element.xml), parent=None)
                content.append(paragraph.text)
            elif element.__class__.__name__ == 'CT_Tbl':  # 表格
                table = docx.table.Table(element, parent=None)
                table_texts = []
                for row in table.rows:
                    row_texts = []
                    for cell in row.cells:
                        if cell.text not in row_texts:
                            row_texts.append(cell.text)
                        else:
                            row_texts.append("")
                    table_texts.append("|".join(row_texts))
                content.append("\n".join(table_texts))
        content = "\n".join(content)
    except Exception:
        content = docx2txt.process(filepath)
    return content


def get_pdf_content(filepath):
    content = ""
    with fitz.Document(filepath) as doc:
        for page in doc:
            content += page.get_text()
    return content


def get_file_content(filepath):
    try:
        if filepath.endswith(".docx"):
            content = get_doc_content(filepath)
        elif filepath.endswith(".pdf"):
            content = get_pdf_content(filepath)
        elif filepath.endswith(".txt"):
            with open(filepath, "r", encoding="utf-8") as f:
                content = f.read()
        else:
            content = None
    except:
        content = None
    return content

posted @ 2025-01-22 11:41 二月雪阅读(23) 评论(0) 编辑收藏举报

刷新页面返回顶部

登录后才能查看或发表评论，立即登录或者逛逛博客园首页

相关博文：

· python openpyxl读写excel

· python 文本内容写入world

· Python读取pdf、word、excel、ppt、csv和txt文件提取所有文本

· 使用python读取doc、docx文档

· python 读取文件函数

公告

昵称：二月雪
园龄： 7个月
粉丝： 3
关注： 1

+加关注

2025年3月

日

一

二

三

四

五

六

lanjianhua

python 读取word、pdf文件内容

公告

搜索

常用链接

我的标签

随笔档案

阅读排行榜

推荐排行榜