Python处理文档中的文本

提取所有Word 文档中的指定部分内容，并合并到一个txt文件中：

import glob
import docx


def get_text_between_headings(doc, heading1, heading2):
    start = -1
    end = -1
    for i in range(len(doc.paragraphs)):
        if doc.paragraphs[i].text == heading1:
            start = i
        elif doc.paragraphs[i].text == heading2:
            end = i
            break
    return '\n'.join([doc.paragraphs[i].text for i in range(start+1, end)])


# 获取所有Word文件的路径
file_paths = glob.glob('./*.docx')

# 创建一个txt文件
with open('./output.txt', 'w', encoding='utf-8') as f:
    # 遍历每个Word文件，将其内容写入txt文件
    for file_path in file_paths:
        doc = docx.Document(file_path)
        # text = '\n\n'.join([paragraph.text for paragraph in doc.paragraphs])
        # f.write(text)
        fruit = get_text_between_headings(doc, "Done", "Introspection")
        # print(fruit)
        if not (fruit.startswith('Figure') or fruit.startswith('[')):
            f.write(fruit)

给txt文档中的所有空行按顺序添加指定内容：

with open('output.txt', 'r') as file:
    lines = file.readlines()

week_no = 0
with open('processed_file.txt', 'w') as file:
    for i, line in enumerate(lines):
        if not line.strip():  # 检查行是否为空行
            week_no += 1
            file.write(f'Week {week_no}')
        file.write(line)

作者：艾孜尔江
转载请务必标明出处！

posted @ 2023-05-08 15:01 艾孜尔江阅读(55) 评论(0) 收藏举报

刷新页面返回顶部

艾孜尔江

Python处理文档中的文本

公告