epub转txt

from bs4 import BeautifulSoup

def exact_p_tag(path,f):
    xhtml_file = open(path, 'r', encoding='utf-8')
    xhtml_handle = xhtml_file.read()

    soup = BeautifulSoup(xhtml_handle, 'lxml')

    title = soup.find_all("title")
    # print(title)

    p_list = soup.find_all('p')
    for p in p_list:
        f.write(p.text+'\n')
    xhtml_file.close()

import os

os.chdir('C:/Users/tellw/Downloads/test')

from pathlib import Path

xhtml_file_paths=list(Path('EPUB/xhtml').glob('*.xhtml'))

f=open('C:/Users/tellw/test/test.txt','w',encoding='utf8')
for xfp in xhtml_file_paths:
    exact_p_tag(xfp,f)
f.close()

使用 Python 提取 epub 中的文本 https://fanlumaster.github.io/2021/07/08/使用-Python-提取-epub-中的文本/

创建于2409071243,修改于2409071243

posted @ 2024-09-07 12:43  园糯  阅读(16)  评论(0编辑  收藏  举报