Python 处理 PDF

 

 

from pdf2image import convert_from_path

def get_page_image(  # pdf 按页切成图片
        hash_value: str
) -> Dict[int, str]:
    pdf_path = os.path.join(config.PDF_DIR, f'{hash_value}.pdf').replace('\\', '/')
    img_pages = convert_from_path(pdf_path)
    res = {}
    for page, img in enumerate(img_pages):
        arr = np.array(img)
        basename = f'{hash_value}_{page:03d}.jpg'
        image_path = os.path.join(config.PDF_IMAGE_DIR, basename)
        img.save(image_path)
        res[page] = image_path
    return res

 

使用命令行从 PDF 中提取文本

pip install pdfminer.six
if __name__ == '__main__':
    paper_id = "111111111"
    page = 7
    pdfpath = os.path.join(config.PDF_DIR, paper_id + ".pdf")
    xmlpath = os.path.join(config.PDF_XML_DIR, paper_id + "_{}_pdfminer.xml".format(page))
    python = sys.executable
    module_base = pathlib.Path(__file__).absolute().parent.parent
    pdf2txt = os.path.join(module_base, "venv/Scripts/pdf2txt.py").replace('\\', '/')
    os.system("{} {} -o {} -p {} -t xml {}".format(python, pdf2txt, xmlpath, page + 1, pdfpath))

https://pdfminersix.readthedocs.io/en/latest/tutorial/commandline.html

posted @ 2021-06-02 16:36  薄荷味日记  阅读(126)  评论(0编辑  收藏  举报