Python 处理 PDF
from pdf2image import convert_from_path def get_page_image( # pdf 按页切成图片 hash_value: str ) -> Dict[int, str]: pdf_path = os.path.join(config.PDF_DIR, f'{hash_value}.pdf').replace('\\', '/') img_pages = convert_from_path(pdf_path) res = {} for page, img in enumerate(img_pages): arr = np.array(img) basename = f'{hash_value}_{page:03d}.jpg' image_path = os.path.join(config.PDF_IMAGE_DIR, basename) img.save(image_path) res[page] = image_path return res
使用命令行从 PDF 中提取文本
pip install pdfminer.six
if __name__ == '__main__': paper_id = "111111111" page = 7 pdfpath = os.path.join(config.PDF_DIR, paper_id + ".pdf") xmlpath = os.path.join(config.PDF_XML_DIR, paper_id + "_{}_pdfminer.xml".format(page)) python = sys.executable module_base = pathlib.Path(__file__).absolute().parent.parent pdf2txt = os.path.join(module_base, "venv/Scripts/pdf2txt.py").replace('\\', '/') os.system("{} {} -o {} -p {} -t xml {}".format(python, pdf2txt, xmlpath, page + 1, pdfpath))
https://pdfminersix.readthedocs.io/en/latest/tutorial/commandline.html