首先,下载所需要的库
1 :pdfminer 安装库命令:
2: docx 安装库命令:
开始正餐:
(注意:pdf中非图片构成的部分才能被成功转换)
1
2
3
4
5import sys
6import importlib
7importlib.reload(sys)
8
9from pdfminer.pdfparser import PDFParser,PDFDocument
10from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
11from pdfminer.converter import PDFPageAggregator
12from pdfminer.layout import *
13from pdfminer.pdfinterp import PDFTextExtractionNotAllowed
14import os
15
16
17os.chdir(r'c:/users/dicey/desktop/codes/pdf-docx')
18
19'''
20解析pdf文件,获取文件中包含的各种对象
21'''
22
23def parse(pdf_path):
24 fp = open('diya.pdf', 'rb')
25
26 parser = PDFParser(fp)
27
28 doc = PDFDocument()
29
30 parser.set_document(doc)
31 doc.set_parser(parser)
32
33
34
35 doc.initialize()
36
37
38 if not doc.is_extractable:
39 raise PDFTextExtractionNotAllowed
40 else:
41
42 rsrcmgr = PDFResourceManager()
43
44 laparams = LAParams()
45 device = PDFPageAggregator(rsrcmgr, laparams=laparams)
46
47 interpreter = PDFPageInterpreter(rsrcmgr, device)
48
49
50 num_page, num_image, num_curve, num_figure, num_TextBoxHorizontal = 0, 0, 0, 0, 0
51
52
53 for page in doc.get_pages():
54 num_page += 1
55 interpreter.process_page(page)
56
57 layout = device.get_result()
58 for x in layout:
59 if isinstance(x,LTImage):
60 num_image += 1
61 if isinstance(x,LTCurve):
62 num_curve += 1
63 if isinstance(x,LTFigure):
64 num_figure += 1
65 if isinstance(x, LTTextBoxHorizontal):
66 num_TextBoxHorizontal += 1
67
68 with open(r'test2.doc', 'a',encoding='utf-8') as f:
69 results = x.get_text()
70 f.write(results)
71 f.write('\n')
72 print('对象数量:\n','页面数:%s\n'%num_page,'图片数:%s\n'%num_image,'曲线数:%s\n'%num_curve,'水平文本框:%s\n'
73 %num_TextBoxHorizontal)
74
75
76if __name__ == '__main__':
77 pdf_path = r'diya.pdf'
78 parse(pdf_path)