Python处理PDF——pdfplumber的安装与使用

https://github.com/hbh112233abc/pdfplumber/blob/stable/README-CN.md

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# -*- coding:utf-8 -*-
 
"""
@Time :2023/XX/XX
@Auth :Stone
@File :parse_online_pdf.py
@DESC :在线解析PDF文档
"""
import requests
import pdfplumber
import re, time, os
 
 
def online_pdf_parse(path_or_url, mode=1, url_params=None, proxies=None, save_as=None):
    '''
    <语法>
        参数path_or_url: PDF文档路径或者URL
        参数mode: 设置解析模式,
            [1, '1', 'text']返回文档内容 -> str
            [2, '2', 'table']返回表格信息 -> list
            [3, '3', 'text_and_table']返回文档内容及表格信息 -> tuple
        参数url_params: 读取在线PDF文档时,传入requests请求参数,类型 <- dict
        参数proxies: 读取在线PDF文档时,传入requests的代理
        参数save_as: 读取在线PDF文档时,若进行此项设置则另存为本地文档,方便后续使用
    </语法>
    '''
 
    url_mode = False
 
    # 判断是本地文档还是在线文档
    if re.search(r'''(?x)\A([a-z][a-z0-9+\-.]*)://([a-z0-9\-._~%]+|\[[a-z0-9\-._~%!$&'()*+,;=:]+\])''', path_or_url):
        url_mode = True
    else:
        pdf_path = path_or_url
 
    if url_mode:
        pdf_url = path_or_url
        headers_d = None
        headers_d = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)'}
        if not proxies:
            proxy_host = {}
        if not url_params:
            url_params = {}
            url_params['headers'] = headers_d
            url_params['data'] = None
            url_params['params'] = None
            url_params['proxies'] = proxies
        if not url_params['headers']: url_params['headers'] = headers_d
        if url_params['data'] or url_params['params']:
            response = requests.post(pdf_url, **url_params)
        else:
            response = requests.get(pdf_url, **url_params)
 
        # 写入临时文件再进行解析
        pdf_path = save_as if save_as else f'~temp{time.time()}~.pdf'
        with open(pdf_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)
                    f.flush()
 
    pdf_path = os.path.abspath(pdf_path)
 
    # 用pdfplumber对pdf文档进行解析
    pdf_text = ''
    pdf_tables = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            if str(mode).lower() in ['1', 'text', '0', '3']:
                pdf_text += str(page.extract_text())
            if str(mode).lower() in ['2', 'table', '0', '3']:
                pdf_tables += page.extract_tables()
            # print(f"pdf_tables={pdf_tables}")
            # print(f"*" * 166)
 
    # 删除临时pdf文档
    if url_mode and not save_as:
        try:
            os.remove(pdf_path)
        except Exception as e:
            pass
 
    if str(mode).lower() in ['1', 'text']:
        return pdf_text
    elif str(mode).lower() in ['2', 'table']:
        return pdf_tables
    elif str(mode).lower() in ['3', 'text_and_table']:
        return pdf_text, pdf_tables
 
 
def replace_str(str_font):
    """替换文字中的符号"""
    str_font = str(str_font).replace('\n', '').replace(' ', '').replace(': ', ':')
    return str_font
 
 
def link_last_list(need_list):
    """链接上一个list"""
    result_total = []
    for current_list in need_list:
        if current_list[0] == '':
            # 如果当前列表第一个值为空字符串
            if result_total:
                # 如果有上一个非空列表 list1=上一个列表,list2=当前列表,
                new_list = []
                for i in range(len(result_total[-1])):
                    new_value = str(result_total[-1][i]) + str(current_list[i])
                    new_list.append(new_value)
                result_total[-1] = new_list
            else:
                result_total.append(current_list)
        else:
            result_total.append(current_list)
    print(f"获取到所有数组合并后为={result_total}")
    return result_total
 
 
if __name__ == '__main__':
    pdf_url = f"********************************"
    pdf_text = online_pdf_parse(pdf_url, mode='table')
    # # print(f"获取的内容是={pdf_text}")
    # 识别后是按照页面进行划分数组,所以会产生一行的数据划分成两行
    data = []
    for item in pdf_text:
        for dd in item:
            data.append([replace_str(str_item) for str_item in dd])
    # print(f"all_list={data}")
    result_list = link_last_list(data)
    print(f"拼接后的数组为={result_list}")