Python处理PDF——pdfplumber的安装与使用
https://github.com/hbh112233abc/pdfplumber/blob/stable/README-CN.md
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 | # -*- coding:utf-8 -*- """ @Time :2023/XX/XX @Auth :Stone @File :parse_online_pdf.py @DESC :在线解析PDF文档 """ import requests import pdfplumber import re, time, os def online_pdf_parse(path_or_url, mode = 1 , url_params = None , proxies = None , save_as = None ): ''' <语法> 参数path_or_url: PDF文档路径或者URL 参数mode: 设置解析模式, [1, '1', 'text']返回文档内容 -> str [2, '2', 'table']返回表格信息 -> list [3, '3', 'text_and_table']返回文档内容及表格信息 -> tuple 参数url_params: 读取在线PDF文档时,传入requests请求参数,类型 <- dict 参数proxies: 读取在线PDF文档时,传入requests的代理 参数save_as: 读取在线PDF文档时,若进行此项设置则另存为本地文档,方便后续使用 </语法> ''' url_mode = False # 判断是本地文档还是在线文档 if re.search(r '''(?x)\A([a-z][a-z0-9+\-.]*)://([a-z0-9\-._~%]+|\[[a-z0-9\-._~%!$&'()*+,;=:]+\])''' , path_or_url): url_mode = True else : pdf_path = path_or_url if url_mode: pdf_url = path_or_url headers_d = None headers_d = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)' } if not proxies: proxy_host = {} if not url_params: url_params = {} url_params[ 'headers' ] = headers_d url_params[ 'data' ] = None url_params[ 'params' ] = None url_params[ 'proxies' ] = proxies if not url_params[ 'headers' ]: url_params[ 'headers' ] = headers_d if url_params[ 'data' ] or url_params[ 'params' ]: response = requests.post(pdf_url, * * url_params) else : response = requests.get(pdf_url, * * url_params) # 写入临时文件再进行解析 pdf_path = save_as if save_as else f '~temp{time.time()}~.pdf' with open (pdf_path, 'wb' ) as f: for chunk in response.iter_content(chunk_size = 1024 ): if chunk: f.write(chunk) f.flush() pdf_path = os.path.abspath(pdf_path) # 用pdfplumber对pdf文档进行解析 pdf_text = '' pdf_tables = [] with pdfplumber. open (pdf_path) as pdf: for page in pdf.pages: if str (mode).lower() in [ '1' , 'text' , '0' , '3' ]: pdf_text + = str (page.extract_text()) if str (mode).lower() in [ '2' , 'table' , '0' , '3' ]: pdf_tables + = page.extract_tables() # print(f"pdf_tables={pdf_tables}") # print(f"*" * 166) # 删除临时pdf文档 if url_mode and not save_as: try : os.remove(pdf_path) except Exception as e: pass if str (mode).lower() in [ '1' , 'text' ]: return pdf_text elif str (mode).lower() in [ '2' , 'table' ]: return pdf_tables elif str (mode).lower() in [ '3' , 'text_and_table' ]: return pdf_text, pdf_tables def replace_str(str_font): """替换文字中的符号""" str_font = str (str_font).replace( '\n' , ' ').replace(' ', ' ').replace(' : ', ' :') return str_font def link_last_list(need_list): """链接上一个list""" result_total = [] for current_list in need_list: if current_list[ 0 ] = = '': # 如果当前列表第一个值为空字符串 if result_total: # 如果有上一个非空列表 list1=上一个列表,list2=当前列表, new_list = [] for i in range ( len (result_total[ - 1 ])): new_value = str (result_total[ - 1 ][i]) + str (current_list[i]) new_list.append(new_value) result_total[ - 1 ] = new_list else : result_total.append(current_list) else : result_total.append(current_list) print (f "获取到所有数组合并后为={result_total}" ) return result_total if __name__ = = '__main__' : pdf_url = f "********************************" pdf_text = online_pdf_parse(pdf_url, mode = 'table' ) # # print(f"获取的内容是={pdf_text}") # 识别后是按照页面进行划分数组,所以会产生一行的数据划分成两行 data = [] for item in pdf_text: for dd in item: data.append([replace_str(str_item) for str_item in dd]) # print(f"all_list={data}") result_list = link_last_list(data) print (f "拼接后的数组为={result_list}" ) |
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 分享4款.NET开源、免费、实用的商城系统
· 全程不用写代码,我用AI程序员写了一个飞机大战
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· 记一次.NET内存居高不下排查解决与启示
· 白话解读 Dapr 1.15:你的「微服务管家」又秀新绝活了
2021-10-13 psutil模块