pdf提取信息到excel
1. 报错:No module named 'StringIO'
Python3中出现“No module named 'StringIO'”错误处理方法_浅笑古今的博客-CSDN博客
2. [Python] 自动化办公 PDF提取文字、表格、图片 - 简书 (jianshu.com)
3. python写入excel(方式1) - 景月 - 博客园 (cnblogs.com)
4. 有些PDF失效,所以最后添加了try except防止报错退出循环
任务要求:从PDF中提取所有信息,将其存入到excel表格中,代码包含了读取PDF,制作excel表格以及将提取出来的内容添加到表格中,代码如下
import xlsxwriter import pdfplumber import os # 创建excel文件 xl = xlsxwriter.Workbook(r'C:\Users\Eleni\Desktop\test\test.xlsx') # 添加sheet sheet = xl.add_worksheet('sheet1') sheet.write_string("A1","编号") sheet.write_string("B1","date") sheet.write_string("C1","project") sheet.write_string("D1","property") sheet.write_string("E1","vendor name") sheet.write_string("F1","vendor address") sheet.write_string("G1","agent company") sheet.write_string("H1","agent address") sheet.write_string("I1","agent contact") sheet.write_string("J1","agent phone") sheet.write_string("K1","agent fax") sheet.write_string("L1","Purchaser name") sheet.write_string("M1","Co purchaser name") sheet.write_string("N1","Purchaser address") sheet.write_string("O1","Purchaser email") sheet.write_string("P1","Purchaser phone") sheet.write_string("Q1","Purchaser FIRB") sheet.write_string("R1","Purchase reason") sheet.write_string("S1","Purchaser Solicitor company") sheet.write_string("T1","Purchaser Solicitor address") sheet.write_string("U1","Purchaser Solicitor contact") sheet.write_string("V1","Purchaser Solicitor phone") sheet.write_string("W1","Purchaser Solicitor fax") sheet.write_string("X1","Purchaser Solicitor email") sheet.write_string("Y1","Purchase Price") sheet.write_string("Z1","Deposit Token") sheet.write_string("AA1","agent notes") x=1 for i in range(7,11): file_path = os.getcwd() + '\\' + 'Desktop' + '\\' + 'test' + '\\' + str(i) + '.pdf' li=[] try: with pdfplumber.open(file_path) as pdf: page = pdf.pages[0] for n in range(3): li.append(page.extract_table()[n][1]) for n in range(4,6): li.append(page.extract_table()[n][1]) for n in range(14,19): li.append(page.extract_table()[n][1]) for n in range(20,27): li.append(page.extract_table()[n][1]) for n in range(28,36): li.append(page.extract_table()[n][1]) x+=1 #A的序号是从X+1开始进行编号 y=x z=y-1 sheet.write_string("A%d" % x,"%d" % z) sheet.write_string("B%d" % y, "%s" % li[0]) sheet.write_string("C%d" % y, "%s" % li[1]) sheet.write_string("D%d" % y, "%s" % li[2]) sheet.write_string("E%d" % y, "%s" % li[3]) sheet.write_string("F%d" % y, "%s" % li[4]) sheet.write_string("G%d" % y, "%s" % li[5]) sheet.write_string("H%d" % y, "%s" % li[6]) sheet.write_string("I%d" % y, "%s" % li[7]) sheet.write_string("J%d" % y, "%s" % li[8]) sheet.write_string("K%d" % y, "%s" % li[9]) sheet.write_string("L%d" % y, "%s" % li[10]) sheet.write_string("M%d" % y, "%s" % li[11]) sheet.write_string("N%d" % y, "%s" % li[12]) sheet.write_string("O%d" % y, "%s" % li[13]) sheet.write_string("P%d" % y, "%s" % li[14]) sheet.write_string("Q%d" % y, "%s" % li[15]) sheet.write_string("R%d" % y, "%s" % li[16]) sheet.write_string("S%d" % y, "%s" % li[17]) sheet.write_string("T%d" % y, "%s" % li[18]) sheet.write_string("U%d" % y, "%s" % li[19]) sheet.write_string("V%d" % y, "%s" % li[20]) sheet.write_string("W%d" % y, "%s" % li[21]) sheet.write_string("X%d" % y, "%s" % li[22]) sheet.write_string("Y%d" % y, "%s" % li[23]) sheet.write_string("Z%d" % y, "%s" % li[24]) except: print('error') sheet.set_column('A:B', 60) xl.close()