python: using pdfplumber Lib read pdf file

 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
from openpyxl import Workbook
from openpyxl.styles import PatternFill,Side,Border
import pdfplumber
 
 
 
 
l=[]
def visitDir(path):
    if not os.path.isdir(path):
        print('Error:"',path,'" is not a directory or does not exist.')
        return
    list_dirs = os.walk(path) #os.walk返回一个元组,包括3个元素:#所有路径名、所有目录列表与文件列表
    for root, dirs, files in list_dirs:      #遍历该元组的目录和文件信息
        for f in files:
            if f.endswith(".pdf"):
                l.append(os.path.join(root, f))
def writeExcel(l):
    wb = Workbook()
    ws1 = wb.active
    data =[]
    for i in l:
        with pdfplumber.open(i) as pdf:
            for page in pdf.pages:
                textdata =page.extract_text()
                l = textdata.split()
                data.append(l)
    border=Border(top=Side(border_style='thin',color='000000'),
                 bottom=Side(border_style='thin',color='000000'),
                 left=Side(border_style='thin',color='000000'),
                 right=Side(border_style='thin',color='000000'))
    ws1["A1"]="合同序号"
    ws1["B1"]="合同名称"
    ws1["C1"]="合同金额"
    ws1["A1"].fill=PatternFill(fill_type='solid', fgColor="8B008B")
    ws1["B1"].fill=PatternFill(fill_type='solid', fgColor="8B008B")
    ws1["C1"].fill=PatternFill(fill_type='solid', fgColor="8B008B")
    ws1["A1"].border = border
    ws1["B1"].border = border
    ws1["C1"].border = border
    fill = PatternFill(fill_type='solid', fgColor="FFC0CB")
    for i in range(len(data)):
        for j in range(len(data[0])):
            ws1.cell(i+2,j+1,data[i][j]).fill=fill
            ws1.cell(i+2,j+1,data[i][j]).border=border
 
    wb.save("data/合同信息导出.xlsx")
    wb.close()
 
 
if __name__ == '__main__':
    print_hi('PyCharm,geovin du study')
 
    visitDir('data')
    writeExcel(l)

  

posted @   ®Geovin Du Dream Park™  阅读(7)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 25岁的心里话
· 闲置电脑爆改个人服务器(超详细) #公网映射 #Vmware虚拟网络编辑器
· 基于 Docker 搭建 FRP 内网穿透开源项目(很简单哒)
· 零经验选手,Compose 一天开发一款小游戏!
· 一起来玩mcp_server_sqlite,让AI帮你做增删改查!!
历史上的今天:
2020-07-06 csharp: Emgu.CV.OCR and Tesseract.OCR Optical Character Recognition
2018-07-06 MySQL chartset
2011-07-06 jQuery jToday Plugin
< 2025年3月 >
23 24 25 26 27 28 1
2 3 4 5 6 7 8
9 10 11 12 13 14 15
16 17 18 19 20 21 22
23 24 25 26 27 28 29
30 31 1 2 3 4 5
点击右上角即可分享
微信分享提示