python: 用百度API读取增值税发票信息

 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# encoding: utf-8
# 版权所有 2023 涂聚文有限公司
# 许可信息查看:
# 描述:
# Author    : geovindu,Geovin Du 涂聚文.
# IDE       : PyCharm 2023.1 python 311
# Datetime  : 2023/9/30 6:56
# User      : geovindu
# Product   : PyCharm
# Project   : pythonTkinterDemo
# File      : BaiduOCRAPI.py
# explain   : 学习
 
'''
{'words_result_num': 46, 'words_result': {'InvoiceNumDigit': '', 'CommodityUnit': [{'row': '1', 'word': '个'}], 'PurchaserAddress': '', 'SheetNum': '', 'CommodityType': [], 'TotalAmount': '100.83', 'Checker': '李思', 'PurchaserBank': '', 'Agent': '否', 'Password': '*71<09/<5*61/*67/+5-5>0>876-4794<3/2*802209-<->*7/973<52466>1-7<74+86*/*>1882>1-1>87<86588<6>0<803719>+895-*', 'InvoiceTypeOrg': '海南增值税电子普通发票', 'InvoiceCodeConfirm': '046002200111', 'TotalTax': '9.07', 'ServiceType': '日用品食品', 'CommodityTaxRate': [{'row': '1', 'word': '9%'}], 'CommodityTax': [{'row': '1', 'word': '9.07'}], 'SellerBank': '中国工商银行股份有限公司三亚解放支行2201030109200160703', 'Remarks': '订单号:278170028513', 'SellerAddress': '海南省三亚市崖州区崖州湾科技城标准厂房二期三楼C274区22165500-1602/1699', 'NoteDrawer': '王梅', 'InvoiceTag': '其他', 'InvoiceNumConfirm': '82098742', 'OnlinePay': '', 'Payee': '王陆', 'CommodityName': [{'row': '1', 'word': '*其他食品*食品'}], 'CommodityVehicleType': [], 'InvoiceCode': '046002200111', 'AmountInWords': '壹佰零玖圆玖角', 'AmountInFiguers': '109.90', 'City': '', 'InvoiceType': '电子普通发票', 'CommodityEndDate': [], 'PurchaserName': '重庆海开科技有限公司', 'InvoiceDate': '2023年07月31日', 'CommodityNum': [{'row': '1', 'word': '1'}], 'PurchaserRegisterNum': '915001075828145135', 'MachineCode': '661719672092', 'CommodityPlateNum': [], 'CheckCode': '47587549392161874692', 'SellerRegisterNum': '91460200MA5T41103U', 'CommodityPrice': [{'row': '1', 'word': '100.83'}], 'CommodityStartDate': [], 'SellerName': '三亚京东佳禹贸易有限公司', 'CommodityAmount': [{'row': '1', 'word': '100.83'}], 'Province': '海南省', 'InvoiceNum': '82098742'}, 'pdf_file_size': 1, 'log_id': 1707917072629791560}
Json {'words_result_num': 46, 'words_result': {'InvoiceNumDigit': '', 'CommodityUnit': [{'row': '1', 'word': '个'}], 'PurchaserAddress': '', 'SheetNum': '', 'CommodityType': [], 'TotalAmount': '100.83', 'Checker': '李思', 'PurchaserBank': '', 'Agent': '否', 'Password': '*71<09/<5*61/*67/+5-5>0>876-4794<3/2*802209-<->*7/973<52466>1-7<74+86*/*>1882>1-1>87<86588<6>0<803719>+895-*', 'InvoiceTypeOrg': '海南增值税电子普通发票', 'InvoiceCodeConfirm': '046002200111', 'TotalTax': '9.07', 'ServiceType': '日用品食品', 'CommodityTaxRate': [{'row': '1', 'word': '9%'}], 'CommodityTax': [{'row': '1', 'word': '9.07'}], 'SellerBank': '中国工商银行股份有限公司三亚解放支行2201030109200160703', 'Remarks': '订单号:278170028513', 'SellerAddress': '海南省三亚市崖州区崖州湾科技城标准厂房二期三楼C274区22165500-1602/1699', 'NoteDrawer': '王梅', 'InvoiceTag': '其他', 'InvoiceNumConfirm': '82098742', 'OnlinePay': '', 'Payee': '王陆', 'CommodityName': [{'row': '1', 'word': '*其他食品*食品'}], 'CommodityVehicleType': [], 'InvoiceCode': '046002200111', 'AmountInWords': '壹佰零玖圆玖角', 'AmountInFiguers': '109.90', 'City': '', 'InvoiceType': '电子普通发票', 'CommodityEndDate': [], 'PurchaserName': '重庆海开科技有限公司', 'InvoiceDate': '2023年07月31日', 'CommodityNum': [{'row': '1', 'word': '1'}], 'PurchaserRegisterNum': '915001075828145135', 'MachineCode': '661719672092', 'CommodityPlateNum': [], 'CheckCode': '47587549392161874692', 'SellerRegisterNum': '91460200MA5T41103U', 'CommodityPrice': [{'row': '1', 'word': '100.83'}], 'CommodityStartDate': [], 'SellerName': '三亚京东佳禹贸易有限公司', 'CommodityAmount': [{'row': '1', 'word': '100.83'}], 'Province': '海南省', 'InvoiceNum': '82098742'}, 'pdf_file_size': 1, 'log_id': 1707917072629791560}
 
'''
 
import os
import base64
import requests
import pandas as pd
import json
 
 
 
class BaiduOCR(object):
    """
    利用百度API读取发票信息(pdf,image文件)
    """
 
    AppID="40226401"
    APIKey="QuXMNizc80gTmUznKDRqQX3D"
    SecretKey="h6aHaGLssw51CYGtR3dvX1wGg6BBm0zi"
 
    def __init__(self):
        """
 
        """
        self.AppID="40226401"
        self.APIKey="SeP54f3RO7GqifYpX8DPQGQE"  #QuXMNizc80gTmUznKDRqQX3D
        self.SecretKey="c2zXHOWM2hlxeECEgwqG6UWlLTN1kQRs"  #h6aHaGLssw51CYGtR3dvX1wGg6BBm0zi
 
 
 
    def getAccessToken(self):
        """
 
        :param APIKey:
        :param SecretKey:
        :return:
        """
        '''
        host = f"https://aip.baidubce.com/oauth/2.0/token?client_secret={self.SecretKey}&grant_type=client_credentials&client_id={self.APIKey}"
        response = requests.get(host)
        return response.json()['access_token']
        '''
        url = "https://aip.baidubce.com/oauth/2.0/token"
        params = {"grant_type": "client_credentials", "client_id": self.APIKey, "client_secret": self.SecretKey}
        return str(requests.post(url, params=params).json().get("access_token"))
 
    def getContent(self,accessToken, pdfFile):
        """
 
        :param accessToken
        :param pdfFile:
        :return:
        """
        #headers = {'content-type': 'application/x-www-form-urlencoded'}
        #request_url = f"https://aip.baidubce.com/rest/2.0/ocr/v1/vat_invoice?access_token={accessToken}"
        f = open(pdfFile, 'rb')
        pdf = base64.b64encode(f.read())
        print(pdf)
        print(accessToken)
        request_url = f"https://aip.baidubce.com/rest/2.0/ocr/v1/vat_invoice"
        params = {"pdf_file": pdf}
        access_token =accessToken # '[调用鉴权接口获取的token]'
        request_url = request_url + "?access_token=" + access_token
        headers = {'content-type': 'application/x-www-form-urlencoded'}
        response = requests.post(request_url, data=params, headers=headers)
        if response:
            print(response.json())
 
        #print(pdf)
        #params = {"pdf_file": pdf}
        #response = requests.post(request_url, data=params, headers=headers)
        #print(response.json())
        return response.json()
 
 
    def getContentPng(self,accessToken, pngFile):
        """
 
        :param accessToken
        :param pngFile:
        :return:
        """
        #headers = {'content-type': 'application/x-www-form-urlencoded'}
        #request_url = f"https://aip.baidubce.com/rest/2.0/ocr/v1/vat_invoice?access_token={accessToken}"
        f = open(pngFile, 'rb')
        pdf = base64.b64encode(f.read())
        print(pdf)
        print(accessToken)
        request_url = f"https://aip.baidubce.com/rest/2.0/ocr/v1/vat_invoice"
        params = {"image": pdf}
        access_token =accessToken # '[调用鉴权接口获取的token]'
        request_url = request_url + "?access_token=" + access_token
        headers = {'content-type': 'application/x-www-form-urlencoded'}
        response = requests.post(request_url, data=params, headers=headers)
        if response:
            print(response.json())
 
        #print(pdf)
        #params = {"pdf_file": pdf}
        #response = requests.post(request_url, data=params, headers=headers)
        #print(response.json())
        return response.json()
 
    def getUsefulInfo(self,content, pdfName):
        """
 
        :param content
        :param pdfName:
        :return:
        """
        jsonstr = content
        print("Json",jsonstr)
        words_result = jsonstr['words_result']
        info = {'发票文件名': pdfName,
                '发票号码': str(words_result['InvoiceNum']),
                '开票日期': words_result['InvoiceDate'],
                '货物名称': words_result['CommodityName'][0]['word'],
                '未税金额': words_result['CommodityAmount'][0]['word'],
                '货物税率': words_result['CommodityTaxRate'][0]['word'],
                '货物税额': words_result['CommodityTax'][0]['word'],
                '合计金额': words_result['TotalAmount'],
                '合计税额': words_result['TotalTax'],
                '价税合计(小写)': words_result['AmountInFiguers'],
                '价税合计(大写)': words_result['AmountInWords'],
                '销售方名称': words_result['SellerName'],
                '销售方纳税人识别号': words_result['SellerRegisterNum'],
                '销售方银行及账户': words_result['SellerBank'],
                '销售方地址及电话': words_result['SellerAddress'],
                '购买方名称':words_result['PurchaserName'],
                '购买方纳税人识别号':words_result['PurchaserRegisterNum'],
                '机器编号':words_result['MachineCode']
                }
        return info

  

 

调用:用京东多张发票测试成功

 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
ocr=Common.BaiduOCRAPI.BaiduOCR()
pdfFilelist = os.listdir("invoice/")
infolist = []
for pdfFile in pdfFilelist:
    if pdfFile.split(".")[-1] == 'pdf':
        pdfName = pdfFile.split(".")[:-1]
        print(pdfFile)
        access_token =ocr.getAccessToken()
        content = ocr.getContent(access_token, "invoice/" + pdfFile)
        info = ocr.getUsefulInfo(content, pdfName)
        infolist.append(info)
 
df = pd.DataFrame(infolist)
print(df)
#df.to_excel('增值税发票信息统计.xlsx', sheet_name="geovindu",index=False)
with pd.ExcelWriter('geovindu.xlsx') as writer:  #, mode='a' 附加
    df.to_excel(writer, sheet_name='geovindu', index=False)

  

 

 

 

 

posted @   ®Geovin Du Dream Park™  阅读(142)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 无需6万激活码!GitHub神秘组织3小时极速复刻Manus,手把手教你使用OpenManus搭建本
· C#/.NET/.NET Core优秀项目和框架2025年2月简报
· Manus爆火,是硬核还是营销?
· 一文读懂知识蒸馏
· 终于写完轮子一部分:tcp代理 了,记录一下
历史上的今天:
2022-09-30 CSharp: State Pattern
2022-09-30 CSharp:Observer Pattern
2022-09-30 CSharp: Mediator Pattern
2022-09-30 Java: Visitor Pattern
2022-09-30 Java: State Pattern
< 2025年3月 >
23 24 25 26 27 28 1
2 3 4 5 6 7 8
9 10 11 12 13 14 15
16 17 18 19 20 21 22
23 24 25 26 27 28 29
30 31 1 2 3 4 5
点击右上角即可分享
微信分享提示