python: 用百度API读取增值税发票信息
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 | # encoding: utf-8 # 版权所有 2023 涂聚文有限公司 # 许可信息查看: # 描述: # Author : geovindu,Geovin Du 涂聚文. # IDE : PyCharm 2023.1 python 311 # Datetime : 2023/9/30 6:56 # User : geovindu # Product : PyCharm # Project : pythonTkinterDemo # File : BaiduOCRAPI.py # explain : 学习 ''' {'words_result_num': 46, 'words_result': {'InvoiceNumDigit': '', 'CommodityUnit': [{'row': '1', 'word': '个'}], 'PurchaserAddress': '', 'SheetNum': '', 'CommodityType': [], 'TotalAmount': '100.83', 'Checker': '李思', 'PurchaserBank': '', 'Agent': '否', 'Password': '*71<09/<5*61/*67/+5-5>0>876-4794<3/2*802209-<->*7/973<52466>1-7<74+86*/*>1882>1-1>87<86588<6>0<803719>+895-*', 'InvoiceTypeOrg': '海南增值税电子普通发票', 'InvoiceCodeConfirm': '046002200111', 'TotalTax': '9.07', 'ServiceType': '日用品食品', 'CommodityTaxRate': [{'row': '1', 'word': '9%'}], 'CommodityTax': [{'row': '1', 'word': '9.07'}], 'SellerBank': '中国工商银行股份有限公司三亚解放支行2201030109200160703', 'Remarks': '订单号:278170028513', 'SellerAddress': '海南省三亚市崖州区崖州湾科技城标准厂房二期三楼C274区22165500-1602/1699', 'NoteDrawer': '王梅', 'InvoiceTag': '其他', 'InvoiceNumConfirm': '82098742', 'OnlinePay': '', 'Payee': '王陆', 'CommodityName': [{'row': '1', 'word': '*其他食品*食品'}], 'CommodityVehicleType': [], 'InvoiceCode': '046002200111', 'AmountInWords': '壹佰零玖圆玖角', 'AmountInFiguers': '109.90', 'City': '', 'InvoiceType': '电子普通发票', 'CommodityEndDate': [], 'PurchaserName': '重庆海开科技有限公司', 'InvoiceDate': '2023年07月31日', 'CommodityNum': [{'row': '1', 'word': '1'}], 'PurchaserRegisterNum': '915001075828145135', 'MachineCode': '661719672092', 'CommodityPlateNum': [], 'CheckCode': '47587549392161874692', 'SellerRegisterNum': '91460200MA5T41103U', 'CommodityPrice': [{'row': '1', 'word': '100.83'}], 'CommodityStartDate': [], 'SellerName': '三亚京东佳禹贸易有限公司', 'CommodityAmount': [{'row': '1', 'word': '100.83'}], 'Province': '海南省', 'InvoiceNum': '82098742'}, 'pdf_file_size': 1, 'log_id': 1707917072629791560} Json {'words_result_num': 46, 'words_result': {'InvoiceNumDigit': '', 'CommodityUnit': [{'row': '1', 'word': '个'}], 'PurchaserAddress': '', 'SheetNum': '', 'CommodityType': [], 'TotalAmount': '100.83', 'Checker': '李思', 'PurchaserBank': '', 'Agent': '否', 'Password': '*71<09/<5*61/*67/+5-5>0>876-4794<3/2*802209-<->*7/973<52466>1-7<74+86*/*>1882>1-1>87<86588<6>0<803719>+895-*', 'InvoiceTypeOrg': '海南增值税电子普通发票', 'InvoiceCodeConfirm': '046002200111', 'TotalTax': '9.07', 'ServiceType': '日用品食品', 'CommodityTaxRate': [{'row': '1', 'word': '9%'}], 'CommodityTax': [{'row': '1', 'word': '9.07'}], 'SellerBank': '中国工商银行股份有限公司三亚解放支行2201030109200160703', 'Remarks': '订单号:278170028513', 'SellerAddress': '海南省三亚市崖州区崖州湾科技城标准厂房二期三楼C274区22165500-1602/1699', 'NoteDrawer': '王梅', 'InvoiceTag': '其他', 'InvoiceNumConfirm': '82098742', 'OnlinePay': '', 'Payee': '王陆', 'CommodityName': [{'row': '1', 'word': '*其他食品*食品'}], 'CommodityVehicleType': [], 'InvoiceCode': '046002200111', 'AmountInWords': '壹佰零玖圆玖角', 'AmountInFiguers': '109.90', 'City': '', 'InvoiceType': '电子普通发票', 'CommodityEndDate': [], 'PurchaserName': '重庆海开科技有限公司', 'InvoiceDate': '2023年07月31日', 'CommodityNum': [{'row': '1', 'word': '1'}], 'PurchaserRegisterNum': '915001075828145135', 'MachineCode': '661719672092', 'CommodityPlateNum': [], 'CheckCode': '47587549392161874692', 'SellerRegisterNum': '91460200MA5T41103U', 'CommodityPrice': [{'row': '1', 'word': '100.83'}], 'CommodityStartDate': [], 'SellerName': '三亚京东佳禹贸易有限公司', 'CommodityAmount': [{'row': '1', 'word': '100.83'}], 'Province': '海南省', 'InvoiceNum': '82098742'}, 'pdf_file_size': 1, 'log_id': 1707917072629791560} ''' import os import base64 import requests import pandas as pd import json class BaiduOCR( object ): """ 利用百度API读取发票信息(pdf,image文件) """ AppID = "40226401" APIKey = "QuXMNizc80gTmUznKDRqQX3D" SecretKey = "h6aHaGLssw51CYGtR3dvX1wGg6BBm0zi" def __init__( self ): """ """ self .AppID = "40226401" self .APIKey = "SeP54f3RO7GqifYpX8DPQGQE" #QuXMNizc80gTmUznKDRqQX3D self .SecretKey = "c2zXHOWM2hlxeECEgwqG6UWlLTN1kQRs" #h6aHaGLssw51CYGtR3dvX1wGg6BBm0zi def getAccessToken( self ): """ :param APIKey: :param SecretKey: :return: """ ''' host = f"https://aip.baidubce.com/oauth/2.0/token?client_secret={self.SecretKey}&grant_type=client_credentials&client_id={self.APIKey}" response = requests.get(host) return response.json()['access_token'] ''' url = "https://aip.baidubce.com/oauth/2.0/token" params = { "grant_type" : "client_credentials" , "client_id" : self .APIKey, "client_secret" : self .SecretKey} return str (requests.post(url, params = params).json().get( "access_token" )) def getContent( self ,accessToken, pdfFile): """ :param accessToken :param pdfFile: :return: """ #headers = {'content-type': 'application/x-www-form-urlencoded'} #request_url = f"https://aip.baidubce.com/rest/2.0/ocr/v1/vat_invoice?access_token={accessToken}" f = open (pdfFile, 'rb' ) pdf = base64.b64encode(f.read()) print (pdf) print (accessToken) request_url = f "https://aip.baidubce.com/rest/2.0/ocr/v1/vat_invoice" params = { "pdf_file" : pdf} access_token = accessToken # '[调用鉴权接口获取的token]' request_url = request_url + "?access_token=" + access_token headers = { 'content-type' : 'application/x-www-form-urlencoded' } response = requests.post(request_url, data = params, headers = headers) if response: print (response.json()) #print(pdf) #params = {"pdf_file": pdf} #response = requests.post(request_url, data=params, headers=headers) #print(response.json()) return response.json() def getContentPng( self ,accessToken, pngFile): """ :param accessToken :param pngFile: :return: """ #headers = {'content-type': 'application/x-www-form-urlencoded'} #request_url = f"https://aip.baidubce.com/rest/2.0/ocr/v1/vat_invoice?access_token={accessToken}" f = open (pngFile, 'rb' ) pdf = base64.b64encode(f.read()) print (pdf) print (accessToken) request_url = f "https://aip.baidubce.com/rest/2.0/ocr/v1/vat_invoice" params = { "image" : pdf} access_token = accessToken # '[调用鉴权接口获取的token]' request_url = request_url + "?access_token=" + access_token headers = { 'content-type' : 'application/x-www-form-urlencoded' } response = requests.post(request_url, data = params, headers = headers) if response: print (response.json()) #print(pdf) #params = {"pdf_file": pdf} #response = requests.post(request_url, data=params, headers=headers) #print(response.json()) return response.json() def getUsefulInfo( self ,content, pdfName): """ :param content :param pdfName: :return: """ jsonstr = content print ( "Json" ,jsonstr) words_result = jsonstr[ 'words_result' ] info = { '发票文件名' : pdfName, '发票号码' : str (words_result[ 'InvoiceNum' ]), '开票日期' : words_result[ 'InvoiceDate' ], '货物名称' : words_result[ 'CommodityName' ][ 0 ][ 'word' ], '未税金额' : words_result[ 'CommodityAmount' ][ 0 ][ 'word' ], '货物税率' : words_result[ 'CommodityTaxRate' ][ 0 ][ 'word' ], '货物税额' : words_result[ 'CommodityTax' ][ 0 ][ 'word' ], '合计金额' : words_result[ 'TotalAmount' ], '合计税额' : words_result[ 'TotalTax' ], '价税合计(小写)' : words_result[ 'AmountInFiguers' ], '价税合计(大写)' : words_result[ 'AmountInWords' ], '销售方名称' : words_result[ 'SellerName' ], '销售方纳税人识别号' : words_result[ 'SellerRegisterNum' ], '销售方银行及账户' : words_result[ 'SellerBank' ], '销售方地址及电话' : words_result[ 'SellerAddress' ], '购买方名称' :words_result[ 'PurchaserName' ], '购买方纳税人识别号' :words_result[ 'PurchaserRegisterNum' ], '机器编号' :words_result[ 'MachineCode' ] } return info |
调用:用京东多张发票测试成功
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 | ocr = Common.BaiduOCRAPI.BaiduOCR() pdfFilelist = os.listdir( "invoice/" ) infolist = [] for pdfFile in pdfFilelist: if pdfFile.split( "." )[ - 1 ] = = 'pdf' : pdfName = pdfFile.split( "." )[: - 1 ] print (pdfFile) access_token = ocr.getAccessToken() content = ocr.getContent(access_token, "invoice/" + pdfFile) info = ocr.getUsefulInfo(content, pdfName) infolist.append(info) df = pd.DataFrame(infolist) print (df) #df.to_excel('增值税发票信息统计.xlsx', sheet_name="geovindu",index=False) with pd.ExcelWriter( 'geovindu.xlsx' ) as writer: #, mode='a' 附加 df.to_excel(writer, sheet_name = 'geovindu' , index = False ) |
哲学管理(学)人生, 文学艺术生活, 自动(计算机学)物理(学)工作, 生物(学)化学逆境, 历史(学)测绘(学)时间, 经济(学)数学金钱(理财), 心理(学)医学情绪, 诗词美容情感, 美学建筑(学)家园, 解构建构(分析)整合学习, 智商情商(IQ、EQ)运筹(学)生存.---Geovin Du(涂聚文)
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 无需6万激活码!GitHub神秘组织3小时极速复刻Manus,手把手教你使用OpenManus搭建本
· C#/.NET/.NET Core优秀项目和框架2025年2月简报
· Manus爆火,是硬核还是营销?
· 一文读懂知识蒸馏
· 终于写完轮子一部分:tcp代理 了,记录一下
2022-09-30 CSharp: State Pattern
2022-09-30 CSharp:Observer Pattern
2022-09-30 CSharp: Mediator Pattern
2022-09-30 Java: Visitor Pattern
2022-09-30 Java: State Pattern