用AutoHotkey调用百度ocr接口提取增值税发票相关字段并写到Excel

功能介绍:

提取指定文件夹下的所有增值税发票(格式为jpg或png或pdf(暂时只处理第1页)),把所有信息写到Excel表当前选中的单元格,并重命名原始发票(可指定规则)复制到新文件夹。
由于要用到百度的接口,所以需要注册百度智能云+实名认证+创建应用+领取资源

  • https://console.bce.baidu.com/ai/#/ai/ocr/overview/index (产品服务→人工智能→文字识别)
  • 实名认证
  • 创建应用→随便输入应用名称→立即创建→查看应用详情→记录 appid apikey secretkey
  • 领取相应的资源:文字识别→概览→右侧【领取免费资源】→选中【财务票据OCR】→全部→0元领取
  • 等待资源到账:右键【资源列表】→已领取资源→核实是否拥有资源

百度返回字段对照表如图

image

使用步骤:

  • 打开任意Excel表,选中第一个要填的单元格
  • 读取脚本说明,修改相应内容后运行即可

下面是AutoHotkey v2 beta版代码

;注册百度智能云+实名认证+创建应用+领取资源(财务)
;   https://console.bce.baidu.com/ai/#/ai/ocr/overview/index (产品服务→人工智能→文字识别)
;   实名认证
;   创建应用→随便输入应用名称→立即创建→查看应用详情→记录 apikey secretkey
;   领取相应的资源:
;       文字识别→概览→右侧【领取免费资源】→选中【财务票据OCR】→全部→0元领取
;       等待资源到账:右键【资源列表】→已领取资源

;NOTE 搜索 hymodify 修改相应信息
;功能:
;   提取 dn0 文件夹电子发票信息(pdf只提取第1页),并写到当前已打开Excel表(从【当前选中单元格】开始写)
#SingleInstance force

if (!ProcessExist("Excel.exe")) {
    msgbox("请打开Excel并选中第一个要写入单元格",,0x40000)
    ExitApp
}

if (0) {
    dn0 := "c:\Users\Administrator\Desktop\11" ;hymodify 【旧】发票文件夹
    dn1 := "c:\Users\Administrator\Desktop\22" ;hymodify 【新】发票文件夹(发票重命名后复制到此文件夹)
    if !DirExist(dn1)
        DirCreate(dn1)
} else {
    dn0 := DirSelect(, 2, "选择【旧】发票文件夹")
    if (!strlen(dn0))
        ExitApp
    dn1 := DirSelect(, 2, "选择【新】发票文件夹")
}
arrOcr := [
    ["发票代码","InvoiceCode"],
    ["发票号码","InvoiceNum"],
    ["开票日期","InvoiceDate"],
    ["校验码","CheckCode"],
    ["机器编号","MachineCode"],
    ["金额","AmountInFiguers"],
    ["服务名称1","CommodityName"],
    ["税率1","CommodityTaxRate"],
    ["税额1","CommodityTax"],
    ["大写金额","AmountInWords"],
    ["销售方名称","SellerName"],
    ["销售方纳税人识别号","SellerRegisterNum"],
    ["销售方地址","SellerAddress"],
    ["销售方开户行","SellerBank"],
    ["购买方名称","PurchaserName"],
    ["购买方纳税人识别号","PurchaserRegisterNum"],
    ["购买方地址","PurchaserAddress"],
    ["购买方开户行","PurchaserBank"],
]
arrOther := [
    "新文件名", ;依赖 objOcr 结果
    "原文件名",
    "序号",
]

csOcr := arrOcr.length
cs := csOcr+arrOther.length
arrA := ComObjArray(12, 1, cs)
xl := ox()
st := xl.ActiveSheet
ac := xl.ActiveCell
r := 0
arrError := []
if (ac.row == 1) { ;在第1行,则初始化并写入标题
    st.cells.NumberFormat := "@"
    ;设置标题
    for _, arr in arrOcr
        arrA[0,A_Index-1] := arr[1]
    for _, v in arrOther
        arrA[0,csOcr+A_Index-1] := v
    ac.resize(1,cs).value := arrA ;要写的第1行
    rng1 := ac.offset(1).resize(1,cs)
} else
    rng1 := ac.resize(1,cs)
loop files, dn0 . "\*.*", "RF" { ;hymodify 带R会处理子文件夹
    if (A_LoopFileAttrib ~= "[HS]")
        continue
    if !(A_LoopFileName ~= "i)\.(pdf|jpg|png)") ;hymodify 过滤文件格式
        continue
    tooltip(A_Index . "`n" . A_LoopFileName)
    objOcr := _Web.baiduOcr_vatInvoice(A_LoopFileFullPath)
    ; hyf_objView(objOcr)
    arrA := ComObjArray(12, 1, cs) ;每行写一次
    ;写入 ocr 内容
    noExt := ""
    if (isobject(objOcr) && objOcr["TotalAmount"]) { ;成功获取结果
        for _, arr in arrOcr {
            res := objOcr[arr[2]]
            if (isobject(res)) {
                if (res.length)
                    arrA[0,A_Index-1] := res[1]["word"]
            } else
                arrA[0,A_Index-1] := res
        }
        noExt := format("{1}-{2}", delete0(objOcr["AmountInFiguers"]),objOcr["InvoiceNum"]) ;hymodify 新文件名规则,默认是(金额-发票号码)
        arrA[0,csOcr] := noExt
    }
    ;常规内容
    arrA[0,csOcr+1] := A_LoopFileName
    arrA[0,csOcr+2] := r+1
    ;arrA写到整行
    rng1.offset(r).value := arrA
    r++
    ;复制文件
    if strlen(dn1) {
        if (strlen(noExt)) {
            SplitPath(A_LoopFileFullPath, &fn,, &ext)
            try
                FileCopy(A_LoopFileFullPath, format("{1}\{2}.{3}", dn1,noExt,ext))
            catch
                arrError.push(A_LoopFileName)
        } else {
            SplitPath(A_LoopFileFullPath, &fn,, &ext, &noExt)
            FileCopy(A_LoopFileFullPath, format("{1}\__{2}.{3}", dn1,noExt,ext))
        }
    }
}
WinActivate("ahk_id " . st.application.hwnd)
tooltip
if arrError.length
    msgbox("以下文件复制时出错了,请核实`n`n" . json.stringify(arrError, 4))
else
    msgbox("已完成",,0x40000)
ExitApp

ox(winTitle:="ahk_class XLMAIN") {
    if WinExist(winTitle)
        ctlID := ControlGetHwnd("EXCEL71")
    else
        return ComObject("Excel.application")
    numput('Int64',0x0000000000020400, 'Int64',0x46000000000000C0, IID_IDispatch:=buffer(16))
    dllcall("oleacc\AccessibleObjectFromWindow", "ptr",ctlID, "uint",0xFFFFFFF0, "ptr",IID_IDispatch, "ptr*",win:=ComValue(9,0), 'HRESULT')
    loop {
        try
            return win.application
        catch
            ControlSend("{escape}", "EXCEL71")
    }
}

delete0(num) {
    if (num ~= "^-?\d+\.\d+$") {
        if (num ~= "\.\d{8,}$") ;小数位太多的异常
            num := round(num+0.00000001, 6)
        return rtrim(RegExReplace(num, "\.\d*?\K0+$"), ".")
    } else
        return num
}

class _Web {

    ;来自帮助 SysGetIPAddresses
    static get(url) {
        rst := ComObject("WinHttp.WinHttpRequest.5.1")
        rst.open("GET", url)
        try {
            rst.send()
            return rst.ResponseText
        }
    }

    ;网址,编码, 请求方式,post数据(NOTE 可能不好用)
    ;https://docs.microsoft.com/en-us/windows/win32/winhttp/iwinhttprequest-send
    static post(url, postData:="", Encoding:="", headers:="") {
        rst := ComObject("WinHttp.WinHttpRequest.5.1")
        rst.open("POST", url)
        if isobject(headers) {
            for k, v in headers {
                if v
                    rst.SetRequestHeader(k, v)
            }
        }
        rst.SetRequestHeader("Content-Type", "application/x-www-form-urlencoded")
        ; hyf_objView(postData)
        if isobject(postData) { ;NOTE 要转编码
            param := ""
            for k, v in postData {
                if (A_Index == 1)
                    param := format("{1}={2}", k,_Web.UrlEncode(v))
                else
                    param .= format("&{1}={2}", k,_Web.UrlEncode(v))
            }
            rst.send(param)
            rst.WaitForResponse(postData.has("timeout") ? postData.timeout : -1)
        } else {
            rst.send()
        }
        ; rsy.option(2) := nPage ;Codepage:nPage
        if Encoding && rst.ResponseBody {
            oADO := ComObject("adodb.stream")
            oADO.Type := 1
            oADO.Mode := 3
            oADO.Open()
            oADO.Write(rst.ResponseBody)
            oADO.Position := 0
            oADO.Type := 2
            oADO.Charset := Encoding
            res := oADO.ReadText()
            oADO.Close()
            return res
        }
        return rst.ResponseText
    }

    ;注册百度智能云+实名认证+创建应用
    ;   https://console.bce.baidu.com/ai/#/ai/ocr/overview/index (产品服务→人工智能→文字识别)
    ;   创建应用→随便输入应用名称→立即创建→查看应用详情→记录 apikey secretkey
    ;   实名认证
    ;   领取相应的资源:
    ;       文字识别→概览→右侧【领取免费资源】→选中类别→全部→0元领取
    ;       等待资源到账:右键【资源列表】→已领取资源
    ;baiduToken 方法内修改获取的 apikey secretkey
    static baiduToken() {
        apikey := "xxx" ;hymodify
        secretkey := "xxx" ;hymodify
        host := format("https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id={1}&client_secret={2}&", apikey,secretkey)
        res := _Web.get(host)
        obj := json.parse(res)
        return obj['access_token']
    }

    ;文档 https://cloud.baidu.com/doc/OCR/s/nk3h7xy2t
    ; _Web.baiduOcr_vatInvoice("c:\Users\Administrator\Desktop\22\1.pdf")
    ;如果fp是pdf,page表示页码
    static baiduOcr_vatInvoice(fp, page:=1) {
        url := "https://aip.baidubce.com/rest/2.0/ocr/v1/vat_invoice"
        b64 := (strlen(fp)>256) ? fp : _Web._toBase64(fp)
        url := format("{1}?access_token={2}", url,_Web.baiduToken())
        if (strlen(fp) < 256 && fp ~= "i)pdf$") {
            params := map(
                "pdf_file" , b64,
                "pdf_file_num" , page,
            )
        } else
            params := map("image" , b64)
        ; hyf_objView(params)
        response := _Web.post(url, params, "utf-8")
        obj := json.parse(response)
        if (obj.has("error_code")) {
            msgbox(json.stringify(obj, 4))
            return []
            ; throw obj["error_code"] . "`n" . obj["error_msg"]
        } else
            return obj["words_result"]
    }

    static _toBase64(fp) {
        buf := FileRead(fp, "raw")
        dllcall("crypt32\CryptBinaryToString", "Ptr",buf, "UInt",buf.size, "UInt",0x40000001, "Ptr",0, "uint*",&nSize:=0)
        b64 := buffer(nSize << 1, 0)
        dllcall("crypt32\CryptBinaryToString", "Ptr",buf, "UInt",buf.size, "UInt",0x40000001, "Ptr",b64, "uint*",&nSize)
        return strget(b64)
    }

    ;字符串特殊字符转义成URL格式(来自万年书妖)
    static UrlEncode(str, enc:="UTF-8") {
        hex := "00"
        fun := "msvcrt\swprintf"
        buff := buffer(size:=strput(str, enc))
        strput(str, buff, enc)
        while(code:=numget(buff, A_Index - 1, "UChar")) && dllcall(fun, "str",hex, "str","%%%02X", "uchar",code, "cdecl")
            r .= hex
        return r
        ;StringReplace, str, str, `%, , A ;%为URL特殊转义符,先处理(Google对%符的搜索支持不好才删除,否则替换为%25)
        ;array := map("&","%26"," ","%20","(","%28",")","%29","'","%27",",","%3A","/","%2F","+","%2B",A_Tab,"%21","`r`n","%0A") ;`r`n必须放一起,可用记事本测试
        ;for, key, value in array  ;特殊字符url转义
        ;StringReplace, str, str, %key%, %value%, A ;此处循环,两个参数必须一样
        ;return str
    }
}
posted @ 2021-08-12 22:43  火冷  阅读(822)  评论(0编辑  收藏  举报