java 图片识别文字(中英文混合)

调用 tess4j 库来识别图片文字

 

依赖的maven库

<dependency>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-api</artifactId>
            <version>1.7.26</version>
        </dependency>

        <dependency>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-simple</artifactId>
            <version>1.7.26</version>
        </dependency>

<!-- https://mvnrepository.com/artifact/net.sourceforge.tess4j/tess4j -->
        <dependency>
            <groupId>net.sourceforge.tess4j</groupId>
            <artifactId>tess4j</artifactId>
            <version>5.1.1</version>
        </dependency>

 

图片识别文字

package com;

import net.sourceforge.tess4j.ITesseract;
import net.sourceforge.tess4j.Tesseract1;
import net.sourceforge.tess4j.TesseractException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;

public class TestOCR {
    private static final Logger logger = LoggerFactory.getLogger(TestOCR.class);

    public static void main(String[] args) {
        String result = doOCR("字库位置", "要识别的图片地址");
        System.out.println(result);
    }

    private static String doOCR(String dataPath, String imgPath) {
        File imageFile = new File(imgPath);
        ITesseract instance = new Tesseract1();
        //字库位置
        instance.setDatapath(dataPath);
        //eng+chi_sim代表中英文混合
        instance.setLanguage("eng+chi_sim");//eng :英文  chi_sim :简体中文

        try {
            return instance.doOCR(imageFile);
        } catch (TesseractException e) {
            logger.error("", e);
        }

        return "";
    }
}

  

字库下载

下载中文包:https://github.com/tesseract-ocr/tessdata 选择chi_sim.traineddata文件进行下载,英文包在tess4j jar包中可以获取。

posted @ 2022-02-24 17:26  小码农2017  阅读(983)  评论(0编辑  收藏  举报