tesseract的简单试用
通过截图获取多语言文本,与多语言文档对比,确定文本是否正确
ocr.py
截图可以是1.单个文件;2.adb截图;3.目录下所有图片文件
import image_process import tesseract_process import book_process import os option=int(input("1=file,2=adb,3=directory:")) strings=[] book_file= book=book_process.book(book_file) book.read() if(option==1): image_file= language=image_file.split("\\")[-1].split(".")[0] image=image_process.image_picker.get_image_by_path(image_file) strings=tesseract_process.text_recognition.get_text(image,language) book.record(strings,image_file) elif(option==2): language=input("language:") while(True): image,image_file=image_process.image_picker.get_image_by_adb(language) strings=tesseract_process.text_recognition.get_text(image,language) book.record(strings,image_file) flag=input("input n to stop or enter to continue") if(flag=="n"): break elif(option==3): dir= image_set=image_process.image_picker.get_image_from_dir(dir) for image_file in image_set: language=image_file.split(".")[0] image_file=os.path.join(dir,image_file) print(image_file) image=image_process.image_picker.get_image_by_path(image_file) strings=tesseract_process.text_recognition.get_text(image,language) book.record(strings,image_file) book.save()
image_process.py
处理获取图片的逻辑,为tesseract返回Pillow的image对象
from PIL import Image import os from datetime import datetime class image_picker(object): def get_image_by_path(file): image=Image.open(file) return image def get_image_by_adb(language): timestamp=str(int(datetime.now().timestamp())) image_file=language+"."+timestamp+".png" command="adb shell screencap -p /sdcard/"+image_file os.system(command) command="adb pull /sdcard/"+image_file+" ./" os.system(command) command="adb shell rm /sdcard/"+image_file os.system(command) file="./"+image_file image=Image.open(file) return image,image_file def get_image_from_dir(dir): types=("png","jpg","jpeg") image_set=set() for a,b,files in os.walk(dir): for file in files: if(file.split(".")[-1] in types): image_set.add(file) return image_set
tesseract_process.py
使用tesseract获取图片上的文本,使用两个空格作为分隔符,返回单词的列表
import pytesseract import re import image_process class text_recognition(object): def get_text(image,lang): text=pytesseract.image_to_string(image,lang=lang,config="--psm 3 -c preserve_interword_spaces=1") result=re.split(r"\n|\s{2,}",text) return result
book_process.py
多语言文档储存在xlsx文件,A列为给定的文本,B列为对比结果,C列为发现文本的次数,D列为发现文本的图片文件
import openpyxl class book(object): def __init__(self,file): self.__file=file def read(self): self.__book=openpyxl.load_workbook(self.__file) self.__sheet=self.__book["Sheet1"] rowidx=1 for row in self.__sheet.iter_rows(min_col=3,max_col=3,values_only=True): for count in row: if(count>0): print("init error") self.__book=None return None return self.__book def write(self,row,column,value): self.__sheet.cell(row,column,value) def record(self,strings,image_file): words={} rowidx=1 for row in self.__sheet.iter_rows(max_col=1,values_only=True): for word in row: words[word]=rowidx # print(words[word],word) rowidx+=1 for word in strings: if(word in words): rowidx=words[word] print("found %s at %d" % (word[:15],rowidx)) resultcell=self.__sheet.cell(rowidx,2,"found") countcell=self.__sheet.cell(rowidx,3,self.__sheet.cell(rowidx,3).value+1) pathcell=self.__sheet.cell(rowidx,4,str(self.__sheet.cell(rowidx,4).value)+"\r"+image_file) def save(self): self.__book.save(self.__file)