几个有用的脚本备记
tesseract sh训练脚本
#! /bin/bash # build the environment mkdir tessenv; cd tessenv TROOT=`pwd` mkdir $TROOT/stockfonts; mkdir $TROOT/build; mkdir $TROOT/build/eng echo "Environment built" # Get the stock english fonts from Google (old, but they work) cd $TROOT/stockfonts GET http://tesseract-ocr.googlecode.com/files/boxtiff-2.01.eng.tar.gz > boxtiff-2.01.eng.tar.gz echo "Google box/tiff tar.gz loaded" # unpack the fonts, a new english (eng) directory is created with tif/box files tar -xzf boxtiff-2.01.eng.tar.gz echo "box/tiff file unpacked" # Move the arial font data into the build space (yes, the exp0 is required) mv $TROOT/stockfonts/eng/eng.arial.g4.tif $TROOT/build/eng.arial.exp0.tif mv $TROOT/stockfonts/eng/eng.arial.box $TROOT/build/eng.arial.exp0.box echo "ariel box/tif moved and renamed" cd $TROOT/build # Create the font_properties file echo "arial 0 0 0 0 0" > font_properties # BEGIN BUILDING NEW eng.traineddata tesseract eng.arial.exp0.tif eng.arial.exp0 nobatch box.train unicharset_extractor eng.arial.exp0.box shapeclustering -F font_properties -U unicharset eng.arial.exp0.tr mftraining -F font_properties -U unicharset -O eng.unicharset eng.arial.exp0.tr cntraining eng.arial.exp0.tr echo "eng.traineddata complete" # BEGIN combining into an eng.traineddata set # Note files are moved into an isoloated directory for combiing # Note files have language prefix added cp eng.unicharset $TROOT/build/eng/eng.unicharset cp normproto $TROOT/build/eng/eng.normproto cp inttemp $TROOT/build/eng/eng.inttemp cp pffmtable $TROOT/build/eng/eng.pffmtable cp shapetable $TROOT/build/eng/eng.shapetable cd $TROOT/build/eng combine_tessdata eng. # You now have an eng.trainedddata file in your $TROOT/build/eng directory # You must move this file to your /usr/local/share/tessdata directory. # You will need sudo permission. # BE SURE to back up your old eng.traineddata FIRST # Recommend testing your new tesseract with the eng.arial.exp0.tif file in # the build directory.
opencv 文本图片预处理
# -*- coding: UTF-8 -*- import cv2 def digitsimg(src): #灰度化 img_gray = cv2.cvtColor(src,cv2.COLOR_BGR2GRAY) #Otsu thresholding 二值化 ret,result= cv2.threshold(img_gray,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU) #腐蚀去除一些小的点 kernel = cv2.getStructuringElement(cv2.MORPH_CROSS,(3,2)) eroded = cv2.erode(result,kernel) #将结果放大便于识别 result = cv2.resize(result,(128,128),interpolation=cv2.INTER_CUBIC) # cv2.imshow('result',result) # cv2.waitKey(0) #腐蚀去除放大后的一些小的点 eroded = cv2.erode(result,kernel) # cv2.imshow('eroded',eroded) # cv2.waitKey(0) #膨胀使数字更饱满 result = cv2.dilate(eroded,kernel) # cv2.imshow('dilated',result) #直方图均衡化使图像更清晰 cv2.equalizeHist(result) #中值滤波去除噪点 result = cv2.medianBlur(result,5) # cv2.imshow('median',result) # cv2.waitKey(0) return result ''' def chineseimg(src): #灰度化 img_gray = cv2.cvtColor(src,cv2.COLOR_BGR2GRAY) #Otsu thresholding 二值化 ret,result= cv2.threshold(img_gray,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU) # cv2.imshow('otsu',result) # cv2.waitKey(0) #直方图均衡化使图像更清晰 cv2.equalizeHist(result) # cv2.imshow('直方图',result) # cv2.waitKey(0) return result #将结果放大便于识别 result = cv2.resize(result,(256,128),interpolation=cv2.INTER_CUBIC) #腐蚀去除放大后的一些小的点 kernel = cv2.getStructuringElement(cv2.MORPH_CROSS,(3,2)) eroded = cv2.erode(result,kernel) cv2.imshow('eroded',eroded) cv2.waitKey(0) #膨胀使数字更饱满 result = cv2.dilate(eroded,kernel) cv2.imshow('dilated',result) cv2.waitKey(0) #直方图均衡化使图像更清晰 cv2.equalizeHist(result) #中值滤波去除噪点 result = cv2.medianBlur(result,5) cv2.imshow('median',result) cv2.waitKey(0)'''
https://coding.net/u/mengning/p/np2016/git/blob/master/BloodTestReportOCR/imgproc.py
每天一小步,人生一大步!Good luck~