GOCR v0.50 原理分析
一,简介:
GOCR是一个c写的开源OCR库,GNU Public License,作者:Joerg Schulenburg
项目主页:http://jocr.sourceforge.net/index.html
源代码(v0.50) : http://pan.baidu.com/s/1y1Jj1 (VS2005工程项目)
Update : http://pan.baidu.com/s/1c0b278O (windows下通过liblept支持jpeg/png等格式的OCR)
版本(version.h):
#define version_string "0.50" #define release_string "20130305"
二,原理分析:
1,GOCR的主要流程如下:
int pgm2asc(job_t *job) { pix *pp; progress_counter_t *pc; static int multi_image_count=0; /* number of image within multi-image */ int orig_cs=0; if (!multi_image_count) orig_cs = job->cfg.cs; /* save for multi-images */ multi_image_count++; assert(job); /* FIXME jb: remove pp */ pp = &(job->src.p); pc = open_progress(100,"pgm2asc_main"); progress(0,pc); /* start progress output 0% 0% */ #if 0 /* dont vast memory */ /* FIXME jb: malloc */ if ( job->cfg.verbose & 32 ) { // generate 2nd imagebuffer for debugging output job->tmp.ppo.p = (unsigned char *)malloc(job->src.p.y * job->src.p.x); // buffer assert(job->tmp.ppo.p); copybox(&job->src.p, 0, 0, job->src.p.x, job->src.p.y, &job->tmp.ppo, job->src.p.x * job->src.p.y); } #else job->tmp.ppo=job->src.p; /* temporarely, removed later */ #endif // if (job->cfg.verbose&32) debug_img("out000.ppm",job,0); /* ----- count colors ------ create histogram ------- - this should be used to create a upper and lower limit for cs - cs is the optimum gray value between cs_min and cs_max - also inverse scans could be detected here later */ if (orig_cs==0) job->cfg.cs=otsu( pp->p,pp->y,pp->x,0,0,pp->x,pp->y,job->cfg.verbose & 1); else // dont set cs, output stats + do inversion if needed 2010-10-07 otsu( pp->p,pp->y,pp->x,0,0,pp->x,pp->y,job->cfg.verbose & 1); // if (job->cfg.verbose&32) debug_img("out001.ppm",job,0); /* renormalize the image and set the normalized threshold value */ job->cfg.cs=thresholding( pp->p,pp->y,pp->x,0,0,pp->x,pp->y, job->cfg.cs ); if( job->cfg.verbose ) fprintf(stderr, "# thresholding new_threshold= %d\n", job->cfg.cs); // if (job->cfg.verbose&32) debug_img("out002.ppm",job,0); progress(5,pc); /* progress is only estimated */ /* this is first step for reorganize the PG ---- look for letters, put rectangular frames around letters letter = connected points near color F should be used by dust removing (faster) and line detection! ---- 0..cs = black letters, last change = Mai99 */ progress(8,pc); /* progress is only estimated */ // if (job->cfg.verbose&32) debug_img("out008.ppm",job,8); scan_boxes( job, pp ); if ( !job->res.numC ){ fprintf( stderr,"# no boxes found - stopped\n" ); if(job->cfg.verbose&32) debug_img("out01",job,8); /***** should free stuff, etc) */ return(1); } // tmp10/bug100818a.pgm creates artefacts on image // if (job->cfg.verbose&32) debug_img("out00",job,4+8); progress(10,pc); /* progress is only estimated */ // if(job->cfg.verbose&32) debug_img("out01",job,4+8); // output_list(job); // for debugging // ToDo: matrix printer preprocessing remove_dust( job ); /* from the &(job->res.boxlist)! */ // if(job->cfg.verbose&32) debug_img("out02",job,4+8); // output_list(job); // for debugging #if 0 // ToDo 2010-10-15 destroys QR-barcodes smooth_borders( job ); /* only for big chars */ #endif progress(12,pc); /* progress is only estimated */ // if(job->cfg.verbose&32) debug_img("out03",job,4+8); // output_list(job); // for debugging detect_barcode( job ); /* mark barcode */ // if(job->cfg.verbose&32) debug_img("out04",job,4+8); // output_list(job); // for debugging detect_pictures( job ); /* mark pictures */ // if(job->cfg.verbose&32) debug_img("out05",job,4+8); // output_list(job); // for debugging remove_pictures( job ); /* do this as early as possible, before layout */ // if(job->cfg.verbose&32) debug_img("out06",job,4+8); // output_list(job); // for debugging glue_holes_inside_chars( pp ); /* including count subboxes (holes) */ detect_rotation_angle( job ); #if 1 /* Rotate the whole picture! move boxes */ if( job->res.lines.dy!=0 ){ // move down lowest first, move up highest first // in work! ??? (at end set dy=0) think on ppo! } #endif detect_text_lines( pp, job->cfg.mode ); /* detect and mark job->tmp.ppo */ // if(job->cfg.verbose&32) debug_img("out07",job,4+8); progress(20,pc); /* progress is only estimated */ add_line_info( job /* , &(job->res.boxlist) */); if (job->cfg.verbose&32) debug_img("out10",job,4+8); divide_vert_glued_boxes( pp, job->cfg.mode); /* after add_line_info, before list_sort! */ // if(job->cfg.verbose&32) debug_img("out11",job,0); remove_melted_serifs( job, pp ); /* make some corrections on pixmap */ /* list_ins seems to sort in the boxes on the wrong place ??? */ // if(job->cfg.verbose&32) debug_img("out12",job,4+8); glue_broken_chars( job, pp ); /* 2nd glue */ // if(job->cfg.verbose&32) debug_img("out14",job,4+8); // 2010-09-24 overall box size is correct here, but later broken remove_rest_of_dust( job ); // if(job->cfg.verbose&32) debug_img("out15",job,4+8); /* better sort after dust is removed (slow for lot of pixels) */ list_sort(&(job->res.boxlist), sort_box_func); measure_pitch( job ); if(job->cfg.mode&64) find_same_chars( pp ); progress(30,pc); /* progress is only estimated */ // if(job->cfg.verbose&32) debug_img("out16",job,4+8); char_recognition( pp, job->cfg.mode); progress(60,pc); /* progress is only estimated */ // if(job->cfg.verbose&32) debug_img("out17",job,4+8); if ( adjust_text_lines( pp, job->cfg.mode ) ) { /* correct using chars */ /* may be, characters/pictures have changed line number */ list_sort(&(job->res.boxlist), sort_box_func); // 2nd recognition call if lines are adjusted char_recognition( pp, job->cfg.mode); } #define BlownUpDrawing 0 /* german: Explosionszeichnung, temporarly */ #if BlownUpDrawing == 1 /* german: Explosionszeichnung */ { /* just for debugging */ int i,ii,ni; struct box *box2; i=ii=ni=0; for_each_data(&(job->res.boxlist)) { /* count boxes */ box2 = (struct box *)list_get_current(&(job->res.boxlist)); if (box2->c==UNKNOWN) i++; if (box2->c==PICTURE) ii++; ni++; } end_for_each(&(job->res.boxlist)); if (job->cfg.verbose) fprintf(stderr,"# debug: unknown= %d picts= %d boxes= %d\n",i,ii,ni); } #endif // ----------- write out20.pgm ----------- mark lines + boxes if (job->cfg.verbose&32) debug_img("out20",job,1+4+8); compare_unknown_with_known_chars( pp, job->cfg.mode); progress(70,pc); /* progress is only estimated */ try_to_divide_boxes( pp, job->cfg.mode); progress(80,pc); /* progress is only estimated */ /* --- list output ---- for debugging --- */ if (job->cfg.verbose&6) output_list(job); /* ---- insert spaces ---- */ list_insert_spaces( pp , job ); // ---- proof difficult chars Il1 by context view ---- if (job->cfg.verbose) fprintf(stderr,"# context correction if !(mode&32)\n"); if (!(job->cfg.mode&32)) context_correction( job ); store_boxtree_lines( job, job->cfg.mode ); progress(90,pc); /* progress is only estimated */ /* 0050002.pgm.gz ca. 109 digits, only 50 recognized (only in lines?) * ./gocr -v 39 -m 56 -e - -m 4 -C 0-9 -f XML tmp0406/0050002.pbm.gz * awk 'BEGIN{num=0}/1<\/box>/{num++;}END{print num}' o * 15*0 24*1 18*2 19*3 15*4 6*5 6*6 6*7 4*8 8*9 sum=125digits counted boxes * 9*0 19*1 14*2 15*3 11*4 6*5 5*6 6*7 4*8 8*9 sum=97digits recognized * 1*1 1*7 not recognized (Oct04) * 33*SPC 76*NL = 109 spaces + 36*unknown sum=241 * 16 missed */ #if BlownUpDrawing == 1 /* german: Explosionszeichnung */ { /* just for debugging */ int i,ii,ni; struct box *box2; const char *testc="0123456789ABCDEFGHIJK"; i=ii=ni=0; for_each_data(&(job->res.boxlist)) { /* count boxes */ box2 = (struct box *)list_get_current(&(job->res.boxlist)); if (box2->c==UNKNOWN) i++; if (box2->c==PICTURE) ii++; if (box2->c>' ' && box2->c<='z') ni++; } end_for_each(&(job->res.boxlist)); if(job->cfg.verbose) fprintf(stderr,"# debug: (_)= %d picts= %d chars= %d",i,ii,ni); for (i=0;i<20;i++) { ni=0; for_each_data(&(job->res.boxlist)) { /* count boxes */ box2 = (struct box *)list_get_current(&(job->res.boxlist)); if (box2->c==testc[i]) ni++; } end_for_each(&(job->res.boxlist)); if(job->cfg.verbose && ni>0) fprintf(stderr," (%c)=%d",testc[i],ni); } if(job->cfg.verbose) fprintf(stderr,"\n"); } #endif // ---- frame-size-histogram // ---- (my own defined) distance between letters // ---- write internal picture of textsite // ----------- write out30.pgm ----------- if( job->cfg.verbose&32 ) debug_img("out30",job,2+4); progress(100,pc); /* progress is only estimated */ close_progress(pc); return 0; /* what should I return? error-state? num-of-chars? */ }
2,Scan boxes分析:
流程:从上往下,分别在X,Y轴方向投影,得到box list。
3,去除噪点:
/* ---- remove dust --------------------------------- What is dust? I think, this is a very small pixel cluster without neighbours. Of course not all dust clusters can be detected correct. This feature should be possible to switch off via option. -> may be, all clusters should be stored here? speed is very slow, I know, but I am happy that it is working well */
4,detect barcode and pictures , remove pictures:
图片:所有box的平均宽度为avgwidth,平均高度为avgheight,符合box.width > 4 * avgwidth || height > 4*avgheight条件并且相近大小的box少于4个的box认为是图像box。
5,glur holes inside char:
/* ---- join holes to chars( before step1 ) v0.42 ----------------------- join boxes lying inside another box (usually holes, ex: "aeobdg46890") Dont add dust to a char! lines are not detected yet */
6,detect rotation angle:
/* ** Detect rotation angle (one for whole image) ** old: longest text-line and determining the angle of this line. * * search right nearest neighbour of each box and average vectors * to get the text orientation, * upside down decision is not made here (I dont know how to do it) * ToDo: set job->res.lines.{dx,dy} * pass 1: get mean vector to nearest char * pass 2: get mean vector to nearest char without outriders to pass 1 * extimate direction as (dx,dy,num)[pass] * ToDo: estimate an error, boxes only work fine for zero-rotation * for 45 degree use vectors, not boxes to get base line */
7,detect text lines:
http://en.wikipedia.org/wiki/Cap_height
8,measure pitch:
估计空格的宽度。
9,识别字符:
gocr的识别不是机器学习式的学习,没有training过程,完全靠先验的规则,因此只能识别英文字符,数字,标点等。识别主要是一个filter链路,每个filter决定box是否是该字符,是则略过后续filter。
a,从box外引出一条射线从某个方向(左,右,上,下)某个坐标(x,y)向box内部,第一个交点位置必须符合某个字符的先验规则;
代码:
/* move from x,y to direction r until pixel of color col is found * or maximum of l steps * return the number of steps done */ int loop(pix *p,int x,int y,int l,int cs,int col, DIRECTION r){ int i=0; if(x>=0 && y>=0 && x<p->x && y<p->y){ switch (r) { case UP: for( ;i<l && y>=0;i++,y--) if( (getpixel(p,x,y)<cs)^col ) break; break; case DO: for( ;i<l && y<p->y;i++,y++) if( (getpixel(p,x,y)<cs)^col ) break; break; case LE: for( ;i<l && x>=0;i++,x--) if( (getpixel(p,x,y)<cs)^col ) break; break; case RI: for( ;i<l && x<p->x;i++,x++) if( (getpixel(p,x,y)<cs)^col ) break; break; default:; } } return i; }b,经过box的一条直线与字符的交点个数必须符合某个字符的先验规则,算法:计算这样的点(如从左向右:Pixel(x,y) = white && Pixel(x+1,y) = black ) 的个数
代码:
int num_cross(int x0, int x1, int y0, int y1, pix *p, int cs) { int rc = 0, col = 0, k, x, y, i, d; // rc=crossings col=0=white int dx = x1 - x0, dy = y1 - y0; d = MAX(abs(dx), abs(dy)); for (i = 0, x = x0, y = y0; i <= d; i++) { if (d) { x = x0 + i * dx / d; y = y0 + i * dy / d; } k = ((getpixel(p, x, y) < cs) ? 1 : 0); // 0=white 1=black if (col == 0 && k == 1) // found a white-black transition rc++; col = k; // last color } return rc; }c,孔洞的个数必须符合某个字符的先验规则,比如A有一个洞;这一步只是判断,实际工作在第5步已经完成。
d,如下面识别“{”的代码:
意思是横穿过dy条线,所有线与字符的交点个数均为1;在字符的前半面,竖直穿过dx/2条线,交点个数均为2,即左凸起部分;等等。
// --------- test {} -------------------------------- for(ad=d=99;dx>2 && dy>5 && 2*dy>3*dx;){ DBG( wchar_t c_ask='}'; ) if (!hchar) ad=97*ad/100; for(y=0;y<dy;y++){ if( num_cross(0,dx-1,y,y,bp,cs) != 1 ) break; } if (y<dy) Break; for(x=0;x<dx/2;x++){ if( num_cross(x,x,0,dy-1,bp,cs) != 2 ) break; } if (y<dx/2) Break; if ( num_cross( 0, 0,dy/4,dy-1-dy/4,bp,cs) != 0 ) Break; if ( num_cross(dx-1,dx-1,dy/4,dy-1-dy/4,bp,cs) != 1 ) Break; i1=loop(bp,dx-1 ,dy/4,dx,cs,0,LE); i1=loop(bp,dx-1-i1,dy/4,dx,cs,1,LE); // thickness1 for (i2=dx,i3=y=dy/2-1-dy/16;y<dy/2+2+dy/16;y++) { x=loop(bp,dx-1 , y,dx,cs,0,LE); if (x<i2) {i2=x;i3=y;} } i2= loop(bp,dx-1-i2,i3,dx,cs,1,LE); // thickness2 if (i2<i1+dx/16+1) Break; if ( loop(bp,dx-1,dy-1,dx,cs,0,LE)>3*dx/4 ) {ad=99*ad/100;MSG({})} if ( loop(bp,dx-1, 0,dx,cs,0,LE)>3*dx/4 ) {ad=99*ad/100;MSG({})} // > if ( loop(bp,dx-1, 0,dy,cs,0,DO)<dy/2-dy/8-1 ) {ad=98*ad/100;MSG({})} if ( loop(bp,dx-1,dy-1,dy,cs,0,UP)<dy/2-dy/8-1 ) {ad=98*ad/100;MSG({})} // ) if ( loop(bp,dx-1, 0,dy,cs,0,DO)<=dy/4) Break; if (dy>=8) if ( loop(bp,0, 0,dx,cs,0,RI) + loop(bp,0,dy/4,dx,cs,0,RI) - 2*loop(bp,0,dy/8,dx,cs,0,RI) >=dx/8 ) {ad=98*ad/100;MSG({})} // < if ( loop(bp,1,dy-1,dy,cs,0,UP)>dy/4 ) Break; // ??? if ( get_bw(x1,x1,y0,y0+dy/4,box1->p,cs,1) == 1 || get_bw(x1,x1,y1-dy/4,y1,box1->p,cs,1) == 1 ) Break; Setac(box1,(bc='}'),ad);break; }
10,compare_unknown_with_known_chars try_to_divide_boxes等后处理;
11,list_insert_spaces 插入空格;
12,store_boxtree_lines;
13,输出识别结果。