CSharp: Tesseract OCR V5.0 in donet core 3.1

Reference resources
https://github.com/alex-doe/open-ocr-dotnet
https://github.com/tleyden/open-ocr/ go
https://github.com/DayBreak-u/chineseocr_lite
https://github.com/pjreddie/darknet
https://sourceforge.net/projects/vietocr/
https://github.com/PaddlePaddle/PaddleOCR
https://github.com/gumblex/tessdata_chi
https://github.com/nobody132/masr
https://code.google.com/p/tesseractdotnet
https://github.com/tesseract-ocr/tesseract
https://github.com/tesseract-ocr/tessdata
https://github.com/charlesw/tesseract
https://github.com/charlesw/tesseract-samples
https://github.com/tesseract-ocr/langdata

 

 

 

  /// <summary>
    /// https://tesseract-ocr.github.io/tessdoc/Data-Files
    /// geovindu,Geovin Du
    /// </summary>
    public enum LanguangeList
    {
        /// <summary>
        /// Afrikaans	afr.traineddata
        /// </summary>
        afr,


        /// <summary>
        /// Amharic	amh.traineddata
        /// </summary>
        amh,


        /// <summary>
        /// Arabic	ara.traineddata
        /// </summary>
        ara,


        /// <summary>
        /// Assamese	asm.traineddata
        /// </summary>
        asm,


        /// <summary>
        /// Azerbaijani	aze.traineddata
        /// </summary>
        aze,


        /// <summary>
        /// Azerbaijani - Cyrillic	aze_cyrl.traineddata
        /// </summary>
        aze_cyrl,


        /// <summary>
        /// Belarusian	bel.traineddata
        /// </summary>
        bel,


        /// <summary>
        /// Bengali	ben.traineddata
        /// </summary>
        ben,
        /// <summary>
        /// Tibetan	bod.traineddata
        /// </summary>
        bod,


        /// <summary>
        /// Bosnian	bos.traineddata
        /// </summary>
        bos,


        /// <summary>
        /// Bulgarian	bul.traineddata
        /// </summary>
        bul,


        /// <summary>
        /// Catalan; Valencian	cat.traineddata
        /// </summary>
        cat,


        /// <summary>
        /// Cebuano	ceb.traineddata
        /// </summary>
        ceb,


        /// <summary>
        /// Czech	ces.traineddata
        /// </summary>
        ces,


        /// <summary>
        /// 简体中文
        /// Chinese - Simplified	chi_sim.traineddata
        /// </summary>
        chi_sim,


        /// <summary>
        /// 繁体中文
        /// Chinese - Traditional	chi_tra.traineddata
        /// </summary>
        chi_tra,


        /// <summary>
        /// Cherokee	chr.traineddata
        /// </summary>
        chr,


        /// <summary>
        /// Welsh	cym.traineddata
        /// </summary>
        cym,


        /// <summary>
        /// Danish	dan.traineddata
        /// </summary>
        dan,


        /// <summary>
        /// German	deu.traineddata
        /// </summary>
        deu,


        /// <summary>
        /// Dzongkha	dzo.traineddata
        /// </summary>
        dzo,


        /// <summary>
        /// Greek, Modern (1453-)	ell.traineddata
        /// </summary>
        ell,


        /// <summary>
        /// English	eng.traineddata
        /// </summary>
        eng,


        /// <summary>
        /// English, Middle (1100-1500)	enm.traineddata
        /// </summary>
        enm,


        /// <summary>
        /// Esperanto	epo.traineddata
        /// </summary>
        epo,


        /// <summary>
        /// Estonian	est.traineddata
        /// </summary>
        est,


        /// <summary>
        /// Basque	eus.traineddata
        /// </summary>
        eus,


        /// <summary>
        /// Persian	fas.traineddata
        /// </summary>
        fas,


        /// <summary>
        /// Finnish	fin.traineddata
        /// </summary>
        fin,


        /// <summary>
        /// French  fra.traineddata
        /// </summary>
        fra,


        /// <summary>
        /// German Fraktur	frk.traineddata
        /// </summary>
        frk,


        /// <summary>
        /// French, Middle (ca. 1400-1600)	frm.traineddata
        /// </summary>
        frm,


        /// <summary>
        /// Irish	gle.traineddata
        /// </summary>
        gle,


        /// <summary>
        /// Galician	glg.traineddata
        /// </summary>
        glg,


        /// <summary>
        /// Greek, Ancient (-1453)	grc.traineddata
        /// </summary>
        grc,


        /// <summary>
        /// Gujarati	guj.traineddata
        /// </summary>
        guj,


        /// <summary>
        /// Haitian; Haitian Creole	hat.traineddata
        /// </summary>
        hat,


        /// <summary>
        /// Hebrew	heb.traineddata
        /// </summary>
        heb,


        /// <summary>
        /// Hindi	hin.traineddata
        /// </summary>
        hin,


        /// <summary>
        /// Croatian	hrv.traineddata
        /// </summary>
        hrv,


        /// <summary>
        /// Hungarian	hun.traineddata
        /// </summary>
        hun,


        /// <summary>
        /// Inuktitut	iku.traineddata
        /// </summary>
        iku,


        /// <summary>
        /// Indonesian	ind.traineddata
        /// </summary>
        ind,


        /// <summary>
        /// Icelandic	isl.traineddata
        /// </summary>
        isl,


        /// <summary>
        /// Italian	ita.traineddata
        /// </summary>
        ita,


        /// <summary>
        /// Italian - Old	ita_old.traineddata
        /// </summary>
        ita_old,


        /// <summary>
        /// Javanese	jav.traineddata
        /// </summary>
        jav,


        /// <summary>
        /// Japanese	jpn.traineddata
        /// </summary>
        jpn,


        /// <summary>
        /// Kannada	kan.traineddata
        /// </summary>
        kan,


        /// <summary>
        /// Georgian	kat.traineddata
        /// </summary>
        kat,


        /// <summary>
        /// Georgian - Old	kat_old.traineddata
        /// </summary>
        kat_old,


        /// <summary>
        /// Kazakh	kaz.traineddata
        /// </summary>
        kaz,


        /// <summary>
        /// Central Khmer	khm.traineddata
        /// </summary>
        khm,


        /// <summary>
        /// Kirghiz; Kyrgyz	kir.traineddata
        /// </summary>
        kir,


        /// <summary>
        /// Korean	kor.traineddata
        /// </summary>
        kor,


        /// <summary>
        /// Kurdish	kur.traineddata
        /// </summary>
        kur,


        /// <summary>
        /// Lao	lao.traineddata
        /// </summary>
        lao,


        /// <summary>
        /// Latin	lat.traineddata
        /// </summary>
        lat,

        /// <summary>
        /// Latvian	lav.traineddata
        /// </summary>
        lav,

        /// <summary>
        /// Lithuanian	lit.traineddata
        /// </summary>
        lit,

        /// <summary>
        /// Malayalam	mal.traineddata
        /// </summary>
        mal,

        /// <summary>
        /// Marathi	mar.traineddata
        /// </summary>
        mar,

        /// <summary>
        /// Macedonian	mkd.traineddata
        /// </summary>
        mkd,


        /// <summary>
        /// Maltese	mlt.traineddata
        /// </summary>
        mlt,


        /// <summary>
        /// Malay	msa.traineddata
        /// </summary>
        msa,


        /// <summary>
        /// Burmese	mya.traineddata
        /// </summary>
        mya,


        /// <summary>
        /// Nepali	nep.traineddata
        /// </summary>
        nep,


        /// <summary>
        /// Dutch; Flemish	nld.traineddata
        /// </summary>
        nld,


        /// <summary>
        /// Norwegian	nor.traineddata
        /// </summary>
        nor,


        /// <summary>
        /// Oriya	ori.traineddata
        /// </summary>
        ori,


        /// <summary>
        /// Panjabi; Punjabi	pan.traineddata
        /// </summary>
        pan,


        /// <summary>
        /// Polish	pol.traineddata
        /// </summary>
        pol,


        /// <summary>
        /// Portuguese	por.traineddata
        /// </summary>
        por,


        /// <summary>
        /// Pushto; Pashto	pus.traineddata
        /// </summary>
        pus,


        /// <summary>
        /// Romanian; Moldavian; Moldovan	ron.traineddata
        /// </summary>
        ron,


        /// <summary>
        /// Russian	rus.traineddata
        /// </summary>
        rus,


        /// <summary>
        /// Sanskrit	san.traineddata
        /// </summary>
        san,


        /// <summary>
        /// Sinhala; Sinhalese	sin.traineddata
        /// </summary>
        sin,


        /// <summary>
        /// Slovak	slk.traineddata
        /// </summary>
        slk,


        /// <summary>
        /// Slovenian	slv.traineddata
        /// </summary>
        slv,


        /// <summary>
        /// Spanish; Castilian	spa.traineddata
        /// </summary>
        spa,


        /// <summary>
        /// Spanish; Castilian - Old	spa_old.traineddata
        /// </summary>
        spa_old,


        /// <summary>
        /// Albanian	sqi.traineddata
        /// </summary>
        sqi,


        /// <summary>
        /// Serbian	srp.traineddata
        /// </summary>
        srp,


        /// <summary>
        /// Serbian - Latin	srp_latn.traineddata
        /// </summary>
        srp_latn,


        /// <summary>
        /// Swahili	swa.traineddata
        /// </summary>
        swa,


        /// <summary>
        /// Swedish	swe.traineddata
        /// </summary>
        swe,


        /// <summary>
        /// Syriac	syr.traineddata
        /// </summary>
        syr,


        /// <summary>
        /// Tamil	tam.traineddata
        /// </summary>
        tam,


        /// <summary>
        /// Telugu	tel.traineddata
        /// </summary>
        tel,


        /// <summary>
        /// Tajik	tgk.traineddata
        /// </summary>
        tgk,


        /// <summary>
        /// Tagalog	tgl.traineddata
        /// </summary>
        tgl,


        /// <summary>
        /// Thai	tha.traineddata
        /// </summary>
        tha,


        /// <summary>
        /// Tigrinya	tir.traineddata
        /// </summary>
        tir,


        /// <summary>
        /// Turkish	tur.traineddata
        /// </summary>
        tur,


        /// <summary>
        /// Uighur; Uyghur	uig.traineddata
        /// </summary>
        uig,


        /// <summary>
        /// Ukrainian	ukr.traineddata
        /// </summary>
        ukr,


        /// <summary>
        /// Urdu	urd.traineddata
        /// </summary>
        urd,


        /// <summary>
        /// Uzbek	uzb.traineddata
        /// </summary>
        uzb,


        /// <summary>
        /// Uzbek - Cyrillic	uzb_cyrl.traineddata
        /// </summary>
        uzb_cyrl,


        /// <summary>
        /// Vietnamese	vie.traineddata
        /// </summary>
        vie,


        /// <summary>
        /// Yiddish	yid.traineddata
        /// </summary>
        yid

    }

 

 /// <summary>
    /// 
    /// </summary>
    internal class Program
    {

        /// <summary>
        /// https://tesseract-ocr.github.io/tessdoc/Data-Files
        /// https://github.com/charlesw/tesseract-samples
        /// https://github.com/tesseract-ocr/tessdata
        /// https://github.com/danbloomberg/leptonica/releases
        /// </summary>
        /// <param name="args"></param>
        public static void Main(string[] args)
        {
            var testImagePath = "./geovindu2.jpg"; //phototest.tif
            if (args.Length > 0)
            {
                testImagePath = args[0];
            }

            try
            {
                using (var engine = new TesseractEngine(@"./tessdata", LanguangeList.chi_sim.ToString(), EngineMode.Default))  //chi_tra //eng //  
                {
                    using (var img = Pix.LoadFromFile(testImagePath))
                    {
                        using (var page = engine.Process(img))
                        {
                            var text = page.GetText();
                            Console.WriteLine("Mean confidence: {0}", page.GetMeanConfidence());

                            Console.WriteLine("Text (GetText): \r\n{0}", text);
                            Console.WriteLine("Text (iterator):");
                            using (var iter = page.GetIterator())
                            {
                                iter.Begin();

                                do
                                {
                                    do
                                    {
                                        do
                                        {
                                            do
                                            {
                                                if (iter.IsAtBeginningOf(PageIteratorLevel.Block))
                                                {
                                                    Console.WriteLine("<BLOCK>");
                                                }

                                                Console.Write(iter.GetText(PageIteratorLevel.Word));
                                                Console.Write(" ");

                                                if (iter.IsAtFinalOf(PageIteratorLevel.TextLine, PageIteratorLevel.Word))
                                                {
                                                    Console.WriteLine();
                                                }
                                            } while (iter.Next(PageIteratorLevel.TextLine, PageIteratorLevel.Word));

                                            if (iter.IsAtFinalOf(PageIteratorLevel.Para, PageIteratorLevel.TextLine))
                                            {
                                                Console.WriteLine();
                                            }
                                        } while (iter.Next(PageIteratorLevel.Para, PageIteratorLevel.TextLine));
                                    } while (iter.Next(PageIteratorLevel.Block, PageIteratorLevel.Para));
                                } while (iter.Next(PageIteratorLevel.Block));
                            }
                        }
                    }
                }
            }
            catch (Exception e)
            {
                Trace.TraceError(e.ToString());
                Console.WriteLine("Unexpected Error: " + e.Message);
                Console.WriteLine("Details: ");
                Console.WriteLine(e.ToString());
            }
            Console.Write("Press any key to continue . . . ");
            Console.ReadKey(true);
        }
    }

  

  /// <summary>
    /// Description of MainForm.
    /// 引用 leptonica-1.82.0.dll 
    /// geovindu,Geovin Du
    /// </summary>
    public class DefaultPage : System.Web.UI.Page
	{	
		#region Data

        // input panel controls

        protected Panel inputPanel;
		protected HtmlInputFile imageFile;
		protected HtmlButton submitFile;

        // result panel controls
        protected Panel resultPanel;
        protected HtmlGenericControl meanConfidenceLabel;
        protected HtmlTextArea resultText;
        protected HtmlButton restartButton;


		#endregion

		#region Event Handlers

		private void OnSubmitFileClicked(object sender, EventArgs args)
		{
            if (imageFile.PostedFile != null && imageFile.PostedFile.ContentLength > 0)
            {
                // for now just fail hard if there's any error however in a propper app I would expect a full demo.
                //chi_sim  https://tesseract-ocr.github.io/tessdoc/Data-Files
                using (var engine = new TesseractEngine(Server.MapPath(@"~/tessdata"), LanguangeList.chi_sim.ToString(), EngineMode.Default))//eng  //chi_sim
                {
                    // have to load Pix via a bitmap since Pix doesn't support loading a stream.
                    using (var image = new System.Drawing.Bitmap(imageFile.PostedFile.InputStream))
                    {
                        using (var pix = PixConverter.ToPix(image))
                        {
                            using (var page = engine.Process(pix))
                            {
                                meanConfidenceLabel.InnerText = String.Format("{0:P}", page.GetMeanConfidence());
                                resultText.InnerText = page.GetText();
                            }
                        }
                    }
                }
                inputPanel.Visible = false;
                resultPanel.Visible = true;
            }
		}

        private void OnRestartClicked(object sender, EventArgs args)
        {
            resultPanel.Visible = false;
            inputPanel.Visible = true;
        }

		#endregion

		#region Page Setup
		protected override void OnInit(EventArgs e)
		{
			InitializeComponent();
			base.OnInit(e);
		}

		//----------------------------------------------------------------------
		private void InitializeComponent()
		{
            this.restartButton.ServerClick += OnRestartClicked;
			this.submitFile.ServerClick += OnSubmitFileClicked;
		}

		#endregion
	}

  

输出:

 

 

 

 

GPTs are GPTs: An Early Look at the Labor Market Impact Potential of Large Language Models
https://arxiv.org/pdf/2303.10130.pdf

Sparks of Artificial General Intelligence: Early experiments with GPT-4
https://arxiv.org/abs/2303.12712
https://arxiv.org/pdf/2303.12712.pdf

posted @ 2023-03-31 14:41  ®Geovin Du Dream Park™  阅读(173)  评论(0编辑  收藏  举报