CSharp: Tesseract OCR V5.0 in donet core 3.1
Reference resources
https://github.com/alex-doe/open-ocr-dotnet
https://github.com/tleyden/open-ocr/ go
https://github.com/DayBreak-u/chineseocr_lite
https://github.com/pjreddie/darknet
https://sourceforge.net/projects/vietocr/
https://github.com/PaddlePaddle/PaddleOCR
https://github.com/gumblex/tessdata_chi
https://github.com/nobody132/masr
https://code.google.com/p/tesseractdotnet
https://github.com/tesseract-ocr/tesseract
https://github.com/tesseract-ocr/tessdata
https://github.com/charlesw/tesseract
https://github.com/charlesw/tesseract-samples
https://github.com/tesseract-ocr/langdata
/// <summary> /// https://tesseract-ocr.github.io/tessdoc/Data-Files /// geovindu,Geovin Du /// </summary> public enum LanguangeList { /// <summary> /// Afrikaans afr.traineddata /// </summary> afr, /// <summary> /// Amharic amh.traineddata /// </summary> amh, /// <summary> /// Arabic ara.traineddata /// </summary> ara, /// <summary> /// Assamese asm.traineddata /// </summary> asm, /// <summary> /// Azerbaijani aze.traineddata /// </summary> aze, /// <summary> /// Azerbaijani - Cyrillic aze_cyrl.traineddata /// </summary> aze_cyrl, /// <summary> /// Belarusian bel.traineddata /// </summary> bel, /// <summary> /// Bengali ben.traineddata /// </summary> ben, /// <summary> /// Tibetan bod.traineddata /// </summary> bod, /// <summary> /// Bosnian bos.traineddata /// </summary> bos, /// <summary> /// Bulgarian bul.traineddata /// </summary> bul, /// <summary> /// Catalan; Valencian cat.traineddata /// </summary> cat, /// <summary> /// Cebuano ceb.traineddata /// </summary> ceb, /// <summary> /// Czech ces.traineddata /// </summary> ces, /// <summary> /// 简体中文 /// Chinese - Simplified chi_sim.traineddata /// </summary> chi_sim, /// <summary> /// 繁体中文 /// Chinese - Traditional chi_tra.traineddata /// </summary> chi_tra, /// <summary> /// Cherokee chr.traineddata /// </summary> chr, /// <summary> /// Welsh cym.traineddata /// </summary> cym, /// <summary> /// Danish dan.traineddata /// </summary> dan, /// <summary> /// German deu.traineddata /// </summary> deu, /// <summary> /// Dzongkha dzo.traineddata /// </summary> dzo, /// <summary> /// Greek, Modern (1453-) ell.traineddata /// </summary> ell, /// <summary> /// English eng.traineddata /// </summary> eng, /// <summary> /// English, Middle (1100-1500) enm.traineddata /// </summary> enm, /// <summary> /// Esperanto epo.traineddata /// </summary> epo, /// <summary> /// Estonian est.traineddata /// </summary> est, /// <summary> /// Basque eus.traineddata /// </summary> eus, /// <summary> /// Persian fas.traineddata /// </summary> fas, /// <summary> /// Finnish fin.traineddata /// </summary> fin, /// <summary> /// French fra.traineddata /// </summary> fra, /// <summary> /// German Fraktur frk.traineddata /// </summary> frk, /// <summary> /// French, Middle (ca. 1400-1600) frm.traineddata /// </summary> frm, /// <summary> /// Irish gle.traineddata /// </summary> gle, /// <summary> /// Galician glg.traineddata /// </summary> glg, /// <summary> /// Greek, Ancient (-1453) grc.traineddata /// </summary> grc, /// <summary> /// Gujarati guj.traineddata /// </summary> guj, /// <summary> /// Haitian; Haitian Creole hat.traineddata /// </summary> hat, /// <summary> /// Hebrew heb.traineddata /// </summary> heb, /// <summary> /// Hindi hin.traineddata /// </summary> hin, /// <summary> /// Croatian hrv.traineddata /// </summary> hrv, /// <summary> /// Hungarian hun.traineddata /// </summary> hun, /// <summary> /// Inuktitut iku.traineddata /// </summary> iku, /// <summary> /// Indonesian ind.traineddata /// </summary> ind, /// <summary> /// Icelandic isl.traineddata /// </summary> isl, /// <summary> /// Italian ita.traineddata /// </summary> ita, /// <summary> /// Italian - Old ita_old.traineddata /// </summary> ita_old, /// <summary> /// Javanese jav.traineddata /// </summary> jav, /// <summary> /// Japanese jpn.traineddata /// </summary> jpn, /// <summary> /// Kannada kan.traineddata /// </summary> kan, /// <summary> /// Georgian kat.traineddata /// </summary> kat, /// <summary> /// Georgian - Old kat_old.traineddata /// </summary> kat_old, /// <summary> /// Kazakh kaz.traineddata /// </summary> kaz, /// <summary> /// Central Khmer khm.traineddata /// </summary> khm, /// <summary> /// Kirghiz; Kyrgyz kir.traineddata /// </summary> kir, /// <summary> /// Korean kor.traineddata /// </summary> kor, /// <summary> /// Kurdish kur.traineddata /// </summary> kur, /// <summary> /// Lao lao.traineddata /// </summary> lao, /// <summary> /// Latin lat.traineddata /// </summary> lat, /// <summary> /// Latvian lav.traineddata /// </summary> lav, /// <summary> /// Lithuanian lit.traineddata /// </summary> lit, /// <summary> /// Malayalam mal.traineddata /// </summary> mal, /// <summary> /// Marathi mar.traineddata /// </summary> mar, /// <summary> /// Macedonian mkd.traineddata /// </summary> mkd, /// <summary> /// Maltese mlt.traineddata /// </summary> mlt, /// <summary> /// Malay msa.traineddata /// </summary> msa, /// <summary> /// Burmese mya.traineddata /// </summary> mya, /// <summary> /// Nepali nep.traineddata /// </summary> nep, /// <summary> /// Dutch; Flemish nld.traineddata /// </summary> nld, /// <summary> /// Norwegian nor.traineddata /// </summary> nor, /// <summary> /// Oriya ori.traineddata /// </summary> ori, /// <summary> /// Panjabi; Punjabi pan.traineddata /// </summary> pan, /// <summary> /// Polish pol.traineddata /// </summary> pol, /// <summary> /// Portuguese por.traineddata /// </summary> por, /// <summary> /// Pushto; Pashto pus.traineddata /// </summary> pus, /// <summary> /// Romanian; Moldavian; Moldovan ron.traineddata /// </summary> ron, /// <summary> /// Russian rus.traineddata /// </summary> rus, /// <summary> /// Sanskrit san.traineddata /// </summary> san, /// <summary> /// Sinhala; Sinhalese sin.traineddata /// </summary> sin, /// <summary> /// Slovak slk.traineddata /// </summary> slk, /// <summary> /// Slovenian slv.traineddata /// </summary> slv, /// <summary> /// Spanish; Castilian spa.traineddata /// </summary> spa, /// <summary> /// Spanish; Castilian - Old spa_old.traineddata /// </summary> spa_old, /// <summary> /// Albanian sqi.traineddata /// </summary> sqi, /// <summary> /// Serbian srp.traineddata /// </summary> srp, /// <summary> /// Serbian - Latin srp_latn.traineddata /// </summary> srp_latn, /// <summary> /// Swahili swa.traineddata /// </summary> swa, /// <summary> /// Swedish swe.traineddata /// </summary> swe, /// <summary> /// Syriac syr.traineddata /// </summary> syr, /// <summary> /// Tamil tam.traineddata /// </summary> tam, /// <summary> /// Telugu tel.traineddata /// </summary> tel, /// <summary> /// Tajik tgk.traineddata /// </summary> tgk, /// <summary> /// Tagalog tgl.traineddata /// </summary> tgl, /// <summary> /// Thai tha.traineddata /// </summary> tha, /// <summary> /// Tigrinya tir.traineddata /// </summary> tir, /// <summary> /// Turkish tur.traineddata /// </summary> tur, /// <summary> /// Uighur; Uyghur uig.traineddata /// </summary> uig, /// <summary> /// Ukrainian ukr.traineddata /// </summary> ukr, /// <summary> /// Urdu urd.traineddata /// </summary> urd, /// <summary> /// Uzbek uzb.traineddata /// </summary> uzb, /// <summary> /// Uzbek - Cyrillic uzb_cyrl.traineddata /// </summary> uzb_cyrl, /// <summary> /// Vietnamese vie.traineddata /// </summary> vie, /// <summary> /// Yiddish yid.traineddata /// </summary> yid }
/// <summary> /// /// </summary> internal class Program { /// <summary> /// https://tesseract-ocr.github.io/tessdoc/Data-Files /// https://github.com/charlesw/tesseract-samples /// https://github.com/tesseract-ocr/tessdata /// https://github.com/danbloomberg/leptonica/releases /// </summary> /// <param name="args"></param> public static void Main(string[] args) { var testImagePath = "./geovindu2.jpg"; //phototest.tif if (args.Length > 0) { testImagePath = args[0]; } try { using (var engine = new TesseractEngine(@"./tessdata", LanguangeList.chi_sim.ToString(), EngineMode.Default)) //chi_tra //eng // { using (var img = Pix.LoadFromFile(testImagePath)) { using (var page = engine.Process(img)) { var text = page.GetText(); Console.WriteLine("Mean confidence: {0}", page.GetMeanConfidence()); Console.WriteLine("Text (GetText): \r\n{0}", text); Console.WriteLine("Text (iterator):"); using (var iter = page.GetIterator()) { iter.Begin(); do { do { do { do { if (iter.IsAtBeginningOf(PageIteratorLevel.Block)) { Console.WriteLine("<BLOCK>"); } Console.Write(iter.GetText(PageIteratorLevel.Word)); Console.Write(" "); if (iter.IsAtFinalOf(PageIteratorLevel.TextLine, PageIteratorLevel.Word)) { Console.WriteLine(); } } while (iter.Next(PageIteratorLevel.TextLine, PageIteratorLevel.Word)); if (iter.IsAtFinalOf(PageIteratorLevel.Para, PageIteratorLevel.TextLine)) { Console.WriteLine(); } } while (iter.Next(PageIteratorLevel.Para, PageIteratorLevel.TextLine)); } while (iter.Next(PageIteratorLevel.Block, PageIteratorLevel.Para)); } while (iter.Next(PageIteratorLevel.Block)); } } } } } catch (Exception e) { Trace.TraceError(e.ToString()); Console.WriteLine("Unexpected Error: " + e.Message); Console.WriteLine("Details: "); Console.WriteLine(e.ToString()); } Console.Write("Press any key to continue . . . "); Console.ReadKey(true); } }
/// <summary> /// Description of MainForm. /// 引用 leptonica-1.82.0.dll /// geovindu,Geovin Du /// </summary> public class DefaultPage : System.Web.UI.Page { #region Data // input panel controls protected Panel inputPanel; protected HtmlInputFile imageFile; protected HtmlButton submitFile; // result panel controls protected Panel resultPanel; protected HtmlGenericControl meanConfidenceLabel; protected HtmlTextArea resultText; protected HtmlButton restartButton; #endregion #region Event Handlers private void OnSubmitFileClicked(object sender, EventArgs args) { if (imageFile.PostedFile != null && imageFile.PostedFile.ContentLength > 0) { // for now just fail hard if there's any error however in a propper app I would expect a full demo. //chi_sim https://tesseract-ocr.github.io/tessdoc/Data-Files using (var engine = new TesseractEngine(Server.MapPath(@"~/tessdata"), LanguangeList.chi_sim.ToString(), EngineMode.Default))//eng //chi_sim { // have to load Pix via a bitmap since Pix doesn't support loading a stream. using (var image = new System.Drawing.Bitmap(imageFile.PostedFile.InputStream)) { using (var pix = PixConverter.ToPix(image)) { using (var page = engine.Process(pix)) { meanConfidenceLabel.InnerText = String.Format("{0:P}", page.GetMeanConfidence()); resultText.InnerText = page.GetText(); } } } } inputPanel.Visible = false; resultPanel.Visible = true; } } private void OnRestartClicked(object sender, EventArgs args) { resultPanel.Visible = false; inputPanel.Visible = true; } #endregion #region Page Setup protected override void OnInit(EventArgs e) { InitializeComponent(); base.OnInit(e); } //---------------------------------------------------------------------- private void InitializeComponent() { this.restartButton.ServerClick += OnRestartClicked; this.submitFile.ServerClick += OnSubmitFileClicked; } #endregion }
输出:
GPTs are GPTs: An Early Look at the Labor Market Impact Potential of Large Language Models
https://arxiv.org/pdf/2303.10130.pdf
Sparks of Artificial General Intelligence: Early experiments with GPT-4
https://arxiv.org/abs/2303.12712
https://arxiv.org/pdf/2303.12712.pdf