.Net多线程读取pdf文本
1.nuget安装UglyToad.PdfPig
2.SemaphoreSlim semaphore = new SemaphoreSlim(10);同时启动10个线程读取指定页面文本。
C#代码:
static string GetPdfText(string filePath) { FileInfo file = new FileInfo(filePath); if (file.Extension.ToLower().Contains("pdf")) { try { var sb = new StringBuilder(); int taskCount; using (UglyToad.PdfPig.PdfDocument document = UglyToad.PdfPig.PdfDocument.Open(file.FullName)) { taskCount = document.NumberOfPages; } List<Task<Tuple<int, string>>> tasks = new List<Task<Tuple<int, string>>>(); SemaphoreSlim semaphore = new SemaphoreSlim(10); for (int i = 0; i < taskCount; i++) { int taskNumber = i; Task<Tuple<int, string>> task = Task.Run(async () => { await semaphore.WaitAsync(); try { return PdfPageText(file.FullName, taskNumber); } finally { semaphore.Release(); } }); tasks.Add(task); } Task.WaitAll(tasks.ToArray()); foreach (var task in tasks.OrderBy(x => x.Result.Item1)) { sb.Append(task.Result.Item2); } return sb.ToString(); } catch (Exception) { // ignored } } return string.Empty; } static Tuple<int, string> PdfPageText(string fullName, int i) { using (UglyToad.PdfPig.PdfDocument document = UglyToad.PdfPig.PdfDocument.Open(fullName)) { var page = document.GetPage(i + 1); return new Tuple<int, string>(i + 1, page.Text); } }