C#解析PDF

C#解析PDF的方式有很多,比较好用的有ITestSharp和PdfBox。

PDF内容页如果是图片类型,例如扫描件,则需要进行OCR(光学字符识别)。

文本内容的PDF文档,解析的过程中,我目前仅发现能以字符串的形式读取的,不能够读取其中的表格。据说PDF文档结构中是没有表格概念的,因此这个自然是读不到的,如果果真如此,则PDF中表格内容的解析,只能对获取到的字符串按照一定的逻辑自行解析了。

ITestSharp是一C#开源项目,PdfBox为Java开源项目,借助于IKVM在.Net平台下有实现。

Pdf转换Image,使用的是GhostScript,可以以API的方式调用,也可以以Windows命令行的方式调用。

OCR使用的是Asprise,识别效果较好(商业),另外还可以使用MS的ImageScaning(2007)或OneNote(2010)(需要依赖Office组件),Tessert(HP->Google)(效果很差)。

附上ITestSharp、PdfBox对PDF的解析代码。

ITestSharp辅助类

  1 using System;
  2 using System.Collections.Generic;
  3 using System.Text;
  4 
  5 using iTextSharp.text.pdf;
  6 using iTextSharp.text.pdf.parser;
  7 using System.IO;
  8 
  9 namespace eyuan
 10 {
 11     public static class ITextSharpHandler
 12     {
 13         /// <summary>
 14         /// 读取PDF文本内容
 15         /// </summary>
 16         /// <param name="fileName"></param>
 17         /// <returns></returns>
 18         public static string ReadPdf(string fileName)
 19         {
 20             if (!File.Exists(fileName))
 21             {
 22                 LogHandler.LogWrite(@"指定的PDF文件不存在:" + fileName);
 23                 return string.Empty;
 24             }
 25             //
 26             string fileContent = string.Empty;
 27             StringBuilder sbFileContent = new StringBuilder();
 28             //打开文件
 29             PdfReader reader = null;
 30             try
 31             {
 32                 reader = new PdfReader(fileName);
 33             }
 34             catch (Exception ex)
 35             {
 36                 LogHandler.LogWrite(string.Format(@"加载PDF文件{0}失败,错误:{1}", new string[] { fileName, ex.ToString() }));
 37 
 38                 if (reader != null)
 39                 {
 40                     reader.Close();
 41                     reader = null;
 42                 }
 43 
 44                 return string.Empty;
 45             }
 46 
 47             try
 48             {
 49                 //循环各页(索引从1开始)
 50                 for (int i = 1; i <= reader.NumberOfPages; i++)
 51                 {
 52                     sbFileContent.AppendLine(PdfTextExtractor.GetTextFromPage(reader, i));
 53 
 54                 }
 55 
 56             }
 57             catch (Exception ex)
 58             {
 59                 LogHandler.LogWrite(string.Format(@"解析PDF文件{0}失败,错误:{1}", new string[] { fileName, ex.ToString() }));
 60 
 61             }
 62             finally
 63             {
 64                 if (reader != null)
 65                 {
 66                     reader.Close();
 67                     reader = null;
 68                 }
 69             }
 70             //
 71             fileContent = sbFileContent.ToString();
 72             return fileContent;
 73         }
 74         /// <summary>
 75         /// 获取PDF页数
 76         /// </summary>
 77         /// <param name="fileName"></param>
 78         /// <returns></returns>
 79         public static int GetPdfPageCount(string fileName)
 80         {
 81             if (!File.Exists(fileName))
 82             {
 83                 LogHandler.LogWrite(@"指定的PDF文件不存在:" + fileName);
 84                 return -1;
 85             }
 86             //打开文件
 87             PdfReader reader = null;
 88             try
 89             {
 90                 reader = new PdfReader(fileName);
 91             }
 92             catch (Exception ex)
 93             {
 94                 LogHandler.LogWrite(string.Format(@"加载PDF文件{0}失败,错误:{1}", new string[] { fileName, ex.ToString() }));
 95 
 96                 if (reader != null)
 97                 {
 98                     reader.Close();
 99                     reader = null;
100                 }
101 
102                 return -1;
103             }
104             //
105             return reader.NumberOfPages;
106         }
107     }
108 }

 PDFBox辅助类

 1 using org.pdfbox.pdmodel;
 2 using org.pdfbox.util;
 3 using System;
 4 using System.Collections.Generic;
 5 using System.IO;
 6 using System.Text;
 7 
 8 namespace eyuan
 9 {
10     public static class PdfBoxHandler
11     {
12         /// <summary>
13         /// 使用PDFBox组件进行解析
14         /// </summary>
15         /// <param name="input">PDF文件路径</param>
16         /// <returns>PDF文本内容</returns>
17         public static string ReadPdf(string input)
18         {
19             if (!File.Exists(input))
20             {
21                 LogHandler.LogWrite(@"指定的PDF文件不存在:" + input);
22                 return null;
23             }
24             else
25             {
26                 PDDocument pdfdoc = null;
27                 string strPDFText = null;
28                 PDFTextStripper stripper = null;
29 
30                 try
31                 {
32                     //加载PDF文件
33                     pdfdoc = PDDocument.load(input);
34                 }
35                 catch (Exception ex)
36                 {
37                     LogHandler.LogWrite(string.Format(@"加载PDF文件{0}失败,错误:{1}", new string[] { input, ex.ToString() }));
38 
39                     if (pdfdoc != null)
40                     {
41                         pdfdoc.close();
42                         pdfdoc = null;
43                     }
44 
45                     return null;
46                 }
47 
48                 try
49                 {
50                     //解析PDF文件
51                     stripper = new PDFTextStripper();
52                     strPDFText = stripper.getText(pdfdoc);
53 
54                    
55 
56                 }
57                 catch (Exception ex)
58                 {
59                     LogHandler.LogWrite(string.Format(@"解析PDF文件{0}失败,错误:{1}", new string[] { input, ex.ToString() }));
60 
61                 }
62                 finally
63                 {
64                     if (pdfdoc != null)
65                     {
66                         pdfdoc.close();
67                         pdfdoc = null;
68                     }
69                 }
70 
71                 return strPDFText;
72             }
73 
74         }
75     }
76 }

 

另外附上PDF转Image,然后对Image进行OCR的代码。

转换PDF为Jpeg图片代码(GhostScript辅助类)

  1 using System;
  2 using System.Collections;
  3 using System.Collections.Generic;
  4 using System.Runtime.InteropServices;
  5 using System.Text;
  6 
  7 namespace eyuan
  8 {
  9     public class GhostscriptHandler
 10     {
 11 
 12         #region GhostScript Import
 13         /// <summary>创建Ghostscript的实例
 14         /// This instance is passed to most other gsapi functions. 
 15         /// The caller_handle will be provided to callback functions.  
 16         ///  At this stage, Ghostscript supports only one instance. </summary>  
 17         /// <param name="pinstance"></param>  
 18         /// <param name="caller_handle"></param>  
 19         /// <returns></returns>   
 20         [DllImport("gsdll32.dll", EntryPoint = "gsapi_new_instance")]
 21         private static extern int gsapi_new_instance(out IntPtr pinstance, IntPtr caller_handle);
 22         /// <summary>This is the important function that will perform the conversion
 23         /// 
 24         /// </summary>  
 25         /// <param name="instance"></param>  
 26         /// <param name="argc"></param>  
 27         /// <param name="argv"></param>  
 28         /// <returns></returns>  
 29         [DllImport("gsdll32.dll", EntryPoint = "gsapi_init_with_args")]
 30         private static extern int gsapi_init_with_args(IntPtr instance, int argc, IntPtr argv);
 31         /// <summary>  
 32         /// Exit the interpreter. 
 33         /// This must be called on shutdown if gsapi_init_with_args() has been called, 
 34         /// and just before gsapi_delete_instance().
 35         /// 退出
 36         /// </summary>  
 37         /// <param name="instance"></param>  
 38         /// <returns></returns>  
 39         [DllImport("gsdll32.dll", EntryPoint = "gsapi_exit")]
 40         private static extern int gsapi_exit(IntPtr instance);
 41         /// <summary>  
 42         /// Destroy an instance of Ghostscript. 
 43         /// Before you call this, Ghostscript must have finished. 
 44         /// If Ghostscript has been initialised, you must call gsapi_exit before gsapi_delete_instance.   
 45         /// 销毁实例
 46         /// </summary>  
 47         /// <param name="instance"></param>  
 48         [DllImport("gsdll32.dll", EntryPoint = "gsapi_delete_instance")]
 49         private static extern void gsapi_delete_instance(IntPtr instance);
 50         #endregion
 51 
 52         #region 变量
 53         private string _sDeviceFormat;
 54         private int _iWidth;
 55         private int _iHeight;
 56         private int _iResolutionX;
 57         private int _iResolutionY;
 58         private int _iJPEGQuality;
 59         private Boolean _bFitPage;
 60         private IntPtr _objHandle;
 61         #endregion
 62 
 63         #region 属性
 64         /// <summary>
 65         /// 输出格式
 66         /// </summary>
 67         public string OutputFormat
 68         {
 69             get { return _sDeviceFormat; }
 70             set { _sDeviceFormat = value; }
 71         }
 72         /// <summary>
 73         /// 
 74         /// </summary>
 75         public int Width
 76         {
 77             get { return _iWidth; }
 78             set { _iWidth = value; }
 79         }
 80         /// <summary>
 81         /// 
 82         /// </summary>
 83         public int Height
 84         {
 85             get { return _iHeight; }
 86             set { _iHeight = value; }
 87         }
 88         /// <summary>
 89         /// 
 90         /// </summary>
 91         public int ResolutionX
 92         {
 93             get { return _iResolutionX; }
 94             set { _iResolutionX = value; }
 95         }
 96         /// <summary>
 97         /// 
 98         /// </summary>
 99         public int ResolutionY
100         {
101             get { return _iResolutionY; }
102             set { _iResolutionY = value; }
103         }
104         /// <summary>
105         /// 
106         /// </summary>
107         public Boolean FitPage
108         {
109             get { return _bFitPage; }
110             set { _bFitPage = value; }
111         }
112         /// <summary>Quality of compression of JPG
113         /// Jpeg文档质量
114         /// </summary>  
115         public int JPEGQuality
116         {
117             get { return _iJPEGQuality; }
118             set { _iJPEGQuality = value; }
119         }
120         #endregion
121 
122         #region 初始化(实例化对象)
123         /// <summary>
124         /// 
125         /// </summary>
126         /// <param name="objHandle"></param>
127         public GhostscriptHandler(IntPtr objHandle)
128         {
129             _objHandle = objHandle;
130         }
131         public GhostscriptHandler()
132         {
133             _objHandle = IntPtr.Zero;
134         }
135         #endregion
136 
137         #region 字符串处理
138         /// <summary>
139         /// 转换Unicode字符串到Ansi字符串
140         /// </summary>
141         /// <param name="str">Unicode字符串</param>
142         /// <returns>Ansi字符串(字节数组格式)</returns>
143         private byte[] StringToAnsiZ(string str)
144         {
145             //' Convert a Unicode string to a null terminated Ansi string for Ghostscript.  
146             //' The result is stored in a byte array. Later you will need to convert  
147             //' this byte array to a pointer with GCHandle.Alloc(XXXX, GCHandleType.Pinned)  
148             //' and GSHandle.AddrOfPinnedObject()  
149             int intElementCount;
150             int intCounter;
151             byte[] aAnsi;
152             byte bChar;
153             intElementCount = str.Length;
154             aAnsi = new byte[intElementCount + 1];
155             for (intCounter = 0; intCounter < intElementCount; intCounter++)
156             {
157                 bChar = (byte)str[intCounter];
158                 aAnsi[intCounter] = bChar;
159             }
160             aAnsi[intElementCount] = 0;
161             return aAnsi;
162         }
163         #endregion
164 
165         #region 转换文件
166         /// <summary>
167         /// 转换文件
168         /// </summary>
169         /// <param name="inputFile">输入的PDF文件路径</param>
170         /// <param name="outputFile">输出的Jpeg图片路径</param>
171         /// <param name="firstPage">第一页</param>
172         /// <param name="lastPage">最后一页</param>
173         /// <param name="deviceFormat">格式(文件格式)</param>
174         /// <param name="width">宽度</param>
175         /// <param name="height">高度</param>
176         public void Convert(string inputFile, string outputFile,
177             int firstPage, int lastPage, string deviceFormat, int width, int height)
178         {
179             //判断文件是否存在
180             if (!System.IO.File.Exists(inputFile))
181             {
182                 LogHandler.LogWrite(string.Format("文件{0}不存在", inputFile));
183                 return;
184             }
185             int intReturn;
186             IntPtr intGSInstanceHandle;
187             object[] aAnsiArgs;
188             IntPtr[] aPtrArgs;
189             GCHandle[] aGCHandle;
190             int intCounter;
191             int intElementCount;
192             IntPtr callerHandle;
193             GCHandle gchandleArgs;
194             IntPtr intptrArgs;
195             string[] sArgs = GetGeneratedArgs(inputFile, outputFile,
196                 firstPage, lastPage, deviceFormat, width, height);
197             // Convert the Unicode strings to null terminated ANSI byte arrays  
198             // then get pointers to the byte arrays.  
199             intElementCount = sArgs.Length;
200             aAnsiArgs = new object[intElementCount];
201             aPtrArgs = new IntPtr[intElementCount];
202             aGCHandle = new GCHandle[intElementCount];
203             // Create a handle for each of the arguments after   
204             // they've been converted to an ANSI null terminated  
205             // string. Then store the pointers for each of the handles  
206             for (intCounter = 0; intCounter < intElementCount; intCounter++)
207             {
208                 aAnsiArgs[intCounter] = StringToAnsiZ(sArgs[intCounter]);
209                 aGCHandle[intCounter] = GCHandle.Alloc(aAnsiArgs[intCounter], GCHandleType.Pinned);
210                 aPtrArgs[intCounter] = aGCHandle[intCounter].AddrOfPinnedObject();
211             }
212             // Get a new handle for the array of argument pointers  
213             gchandleArgs = GCHandle.Alloc(aPtrArgs, GCHandleType.Pinned);
214             intptrArgs = gchandleArgs.AddrOfPinnedObject();
215             intReturn = gsapi_new_instance(out intGSInstanceHandle, _objHandle);
216             callerHandle = IntPtr.Zero;
217             try
218             {
219                 intReturn = gsapi_init_with_args(intGSInstanceHandle, intElementCount, intptrArgs);
220             }
221             catch (Exception ex)
222             {
223                  LogHandler.LogWrite(string.Format("PDF文件{0}转换失败.\n错误:{1}",new string[]{inputFile,ex.ToString()}));
224 
225             }
226             finally
227             {
228                 for (intCounter = 0; intCounter < intReturn; intCounter++)
229                 {
230                     aGCHandle[intCounter].Free();
231                 }
232                 gchandleArgs.Free();
233                 gsapi_exit(intGSInstanceHandle);
234                 gsapi_delete_instance(intGSInstanceHandle);
235             }
236         }
237         #endregion
238 
239         #region 转换文件
240         /// <summary>
241         /// 
242         /// </summary>
243         /// <param name="inputFile"></param>
244         /// <param name="outputFile"></param>
245         /// <param name="firstPage"></param>
246         /// <param name="lastPage"></param>
247         /// <param name="deviceFormat"></param>
248         /// <param name="width"></param>
249         /// <param name="height"></param>
250         /// <returns></returns>
251         private string[] GetGeneratedArgs(string inputFile, string outputFile,
252             int firstPage, int lastPage, string deviceFormat, int width, int height)
253         {
254             this._sDeviceFormat = deviceFormat;
255             this._iResolutionX = width;
256             this._iResolutionY = height;
257             // Count how many extra args are need - HRangel - 11/29/2006, 3:13:43 PM  
258             ArrayList lstExtraArgs = new ArrayList();
259             if (_sDeviceFormat == "jpg" && _iJPEGQuality > 0 && _iJPEGQuality < 101)
260                 lstExtraArgs.Add("-dJPEGQ=" + _iJPEGQuality);
261             if (_iWidth > 0 && _iHeight > 0)
262                 lstExtraArgs.Add("-g" + _iWidth + "x" + _iHeight);
263             if (_bFitPage)
264                 lstExtraArgs.Add("-dPDFFitPage");
265             if (_iResolutionX > 0)
266             {
267                 if (_iResolutionY > 0)
268                     lstExtraArgs.Add("-r" + _iResolutionX + "x" + _iResolutionY);
269                 else
270                     lstExtraArgs.Add("-r" + _iResolutionX);
271             }
272             // Load Fixed Args - HRangel - 11/29/2006, 3:34:02 PM  
273             int iFixedCount = 17;
274             int iExtraArgsCount = lstExtraArgs.Count;
275             string[] args = new string[iFixedCount + lstExtraArgs.Count];
276             /* 
277             // Keep gs from writing information to standard output 
278         "-q",                      
279         "-dQUIET", 
280 
281         "-dPARANOIDSAFER", // Run this command in safe mode 
282         "-dBATCH", // Keep gs from going into interactive mode 
283         "-dNOPAUSE", // Do not prompt and pause for each page 
284         "-dNOPROMPT", // Disable prompts for user interaction            
285         "-dMaxBitmap=500000000", // Set high for better performance 
286 
287         // Set the starting and ending pages 
288         String.Format("-dFirstPage={0}", firstPage), 
289         String.Format("-dLastPage={0}", lastPage),    
290 
291         // Configure the output anti-aliasing, resolution, etc 
292         "-dAlignToPixels=0", 
293         "-dGridFitTT=0", 
294         "-sDEVICE=jpeg", 
295         "-dTextAlphaBits=4", 
296         "-dGraphicsAlphaBits=4", 
297             */
298             args[0] = "pdf2img";//this parameter have little real use  
299             args[1] = "-dNOPAUSE";//I don't want interruptions  
300             args[2] = "-dBATCH";//stop after  
301             //args[3]="-dSAFER";  
302             args[3] = "-dPARANOIDSAFER";
303             args[4] = "-sDEVICE=" + _sDeviceFormat;//what kind of export format i should provide  
304             args[5] = "-q";
305             args[6] = "-dQUIET";
306             args[7] = "-dNOPROMPT";
307             args[8] = "-dMaxBitmap=500000000";
308             args[9] = String.Format("-dFirstPage={0}", firstPage);
309             args[10] = String.Format("-dLastPage={0}", lastPage);
310             args[11] = "-dAlignToPixels=0";
311             args[12] = "-dGridFitTT=0";
312             args[13] = "-dTextAlphaBits=4";
313             args[14] = "-dGraphicsAlphaBits=4";
314             //For a complete list watch here:  
315             //http://pages.cs.wisc.edu/~ghost/doc/cvs/Devices.htm  
316             //Fill the remaining parameters  
317             for (int i = 0; i < iExtraArgsCount; i++)
318             {
319                 args[15 + i] = (string)lstExtraArgs[i];
320             }
321             //Fill outputfile and inputfile  
322             args[15 + iExtraArgsCount] = string.Format("-sOutputFile={0}", outputFile);
323             args[16 + iExtraArgsCount] = string.Format("{0}", inputFile);
324             return args;
325         }
326         #endregion
327 
328 
329     }
330 }

 OCR,识别Image代码(AsPrise辅助类)

 1 using System;
 2 using System.Collections.Generic;
 3 using System.Runtime.InteropServices;
 4 using System.Text;
 5 
 6 namespace PDFCaptureService
 7 {
 8     public static class AspriseOCRHandler
 9     {
10         #region 外部引用
11         [DllImport("AspriseOCR.dll", EntryPoint = "OCR", CallingConvention = CallingConvention.Cdecl)]
12         public static extern IntPtr OCR(string file, int type);
13         [DllImport("AspriseOCR.dll", EntryPoint = "OCRpart", CallingConvention = CallingConvention.Cdecl)]
14         static extern IntPtr OCRpart(string file, int type, int startX, int
15         startY, int width, int height);
16         [DllImport("AspriseOCR.dll", EntryPoint = "OCRBarCodes", CallingConvention = CallingConvention.Cdecl)]
17         static extern IntPtr OCRBarCodes(string file, int type);
18         [DllImport("AspriseOCR.dll", EntryPoint = "OCRpartBarCodes", CallingConvention = CallingConvention.Cdecl)]
19         static extern IntPtr OCRpartBarCodes(string file, int type, int
20         startX, int startY, int width, int height);
21         #endregion
22 
23         /// <summary>
24         /// 
25         /// </summary>
26         /// <param name="fileName"></param>
27         /// <returns></returns>
28         public static string ReadImage(string fileName)
29         {
30             IntPtr ptrFileContent = OCR(fileName, -1);
31             string fileContent = Marshal.PtrToStringAnsi(ptrFileContent);
32             //
33             return fileContent;
34         }
35     }
36 }

 调用示例

1 GhostscriptHandler ghostscriptHandler = new GhostscriptHandler();
2                         string tempJpgFileName = string.Format(GhostScriptImageName, Guid.NewGuid().ToString());
3                         int pdfPageCount = ITextSharpHandler.GetPdfPageCount(fileName);
4                         ghostscriptHandler.Convert(fileName, tempJpgFileName, 1, pdfPageCount, "jpeg", 100, 100);
5                         fileContent = AspriseOCRHandler.ReadImage(fileName);

 

 

 

posted @ 2014-05-30 16:40  马洪彪  阅读(13532)  评论(1编辑  收藏  举报