c#抽取pdf文档标题(2)
1 public class IETitle 2 { 3 public static List<WordInfo> WordsInfo = new List<WordInfo>(); 4 5 private static string pdfcontent; 6 public static HandleResult GetTitle(string path, string realtitle) 7 { 8 WordsInfo.Clear(); 9 10 string content = string.Empty; 11 try 12 { 13 content = ITextSharpLib.ExtractTextFromPdf(path); 14 } 15 catch 16 { 17 try 18 { 19 content = PDFBoxLib.Pdf2txt(path); 20 } 21 catch (Exception ex) 22 { 23 24 } 25 } 26 27 pdfcontent = content; 28 29 PDFBoxLib.HandleContent(path); 30 31 //处理字符 32 33 Word w = new Word(); 34 w.MakeWord(WordsInfo); 35 36 Line line = new Line(); 37 line.MakeLine(w); 38 39 //处理行 40 Block block = new Block(); 41 block.MakeBlock(line); 42 43 //获取全部的文本 44 string text = string.Empty; 45 46 try 47 { 48 text = ITextSharpLib.ExtractTextFromPdf(path, 0); 49 } 50 catch (Exception ex) 51 { 52 text = content; 53 } 54 55 HandleResult title = new HandleResult() { Title = "" }; 56 57 try 58 { 59 var sentences = text.Split('\n'); 60 61 InfoExtract ie = new InfoExtract(sentences, text); 62 63 title = ie.ExtractTitle(block, realtitle); 64 65 } 66 catch (Exception ex) 67 { 68 Logger.Debug(ex.Message); 69 } 70 71 return title; 72 } 73 }
上面就是获取标题的整体逻辑代码。29行,是调用pdfboxLib,读取pdf第一页内容:
1 public static string HandleContent(string fileName, int pageIndex = 1) 2 { 3 try 4 { 5 PDDocument document = null; 6 try 7 { 8 document = PDDocument.load(fileName); 9 List allPages = document.getDocumentCatalog().getAllPages(); 10 11 int size = pageIndex == 0 ? allPages.size() : 1; 12 13 for (int i = 0; i < size; i++) 14 { 15 var page = (PDPage)allPages.get(i); 16 17 var contents = page.getContents(); 18 19 PrintTextLocatins2 printer = new PrintTextLocatins2(); 20 21 if (contents != null) 22 { 23 printer.processStream(page, page.findResources(), page.getContents().getStream()); 24 } 25 } 26 } 27 catch (Exception ex) 28 { 29 } 30 finally 31 { 32 if (document != null) 33 { 34 document.close(); 35 } 36 } 37 } 38 catch (Exception ex) 39 { 40 41 } 42 return ""; 43 }
第23行 printer.processStream方法,会触发自定义类PrintTextLocation2类中的字符处理方法 processTextPosition:
1 public class PrintTextLocatins2 : PDFTextStripper 2 { 3 private static int BOLD_F_NUM = 2; 4 private static String[] BOLD_FLAGS = { "Bold", "CAJ FNT04" }; 5 private static int ITALIC_F_NUM = 2; 6 private static String[] ITALIC_FLAGS = { "Italic", "CAJ FNT03" }; 7 8 private static bool IsBold(String font) 9 { 10 int i; 11 for (i = 0; i < BOLD_F_NUM; i++) 12 if (font.Contains(BOLD_FLAGS[i])) 13 return true; 14 return false; 15 } 16 17 private static bool IsItalic(String font) 18 { 19 int i; 20 for (i = 0; i < ITALIC_F_NUM; i++) 21 if (font.Contains(ITALIC_FLAGS[i])) 22 return true; 23 return false; 24 } 25 26 public PrintTextLocatins2() 27 { 28 base.setSortByPosition(false); 29 } 30 protected override void processTextPosition(TextPosition text) 31 { 32 33 WordInfo info = new WordInfo() 34 { 35 X = text.getX(), 36 Y = text.getY(), 37 XDirAdj = text.getXDirAdj(), 38 YDirAdj = text.getYDirAdj(), 39 FontSize = text.getFontSize(), 40 Xscale = text.getXScale(), 41 Yscale = text.getYScale(), 42 Height = text.getHeight(), 43 Space = text.getWidthOfSpace(), 44 Width = text.getWidth(), 45 46 Subfont = text.getFont().getSubType(), 47 Basefont = text.getFont().getBaseFont(), 48 IsBold = IsBold(text.getFont().getBaseFont()), 49 IsItalic = IsItalic(text.getFont().getBaseFont()), 50 51 XSize = (int)(text.getFontSize() * text.getXScale()), 52 53 YSize = (int)(text.getFontSize() * text.getYScale()), 54 55 Word = text.getCharacter() 56 }; 57 58 59 if (info.Space.ToString() == "非数字") 60 { 61 info.Space = 0; 62 } 63 64 IETitle.WordsInfo.Add(info); 65 } 66 }
这样我们就利用pdfbox收集了pdf文档的字符信息。