c#抽取pdf文档标题(2)

 1  public class IETitle
 2     {
 3         public static List<WordInfo> WordsInfo = new List<WordInfo>();
 4 
 5         private static string pdfcontent;
 6         public static HandleResult GetTitle(string path, string realtitle)
 7         {
 8             WordsInfo.Clear();
 9 
10             string content = string.Empty;
11             try
12             {
13                 content = ITextSharpLib.ExtractTextFromPdf(path);
14             }
15             catch
16             {
17                 try
18                 {
19                     content = PDFBoxLib.Pdf2txt(path);
20                 }
21                 catch (Exception ex)
22                 {
23 
24                 }
25             }
26 
27             pdfcontent = content;
28 
29             PDFBoxLib.HandleContent(path);
30 
31             //处理字符
32 
33             Word w = new Word();
34             w.MakeWord(WordsInfo);
35 
36             Line line = new Line();
37             line.MakeLine(w);
38 
39             //处理行
40             Block block = new Block();
41             block.MakeBlock(line);
42 
43             //获取全部的文本
44             string text = string.Empty;
45 
46             try
47             {
48                 text = ITextSharpLib.ExtractTextFromPdf(path, 0);
49             }
50             catch (Exception ex)
51             {
52                 text = content;
53             }
54 
55             HandleResult title = new HandleResult() { Title = "" };
56 
57             try
58             {
59                 var sentences = text.Split('\n');
60 
61                 InfoExtract ie = new InfoExtract(sentences, text);
62 
63                 title = ie.ExtractTitle(block, realtitle);
64 
65             }
66             catch (Exception ex)
67             {
68                 Logger.Debug(ex.Message);
69             }
70 
71             return title;
72         }
73     }

上面就是获取标题的整体逻辑代码。29行,是调用pdfboxLib,读取pdf第一页内容:

 1  public static string HandleContent(string fileName, int pageIndex = 1)
 2         {
 3             try
 4             {
 5                 PDDocument document = null;
 6                 try
 7                 {
 8                     document = PDDocument.load(fileName);
 9                     List allPages = document.getDocumentCatalog().getAllPages();
10 
11                     int size = pageIndex == 0 ? allPages.size() : 1;
12 
13                     for (int i = 0; i < size; i++)
14                     {
15                         var page = (PDPage)allPages.get(i);
16 
17                         var contents = page.getContents();
18 
19                         PrintTextLocatins2 printer = new PrintTextLocatins2();
20 
21                         if (contents != null)
22                         {
23                             printer.processStream(page, page.findResources(), page.getContents().getStream());
24                         }
25                     }
26                 }
27                 catch (Exception ex)
28                 {
29                 }
30                 finally
31                 {
32                     if (document != null)
33                     {
34                         document.close();
35                     }
36                 }
37             }
38             catch (Exception ex)
39             {
40 
41             }
42             return "";
43         }

第23行 printer.processStream方法,会触发自定义类PrintTextLocation2类中的字符处理方法 processTextPosition:

 1  public class PrintTextLocatins2 : PDFTextStripper
 2     {
 3         private static int BOLD_F_NUM = 2;
 4         private static String[] BOLD_FLAGS = { "Bold", "CAJ FNT04" };
 5         private static int ITALIC_F_NUM = 2;
 6         private static String[] ITALIC_FLAGS = { "Italic", "CAJ FNT03" };
 7 
 8         private static bool IsBold(String font)
 9         {
10             int i;
11             for (i = 0; i < BOLD_F_NUM; i++)
12                 if (font.Contains(BOLD_FLAGS[i]))
13                     return true;
14             return false;
15         }
16 
17         private static bool IsItalic(String font)
18         {
19             int i;
20             for (i = 0; i < ITALIC_F_NUM; i++)
21                 if (font.Contains(ITALIC_FLAGS[i]))
22                     return true;
23             return false;
24         }
25 
26         public PrintTextLocatins2()
27         {
28             base.setSortByPosition(false);
29         }
30         protected override void processTextPosition(TextPosition text)
31         {
32 
33             WordInfo info = new WordInfo()
34             {
35                 X = text.getX(),
36                 Y = text.getY(),
37                 XDirAdj = text.getXDirAdj(),
38                 YDirAdj = text.getYDirAdj(),
39                 FontSize = text.getFontSize(),
40                 Xscale = text.getXScale(),
41                 Yscale = text.getYScale(),
42                 Height = text.getHeight(),
43                 Space = text.getWidthOfSpace(),
44                 Width = text.getWidth(),
45 
46                 Subfont = text.getFont().getSubType(),
47                 Basefont = text.getFont().getBaseFont(),
48                 IsBold = IsBold(text.getFont().getBaseFont()),
49                 IsItalic = IsItalic(text.getFont().getBaseFont()),
50 
51                 XSize = (int)(text.getFontSize() * text.getXScale()),
52 
53                 YSize = (int)(text.getFontSize() * text.getYScale()),
54 
55                 Word = text.getCharacter()
56             };
57 
58 
59             if (info.Space.ToString() == "非数字")
60             {
61                 info.Space = 0;
62             }
63 
64             IETitle.WordsInfo.Add(info);
65         }
66     }

这样我们就利用pdfbox收集了pdf文档的字符信息。

 

posted @ 2017-10-27 14:53  micDavid  阅读(696)  评论(0编辑  收藏  举报