首先下载iText包,地址为http://sourceforge.net/projects/itext/,最新版本为5.1.2,完整包名为iText-5.1.2.zip,解压后将得到一组jar包,我们要使用的是里面的itextpdf-5.1.2.jar。
或者引入maven依赖:
<dependency> <groupId>com.itextpdf</groupId> <artifactId>itextpdf</artifactId> <version>5.5.10</version> </dependency>
一、itext读取pdf标题和题号
在本地配置好Java编译和运行环境后,编写如下示例代码:
1 package com.pdfcom; 2 import java.util.ArrayList; 3 import java.util.HashMap; 4 import java.util.Iterator; 5 import java.util.List; 6 7 import com.itextpdf.text.pdf.PdfReader; 8 import com.itextpdf.text.pdf.SimpleBookmark; 9 /** 10 * 11 * @author hp 12 *读取pdf各级标题 及页号 13 */ 14 public class Test { 15 16 public static void main ( String [] args ) throws Exception { 17 PdfReader reader = new PdfReader ( "C:\\Users\\hp\\Desktop\\新建文件夹 (2)\\一案.pdf" ) ; 18 List<HashMap<String, Object>> list = SimpleBookmark.getBookmark ( reader ) ; 19 for ( Iterator<HashMap<String, Object>> i = list.iterator () ; i.hasNext () ; ) { 20 21 showBookmark ( i.next ()) ; 22 23 } 24 for ( Iterator<HashMap<String, Object>> i = list.iterator () ; i.hasNext () ; ) { 25 26 getPageNumbers( i.next ()); 27 } 28 29 } 30 //获取标题 31 private static void showBookmark ( HashMap<String, Object> bookmark) { 32 System.out.println (bookmark.get ( "Title" )) ; 33 @SuppressWarnings("unchecked") 34 ArrayList<HashMap<String, Object>> kids = (ArrayList<HashMap<String, Object>>) bookmark.get ( "Kids" ) ; 35 if ( kids == null ) 36 return ; 37 for ( Iterator<HashMap<String, Object>> i = kids.iterator () ; i.hasNext () ; ) { 38 39 showBookmark ( i.next ()) ; 40 } 41 } 42 43 //获取页码 44 public static void getPageNumbers(HashMap<String, Object> bookmark) { 45 if (bookmark == null) 46 return; 47 48 if ("GoTo".equals(bookmark.get("Action"))) { 49 50 String page = (String)bookmark.get("Page"); 51 if (page != null) { 52 53 page = page.trim(); 54 55 int idx = page.indexOf(' '); 56 57 int pageNum; 58 59 if (idx < 0){ 60 61 pageNum = Integer.parseInt(page); 62 System.out.println ("pageNum :"+ pageNum) ; 63 } 64 else{ 65 66 pageNum = Integer.parseInt(page.substring(0, idx)); 67 System.out.println ("pageNum:" +pageNum) ; 68 } 69 } 70 @SuppressWarnings("unchecked") 71 ArrayList<HashMap<String, Object>> kids = (ArrayList<HashMap<String, Object>>) bookmark.get ( "Kids" ) ; 72 if ( kids == null ) 73 return ; 74 for ( Iterator<HashMap<String, Object>> i = kids.iterator () ; i.hasNext () ; ) { 75 76 getPageNumbers ( i.next ()) ; 77 } 78 79 } 80 } 81 82 }
二、itext读取pdf所有内容
1 package com.pdfcom; 2 3 import java.io.IOException; 4 import java.net.URL; 5 6 import com.itextpdf.text.pdf.PdfReader; 7 import com.itextpdf.text.pdf.parser.PdfTextExtractor; 8 9 /** 10 * 11 * @author hp 12 *读取pdf全部内容 13 */ 14 public class TestAll { 15 16 public static void main(String[] args) throws IOException { 17 18 URL url=new URL("file:/C:\\Users\\hp\\Desktop\\新建文件夹 (2)\\一案.pdf"); 19 readPdf(url);//直接读全PDF面 20 } 21 22 public static void readPdf(URL url){ 23 String pageContent = ""; 24 try { 25 PdfReader reader = new PdfReader(url); 26 int pageNum = reader.getNumberOfPages(); 27 for(int i=1;i<=pageNum;i++){ 28 pageContent += PdfTextExtractor.getTextFromPage(reader, i);//读取第i页的文档内容 29 } 30 31 System.out.println(pageContent); 32 33 } catch (Exception e) { 34 e.printStackTrace(); 35 }finally{ 36 } 37 } 38 39 }
三、读取pdf特定内容。目前itext没有提供逐行读取的方法,要想读取每行的特定内容,就用字符串截取的方法。
1 package com.pdfcom; 2 3 import java.io.IOException; 4 import java.net.URL; 5 import com.itextpdf.text.pdf.PdfReader; 6 import com.itextpdf.text.pdf.parser.PdfTextExtractor; 7 8 /** 9 * 10 * @author hp 11 *读取pdf指定内容 12 */ 13 public class PdfContent { 14 15 public static void main(String[] args) throws IOException { 16 17 URL url=new URL("file:/C:\\Users\\hp\\Desktop\\新建文件夹 (2)\\国网京峡ECI光传输系统500kV保北站XDM1000设备停运三措一案.pdf"); 18 readPdf(url);//直接读全PDF面 19 20 } 21 22 public static void readPdf(URL url){ 23 String pageContent = ""; 24 try { 25 PdfReader reader = new PdfReader(url); 26 int pageNum = reader.getNumberOfPages(); 27 for(int i=1;i<=pageNum;i++){ 28 pageContent += PdfTextExtractor.getTextFromPage(reader, i);//读取第i页的文档内容 29 } 30 31 // System.out.println(pageContent); 32 //获取工作联系人 33 int strStartIndex = pageContent.indexOf("联系人"); 34 int strEndIndex = pageContent.indexOf("二、施工组织措施"); 35 String a = pageContent.substring(strStartIndex, strEndIndex); 36 String b = a.substring(4); 37 System.out.println(b); 38 } catch (Exception e) { 39 e.printStackTrace(); 40 }finally{ 41 } 42 } 43 44 }