【Java解析PDF并获取想要的字符串】

public class pdfAnalysis {
    /**
     * @throws IOException
     * @param从网络上下载PDF,截取PDF字符串,
     */

    public static void main(String[] args) throws IOException {
        // 下载的连接 下载下来的名字 下载下来的路径
        // pdfAnalysis.downLoadByUrl("", "KK.pdf", "F:/");
        // 读取文件
        pdfAnalysis pdf = new pdfAnalysis();

        // 读取文件
        String pdfName = "F:\\CC.pdf";
        // 解析PDF里的值 存入变量pdf_Body
        String pdf_Body = pdf.readFileOfPDF(pdfName);
        //System.out.println(pdf_Body);

        

       /* String str = pdf_Body.substring(pdf_Body.indexOf("Arrival"), pdf_Body.indexOf("Payment Details"));
        String str1 = str.substring(str.indexOf("H ("));
        String [] pp ={"Monday","Tuesday","Wednesday","Thursday","Friday","Saturday" ,"Sunday" };

        for(String sto:pp){
            if(str1.contains(sto)){
                String result = str1.substring(str1.indexOf(sto));
                //System.out.println(result);
                //System.out.println(result.length());
                String result2 = result.substring(0,result.indexOf(","));
                String result3 = result2.trim();
                System.out.println("我要的时间:"+result3+"我是"+pdfName+"文件");
            }

        }*/



        /*if(str1.contains("Monday")||str1.contains("Tuesday")||
                str1.contains("Wednesday")||str1.contains("Thursday")||
                str1.contains("Friday")||str1.contains("Saturday")||str1.contains("Sunday")){



        }*/
        // System.out.println(str1);


        // 取出人名值
        String name_Temp = pdf_Body.substring(pdf_Body.indexOf("Arrive"), pdf_Body.indexOf("passenger details"));
        // System.out.println(str);
        String name_Temp1 = null;
        String result_name = null;
        List<String> list_Name = new ArrayList<>();
        for (int i = 1; i < name_Temp.length(); i++) {

            if (name_Temp.contains(i + ".")) {
                name_Temp1 = name_Temp.substring(name_Temp.indexOf(i + "."));

                result_name = name_Temp1.substring(name_Temp1.indexOf(i + ".") + 3,
                        name_Temp1.indexOf("Seat Number Services"));
                list_Name.add(result_name);
            }
            // System.out.println(add);
            // System.out.println(str2);
            if (name_Temp1.equals("null")) {
                continue;
            }
        }
        for (String i : list_Name) {
            System.out.println("所有的人名:" + i);
        }*/


        if (pdfAnalysis.infile != null) {
            pdfAnalysis.infile.close();
            System.out.println("我要准备关闭PDF文档了");
        }

    }

    public static int appearNumber(String srcText, String temp) {
        int count = 0;
        Pattern p = Pattern.compile(temp);
        Matcher m = p.matcher(srcText);
        while (m.find()) {
            count++;
        }
        return count;
    }
    public static FileInputStream infile = null;

    public String readFileOfPDF(String pdfName) throws IOException {
        String context = null;
        File file = new File(pdfName);// 创建一个文件对象


        try {
            infile = new FileInputStream(pdfName);// 创建一个文件输入流
            // 新建一个PDF解析器对象
            PDFParser parser = new PDFParser(infile);
            // 对PDF文件进行解析
            parser.parse();
            // 获取解析后得到的PDF文档对象
            PDDocument pdfdocument = parser.getPDDocument();
            // 新建一个PDF文本剥离器
            PDFTextStripper stripper = new PDFTextStripper();
            // 从PDF文档对象中剥离文本
            context = stripper.getText(pdfdocument);
            System.out.println("PDF文件" + file.getAbsolutePath() + "的文本内容如下:");
            // System.out.println(context);

        } catch (Exception e) {
            System.out.println("读取PDF文件" + file.getAbsolutePath() + "失败!" + e.getMessage());
        } finally {

            if (infile != null) {
                try {
                    infile.close();
                } catch (IOException e1) {
                }
            }
        }
        return context;
    }

 

posted @ 2018-05-03 10:28  一粒尘埃丶流年  阅读(6713)  评论(0编辑  收藏  举报