Atitit pdf转文本 pdf2txt v4 t83.docx Atitit pdf转文本 pdfutil 目录 1.1. Pdfbox cmd 模式 TextToPDF 1 1.2. Pdf

Atitit pdf转文本 pdf2txt v4 t83.docx

Atitit pdf转文本 pdfutil

 

 

目录

1.1. Pdfbox cmd 模式 TextToPDF 1

1.2. Pdf util code 2

1.3. Pdf api模式 5

2. ref 6

 

 

import org.apache.poi.hslf.extractor.PowerPointExtractor;

poi工具

public static String readPPT(String f) {

PowerPointExtractor extractor;

try {

extractor = new PowerPointExtractor(new FileInputStream(new File(f)));

return extractor.getText();

} catch (IOException e) {

ExUtilV2t33.throwExV2(e);

}

return null;

 

}

 

 

 java -jar C:\Users\attilax\Pictures\pdfbox-app-2.0.9.jar  ExtractText   "C:\atibeks517\l4 doc v3 r7a ori exted\_0index\一种简单的基于字符形状的验证码识别技术.pdf"   c:\logs\识别技术.pdf.txt

 

 

转html

 

-console

false

Send text to console instead of file.

-html

false

Output in HTML format instead of raw text.

 

 

    1. Pdfbox cmd 模式 TextToPDF

This application will create a PDF document from a text file.

Usage: java -jar pdfbox-app-2.y.z.jar TextToPDF [OPTIONS] <outputfile> <textfile>

Command-Line Parameter

Default

Description

-standardFont

Helvetica

The font to use for the text. Either this or -ttf should be specified but not both.

-ttf

 

The TTF font to use for the text. Either this or -standardFont should be specified but not both.

java——PDF转换txt - 乞彦 - 博客园.html

 

 

 java -jar C:\Users\attilax\Pictures\pdfbox-app-2.0.9.jar  ExtractText -console  "C:\atibeks517\l4 doc v3 r7a ori exted\_0index\一种简单的基于字符形状的验证码识别技术.pdf"   c:\logs\识别技术.pdf.txt

 

 

    1. Pdf util code

/FulltxtLucenePrj/src/com/attilax/archive/pdfutilV3t88.java

 

pdfutilV3t88.java

 

import java.io.ByteArrayOutputStream;

import java.io.File;

import java.io.IOException;

import java.nio.file.FileVisitResult;

import java.nio.file.Files;

import java.nio.file.Path;

import java.nio.file.Paths;

import java.nio.file.SimpleFileVisitor;

import java.nio.file.attribute.BasicFileAttributes;

 

import org.apache.commons.exec.CommandLine;

import org.apache.commons.exec.DefaultExecuteResultHandler;

import org.apache.commons.exec.DefaultExecutor;

import org.apache.commons.exec.ExecuteException;

import org.apache.commons.exec.ExecuteWatchdog;

import org.apache.commons.exec.PumpStreamHandler;

import org.apache.commons.io.FileUtils;

import org.apache.commons.io.FilenameUtils;

 

import com.attilax.util.ExUtil;

 

public class pdfutilV2 {

 

public static void main(String[] args) throws ExecuteException, IOException {

 

Files.walkFileTree(

Paths.get(

"C:\\Users\\Administrator\\Documents\\WeChat Files\\attilax\\FileStorage\\File\\2019-08\\CityLink接入文档"),

new SimpleFileVisitor<Path>() {

 

// 处理文件

public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {

 

// return super.visitFile(file, attrs);

String fpath = file.toString();

String ext=FilenameUtils.getExtension(fpath);

if(!ext.equals("pdf"))

return FileVisitResult.CONTINUE; // 没找到继续找

String rzt = pdfutilV2.convert2txt_consoleBlockmodeV2(fpath);

FileUtils.writeStringToFile(new File("d:\\0pdfout\\" + file.toFile().getName() + ".txt"), rzt,

true);

return FileVisitResult.CONTINUE; // 没找到继续找

}

 

});

 

String sou = "C:\\atibeks517\\l4 doc v3 r7a ori exted\\_0index\\一种简单的基于字符形状的验证码识别技术.pdf";

String dest = " c:\\logs\\v2识别技术.pdf.txt";

System.out.println();

;

}

 

public static String convert2txt_consoleBlockmodeV2(String sou) {

// String s = " java -jar D:\\0gif sexy\\pdfbox-app-2.0.9.jar  ExtractText  @sou@ @dest@";

// s.replaceAll("@sou@", sou);

// s.replaceAll("@dest@", dest);

// final CommandLine cmdLine = CommandLine.parse(s);

 

final CommandLine cmdLine = new CommandLine("D:\\jdk1.8.0_31\\bin\\java.exe");

cmdLine.addArgument("-jar");

cmdLine.addArgument("D:\\0gif sexy\\pdfbox-app-2.0.9.jar");

cmdLine.addArgument("ExtractText");

cmdLine.addArgument("-console");

 

cmdLine.addArgument(sou);

// cmdLine.addArgument(dest);

 

// DefaultExecuteResultHandler resultHandler = new

// DefaultExecuteResultHandler();

DefaultExecutor executor = new DefaultExecutor();

try {

ByteArrayOutputStream baos = new ByteArrayOutputStream();

executor.setStreamHandler(new PumpStreamHandler(baos, baos));// iytstren

System.out.println( cmdLine);

executor.execute(cmdLine);

 

String result = baos.toString("utf8").trim();

return result;

 

} catch (Exception e) {

ExUtil.throwExV2(e);

}

return "";

 

}

 

 

    1. Pdf api模式

 

/bookmarksHtmlEverythingIndexPrj/src/emailPKg/ExtractTextFromPDF.java

 

 

package emailPKg;

 

import java.io.File;

import java.io.FileInputStream;

import java.io.FileNotFoundException;

import java.io.IOException;

 

import org.apache.commons.io.FileUtils;

import org.apache.commons.io.FilenameUtils;

import org.apache.pdfbox.pdfparser.PDFParser;

import org.apache.pdfbox.pdmodel.PDDocument;

import org.apache.pdfbox.util.PDFTextStripper;

 

 

 

 /**

  * jra 1.8.16

  * @author zhoufeiyue

  *

  */

public class ExtractTextFromPDF {

public static  String readPDFV2WithCache(String filename,String cacheDir) throws  Exception{

 

String basename=FilenameUtils.getName(filename);

File file2 = new File(cacheDir+"\\"+basename+".txt");

if(file2.exists())

{

return FileUtils.readFileToString(file2);

}

 

File file = new File(filename);

FileInputStream in = null;

 

in = new FileInputStream(filename);

PDFParser parser = new PDFParser(in);

parser.parse();

PDDocument pdDocument = parser.getPDDocument();

PDFTextStripper stripper = new PDFTextStripper();

String result = stripper.getText(pdDocument);

 

System.out.println("PDF文件" + file.getAbsolutePath()+"内容如下:");

 

FileUtils.write(file2, result);

return (result);

 

}

 

 

  1. ref

Apache PDFBox _ Command-Line Tools.html

Atitit 读写文件慢的解决方案cache法  pdf转txt

 

posted @ 2019-08-15 19:24  attilaxAti  阅读(46)  评论(0编辑  收藏  举报