posts - 145,comments - 23,views - 73万
/**
 * com.jiaoyiping.pdstest.TestTika.java
 * Copyright (c) 2009 Hewlett-Packard Development Company, L.P.
 * All rights reserved.
 */
package com.jiaoyiping.pdstest;
 
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.OutputStream;
 
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.mail.RFC822Parser;
import org.apache.tika.parser.microsoft.OfficeParser;
import org.apache.tika.parser.pdf.PDFParser;
import org.apache.tika.sax.BodyContentHandler;
import org.junit.Test;
import org.xml.sax.ContentHandler;
 
/**
 * <pre>
 * Desc:
 * @author 焦一平
 * @refactor 焦一平
 * @date   2014年12月4日 下午1:31:09
 * @version 1.0
 * @see 
 * REVISIONS:
 * Version     Date             Author            Description
 * -------------------------------------------------------------------
 * 1.0        2014年12月4日                                  焦一平            1. Created this class.
 * </pre> 
 */
public class TestTika {
     
    //解析PDF
    @Test
    public void testPdf() throws Exception{
        Long start = System.currentTimeMillis();
        Parser parser = new PDFParser();
        InputStream is = new BufferedInputStream(new FileInputStream(new File("D:\\我的微盘\\文档\\参考文档\\Linux Shell脚本攻略.pdf")));
        OutputStream os = new BufferedOutputStream(new FileOutputStream(new File("C:\\Users\\Administrator\\Desktop\\result.txt")));
        Metadata meta = new Metadata(); 
        meta.add(Metadata.CONTENT_ENCODING, "utf-8"); 
        ContentHandler iHandler = new BodyContentHandler(os); 
        parser.parse(is, iHandler, meta, new ParseContext());
        Long end = System.currentTimeMillis();
        Long used = (end-start)/1000;
        System.out.println("耗时: "+used+"秒");
    }
    //解析Word
    @Test
    public void testWrod() throws Exception{
        Long start = System.currentTimeMillis();
        Parser parser = new OfficeParser();
        InputStream is = new BufferedInputStream(new FileInputStream(new File("D:\\我的微盘\\文档\\参考文档\\jBPM5_用户指南中文版.doc")));
        OutputStream os = new BufferedOutputStream(new FileOutputStream(new File("C:\\Users\\Administrator\\Desktop\\result.txt")));
        Metadata meta = new Metadata(); 
        meta.add(Metadata.CONTENT_ENCODING, "utf-8"); 
        ContentHandler iHandler = new BodyContentHandler(os); 
        parser.parse(is, iHandler, meta, new ParseContext());
         
        Long end = System.currentTimeMillis();
        Long used = (end-start)/1000;
        System.out.println("耗时:"+used+"秒");
    }
    //解析EMAIL(只能解析标准的eml格式的,不能解析微软的msg格式)
    //使用commons-email来进行解析的可以得到收件人、发件人、主题、内容等元数据,TIkA是否支持未尝试
    @Test
    public void testEmail() throws Exception{
        Long start = System.currentTimeMillis();
        Parser parser = new RFC822Parser();
        InputStream is = new BufferedInputStream(new FileInputStream(new File("C:\\Users\\Administrator\\Downloads\\回复_ RE_ 数据导入工作 - 外部系统枚举与U-Cloud枚举映射.eml")));
        OutputStream os = new BufferedOutputStream(new FileOutputStream(new File("C:\\Users\\Administrator\\Desktop\\result.txt")));
        Metadata meta = new Metadata(); 
        meta.add(Metadata.CONTENT_ENCODING, "utf-8");
        ContentHandler iHandler = new BodyContentHandler(os); 
        parser.parse(is, iHandler, meta, new ParseContext());
         
        Long end = System.currentTimeMillis();
        Long used = (end-start)/1000;
        System.out.println("耗时:"+used+"秒");
    }
}

  

 

posted on   梦中彩虹  阅读(1425)  评论(0编辑  收藏  举报
(评论功能已被禁用)
编辑推荐:
· AI与.NET技术实操系列:基于图像分类模型对图像进行分类
· go语言实现终端里的倒计时
· 如何编写易于单元测试的代码
· 10年+ .NET Coder 心语,封装的思维:从隐藏、稳定开始理解其本质意义
· .NET Core 中如何实现缓存的预热?
阅读排行:
· 分享一个免费、快速、无限量使用的满血 DeepSeek R1 模型,支持深度思考和联网搜索!
· 25岁的心里话
· 基于 Docker 搭建 FRP 内网穿透开源项目(很简单哒)
· ollama系列01:轻松3步本地部署deepseek,普通电脑可用
· 按钮权限的设计及实现
< 2025年3月 >
23 24 25 26 27 28 1
2 3 4 5 6 7 8
9 10 11 12 13 14 15
16 17 18 19 20 21 22
23 24 25 26 27 28 29
30 31 1 2 3 4 5

点击右上角即可分享
微信分享提示