POI读取Word与Excel

import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.File;
import java.io.OutputStreamWriter;
import java.util.HashSet;

import org.apache.poi.hslf.HSLFSlideShow;
import org.apache.poi.hslf.model.Slide;
import org.apache.poi.hslf.model.TextRun;
import org.apache.poi.hslf.usermodel.RichTextRun;
import org.apache.poi.hslf.usermodel.SlideShow;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.hwpf.usermodel.Section;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xslf.usermodel.XSLFShape;
import org.apache.poi.xslf.usermodel.XSLFSlide;
import org.apache.poi.xslf.usermodel.XSLFTable;
import org.apache.poi.xslf.usermodel.XSLFTableCell;
import org.apache.poi.xslf.usermodel.XSLFTableRow;
import org.apache.poi.xslf.usermodel.XSLFTextParagraph;
import org.apache.poi.xslf.usermodel.XSLFTextRun;
import org.apache.poi.xslf.usermodel.XSLFTextShape;
public class read {

    private static InputStream is =null;


    public static void readFiles(String path){
        File f = new File(path);
        File[] files = f.listFiles();
        for(File everyfile : files){
            StringBuilder sb = new StringBuilder();
            if(everyfile.isDirectory())    continue;
            String Filename = everyfile.getName();
            if(Filename.startsWith("~")) continue;
            if(!Filename.endsWith("doc")) continue;
            System.out.println(Filename);
            sb.append("###@@@").append(Filename.substring(0,Filename.lastIndexOf("."))).append("\n");
            sb.append("----------------------").append("\n");
            try {
                //输入文件流
                is = new FileInputStream(everyfile);
                if(Filename.toLowerCase().endsWith("ppt")||Filename.toLowerCase().endsWith("pptm")){
                    try {
                        XMLSlideShow pptx = new XMLSlideShow(is);
                        is.close();
                        for(int x= 0 ; x< pptx.getSlides().length ; x++){
                            XSLFSlide slide = pptx.getSlides()[x];
                            if (slide.getShapes().length == 0) continue;
                            String title = getTitle(slide);
                            if(title != null) sb.append(title).append("\t").append("title##@@").append("\n");
                            for(XSLFShape shape : slide){
                                if(shape instanceof XSLFTextShape){
                                    XSLFTextShape content = (XSLFTextShape)shape;
                                    for( XSLFTextParagraph ttp: content.getTextParagraphs()){    
                                        
                                        if(ttp.getText().equals(title)) continue;
                                        //用一个set统计到底有多少字体大小,如果只有一种字体大小,则直接添加paragraph
                                        HashSet<Float> sizeset = new HashSet<Float>();
                                        for(XSLFTextRun tr : ttp.getTextRuns()){
                                            if (tr.getText().trim().equals("")) continue;
                                            if(tr.getText().trim().equals(title)) continue;
                                            float size = (float) tr.getFontSize();
                                            sizeset.add(size);
                                        }
                                        if(sizeset.size()!=1){
                                            for(XSLFTextRun tr : ttp.getTextRuns()){
                                                if (tr.getText().trim().equals("")) continue;
                                                if(tr.getText().trim().equals(title)) continue;
                                                String text = tr.getText();
                                                float size = (float) tr.getFontSize();
                                                sb.append(text.trim()).append("\t").append(size).append("##@@").append("\n");
                                            }
                                        }else{
                                            sb.append(ttp.getText().trim().replaceAll("[\\n\\r]", " ")).append("\t").append((float)sizeset.toArray()[0]).append("##@@").append("\n");
                                                
                                        }
                                    }
                                }else if(shape instanceof XSLFTable){
                                    XSLFTable txShape = (XSLFTable)shape;
                                    for(XSLFTableRow row : txShape.getRows()){
                                        for(XSLFTableCell cell: row.getCells()){
                                            XSLFTextShape content = (XSLFTextShape)cell;
                                            for( XSLFTextParagraph ttp: content.getTextParagraphs()){
                                                if(ttp.getText().equals(title)) continue;
                                                //用一个set统计到底有多少字体大小,如果只有一种字体大小,则直接添加paragraph

                                                
                                                HashSet<Float> sizeset = new HashSet<Float>();
                                                for(XSLFTextRun tr : ttp.getTextRuns()){
                                                    if (tr.getText().trim().equals("")) continue;
                                                    if(tr.getText().trim().equals(title)) continue;
                                                    float size = (float) tr.getFontSize();
                                                    sizeset.add(size);
                                                }
                                                if(sizeset.size()!=1){
                                                    for(XSLFTextRun tr : ttp.getTextRuns()){
                                                        if (tr.getText().trim().equals("")) continue;
                                                        if(tr.getText().trim().equals(title)) continue;
                                                        String text = tr.getText();
                                                        float size = (float) tr.getFontSize();
                                                        sb.append(text.trim()).append("\t").append(size).append("##@@").append("\n");
                                                    }
                                                }else{
                                                    sb.append(ttp.getText().trim().replaceAll("[\\n\\r]", " ")).append("\t").append((float)sizeset.toArray()[0]).append("##@@").append("\n");
                                                        
                                                }
                                            }
                                        }
                                    }
                                }
                            }
                            if(x!=pptx.getSlides().length-1) sb.append("----------------------").append("\n");
                        }
                    } catch (IOException e) {
                        e.printStackTrace();
                    }
                    
                }else if(Filename.endsWith("ppt")){
                    
                    try {
                        SlideShow ss = new SlideShow(new HSLFSlideShow(is));
                        
                        is.close();
                        for(int x = 0 ; x < ss.getSlides().length ; x ++){
                            Slide slide = ss.getSlides()[x];
                            if (slide.getShapes().length ==0) continue;
                            String title = getTitle(slide);
                            if(title != null) sb.append(title).append("\t").append("title##@@").append("\n");
                        
                            for(TextRun tr : slide.getTextRuns()){
                                
                                HashSet<Float> sizeset = new HashSet<Float>();
                                
                                for(RichTextRun rtr : tr.getRichTextRuns()){
                                    if (rtr.getText().trim().equals("")|| rtr.getText() ==null) continue;
                                    if(rtr.getText().trim().equals(title)) continue;
                                    sizeset.add((float)rtr.getFontSize());
                                }
                                if(sizeset.size()!=1){
                                    for(RichTextRun rtr : tr.getRichTextRuns()){
                                        if (rtr.getText().trim().equals("") || rtr.getText() ==null) continue;
                                        if(rtr.getText().trim().equals(title)) continue;
                                        String text = rtr.getText();
                                        float size = (float) rtr.getFontSize();
                                        sb.append(text.trim()).append("\t").append(size).append("##@@").append("\n");
                                    }
                                }else {
                                    for(RichTextRun rtr : tr.getRichTextRuns()){
                                        if (rtr.getText().trim().equals("")|| rtr.getText() ==null) continue;
                                        if(rtr.getText().trim().equals(title)) continue;
                                        sb.append(rtr.getText().trim()).append(" ");
                                    }
                                    sb.append("\t").append((float)sizeset.toArray()[0]).append("##@@").append("\n");
                                }
                            }
                            if(x!=ss.getSlides().length-1) sb.append("----------------------").append("\n");
                        }
                    } catch (IOException e) {
                        e.printStackTrace();
                    }
                }else if(Filename.endsWith("doc")){
                    try {
                        HWPFDocument hwpf = new HWPFDocument(is);
                        Range range = hwpf.getRange();
                        for (int x = 0; x < range.numSections(); x++) {
                               Section s = range.getSection(x);
                               for (int y = 0; y < s.numParagraphs(); y++) {
                                      Paragraph p = s.getParagraph(y);
                                      for (int z = 0; z < p.numCharacterRuns(); z++) {
                                             CharacterRun run = p.getCharacterRun(z);
                                             //字符串文本                                       
                                             String text = run.text().trim();
                                             if(text ==null ||text == " "|| text=="")  continue;
                                             sb.append(text.trim()).append("\t").append(run.getFontSize()).append("##@@").append("\n");
                                      }
                               }
                               if (x != range.numSections()-1)  sb.append("----------------------").append("\n");    
                        }
                    } catch (IOException e) {
                        e.printStackTrace();
                    }
                    
                }
        } catch (FileNotFoundException e) {
                e.printStackTrace();
        }
            write(sb.toString());
        }
    }
    
    
    public static String getTitle(XSLFSlide slide){
        String title = null;
        if (slide.getTitle() != null && 
                !slide.getTitle().trim().equals("")){ 
            title= slide.getTitle().trim();
        }

        return title;

    }
    
    public static String getTitle(Slide slide){
        String title = null;
        if (slide.getTitle() != null && 
                !slide.getTitle().trim().equals("")){ 
            title= slide.getTitle().trim();
        }
        return title;
    }
    static FileOutputStream fos =null;
    static OutputStreamWriter osw =null;
    static BufferedWriter bw =null;
    public static void write(String content){
        File f = new File("ressss.csv");
        try {
            fos = new FileOutputStream(f,true);
            osw = new OutputStreamWriter(fos,"utf-8");
            bw  = new BufferedWriter(osw);
            bw.write(content);
            bw.flush();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }finally{

            if(bw !=null){
                try {
                    bw.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
                bw =null;
            }
            if(osw !=null){
                try {
                    osw.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
                osw =null;
            }
            if(fos !=null){
                try {
                    fos.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
                fos =null;
            }
        }
    }
        
    
    
    
    public static void main(String[] args) throws Exception {
        readFiles("C:\\Users\\ooon\\Desktop\\DKM_data\\DKM_data");
    }
}

 

posted @ 2015-09-22 09:30  ooon  阅读(939)  评论(0编辑  收藏  举报