读取Doc，Excel，PDF，html,生成Txt文件，读取Txt生成Excel文件

package office;

/**
 * 读取Doc，Excel，PDF，html,生成Txt文件，读取Txt生成Excel文件
 * @author JavaAlpha
 * @date 2011-8-1
 * @version V 1.0
 */

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import javax.swing.text.BadLocationException;
import javax.swing.text.DefaultStyledDocument;
import javax.swing.text.rtf.RTFEditorKit;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.util.PDFTextStripper;
import org.textmining.text.extraction.WordExtractor;

public class ReadOffice {

	/**
	 * @param args
	 */
	public static void main(String[] args) {
		// readDoc("e:/1.doc");
		// readExcel("e:/1.xls");
		// readPDF("e:/1.pdf");
		// readHtml("e:/1.html");
		readHtmlAll("e:/1.html");
	}

	/**
	 * 创建TXT文件，写入文件内容
	 * 
	 * @param text
	 */

	static void createTXTAndWriteDoc(String text, String path) {
		FileOutputStream fos = null;
		FileOutputStream out = null;
		try {
			// 新建一输出文件流,如果文件存在先删除文件
			File f = new File(path);
			if (f.exists()) {
				f.delete();
			}

			fos = new FileOutputStream(f);
			out = new FileOutputStream(f);
			byte[] b = text.getBytes("GB2312");
			out.write(b);
			out.flush();

			System.out.println("文件生成...");
		} catch (Exception e) {
			System.out.println("出现异常: " + e);
		} finally {
			try {
				if (null != fos) {
					fos.close();
				}
			} catch (IOException e) {
				e.printStackTrace();
			}
			try {
				if (null != out) {
					out.close();
				}
			} catch (IOException e) {
				e.printStackTrace();
			}
			fos = null;
			out = null;
		}
	}

	/**
	 * 读取DOC文件
	 * 
	 * @param dir
	 * @throws Exception
	 */
	static void readDoc(String dir) {
		// 创建输入流读取doc文件
		FileInputStream in = null;
		WordExtractor extractor = null;
		String text = null;
		try {
			in = new FileInputStream(new File(dir));
			// 创建WordExtractor
			extractor = new WordExtractor();
			// 对doc文件进行提取
			text = extractor.extractText(in);
			System.out.println("text1:" + text);
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			try {
				if (null != in) {
					in.close();
				}
			} catch (IOException e) {
				e.printStackTrace();
			}
			in = null;
		}
		// 写入文件内容
		createTXTAndWriteDoc(text, "e:/doc.txt");
	}

	/**
	 * 读取Excel文件
	 * 
	 * @param dir
	 */
	@SuppressWarnings("deprecation")
	static void readExcel(String dir) {
		/**
		 * @param filePath
		 *            文件路径
		 * @return 读出的Excel的内容
		 */
		StringBuffer buff = new StringBuffer();
		try {
			// 创建对Excel工作簿文件的引用
			HSSFWorkbook wb = new HSSFWorkbook(new FileInputStream(dir));
			// 创建对工作表的引用。
			for (int numSheets = 0; numSheets < wb.getNumberOfSheets(); numSheets++) {
				if (null != wb.getSheetAt(numSheets)) {
					HSSFSheet aSheet = wb.getSheetAt(numSheets);// 获得一个sheet
					for (int rowNumOfSheet = 0; rowNumOfSheet <= aSheet.getLastRowNum(); rowNumOfSheet++) {
						if (null != aSheet.getRow(rowNumOfSheet)) {
							HSSFRow aRow = aSheet.getRow(rowNumOfSheet); // 获得一个行
							for (int cellNumOfRow = 0; cellNumOfRow <= aRow.getLastCellNum(); cellNumOfRow++) {
								if (null != aRow.getCell((short) cellNumOfRow)) {
									HSSFCell aCell = aRow.getCell((short) cellNumOfRow);// 获得列值
									switch (aCell.getCellType()) {
									case HSSFCell.CELL_TYPE_FORMULA:
										break;
									case HSSFCell.CELL_TYPE_NUMERIC:
										buff.append(aCell.getNumericCellValue()).append(' ');
										break;
									case HSSFCell.CELL_TYPE_STRING:
										buff.append(aCell.getStringCellValue()).append(' ');
										break;
									}
								}
							}
							buff.append(' ');
						}
					}
				}
			}
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		// 写入文件内容
		createTXTAndWriteDoc(buff.toString(), "e:/excel.txt");
	}

	/**
	 * 读取Powerpoint文件
	 * 
	 * @param dir
	 */
	static void readPPT(String dir) {

	}

	/**
	 * 读取PDF文件
	 * 
	 * @param dir
	 */
	static void readPDF(String dir) {
		String result = null;
		FileInputStream is = null;
		PDDocument document = null;
		try {
			is = new FileInputStream(dir);
			PDFParser parser = new PDFParser(is);
			parser.parse();
			document = parser.getPDDocument();
			PDFTextStripper stripper = new PDFTextStripper();
			result = stripper.getText(document);
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			if (null != is) {
				try {
					is.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
			}
			if (null != document) {
				try {
					document.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
			}
		}
		// 写入文件内容
		createTXTAndWriteDoc(result, "e:/pdf.txt");
	}

	/**
	 * // 读取pdf文件
	 * 
	 * @param file
	 * @throws Exception
	 */
	public void readPdf(String file) throws Exception {
		// 是否排序
		boolean sort = false;
		// pdf文件名
		String pdfFile = file;
		// 输入文本文件名称
		String textFile = null;
		// 编码方式
		String encoding = "GB2312";
		// 开始提取页数
		int startPage = 1;
		// 结束提取页数
		int endPage = Integer.MAX_VALUE;
		// 文件输入流，生成文本文件
		Writer output = null;
		// 内存中存储的PDF Document
		PDDocument document = null;
		try {
			try {
				// 首先当作一个URL来装载文件，如果得到异常再从本地文件系统//去装载文件
				URL url = new URL(pdfFile); // 注意参数已不是以前版本中的URL.而是File。
				document = PDDocument.load(pdfFile);
				// 获取PDF的文件名
				String fileName = url.getFile();
				// 以原来PDF的名称来命名新产生的txt文件
				if (fileName.length() > 4) {
					File outputFile = new File(fileName.substring(0, fileName.length() - 4) + ".txt");
					textFile = outputFile.getName();
				}
			} catch (MalformedURLException e) {
				// 如果作为URL装载得到异常则从文件系统装载 //注意参数已不是以前版本中的URL.而是File。
				document = PDDocument.load(pdfFile);
				if (pdfFile.length() > 4) {
					textFile = pdfFile.substring(0, pdfFile.length() - 4) + ".txt";
				}
			}
			// 文件输入流，写入文件倒textFile
			output = new OutputStreamWriter(new FileOutputStream(textFile), encoding);
			// PDFTextStripper来提取文本
			PDFTextStripper stripper = null;
			stripper = new PDFTextStripper();
			// 设置是否排序
			stripper.setSortByPosition(sort);
			// 设置起始页
			stripper.setStartPage(startPage);
			// 设置结束页
			System.out.print(stripper.getText(document));
			stripper.setEndPage(endPage);
			// 调用PDFTextStripper的writeText提取并输出文本
			stripper.writeText(document, output);
		} finally {
			if (output != null) {
				// 关闭输出流
				output.close();
			}
			if (document != null) {
				// 关闭PDF Document
				document.close();
			}
		}
	}

	/**
	 * 读取Txt文件
	 * 
	 * @param filePath
	 * @return
	 * @throws Exception
	 */
	public String getTextFromTxt(String filePath) throws Exception {
		FileReader fr = new FileReader(filePath);
		BufferedReader br = new BufferedReader(fr);
		StringBuffer buff = new StringBuffer();
		String temp = null;
		while ((temp = br.readLine()) != null) {
			buff.append(temp + " ");
		}
		br.close();
		return buff.toString();
	}

	/**
	 * 读取RTF文件内容
	 * 
	 * @param filePath
	 * @return
	 */
	public String getTextFromRtf(String filePath) {
		String result = null;
		File file = new File(filePath);
		try {
			DefaultStyledDocument styledDoc = new DefaultStyledDocument();
			InputStream is = new FileInputStream(file);
			new RTFEditorKit().read(is, styledDoc, 0);
			result = new String(styledDoc.getText(0, styledDoc.getLength()).getBytes("ISO8859_1"));
			// 提取文本，读取中文需要使用ISO8859_1编码，否则会出现乱码
		} catch (IOException e) {
			e.printStackTrace();
		} catch (BadLocationException e) {
			e.printStackTrace();
		}
		return result;
	}

	/**
	 * @param filePath
	 *            文件路径
	 * @return 获得html的全部内容
	 */

	public static String readHtml(String filePath) {
		BufferedReader br = null;
		StringBuffer sb = new StringBuffer();
		try {
			br = new BufferedReader(new InputStreamReader(new FileInputStream(filePath), "GB2312"));
			String temp = null;
			while ((temp = br.readLine()) != null) {
				sb.append(temp);
			}
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		// 写入文件内容
		createTXTAndWriteDoc(sb.toString(), "e:/html.txt");
		return sb.toString();
	}

	/**
	 * @param filePath
	 *            文件路径
	 * @return 获得的html文本内容
	 */
	public static void readHtmlAll(String filePath) {
		// 得到body标签中的内容
		String str = readHtml(filePath);
		StringBuffer buff = new StringBuffer();
		int maxindex = str.length() - 1;
		int begin = 0;
		int end;
		// 截取>和<之间的内容
		while ((begin = str.indexOf('>', begin)) < maxindex) {
			end = str.indexOf('<', begin);
			if (end - begin > 1) {
				buff.append(str.substring(++begin, end));
			}
			begin = end + 1;
		}
		// 写入文件内容
		createTXTAndWriteDoc(buff.toString(), "e:/htmlAll.txt");
		//return buff.toString();
	}

	/**
	 * 以行为单位读取文件（文本文件）
	 * 
	 * @param filePath
	 */
	public static void readFileByLine(String filePath) {
		File file = new File(filePath);
		BufferedReader bd = null;
		Map<String, String> str = new HashMap<String, String>();
		String s1 = "";
		String s2 = "";

		try {
			bd = new BufferedReader(new InputStreamReader(new FileInputStream(file), "gb2312"));// 编码转换（关键的地方）

			String temp = "";
			int line = 1;
			while ((temp = bd.readLine()) != null) {
				if (temp.length() > 0) {
					s1 = temp.substring(0, 3);
					s1 = s1.trim();
					s2 = temp.substring(4);
					s2 = s2.trim();
					str.put(s1, s2);
				}
				++line;
			}
			createExcel(str);

		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			try {
				if (bd != null)
					bd.close();
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
	}

	/**
	 * 输出Excel文件，输出格式为多行两列
	 * 
	 * @param map
	 */
	@SuppressWarnings( { "deprecation", "unchecked" })
	static void createExcel(Map<String, String> map) {
		try {
			// 新建一输出文件流
			FileOutputStream fOut = new FileOutputStream("e:/2.xls");
			File file = new File("e:/2.xls");
			if (file.exists()) {
				file.delete();
			}
			// 创建新的Excel 工作簿
			HSSFWorkbook workbook = new HSSFWorkbook();
			// 在Excel工作簿中建一工作表，其名为缺省值
			// 如要新建一名为"联系人用户名和电话"的工作表，其语句为：
			HSSFSheet sheet = workbook.createSheet("联系人用户名和电话");
			HSSFRow row = null;
			// 在索引0的位置创建单元格（左上端）
			HSSFCell cell1 = null;
			HSSFCell cell2 = null;

			Iterator iter = map.entrySet().iterator();
			int i = 0;

			while (iter.hasNext()) {
				Map.Entry entry = (Map.Entry) iter.next();
				Object key = entry.getKey();
				Object val = entry.getValue();
				row = sheet.createRow((short) i++);
				cell1 = row.createCell((short) 0);
				cell2 = row.createCell((short) 1);
				// 定义单元格为字符串类型
				cell1.setCellType(HSSFCell.CELL_TYPE_STRING);
				cell2.setCellType(HSSFCell.CELL_TYPE_STRING);

				// 在单元格中输入一些内容
				cell1.setCellValue(key.toString());
				cell2.setCellValue(val.toString());

				if (i > 255) {
					break;
				}
			}

			// 把相应的Excel 工作簿存盘
			workbook.write(fOut);
			fOut.flush();
			// 操作结束，关闭文件
			fOut.close();
			System.out.println("文件生成...");

		} catch (Exception e) {
			System.out.println("出现异常: " + e);
		}
	}
}

posted @ 2011-08-01 15:35 java程序代码阅读(362) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

java07

读取Doc，Excel，PDF，html,生成Txt文件，读取Txt生成Excel文件

公告