阳光不锈

  博客园 :: 首页 :: 博问 :: 闪存 :: 新随笔 :: :: :: 管理 ::

首先将源网页用UTF-8重新编码放到一个新的文件,

还要注意加上:

 tidy.setInputEncoding("UTF-8");

才能正确显示

源代码如下:


 

import java.net.URL;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.io.*;


import org.w3c.tidy.Tidy;

public class xml {
private String url;
private String outFileName;
private String errOutFileName;

public xml(String url, String outFileName, String
errOutFileName) {
this.url = url;
this.outFileName = outFileName;
this.errOutFileName = errOutFileName;
}

public void convert() {
URL u;
BufferedInputStream in;
FileOutputStream out;

Logger log = Logger.getLogger("convert");
try {
u = new URL(url);

//Create input and output streams
     in = new BufferedInputStream(u.openStream()); // 打开文件,转换为 UTF-8 编码  
    InputStreamReader isr = new InputStreamReader(in, "GB2312"); // 源文件编码为 gb2312
    
    File tmpNewFile = File.createTempFile("GB2312",".html"); // 转换后的文件,设定编码为 utf-8
   out = new FileOutputStream( tmpNewFile ); // 需要将文件转换为字符流
    OutputStreamWriter osw = new OutputStreamWriter( out , "UTF-8"); // 指定目标编码为 utf-8
    osw.write("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n");
   
    char[] buffer = new char[10240];      // 文件缓冲区
    int len = 0;           // 使用字符读取方式,循环读取源文件内容
    while( (len = isr.read(buffer)) !=-1 )     // 转换后写入目标文件中
    {
     osw.write( buffer, 0, len);
    }
    osw.close();           // 转换完成
    isr.close();
    out.close();
    in.close();
   
    if( log.isLoggable( Level.INFO)){
     log.info("HTML 文档转 UTF-8 编码完成!");
    }

//设置tidy
    Tidy tidy = new Tidy();
//  Set file for error messages
    tidy.setErrout(new PrintWriter(new FileWriter(errOutFileName), true));
//  Tell Tidy to convert HTML to XML
 tidy.setXmlOut(true);  
 tidy.setInputEncoding("UTF-8");
 FileInputStream in0 = new FileInputStream( tmpNewFile );
 FileOutputStream out0 = new FileOutputStream(outFileName);  
 

//Convert files
tidy.parse(in0, out0);

//Clean up
in.close();
out.close();
tmpNewFile.delete();         // 删除临时文件

} catch (IOException e) {
System.out.println(this.toString() + e.toString());
}
}

public static void main(String[] args) {
/*
* Parameters are:
* URL of HTML file
* Filename of output file
* Filename of error file
*/

String u="http://www.baidu.com/";
String o="index.xml";
String e="error.xml";

xml t = new xml(u, o, e);
 t.convert();
System.out.println("OK!");

 }
}


 

posted on 2009-01-19 12:21  靳小透  阅读(1873)  评论(0编辑  收藏  举报