JTidy转换html到xml
方法一:现无法解决乱码
package spide;
import java.io.PrintWriter;
import java.io.FileInputStream;
import java.io.IOException;
import org.w3c.dom.Attr;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.tidy.Tidy;
import java.io.FileOutputStream;
/**
* A sample DOM writer. This sample program illustrates how to
* traverse a DOM tree in order to print a document that is parsed.
*
*/
public class TestDOM {
protected PrintWriter out;
public TestDOM() {
try
{
FileOutputStream outxml=new FileOutputStream("D:/test.xml");
out = new PrintWriter(outxml);
}
catch(Exception e)
{
e.printStackTrace();
}
}
/** Prints the specified node, recursively. */
public void print(Node node) {
if ( node == null ) {
return;
}
int type = node.getNodeType();
switch ( type ) {
case Node.DOCUMENT_NODE:
import java.io.PrintWriter;
import java.io.FileInputStream;
import java.io.IOException;
import org.w3c.dom.Attr;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.tidy.Tidy;
import java.io.FileOutputStream;
/**
* A sample DOM writer. This sample program illustrates how to
* traverse a DOM tree in order to print a document that is parsed.
*
*/
public class TestDOM {
protected PrintWriter out;
public TestDOM() {
try
{
FileOutputStream outxml=new FileOutputStream("D:/test.xml");
out = new PrintWriter(outxml);
}
catch(Exception e)
{
e.printStackTrace();
}
}
/** Prints the specified node, recursively. */
public void print(Node node) {
if ( node == null ) {
return;
}
int type = node.getNodeType();
switch ( type ) {
case Node.DOCUMENT_NODE:
out.println("<?xml version=\"1.0\" encoding=\"GBK\"?>");
print(((Document)node).getDocumentElement());
out.flush();
break;
case Node.ELEMENT_NODE:
out.print('<');
out.print(node.getNodeName());
NamedNodeMap attrs = node.getAttributes();
for ( int i = 0; i < attrs.getLength(); i++ ) {
out.print(' ');
out.print(attrs.item(i).getNodeName());
out.print("=\"");
out.print(attrs.item(i).getNodeValue());
out.print('"');
}
out.print('>');
out.println(); // HACK
NodeList children = node.getChildNodes();
if ( children != null ) {
int len = children.getLength();
for ( int i = 0; i < len; i++ ) {
print(children.item(i));
}
}
break;
case Node.TEXT_NODE:
out.print(node.getNodeValue());
break;
}
if ( type == Node.ELEMENT_NODE ) {
out.print("</");
out.print(node.getNodeName());
out.print('>');
out.println(); // HACK
}
out.flush();
}
public static void main(String args[]) {
String conf="D:/tidy.properties";
FileInputStream in;
print(((Document)node).getDocumentElement());
out.flush();
break;
case Node.ELEMENT_NODE:
out.print('<');
out.print(node.getNodeName());
NamedNodeMap attrs = node.getAttributes();
for ( int i = 0; i < attrs.getLength(); i++ ) {
out.print(' ');
out.print(attrs.item(i).getNodeName());
out.print("=\"");
out.print(attrs.item(i).getNodeValue());
out.print('"');
}
out.print('>');
out.println(); // HACK
NodeList children = node.getChildNodes();
if ( children != null ) {
int len = children.getLength();
for ( int i = 0; i < len; i++ ) {
print(children.item(i));
}
}
break;
case Node.TEXT_NODE:
out.print(node.getNodeValue());
break;
}
if ( type == Node.ELEMENT_NODE ) {
out.print("</");
out.print(node.getNodeName());
out.print('>');
out.println(); // HACK
}
out.flush();
}
public static void main(String args[]) {
String conf="D:/tidy.properties";
FileInputStream in;
Tidy tidy = new Tidy();
tidy.setConfigurationFromFile(conf);
TestDOM t = new TestDOM();
try {
in = new FileInputStream("D:/speed.html");
tidy.setConfigurationFromFile(conf);
TestDOM t = new TestDOM();
try {
in = new FileInputStream("D:/speed.html");
tidy.setMakeClean(true);
tidy.setXmlTags(true);
t.print(tidy.parseDOM(in, null));
}
catch ( IOException e ) {
System.err.println( e.toString() );
}
}
}
tidy.setXmlTags(true);
t.print(tidy.parseDOM(in, null));
}
catch ( IOException e ) {
System.err.println( e.toString() );
}
}
}
?
方法二:可以解决乱码,解析时出现 White spaces are required between publicId and systemId错误
package spide;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import org.w3c.tidy.Tidy;
public class Test17 implements Runnable {
private String srcFileName;
private String outFileName;
private String errOutFileName;
private String configFileName;
public class Test17 implements Runnable {
private String srcFileName;
private String outFileName;
private String errOutFileName;
private String configFileName;
public Test17(String srcFileName, String outFileName,??? String confName) {
this.srcFileName = srcFileName;
this.outFileName = outFileName;
this.configFileName= confName;
}
this.srcFileName = srcFileName;
this.outFileName = outFileName;
this.configFileName= confName;
}
public void run() {
BufferedInputStream in;
FileOutputStream out;
Tidy tidy = new Tidy();
tidy.setConfigurationFromFile(configFileName);
try {
// tidy.setErrout(new PrintWriter(new FileWriter(errOutFileName), true));
in = new BufferedInputStream(new FileInputStream(srcFileName));
out = new FileOutputStream(outFileName);
String head = "<?xml version=\"1.0\" encoding=\"GBK\"?>";
byte[] bytes = head.getBytes();
out.write(bytes, 0, bytes.length);
BufferedInputStream in;
FileOutputStream out;
Tidy tidy = new Tidy();
tidy.setConfigurationFromFile(configFileName);
try {
// tidy.setErrout(new PrintWriter(new FileWriter(errOutFileName), true));
in = new BufferedInputStream(new FileInputStream(srcFileName));
out = new FileOutputStream(outFileName);
String head = "<?xml version=\"1.0\" encoding=\"GBK\"?>";
byte[] bytes = head.getBytes();
out.write(bytes, 0, bytes.length);
tidy.parse(in, out);
} catch (IOException e) {
System.out.println(this.toString() + e.toString());
}
}
} catch (IOException e) {
System.out.println(this.toString() + e.toString());
}
}
public static void main(String[] args) {
String src="D:/speed.html";
String out="D:/result.xml";
String err="D:/err.txt";
String conf="D:/tidy.properties";
Test17 t1 = new Test17(src,out,conf);
Thread th1 = new Thread(t1);
th1.start();
}
}
String src="D:/speed.html";
String out="D:/result.xml";
String err="D:/err.txt";
String conf="D:/tidy.properties";
Test17 t1 = new Test17(src,out,conf);
Thread th1 = new Thread(t1);
th1.start();
}
}