Html to clean XML conversion

covert html to xml using Jtidy
package testhtml2xml;
import java.net.URL;
import java.io.*;
import org.w3c.tidy.Tidy;
/**
*
* @author Akshay
*/
public class TestHTML2XML {

/**
* @param args the command line arguments
*/

private String url;
private String outFileName;
private String errOutFileName;

public TestHTML2XML(String url, String outFileName, String errOutFileName) {
this.url = url;
this.outFileName = outFileName;
this.errOutFileName = errOutFileName;
}

@Override
public void run() {
URL u;
BufferedInputStream in;
FileOutputStream out;

Tidy tidy = new Tidy();

//Tell Tidy to convert HTML to XML
//tidy.setIndentCdata(true);
// tidy.setPrintBodyOnly(true);
//tidy.setFixUri(true);
//tidy.setHideComments(true);
tidy.setMakeClean(true);
// System.out.println("here");
tidy.setForceOutput(true);
tidy.setXmlOut(true);
System.out.println("Done: "+this.url);
try {
//Set file for error messages
tidy.setErrout(new PrintWriter(new FileWriter(errOutFileName), true));
u = new URL(url);

//Create input and output streams
in = new BufferedInputStream(u.openStream());
out = new FileOutputStream(outFileName);

//Convert files
tidy.parse(in, out);

//Clean up
in.close();
out.close();

} catch (IOException e) {
System.out.println(this.toString() + e.toString());
}
}

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s