package net.sf.jpluck.apps.cmdline;
import java.io.File;
import java.io.FileInputStream;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import net.sf.jpluck.apps.ExitCodes;
import net.sf.jpluck.apps.OptionsUtil;
import net.sf.jpluck.http.HttpClient;
import net.sf.jpluck.http.HttpResponse;
import net.sf.jpluck.spider.ContentType;
import net.sf.jpluck.spider.Resource;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.Options;
import org.w3c.dom.Document;
public class HTML2XML {
private static final String VERSION = "1.0";
private static final String RELEASE_DATE = "2004-02-07";
public static void main(String[] args) throws Exception {
Options options = OptionsUtil.createGeneric();
options.addOption("encoding", true, "Character encoding for XML file. Default: ISO-8859-1.");
GnuParser parser = new GnuParser();
CommandLine cl = parser.parse(options, args);
if (cl.hasOption("help")) {
String usage = "java -jar html2xml.jar <HTML filename or URL> <XML filename> ";
String description = "Converts HTML to well-formed XML using JTidy.";
OptionsUtil.printHelp(usage, description, options);
System.exit(ExitCodes.OK);
} else if (cl.hasOption("version")) {
System.out.println("HTML2XML " + VERSION + " (" + RELEASE_DATE + ")");
System.exit(ExitCodes.OK);
}
args=cl.getArgs();
if (args.length != 2) {
System.err.println("ERROR: Invalid number of arguments " + args.length);
System.exit(ExitCodes.ERROR_INVALID_NUMBER_OF_ARGUMENTS);
}
String html = cl.getArgs()[0];
String xml = cl.getArgs()[1];
String encoding = cl.getOptionValue("encoding", "ISO-8859-1");
byte[] data = null;
if (html.startsWith("http://")) {
System.out.println("Downloading " + html);
HttpClient httpClient = new HttpClient();
HttpResponse response = httpClient.doGet(html);
data = response.getContent();
} else {
File file = new File(html);
System.out.println("Reading " + file.getAbsolutePath());
data = new byte[(int)file.length()];
FileInputStream in = new FileInputStream(file);
in.read(data);
in.close();
}
System.out.println("Parsing HTML");
Resource resource = new Resource(html, null, new ContentType("text/html"), data, 0, false);
Document document = resource.parseHTML();
Transformer transformer = TransformerFactory.newInstance().newTransformer();
File file = new File(xml);
System.out.println("Writing " + file.getAbsolutePath());
transformer.setOutputProperty(OutputKeys.METHOD, "xml");
transformer.setOutputProperty(OutputKeys.ENCODING, encoding);
transformer.setOutputProperty(OutputKeys.INDENT, "yes");
transformer.transform(new DOMSource(document), new StreamResult(file));
System.out.println("Done!");
}
}