Package net.sf.jpluck.apps.cmdline

Source Code of net.sf.jpluck.apps.cmdline.HTML2XML

package net.sf.jpluck.apps.cmdline;

import java.io.File;
import java.io.FileInputStream;

import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import net.sf.jpluck.apps.ExitCodes;
import net.sf.jpluck.apps.OptionsUtil;
import net.sf.jpluck.http.HttpClient;
import net.sf.jpluck.http.HttpResponse;
import net.sf.jpluck.spider.ContentType;
import net.sf.jpluck.spider.Resource;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.Options;
import org.w3c.dom.Document;

public class HTML2XML {
  private static final String VERSION = "1.0";
  private static final String RELEASE_DATE = "2004-02-07";

  public static void main(String[] args) throws Exception {
    Options options = OptionsUtil.createGeneric();
    options.addOption("encoding", true, "Character encoding for XML file. Default: ISO-8859-1.");   
    GnuParser parser = new GnuParser();
    CommandLine cl = parser.parse(options, args);
    if (cl.hasOption("help")) {
      String usage = "java -jar html2xml.jar <HTML filename or URL> <XML filename> ";
      String description = "Converts HTML to well-formed XML using JTidy.";
      OptionsUtil.printHelp(usage, description, options);
      System.exit(ExitCodes.OK);
    } else if (cl.hasOption("version")) {
      System.out.println("HTML2XML " + VERSION + " (" + RELEASE_DATE + ")");
      System.exit(ExitCodes.OK);
    }
    args=cl.getArgs();
    if (args.length != 2) {
      System.err.println("ERROR: Invalid number of arguments " + args.length);
      System.exit(ExitCodes.ERROR_INVALID_NUMBER_OF_ARGUMENTS);
    }
   
    String html = cl.getArgs()[0];
    String xml = cl.getArgs()[1];
    String encoding = cl.getOptionValue("encoding", "ISO-8859-1");

    byte[] data = null;
    if (html.startsWith("http://")) {
      System.out.println("Downloading " + html);
      HttpClient httpClient = new HttpClient();
      HttpResponse response = httpClient.doGet(html);
      data = response.getContent();     
    } else {
      File file = new File(html);
      System.out.println("Reading " + file.getAbsolutePath());
      data = new byte[(int)file.length()];
      FileInputStream in = new FileInputStream(file);
      in.read(data);
      in.close();
    }
    System.out.println("Parsing HTML");
    Resource resource = new Resource(html, null, new ContentType("text/html"), data, 0, false);
    Document document = resource.parseHTML();
    Transformer transformer = TransformerFactory.newInstance().newTransformer();
    File file = new File(xml);
    System.out.println("Writing " + file.getAbsolutePath());
    transformer.setOutputProperty(OutputKeys.METHOD, "xml");
    transformer.setOutputProperty(OutputKeys.ENCODING, encoding);
    transformer.setOutputProperty(OutputKeys.INDENT, "yes");   
    transformer.transform(new DOMSource(document), new StreamResult(file));
    System.out.println("Done!");
  }
}
TOP

Related Classes of net.sf.jpluck.apps.cmdline.HTML2XML

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.