Source Code of winterwell.utils.web.WebUtils

/*
 * (c) Winterwell Associates Ltd, 2008-2011
 * All rights reserved except.
 */
package winterwell.utils.web;


import java.awt.Color;
import java.awt.Desktop;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLConnection;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.util.AbstractList;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.logging.Level;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.SAXParserFactory;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;


import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.Attributes;
import org.xml.sax.EntityResolver;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;


import winterwell.utils.FailureException;
import winterwell.utils.IORException;
import winterwell.utils.NotUniqueException;
import winterwell.utils.Printer;
import winterwell.utils.Process;
import winterwell.utils.StrUtils;
import winterwell.utils.TodoException;
import winterwell.utils.Utils;
import winterwell.utils.containers.ArrayMap;
import winterwell.utils.containers.Containers;
import winterwell.utils.containers.ITree;
import winterwell.utils.containers.Tree;
import winterwell.utils.gui.GuiUtils;
import winterwell.utils.io.FileUtils;
import winterwell.utils.reporting.Log;
import winterwell.utils.time.TUnit;


/**
 * Web and xml-related utils.
 * 
 * @author daniel
 * @testedby {@link WebUtilsTest}
 */
public class WebUtils {


  private static DocumentBuilderFactory docBuilderFactory;


  static final Pattern IP4_ADDRESS = Pattern
      .compile("\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}");


  /**
   * [js, css]
   */
  public static final String[] JQUERY_UI_URLS = new String[] {
      "http://ajax.googleapis.com/ajax/libs/jqueryui/1.8.5/jquery-ui.min.js",
      "http://soda.sh/static/style/jquery-ui-1.8.5.custom.css" // Bleurgh
                                    // - a
                                    // sodash
                                    // url
  };


  public static final String JQUERY_URL = "http://ajax.googleapis.com/ajax/libs/jquery/1.3.2/jquery.min.js";


  public static final String MIME_TYPE_HTML = "text/html";
  /**
   * application/javascript text/javascript is officially obsolete but still
   * the most widely used.
   */
  public static final String MIME_TYPE_JAVASCRIPT = "application/javascript";
  public static final String MIME_TYPE_JSON = "application/json";
  public static final String MIME_TYPE_MULTIPART_ALT = "multipart/alternative";
  public static final String MIME_TYPE_MULTIPART_MIXED = "multipart/mixed";


  public static final String MIME_TYPE_RSS = "application/rss+xml";


  /**
   * Plain text, utf-8 encoded
   */
  public static final String MIME_TYPE_TXT_UTF8 = "text/plain; charset=UTF-8";


  public static final String MIME_TYPE_XML = "application/xml";


  /**
   * Matches an xml comment - including some bad versions
   */
  public static final Pattern pComment = Pattern.compile("<!-*.*?-+>",
      Pattern.DOTALL);


  /**
   * Matches a doctype element.
   */
  public static final Pattern pDocType = Pattern.compile("<!DOCTYPE.*?>",
      Pattern.CASE_INSENSITIVE | Pattern.DOTALL);


  /**
   * Matches href="whatever" and variants
   */
  public static final Pattern pHref = Pattern.compile(
      "href=['\"]?([^'\"> \r\n\t]+)['\"]?", Pattern.CASE_INSENSITIVE);
  /**
   * Used in strip tags to get rid of scripts and css style blocks altogether.
   */
  public static final Pattern pScriptOrStyle = Pattern.compile(
      "<(script|style)[^<>]*>.+?</(script|style)>",
      Pattern.CASE_INSENSITIVE | Pattern.DOTALL);


  static SAXParserFactory saxParserFactory = null;


  /**
   * Matches an xml tag, e.g. &lt;a>, &lt;br/>, or &lt;/a>.
   */
  public static final Pattern TAG_REGEX = Pattern.compile(
      "<(/?[a-zA-Z][a-zA-Z0-9]*)[^>]*>", Pattern.DOTALL);


  /**
   * Matches urls. Note: Excludes any trailing .
   * 
   * @testedy {@link WebUtilsTest#testUrlRegex()}
   */
  public static final Pattern URL_REGEX = Pattern
      .compile("[hf]tt?ps?://[a-zA-Z0-9_%\\-\\.,\\?&\\/=\\+'~#!\\*:]+[a-zA-Z0-9_%\\-&\\/=\\+]");


  /**
   * Note: XPaths are not thread safe, so best to create new ones as needed
   */
  public static final XPathFactory XPATH_FACTORY = XPathFactory.newInstance();


  public static List<Node> asList(final NodeList scripts) {
    return new AbstractList<Node>() {
      @Override
      public Node get(int index) {
        return scripts.item(index);
      }


      @Override
      public int size() {
        return scripts.getLength();
      }
    };
  }


  public static Map<String, String> asMap(NamedNodeMap nnMap) {
    int n = nnMap.getLength();
    Map<String, String> map = new ArrayMap<String, String>(n);
    for (int i = 0; i < n; i++) {
      Node item = nnMap.item(i);
      String name = item.getNodeName();
      String val = item.getNodeValue();
      String txt = item.getTextContent();
      map.put(name, val);
    }
    return map;
  }


  /**
   * Encode text so that it can be used as the value of an XML attribute. Does
   * not add surrounding quote marks.
   * 
   * @param text
   *            Can be null, which will return as the empty string.
   * @return
   */
  public static String attributeEncode(String text) {
    if (text == null)
      return "";
    StringBuilder sb = new StringBuilder(text.length() + 5);
    attributeEncode(sb, text);
    return sb.toString();
  }


  /**
   * Encode text so that it can be used as the value of an XML attribute. The
   * W3 spec is a little hazy on this (no, really). We encode ', &quot; and
   * &amp;. All other chars are left alone. Does not add surrounding quote
   * marks.
   * 
   * @param helpText
   */
  public static void attributeEncode(StringBuilder out, CharSequence text) {
    for (int i = 0, len = text.length(); i < len; i++) {
      char c = text.charAt(i);
      if (c == '\'') {
        out.append("&#39;"); // alt &#x27; but NOT &apos;
      } else if (c == '"') {
        out.append("&quot;"); // alt &#x22;
      } else if (c == '&') {
        out.append("&amp;");
      } else {
        out.append(c);
      }
    }
  }


  /**
   * Convert a color into an html code
   * 
   * @param col
   * @return E.g. "#ff0000" for Color.RED, "rgba(0,255,0,128)" for transparent
   *         green. Will always use the #hex form if there is no transparency
   *         (ie alpha=255)
   * @see GuiUtils#getColor(String)
   */
  public static String color2html(Color col) {
    StringBuilder html = new StringBuilder(7);
    int r = col.getRed();
    int g = col.getGreen();
    int b = col.getBlue();
    // alpha
    int a = col.getAlpha();
    if (a != 255) {
      float af = a / 255.0f;
      // this should work in CSS
      html.append("rgba(" + r + "," + g + "," + b + "," + af + ")");
      return html.toString();
    }
    html.append('#');
    color2html2_hex(r, html);
    color2html2_hex(g, html);
    color2html2_hex(b, html);
    return html.toString();
  }


  private static void color2html2_hex(int r, StringBuilder html) {
    String hr = Integer.toHexString(r);
    if (hr.length() == 1) {
      html.append('0');
    } else {
      assert hr.length() == 2;
    }
    html.append(hr);
  }


  /**
   * Try to get the server name or IP address for a site. Linux only at
   * present!
   * 
   * @param site
   *            alias or an IP address
   * @param returnIP
   *            If true, return an IPv4 address. If false, look for a
   *            server-name.
   * @return IP or server name. Never null (exception on failure)
   * @testedby {@link WebUtilsTest#testGetIPof()} WARNING: this can fail
   *           sometimes for no good reason that I can see. Retrying usually
   *           succeeds.
   */
  public static String dig(String site, boolean returnIP) {
    assert site != null;
    if (!Utils.OSisUnix())
      throw new TodoException();
    // Are you after a reverse lookup for a name?
    String x = "";
    if (!returnIP && IP4_ADDRESS.matcher(site).matches()) {
      x = "-x ";
    }
    Process p = new Process("dig +short " + x + site);
    p.run();
    p.waitFor(5000); // this should be fast -- 5 seconds allows aeons of
              // time
    String out = p.getOutput();


    // look for an IPv4 address?
    if (returnIP) {
      Matcher m = IP4_ADDRESS.matcher(out);
      if (m.find())
        return m.group();
      throw new FailureException("Couldn't find IP address for " + site
          + " in " + out);
    }


    // look for a name
    String[] bits = StrUtils.splitLines(out);
    String ip = null;
    for (String string : bits) {
      if (string.isEmpty()) {
        continue;
      }
      if (IP4_ADDRESS.matcher(string).matches()) {
        ip = string;
        continue;
      }
      if (string.endsWith(".")) {
        string = string.substring(0, string.length() - 1);
      }
      return string;
    }


    // try a reverse lookup
    if (ip == null)
      throw new FailureException("Couldn't find server name or ip for "
          + site + " in [" + out + "] " + p.getError());
    return dig(ip, false);
  }


  public static void display(File file) {
    display(file.toURI());
  }


  /**
   * Open an html page in a web browser.
   * 
   * @param page
   *            This is an HTML page. It is not a url!
   */
  public static void display(String page) {
    try {
      File f = File.createTempFile("temp", ".html");
      FileUtils.write(f, page);
      display(f);
      // f.deleteOnExit(); bad idea
    } catch (IOException e) {
      throw new IORException(e);
    }
  }


  /**
   * Open a URI in a system web browser.
   * 
   * If {@link GuiUtils#isInteractive()} is false, this will return
   * immediately.
   * 
   * FIXME On Ubuntu, this leads to a daemon thread that can prevent programs
   * terminating until the browser closes.
   */
  public static void display(URI uri) {
    if (!GuiUtils.isInteractive())
      return;
    try {
      Desktop d = Desktop.getDesktop();
      d.browse(uri);
    } catch (UnsupportedOperationException ex) {
      // KDE isn't supported :(
      if (Utils.getOperatingSystem().contains("linux")
          || Utils.getOperatingSystem().contains("unix")) {
        Process p = new Process("xdg-open " + uri);
        p.run();
      } else
        throw ex;
    } catch (IOException e) {
      throw new IORException(e);
    }
  }


  /**
   * @param tag
   *            E.g. "div" Can be badly formed xml
   * @param includeTag
   *            If false, return just the tag's content text (not the tag or
   *            the attributes). Extract all instances of tag from the xml
   *            page. Does NOT cope with nested tags of the same type. Does
   *            NOT cope with self-closing tags that lack the /> ending
   * @return instances of tag. May be empty, never null
   */
  public static List<String> extractXmlTags(String tag, String xml,
      boolean includeTag) {
    // Open-close
    Pattern p = Pattern.compile("<" + tag + "(\\s+[^>]*)?>(.*?)</" + tag
        + ">", Pattern.DOTALL);
    Matcher m = p.matcher(xml);
    List<String> list = new ArrayList<String>();
    while (m.find()) {
      list.add(includeTag ? m.group() : m.group(2));
    }
    // One tag (e.g. <img />
    if (includeTag) {
      p = Pattern.compile("<" + tag + "[^>]*/>");
      m = p.matcher(xml);
      while (m.find()) {
        list.add(m.group());
      }
    }
    return list;
  }


  public static List<String> extractXmlTagsSelfClosing(String tag, String xml) {
    // Open-close
    Pattern p = Pattern.compile("<" + tag + "[^>]*>", Pattern.DOTALL
        | Pattern.CASE_INSENSITIVE);
    Matcher m = p.matcher(xml);
    List<String> list = new ArrayList<String>();
    while (m.find()) {
      list.add(m.group());
    }
    return list;
  }


  /**
   * Convenience for accessing an attribute value (the Node interface is
   * rather ugly)
   * 
   * @param attribute
   * @param node
   * @return value or null
   */
  public static String getAttribute(String attribute, Node node) {
    NamedNodeMap att = node.getAttributes();
    Node w = att.getNamedItem(attribute);
    if (w == null)
      return null;
    return w.getTextContent();
  }


  /**
   * FIXME problems in the following situation:
   * getAttribute("href","<a href=abc.com >woo</a>")
   * 
   * Crude XML tag parser: rips the attribute value out using a text scan. NB:
   * Attributes must be quoted
   * 
   * @param attribute
   * @param tag
   * @return attribute-value or null
   */
  public static String getAttribute(String attribute, String tag) {
    attribute += '=';
    int i = tag.indexOf(attribute);
    if (i == -1)
      return null;
    i += attribute.length();
    if (i == tag.length())
      return null;
    char q = tag.charAt(i);
    if (q == '"' || q == '\'') {
      for (int j = i + 1; j < tag.length(); j++) {
        char c = tag.charAt(j);
        // FIXME escaping chars
        if (c == q)
          return tag.substring(i + 1, j);
      }
    } else {
      // unquoted
      for (int j = i; j < tag.length(); j++) {
        char c = tag.charAt(j);
        if (Character.isWhitespace(c))
          return tag.substring(i, j);
      }
    }
    throw new IllegalArgumentException(tag + " is not valid xml");
  }


  /**
   * Try to get the IP address(es) of the local machine. Unix only at present.
   * 
   * @return
   * @testedby {@link WebUtilsTest#testGetMyIP()}
   */
  public static List<String> getMyIP() {
    if (Utils.OSisUnix()) {
      Process p = new Process("ifconfig");
      try {
        p.run();
        p.waitFor(2000);
        String out = p.getOutput();
        Matcher m = IP4_ADDRESS.matcher(out);
        ArrayList<String> ips = new ArrayList<String>();
        while (m.find()) {
          ips.add(m.group());
        }
        return ips;
      } catch (Exception e) {
        // ifconfig failed?!
        Log.report(e + " " + p.getError(), Level.SEVERE);
        return new ArrayList();
      }
    }
    throw new TodoException();
  }


  /**
   * The crudest possible http-get.
   * 
   * @see winterwell.web.FakeBrowser Or Apache's http client
   * @param url
   * @return
   */
  public static String getPage(String url) {
    try {
      HttpURLConnection connection = (HttpURLConnection) new URL(url)
          .openConnection();
      InputStream in = connection.getInputStream();
      String html = FileUtils.read(in);
      return html;
    } catch (Exception ex) {
      throw Utils.runtime(ex);
    }
  }


  /**
   * @return an XMLReader. Tries to construct a fault-tolerant fast xml reader
   *         (e.g. switches off DTD loading). This has implications for
   *         entities! Which might not get properly supported!
   * @testedby {@link WebUtilsTest#testGetXMLReader()}
   */
  public static XMLReader getXMLReader() {
    if (saxParserFactory == null) {
      // Try to avoid using the built-in
      // Which will probably be Xerces, which is fussy (chokes on valid
      // xml, such as BBC RSS feeds)
      // & slow (always downloads the DTD).
      for (String klassName : new String[] { "com.bluecast.xml.JAXPSAXParserFactory" // Piccolo
      // ,"com.ctc.wstx.sax.WstxSAXParserFactory" // Woodstox
      }) {
        try {
          Class<?> klass = Class.forName(klassName);
          saxParserFactory = (SAXParserFactory) klass.newInstance();
        } catch (Exception e) {
          // oh well
        }
      }
      // oh well - fall back to built-in (Xerces)
      if (saxParserFactory == null) {
        saxParserFactory = SAXParserFactory.newInstance();
      }


      // keep it simple
      saxParserFactory.setValidating(false);
      saxParserFactory.setNamespaceAware(false);
      // hopeful switch-offs -- c.f. cf
      // http://stackoverflow.com/questions/155101/make-documentbuilder-parse-ignore-dtd-references
      for (String badFeature : new String[] {
          "http://xml.org/sax/features/namespaces",
          "http://xml.org/sax/features/validation",
          "http://apache.org/xml/features/nonvalidating/load-dtd-grammar",
          "http://apache.org/xml/features/nonvalidating/load-external-dtd" }) {
        try {
          saxParserFactory.setFeature(badFeature, false);
        } catch (Exception e) {
          // oh well
        }
      }
      // saxParserFactory.setXIncludeAware(false); // throws an error!
      // Default is false anyway
      Log.report("Using SAX parser " + saxParserFactory);
    }
    try {
      XMLReader reader = saxParserFactory.newSAXParser().getXMLReader();
      // Hopefully switch off DTD -- cf
      // http://stackoverflow.com/questions/155101/make-documentbuilder-parse-ignore-dtd-references
      reader.setEntityResolver(new EntityResolver() {
        @Override
        public InputSource resolveEntity(String publicId,
            String systemId) throws SAXException, IOException {
          return new InputSource(new StringReader(""));
          // return null; ??
        }
      });
      return reader;
    } catch (Exception e) {
      throw Utils.runtime(e);
    }
  }


  /**
   * @param v
   *            Can be null (returns null)
   * @deprecated Use CGIUtils.htmlEncode() instead (which wraps a Jakarta
   *             StringEscapeUtils library). This method mainly exists as a
   *             reminder! It does provide safety encoding: <>s to protect
   *             against injection attacks
   */
  @Deprecated
  public static String htmlEncode(String v) throws TodoException {
    if (v == null)
      return null;
    v = v.replace("<", "&lt;");
    v = v.replace(">", "&gt;");
    return v;
  }


  public static void main(String[] args) {
    if (args == null || args.length == 0) {
      Printer.out("Usage: java -jar winterwell.utils.jar COMMAND PARAMS");
      Printer.out("Where COMMAND = render, PARAMS = html-input-file pdf-output-file ");
      System.exit(0);
    }
    String cmd = args[0];
    assert cmd.equals("render");
    String html = FileUtils.read(new File(args[1]));
    File pdf = new File(args[2]);
    renderToPdf(html, pdf, null);
    Printer.out("Rendered pdf at: " + pdf);
  }


  /**
   * 
   * @param xml
   * @param namespaceAware
   * @return
   * @testedby {@link WebUtilsTest#testParseXml()}
   */
  public static Document parseXml(String xml) {
    // // ??Pop the first line if its a DTD spec
    // // This is to prevent the baked in xerces behaviour of making a web
    // call,
    // // then throwing an exception unless that web call succeeds
    // if (xml.startsWith("<!DOCTYPE")) {
    // int i = xml.indexOf('<', 1);
    // if (i != -1) {
    // xml = xml.substring(i);
    // }
    // }
    // But then we get other exceptions - with undeclared entities :(
    // TODO find a decent xml parser
    parseXml2_getFactory();


    try {
      DocumentBuilder builder = docBuilderFactory.newDocumentBuilder();
      InputSource input = new InputSource(new StringReader(xml));
      Document doc = builder.parse(input);
      return doc;
    } catch (Exception e) {
      Log.report(e + " with " + docBuilderFactory); // why are these
                              // things broken?
      throw Utils.runtime(e);
    }
  }


  private static void parseXml2_getFactory() {
    if (docBuilderFactory != null)
      return;
    // TODO Try to avoid using the built-in!
    // Which will probably be Xerces, which is fussy (chokes on valid xml,
    // such as BBC RSS feeds)
    // & slow (always downloads the DTD).
    for (String klassName : new String[] {
    // "com.bluecast.xml.JAXPSAXParserFactory" // Piccolo
    // ,"com.ctc.wstx.sax.WstxSAXParserFactory" // Woodstox
    }) {
      try {
        Class<?> klass = Class.forName(klassName);
        docBuilderFactory = (DocumentBuilderFactory) klass
            .newInstance();
      } catch (Exception e) {
        // oh well
      }
    }
    // oh well - fall back to built-in (Xerces)
    if (docBuilderFactory == null) {
      docBuilderFactory = DocumentBuilderFactory.newInstance();
    }


    // keep it simple
    docBuilderFactory.setNamespaceAware(false);
    docBuilderFactory.setValidating(false);
    // hopeful switch-offs -- c.f. cf
    // http://stackoverflow.com/questions/155101/make-documentbuilder-parse-ignore-dtd-references
    for (String badFeature : new String[] {
        "http://xml.org/sax/features/namespaces",
        "http://xml.org/sax/features/validation",
        "http://apache.org/xml/features/nonvalidating/load-dtd-grammar",
        "http://apache.org/xml/features/nonvalidating/load-external-dtd" }) {
      try {
        docBuilderFactory.setFeature(badFeature, false);
      } catch (Exception e) {
        // oh well
      }
    }
    // What bit of non-validating doesn't Xerces understand?!
    // factory.setXIncludeAware(false); unnecessary and causes errors
    Log.report("Using XML parser " + docBuilderFactory);
  }


  /**
   * A lighter-weight alternative to using Document and XPath
   * 
   * @param xml
   * @return a tree. The root node has no XMLNode (it is the document
   *         super-node).
   * 
   * @see CGIUtils#parseHtmlToTree(String)
   */
  public static Tree<XMLNode> parseXmlToTree(String xml) {
    XMLReader xmlReader = getXMLReader();
    XmlTreeBuilder treeBuilder = new XmlTreeBuilder();
    xmlReader.setContentHandler(treeBuilder);
    try {
      xmlReader.parse(new InputSource(new StringReader(xml)));
      return treeBuilder.getTree();
    } catch (Exception e) {
      throw Utils.runtime(e);
    }
    // // Do it dirty
    // treeBuilder = new XmlTreeBuilder();
    // DirtyXmlReader xmlReader2 = new DirtyXmlReader();
    // xmlReader2.setContentHandler(treeBuilder);
    // xmlReader2.parse(new StringReader(xml));
    // return treeBuilder.getTree();
  }


  /**
   * This relies on: linux, and wkhtmltopdf being installed and on the path.
   * 
   * @param html
   * @param file
   *            target pdf file
   * @param waitFor
   *            null or "--window-status FLOT_DONE", "--javascript-delay 2000"
   *            FIXME these options seems to be buggy (Kubuntu, wkhtmltopdf
   *            0.11.0rc1)
   * 
   * @testedby {@link WebUtilsTest#testRenderToPdf()}
   */
  public static void renderToPdf(String html, File file, String waitFor) {
    File temp1 = null;
    try {
      temp1 = File.createTempFile("page", ".html");
      FileUtils.write(temp1, html);
      assert temp1.exists();


      // 1. Render HTML to PDF with wkhtmltopdf
      // The horrendous 7 second delay is to allow time for ajax to run,
      // even on a busy server (2 seconds was too little on egan).
      Process p = new Process("wkhtmltopdf "
          + (waitFor == null ? "" : waitFor) + " "
          + temp1.getAbsolutePath() + " " + file.getAbsolutePath());
      p.setEcho(true);
      p.run();
      int done = p.waitFor(TUnit.MINUTE.getMillisecs());


      if (!file.exists())
        throw new IOException("Failed to create " + file + "\t"
            + p.getError());
      if (p.getOutput().contains("cannot connect to X server")
          || p.getError().contains("cannot connect to X server"))
        throw new IOException(
            "render failed: wkhtmltopdf couldn't connect to an X server");
      // debug spew
      Log.report("html",
          "RenderToPdf: " + p.getOutput() + "\t" + p.getError(),
          Level.FINE);
    } catch (Exception e) {
      throw Utils.runtime(e);
    } finally {
      // clean up
      if (temp1 != null) {
        FileUtils.delete(temp1);
      }
    }
  }


  /**
   * This relies on: linux, and wkhtmltopdf and convert (imagemagick) being
   * installed
   * 
   * @param html
   * @param file
   */
  public static void renderToPng(String html, File file) {
    File temp1 = null;
    try {
      temp1 = File.createTempFile("chart", ".pdf");
      renderToPdf(html, temp1, "--javascript-delay 1000"); // FIXME
      assert temp1.exists() && temp1.length() > 0;


      // 2. Render, trim and convert to PNG with convert
      Process p2 = new Process("convert -trim -antialias -density 300 "
          + temp1.getAbsolutePath() + " " + file.getAbsolutePath());
      p2.run();
      p2.waitFor(TUnit.MINUTE.getMillisecs());


      if (!file.exists())
        throw new IOException("Failed to create " + file + "\t"
            + p2.getError());
    } catch (Exception e) {
      throw Utils.runtime(e);
    } finally {
      // clean up
      if (temp1 != null) {
        FileUtils.delete(temp1);
      }
    }
  }


  /**
   * Convenience for calling {@link URI#resolve(String)}.
   * <p>
   * E.g. "http://winterstein.me.uk"+"images" returns
   * "http://winterstein.me.uk/images"<br>
   * "http://winterstein.me.uk"+"http://google.com" returns
   * "http://google.com"<br>
   * "http://winterstein.me.uk/text"+"/images" returns
   * "http://winterstein.me.uk/images"<br>
   * "http://winterstein.me.uk/text"+"images" returns
   * "http://winterstein.me.uk/images"<br>
   * 
   * This differs from previous methods because it removes "pages" from the
   * base if you don't want the path to be resolved in this (standard) way,
   * then it is your responsibility to ensure that a trailing slash is
   * present. The exception to this is where there is no path e.g.
   * "http://www.google.com" For convenience in these cases a path of "/" will
   * be supplied.
   * 
   * @param base
   *            Typically a website or a directory on a website.
   * @param extension
   *            This may or may not be an extension
   * @return
   */
  public static URI resolveUri(String base, String extension) {
    // if ( ! base.endsWith("/")) base += "/";
    URI b = URI(base);
    URI ext = URI(extension);
    if (Utils.isBlank(b.getPath())) {
      b = b.resolve("/");
    }
    return b.resolve(ext);
  }


  /**
   * Encode text so that it can be used as a string in JavaScript. We encode
   * ', &quot; and &amp;. All other chars are left alone. Does not add
   * surrounding quote marks.
   * 
   * @param msg
   */
  public static String scriptEncode(String msg) {
    // ?? Is this correct?
    return attributeEncode(msg);
  }


  /**
   * Strip any embedded scripts out of an HTML page. Use this to sanitise user
   * content to defend against ajax hacking attacks.
   * <p>
   * WARNING: this is a bit over-zealous and may mangle some innocent text,
   * e.g. "onwards=stuff"
   */
  public static String stripScripts(String xml) {
    // strip script tags
    String noScript = stripTagContents("script", xml);
    // strip onXXX handlers (this is a bit over-zealous)
    noScript = noScript.replaceAll("\\son\\w+=", "");
    // strip href="javascript:..." urls
    noScript = noScript.replaceAll("href='?\"?javascript:", "");
    return noScript;
  }


  /**
   * Strip any embedded CSS out of an HTML page. Call this <i>before</i>
   * stripTags.
   * 
   * @param xml
   */
  public static String stripStyle(String xml) {
    return stripTagContents("style", xml);
  }


  /**
   * Strip out anything embedded in a "tagName" tag from xml. Does not attempt
   * to handle nested tags. Call <i>before</i> calling stripTags.
   */
  public static String stripTagContents(String tagName, String xml) {
    Pattern re = Pattern.compile("<" + tagName + ".*?</" + tagName + ">",
        Pattern.DOTALL);
    return re.matcher(xml).replaceAll(" ");
  }


  /**
   * Remove xml and html tags, e.g. to safeguard against javascript injection
   * attacks, or to get plain text for NLP.
   * 
   * @param xml
   *            can be null, in which case null will be returned
   * @return the text contents - ie input with all tags removed
   * @testedby {@link WebUtilsTest#testStripTags()}
   */
  public static String stripTags(String xml) {
    if (xml == null)
      return null;
    // short cut if there are no tags
    if (xml.indexOf('<') == -1)
      return xml;
    // first all the scripts (cos we must remove the tag contents too)
    Matcher m4 = pScriptOrStyle.matcher(xml);
    xml = m4.replaceAll("");
    // comments
    Matcher m2 = pComment.matcher(xml);
    String txt = m2.replaceAll("");
    // now the tags
    Matcher m = TAG_REGEX.matcher(txt);
    String txt2 = m.replaceAll("");
    Matcher m3 = pDocType.matcher(txt2);
    String txt3 = m3.replaceAll("");
    return txt3;
  }


  public static URI URI(String uri) {
    try {
      return new URI(uri);
    } catch (URISyntaxException e) {
      throw new RuntimeException(e);
    }
  }


  public static URI URI(URL url) {
    try {
      return url.toURI();
    } catch (URISyntaxException e) {
      throw new RuntimeException(e);
    }
  }


  public static URI URI(URLConnection connection) {
    try {
      return connection.getURL().toURI();
    } catch (URISyntaxException e) {
      throw new RuntimeException(e);
    }
  }


  public static String urlDecode(String s) {
    try {
      s = URLDecoder.decode(s, "UTF-8");
    } catch (UnsupportedEncodingException e) {
      s = URLDecoder.decode(s);
    }
    return s;
  }


  /**
   * 
   * @param vars
   * @return query string ripe for appending onto a url. Does not include a ?.
   *         E.g., you could do "http://mysite.com?" + urlEncoded, if you were
   *         so inclined.
   */
  public static String urlEncode(Map vars) {
    StringBuilder encodedData = new StringBuilder();
    for (Object key : vars.keySet()) {
      Object v = vars.get(key);
      String val = urlEncode(v);
      encodedData.append(urlEncode(key));
      encodedData.append('=');
      encodedData.append(val);
      encodedData.append('&');
    }
    return encodedData.toString();
  }


  /**
   * URL encode
   * 
   * @param x
   *            can be null (returns ""). Will be turned into a String using
   *            String.valueOf()
   * @testedby {@link WebUtilsTest#testUrlEncode()}
   */
  public static String urlEncode(Object x) {
    if (x == null)
      return "";
    String s = String.valueOf(x);
    try {
      s = URLEncoder.encode(s, "UTF-8");
    } catch (UnsupportedEncodingException e) {
      s = URLEncoder.encode(s);
    }
    s = s.replace("+", "%20"); // + for " " seems to be out of date.
    return s;
  }


  /**
   * Use an xpath query to extract what is expected to be a single string
   * valued node. This is a convenience method for a common case.
   * 
   * @param XPATH
   * @param node
   * @return the resulting node's text content, or null if there is no such
   *         node.
   * @throws NotUniqueException
   *             if the query returns multiple nodes.
   */
  public static String xpathExtractString(String xpathQuery, Node node)
      throws NotUniqueException {
    List<Node> titles = WebUtils.xpathQuery(xpathQuery, node);
    if (titles.isEmpty())
      return null;
    if (titles.size() != 1)
      throw new NotUniqueException(Printer.toString(Containers.subList(
          titles, 0, 3)));
    Node node2 = titles.get(0);
    String text = node2.getTextContent();
    // Although W3C say
    // "All line-endings reported as a single LF character.", the
    // piece-of-shit Apache
    // parser that ships with Java disagrees.
    text = StrUtils.LINEENDINGS.matcher(text).replaceAll("\n");
    return text;
  }


  /**
   * @see #xpathQuery(String, String, boolean)
   * @param xpathQuery
   * @param node
   * @return
   * @testedby TODO
   */
  public static List<Node> xpathQuery(String xpathQuery, Node node) {
    try {
      XPathExpression expr = XPATH_FACTORY.newXPath().compile(xpathQuery);
      // get a context limited to the node
      Node clone = node.cloneNode(true);
      NodeList nodeList = (NodeList) expr.evaluate(clone,
          XPathConstants.NODESET);
      List<Node> nodes = asList(nodeList);
      return nodes;
    } catch (XPathExpressionException e) {
      throw Utils.runtime(e);
    }
  }


  /**
   * Convenience for {@link xpathQuery} without namespace aware parsing
   * 
   * @param xpathQuery
   *            E.g. "//book[author="Joseph Heller"]/title"
   */
  public static List<Node> xpathQuery(String xpathQuery, String xml) {
    return xpathQuery(xpathQuery, xml, false);
  }


  /**
   * Run an XPath query over an xml document.
   * <p>
   * <h3>XPath syntax</h3>
   * E.g. given the document
   * 
   * <pre>
   * &lt;shelf>
   * &lt;book year='1960'>&lt;title>Catch 22&lt;/title>&lt;author>Joseph Heller&lt;/author>&lt;/book>
   * &lt;book year='2007'>&lt;title>The English Swordsman&lt;/title>&lt;author>Daniel Winterstein&lt;/author>&lt;/book>
   * &lt;/shelf>
   * </pre>
   * 
   * You could have the queries:
   * 
   * <pre>
   * "//book[author='Joseph Heller']/title"
   * "//book[@year='2007']"
   * "/shelf/book/title"
   * </pre>
   * 
   * See http://www.zvon.org/xxl/XPathTutorial/General/examples.html for more
   * info.
   * <p>
   * Note: This method is not optimally efficient if the same query is
   * repeated, or the same document queried multiple times.
   * 
   * @param xpathQuery
   *            E.g. "//book[author="Joseph Heller"]/title"
   * @param xml
   * @param namespaceAware
   * @return
   */
  public static List<Node> xpathQuery(String xpathQuery, String xml,
      boolean namespaceAware) {
    // Parse XML
    assert !namespaceAware : "TODO";
    Document doc = parseXml(xml);
    // Build an XPath query
    try {
      XPath xp = XPATH_FACTORY.newXPath();
      XPathExpression expr = xp.compile(xpathQuery);
      NodeList nodeList = (NodeList) expr.evaluate(doc,
          XPathConstants.NODESET);
      List<Node> nodes = asList(nodeList);
      return nodes;
    } catch (XPathExpressionException e) {
      throw Utils.runtime(e);
    }
  }


}


/**
 * Used by {@link WebUtils#parseXml(String)} to build DOM-like trees. Rationale:
 * Document & associated classes suck.
 * 
 * <p>
 * TODO should we add a filter on nodes, to pre-ignore boring ones?
 * 
 * @author daniel
 */
final class XmlTreeBuilder extends DefaultHandler {


  ITree<XMLNode> activeTree;


  private boolean endFlag;


  private Tree<XMLNode> root;


  StringBuilder text = new StringBuilder();


  public XmlTreeBuilder() {
  }


  @Override
  public void characters(char[] ch, int start, int length)
      throws SAXException {
    text.append(ch, start, length);
  }


  @Override
  public void endDocument() throws SAXException {
    endFlag = true;
  }


  @Override
  public void endElement(String uri, String localName, String name)
      throws SAXException {
    // Create a text node?
    processTextBuffer();
    // Pop node
    activeTree = activeTree.getParent();
  }


  /**
   * The root node has no XMLNode (it is the document super-node).
   * 
   * @return
   */
  public Tree<XMLNode> getTree() {
    assert endFlag;
    return root;
  }


  private void processTextBuffer() {
    if (text.length() == 0)
      return;
    XMLNode textNode = new XMLNode(text.toString(), true);
    Tree<XMLNode> textTreeNode = new Tree<XMLNode>(activeTree, textNode);
    text = new StringBuilder();
  }


  @Override
  public void skippedEntity(String name) throws SAXException {
    // do nothing
  }


  @Override
  public void startDocument() throws SAXException {
    root = new Tree<XMLNode>();
    activeTree = root;
  }


  @Override
  public void startElement(String uri, String localName, String name,
      Attributes atts) throws SAXException {
    // Create a text node?
    processTextBuffer();
    // Build node
    String tag = Utils.isBlank(localName) ? name : localName;
    XMLNode node = new XMLNode(tag);
    for (int i = 0, n = atts.getLength(); i < n; i++) {
      String aName = atts.getQName(i);
      String value = atts.getValue(i);
      node.getAttributes().put(aName, value);
    }
    // New active tree-node
    Tree<XMLNode> tree = new Tree<XMLNode>(activeTree, node);
    activeTree = tree;
  }


}
Source Code of winterwell.utils.web.WebUtils

Related Classes of winterwell.utils.web.WebUtils