Source Code of org.apache.nutch.parse.js.JSParseFilter

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.parse.js;


import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;


import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.HtmlParseFilter;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.parse.ParseText;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.Parser;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.hadoop.conf.Configuration;
import org.apache.oro.text.regex.MatchResult;
import org.apache.oro.text.regex.Pattern;
import org.apache.oro.text.regex.PatternCompiler;
import org.apache.oro.text.regex.PatternMatcher;
import org.apache.oro.text.regex.PatternMatcherInput;
import org.apache.oro.text.regex.Perl5Compiler;
import org.apache.oro.text.regex.Perl5Matcher;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;


/**
 * This class is a heuristic link extractor for JavaScript files and
 * code snippets. The general idea of a two-pass regex matching comes from
 * Heritrix. Parts of the code come from OutlinkExtractor.java
 */
public class JSParseFilter implements HtmlParseFilter, Parser {
  public static final Logger LOG = LoggerFactory.getLogger(JSParseFilter.class);


  private static final int MAX_TITLE_LEN = 80;


  private Configuration conf;
  
  public ParseResult filter(Content content, ParseResult parseResult,
    HTMLMetaTags metaTags, DocumentFragment doc) {


    Parse parse = parseResult.get(content.getUrl());


    String url = content.getBaseUrl();
    ArrayList<Outlink> outlinks = new ArrayList<Outlink>();
    walk(doc, parse, metaTags, url, outlinks);
    if (outlinks.size() > 0) {
      Outlink[] old = parse.getData().getOutlinks();
      String title = parse.getData().getTitle();
      List<Outlink> list = Arrays.asList(old);
      outlinks.addAll(list);
      ParseStatus status = parse.getData().getStatus();
      String text = parse.getText();
      Outlink[] newlinks = (Outlink[])outlinks.toArray(new Outlink[outlinks.size()]);
      ParseData parseData = new ParseData(status, title, newlinks,
                                          parse.getData().getContentMeta(),
                                          parse.getData().getParseMeta());


      // replace original parse obj with new one
      parseResult.put(content.getUrl(), new ParseText(text), parseData);
    }
    return parseResult;
  }
  
  private void walk(Node n, Parse parse, HTMLMetaTags metaTags, String base, List<Outlink> outlinks) {
    if (n instanceof Element) {
      String name = n.getNodeName();
      if (name.equalsIgnoreCase("script")) {
 /*       String lang = null;
        Node lNode = n.getAttributes().getNamedItem("language");
        if (lNode == null) lang = "javascript";
        else lang = lNode.getNodeValue(); */
        StringBuffer script = new StringBuffer();
        NodeList nn = n.getChildNodes();
        if (nn.getLength() > 0) {
          for (int i = 0; i < nn.getLength(); i++) {
            if (i > 0) script.append('\n');
            script.append(nn.item(i).getNodeValue());
          }
          // if (LOG.isInfoEnabled()) {
          //   LOG.info("script: language=" + lang + ", text: " + script.toString());
          // }
          Outlink[] links = getJSLinks(script.toString(), "", base);
          if (links != null && links.length > 0) outlinks.addAll(Arrays.asList(links));
          // no other children of interest here, go one level up.
          return;
        }
      } else {
        // process all HTML 4.0 events, if present...
        NamedNodeMap attrs = n.getAttributes();
        int len = attrs.getLength();
        for (int i = 0; i < len; i++) {
          // Window: onload,onunload
          // Form: onchange,onsubmit,onreset,onselect,onblur,onfocus
          // Keyboard: onkeydown,onkeypress,onkeyup
          // Mouse: onclick,ondbclick,onmousedown,onmouseout,onmousover,onmouseup
          Node anode = attrs.item(i);
          Outlink[] links = null;
          if (anode.getNodeName().startsWith("on")) {
            links = getJSLinks(anode.getNodeValue(), "", base);
          } else if (anode.getNodeName().equalsIgnoreCase("href")) {
            String val = anode.getNodeValue();
            if (val != null && val.toLowerCase().indexOf("javascript:") != -1) {
              links = getJSLinks(val, "", base);
            }
          }
          if (links != null && links.length > 0) outlinks.addAll(Arrays.asList(links));
        }
      }
    }
    NodeList nl = n.getChildNodes();
    for (int i = 0; i < nl.getLength(); i++) {
      walk(nl.item(i), parse, metaTags, base, outlinks);
    }
  }
  
  public ParseResult getParse(Content c) {
    String type = c.getContentType();
    if (type != null && !type.trim().equals("") && !type.toLowerCase().startsWith("application/x-javascript"))
      return new ParseStatus(ParseStatus.FAILED_INVALID_FORMAT,
              "Content not JavaScript: '" + type + "'").getEmptyParseResult(c.getUrl(), getConf());
    String script = new String(c.getContent());
    Outlink[] outlinks = getJSLinks(script, "", c.getUrl());
    if (outlinks == null) outlinks = new Outlink[0];
    // Title? use the first line of the script...
    String title;
    int idx = script.indexOf('\n');
    if (idx != -1) {
      if (idx > MAX_TITLE_LEN) idx = MAX_TITLE_LEN;
      title = script.substring(0, idx);
    } else {
      idx = Math.min(MAX_TITLE_LEN, script.length());
      title = script.substring(0, idx);
    }
    ParseData pd = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks,
                                 c.getMetadata());
    return ParseResult.createParseResult(c.getUrl(), new ParseImpl(script, pd));
  }
  
  private static final String STRING_PATTERN = "(\\\\*(?:\"|\'))([^\\s\"\']+?)(?:\\1)";
  // A simple pattern. This allows also invalid URL characters.
  private static final String URI_PATTERN = "(^|\\s*?)/?\\S+?[/\\.]\\S+($|\\s*)";
  // Alternative pattern, which limits valid url characters.
  //private static final String URI_PATTERN = "(^|\\s*?)[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+[/.](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]*))?($|\\s*)";
  
  /**
   *  This method extracts URLs from literals embedded in JavaScript.
   */
  private Outlink[] getJSLinks(String plainText, String anchor, String base) {


    final List<Outlink> outlinks = new ArrayList<Outlink>();
    URL baseURL = null;
    
    try {
      baseURL = new URL(base);
    } catch (Exception e) {
      if (LOG.isErrorEnabled()) { LOG.error("getJSLinks", e); }
    }


    try {
      final PatternCompiler cp = new Perl5Compiler();
      final Pattern pattern = cp.compile(STRING_PATTERN,
          Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK
              | Perl5Compiler.MULTILINE_MASK);
      final Pattern pattern1 = cp.compile(URI_PATTERN,
              Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK
                  | Perl5Compiler.MULTILINE_MASK);
      final PatternMatcher matcher = new Perl5Matcher();


      final PatternMatcher matcher1 = new Perl5Matcher();
      final PatternMatcherInput input = new PatternMatcherInput(plainText);


      MatchResult result;
      String url;


      //loop the matches
      while (matcher.contains(input, pattern)) {
        result = matcher.getMatch();
        url = result.group(2);
        PatternMatcherInput input1 = new PatternMatcherInput(url);
        if (!matcher1.matches(input1, pattern1)) {
          //if (LOG.isTraceEnabled()) { LOG.trace(" - invalid '" + url + "'"); }
          continue;
        }
        if (url.startsWith("www.")) {
            url = "http://" + url;
        } else {
          // See if candidate URL is parseable.  If not, pass and move on to
          // the next match.
          try {
            url = new URL(baseURL, url).toString();
          } catch (MalformedURLException ex) {
            if (LOG.isTraceEnabled()) {
              LOG.trace(" - failed URL parse '" + url + "' and baseURL '" +
                  baseURL + "'", ex);
            }
            continue;
          }
        }
        url = url.replaceAll("&amp;", "&");
        if (LOG.isTraceEnabled()) {
          LOG.trace(" - outlink from JS: '" + url + "'");
        }
        outlinks.add(new Outlink(url, anchor));
      }
    } catch (Exception ex) {
      // if it is a malformed URL we just throw it away and continue with
      // extraction.
      if (LOG.isErrorEnabled()) { LOG.error("getJSLinks", ex); }
    }


    final Outlink[] retval;


    //create array of the Outlinks
    if (outlinks != null && outlinks.size() > 0) {
      retval = (Outlink[]) outlinks.toArray(new Outlink[0]);
    } else {
      retval = new Outlink[0];
    }


    return retval;
  }
  
  public static void main(String[] args) throws Exception {
    if (args.length < 2) {
      System.err.println(JSParseFilter.class.getName() + " file.js baseURL");
      return;
    }
    InputStream in = new FileInputStream(args[0]);
    BufferedReader br = new BufferedReader(new InputStreamReader(in, "UTF-8"));
    StringBuffer sb = new StringBuffer();
    String line = null;
    while ((line = br.readLine()) != null) 
      sb.append(line + "\n");
    br.close();
    
    JSParseFilter parseFilter = new JSParseFilter();
    parseFilter.setConf(NutchConfiguration.create());
    Outlink[] links = parseFilter.getJSLinks(sb.toString(), "", args[1]);
    System.out.println("Outlinks extracted: " + links.length);
    for (int i = 0; i < links.length; i++)
      System.out.println(" - " + links[i]);
  }


  public void setConf(Configuration conf) {
    this.conf = conf;
  }


  public Configuration getConf() {
    return this.conf;
  }
}
Source Code of org.apache.nutch.parse.js.JSParseFilter

Related Classes of org.apache.nutch.parse.js.JSParseFilter