Source Code of net.sf.regain.crawler.CrawlerToolkit

/*
 * regain - A file search engine providing plenty of formats
 * Copyright (C) 2004-2008  Til Schneider, Thomas Tesche
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 * Contact: Til Schneider, info@murfman.de
 *
 * CVS information:
 *  $RCSfile$
 *   $Source$
 *     $Date: 2011-01-02 18:09:46 +0100 (So, 02 Jan 2011) $
 *   $Author: thtesche $
 * $Revision: 477 $
 */
package net.sf.regain.crawler;


import java.io.*;
import java.net.*;
import java.text.NumberFormat;
import java.util.ArrayList;


import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;


import net.sf.regain.RegainException;
import net.sf.regain.RegainToolkit;
import net.sf.regain.crawler.access.AccountPasswordEntry;
import net.sf.regain.crawler.config.CrawlerConfig;
import net.sf.regain.util.io.HtmlEntities;


/**
 * Contains help methods for the crawler.
 *
 * @author Til Schneider, www.murfman.de
 * @author Gerhard Olsson
 * @author Thomas Tesche
 */
public class CrawlerToolkit {


  /** The logger for this class */
  private static Logger mLog = Logger.getLogger(CrawlerToolkit.class);
  private static Pattern urlPatternLeft = Pattern.compile("([\\w]*://[\\w\\.:\\d-]*[^/]).*");


  public static String createURLFromProps(String[] parts) {
    
    String result = "";
    if( parts.length >= 4 ) {
      // We need at least protocol, sld, tld, account/password
      result = parts[0] + "://";
      for( int i=1; i< parts.length-2;i++) {
        // aggregate domain name
        result += parts[i] + ".";
      }
      
      // Remove the last dot
      result = result.substring(0, result.length()-1);
      
      // Analyze length-2 part for portnumber
      if( Pattern.matches("^\\d*$", parts[parts.length-2]) ) {
        result += ":";
      } else {
        result += ".";
      }
      result += parts[parts.length-2] + "/";


    } else {
      mLog.error("This is not a valid authentication entry: " + parts );
    }
    
    return result;
  }
  
  /**
   * Returns a human readable command string for a command.
   *
   * @param commandArr The command separated in executable and parameters.
   * @return The human readable command, where the parameters follow the
   *         execuable separated by spaces.
   */
  private static String toCommand(String[] commandArr) {
    StringBuffer buffer = new StringBuffer();
    for (int i = 0; i < commandArr.length; i++) {
      if (i != 0) {
        buffer.append(" ");
      }
      buffer.append(commandArr[i]);
    }
    
    return buffer.toString();
  }




  /**
   * Executes a native command and returns its output.
   * 
   * @param commandArr An array containing ehe command to execute and its parameters.
   * @return The output of the command as arrays of lines.
   * @throws RegainException If executing failed.
   */
  public static String[] executeNativeCommand(String[] commandArr)
    throws RegainException
  {
    InputStream in = null;
    try {
      long startTime = -1;
      if (mLog.isDebugEnabled()) {
        startTime = System.currentTimeMillis();
      }
      Process proc = Runtime.getRuntime().exec(commandArr);


      in = proc.getInputStream();
      BufferedReader reader = new BufferedReader(new InputStreamReader(in));
      ArrayList list = new ArrayList();
      String line;
      while ((line = reader.readLine()) != null) {
        if (mLog.isDebugEnabled()) {
          mLog.debug("  Got line: '" + line + "'");
        }
        list.add(line);
      }


      int exitCode;
      try {
        exitCode = proc.waitFor();
      } catch (InterruptedException exc) {
        throw new RegainException("Waiting for termination of process failed: "
          + commandArr[0], exc);
      }


      if (mLog.isDebugEnabled()) {
        double duration = (double) (System.currentTimeMillis() - startTime) / 1000.0;


        NumberFormat format = NumberFormat.getInstance();
        format.setMinimumFractionDigits(2);
        format.setMaximumFractionDigits(2);


        mLog.debug("..." + toCommand(commandArr) + " finished ("
            + format.format(duration) + " secs)");
      }


      if (exitCode != 0) {
        throw new RegainException("Native command exited with exit code "
            + exitCode + ": '" + toCommand(commandArr) + "'");
      }


      String[] asArr = new String[list.size()];
      list.toArray(asArr);
      return asArr;
    }
    catch (IOException exc) {
      throw new RegainException("Executing native command failed: '"
          + toCommand(commandArr) + "'", exc);
    }
    finally {
      if (in != null) {
        try { in.close(); } catch (IOException exc) {}
      }
    }
  }




  /**
   * Originally copied from javax.swing.JEditorPane#getStream(...).
   * <p>
   * Fetches a stream for the given URL, which is about to
   * be loaded by the <code>setPage</code> method.  By
   * default, this simply opens the URL and returns the
   * stream.  This can be reimplemented to do useful things
   * like fetch the stream from a cache, monitor the progress
   * of the stream, etc.
   * <p>
   * This method is expected to have the the side effect of
   * establishing the content type, and therefore setting the
   * appropriate <code>EditorKit</code> to use for loading the stream.
   * <p>
   * If this the stream was an http connection, redirects
   * will be followed and the resulting URL will be set as
   * the <code>Document.StreamDescriptionProperty</code> so that relative
   * URL's can be properly resolved.
   *
   * @param url the URL of the page
   *
   * @return a stream reading data from the specified URL.
   * @throws RedirectException if the URL redirects to another URL.
   * @throws HttpStreamException if something went wrong.
   */
  public static InputStream getHttpStream(URL url)
    throws RedirectException, HttpStreamException
  {
    URLConnection conn = null;
    try {


      String userPassword = extractCredentialsFromProtocolHostFragment(createURLWithoutPath(url.toExternalForm()));
      if (userPassword != null && userPassword.length() > 0) {
        final String [] token = userPassword.split(":");
        //System.out.println("username"+token[0]);
        //System.out.println("password"+token[1]);
        Authenticator.setDefault(new Authenticator() {


          @Override
          protected  PasswordAuthentication getPasswordAuthentication() {
            return new PasswordAuthentication(token[0], token[1].toCharArray());
          }
        });
      }
      conn = url.openConnection();
      if (conn instanceof HttpURLConnection) {
        HttpURLConnection hconn = (HttpURLConnection) conn;
        // Required in Java 1.5 (redirect followed automatically)
        // (Not available in Java 1.2.2)
        hconn.setInstanceFollowRedirects(false);


        // Set the preferred charset
        String charset = RegainToolkit.getSystemDefaultEncoding() + ",utf-8,*";
        hconn.setRequestProperty("Accept-Charset", charset);


        // Check the response code
        int response = hconn.getResponseCode();
        boolean redirect = (response >= 300 && response <= 399);


        // In the case of a redirect, we want to actually change the URL
        // that was input to the new, redirected URL
        if (redirect) {
          String loc = conn.getHeaderField("Location");
          if (loc != null) {
            String redirectUrl;
            if (loc.startsWith("http")) {
              redirectUrl = new URL(loc).toString();
            } else {
              redirectUrl = new URL(url, loc).toString();
            }
            throw new RedirectException("Redirect '" + url +
                "' -> '" + redirectUrl + "'", redirectUrl);
          }
          throw new IOException("Redirect did not provide a 'Location' header");
        }
      }


      return conn.getInputStream();
    }
    catch (RedirectException thr) {
      throw thr;
    }
    catch (Throwable thr) {
      throw HttpStreamException.createInstance("Could not get HTTP connection to "
          + url.toString(), thr, conn);
    }
  }






  /**
   * Lädt ein Dokument von einem HTTP-Server herunter und gibt seinen Inhalt
   * zurück.
   *
   * @param url Die URL des zu ladenden Dokuments.
   *
   * @return Den Inhalt des Dokuments.
   * @throws RegainException Wenn das Laden fehl schlug.
   */
  public static byte[] loadHttpDocument(String url) throws RegainException {
    InputStream in = null;
    ByteArrayOutputStream out = null;


    try {
      in = getHttpStream(new URL(url));


      out = new ByteArrayOutputStream();


      RegainToolkit.pipe(in, out);


      out.close();
      return out.toByteArray();
    }
    catch (RedirectException exc) {
      throw exc;
    }
    catch (IOException exc) {
      throw new RegainException("Could not load Document with HTTP", exc);
    }
    finally {
      if (in != null) {
        try { in.close(); } catch (Exception exc) {}
      }
      if (out != null) {
        try { out.close(); } catch (Exception exc) {}
      }
    }
  }




  /**
   * Loads a file from the file system and returns the content
   *
   * @param file The file to load
   * @return byte[] The content of file
   * @throws RegainException in case of problems while loading
   */
  public static byte[] loadFile(File file) throws RegainException {
    if (file.isDirectory()) {
      throw new RegainException("Can't load a directory: "
        + file.getAbsolutePath());
    }


    FileInputStream in = null;
    ByteArrayOutputStream out = null;
    try {
      in = new FileInputStream(file);
      out = new ByteArrayOutputStream((int) file.length());


      RegainToolkit.pipe(in, out);


      return out.toByteArray();
    }
    catch (IOException exc) {
      throw new RegainException("Loading file failed " + file.getAbsolutePath(), exc);
    }
    finally {
      if (out != null) {
        try { out.close(); } catch (IOException exc) {}
      }
      if (in != null) {
        try { in.close(); } catch (IOException exc) {}
      }
    }
  }


  /**
   * Loads content from a InputStream and returns the content
   *
   * @param inputStream the stream to read
   * @return byte[] The content of the source
   * @throws RegainException in case of problems while loading
   */
  public static byte[] loadFileFromStream(InputStream inputStream, int length) throws RegainException {
    
    ByteArrayOutputStream out = null;
    try {
      out = new ByteArrayOutputStream(length);


      RegainToolkit.pipe(inputStream, out);


      return out.toByteArray();
    }
    catch (IOException exc) {
      throw new RegainException("Loading inputstream failed ", exc);
    }
    finally {
      if (out != null) {
        try { out.close(); } catch (IOException exc) {}
      }
    }
  }




  /**
   * Wandelt die gegebene HTTP-URL in eine absolute URL um.
   * <p>
   * Wenn die URL bereits absolut war, so wird sie unverändert zurückgegeben.
   *
   * @param url Die umzuwandelnde URL.
   * @param parentUrl Die URL auf die sich die umzuwandelnde URL bezieht. Diese
   *        URL muss absolut sein.
   *
   * @return Die absolute Version der gegebenen URL.
   */
  public static String toAbsoluteUrl(String url, String parentUrl) {
    if (! (url.startsWith("http://") || url.startsWith("file://"))) {
      // This is a relative URL


      if (parentUrl.startsWith("http://") && url.startsWith("/")) {
        // NOTE: In HTTP there are two kinds of relative URLs:
        // Some start with '/': They are absolute within the domain
        // Others don't start with '/': They are really realtive


        // This URL is absolute within the domain
        // NOTE: 7 for skipping 'http://'
        int firstSlashPos = parentUrl.indexOf('/', 7);
        if (firstSlashPos != -1) {
          String domain = parentUrl.substring(0, firstSlashPos);
          url = domain + url;
        } else {
          // The parentUrl is a domain without a path, e.g. "http://www.murfman.de"
          // -> Use the whole parentUrl
          // NOTE: url start with a /
          url = parentUrl + url;
        }
      } else {
        // This URL is really relative
        int lastSlashPos = parentUrl.lastIndexOf('/');
        // NOTE: http:// has 7 chars
        if (lastSlashPos > 7) {
          String domainWidthPath = parentUrl.substring(0, lastSlashPos + 1);
          url = domainWidthPath + url;
        } else {
          // The parentUrl is a domain without a path, e.g. "http://www.murfman.de"
          // -> Use the whole parentUrl
          url = parentUrl + "/" + url;
        }
      }
    }


    // Check if url contains . in path
    url = RegainToolkit.replace(url, "/./", "/");
    if (url.endsWith("/.")) {
      url = url.substring(0, url.length() - 2);
    }


    // Check if url contains .. in path
    int updirIdx = 0;
    while ((updirIdx = url.indexOf("/..", updirIdx)) != -1) {
      // Check whether a / follows or whether this is the end
      int slashAfterIdx = updirIdx + 3;
      if ((slashAfterIdx >= url.length()) || (url.charAt(slashAfterIdx) == '/')) {
        // We found a "/../" or an "/.." at the end
        // -> Cut the directory before and the .. out 
      
        // Find previous /
        int slashBeforeIdx = url.lastIndexOf('/', updirIdx - 1);


        if (slashBeforeIdx != -1) {
          // Cut the "/somedir/.." out
          url = url.substring(0, slashBeforeIdx) + url.substring(slashAfterIdx);
          updirIdx = slashBeforeIdx;
        } else {
          throw new IllegalArgumentException("Illegal URL: " + url
              + ". (parent URL: " + parentUrl + ") Contains a .. with no / before");
        }
      } else {
        // This is something like "a/..extension/b" -> Go on after the "/.."
        updirIdx += 3;
      }
    }


    return url;
  }


  /**
   * Completes an url which denotes a directory but doesnt end with a slash.
   * 
   * @param url the URL to check and fix
   * @return  fixed URL
   */
  public static String completeDirectory(String url) {


    try {
      URL parsedUrl = new URL(url);
      String path = parsedUrl.getPath();
      String query = parsedUrl.getQuery();


      // Check for replacement only if this an URL without a query
      if ((query == null || query.length() == 0) && path != null && path.length() > 0) {


        if (!(path.contains(".")) && !path.endsWith("/")) {
          // this is an directory and has to end with a slash
          // remove an empty query
          if (url.endsWith("?")) {
            url = url.substring(0, url.length() - 1);
          }


          //@ToDo: Reminder for an unclear feature. 
          //return url + "/";
        }
      }
    } catch (MalformedURLException ex) {
      // This should never happen. We assume the URL where checked before.
    }




    return url;
  }


  /**
   * Removes anchors from URLs like http://mydomain.com/index.html#anchor
   * 
   * @param url an URL with or without an anchor
   * @return the URL without an anchor
   */
  public static String removeAnchor(String url){
    // Remove anchors from link.
    int index = url.indexOf('#');
    if (index != -1) {
      return url.substring(0, index);
    } else {
      return url;
    }
  }
  
  /**
   * Prints the active threads to System.out. Usefull for debugging.
   */
  public static void printActiveThreads() {
    ThreadGroup group = Thread.currentThread().getThreadGroup();
    Thread[] activeArr = new Thread[group.activeCount()];
    group.enumerate(activeArr);
    System.out.print("active threads: ");
    for (int i = 0; i < activeArr.length; i++) {
      if (i != 0) {
        System.out.print(", ");
      }
      System.out.print(activeArr[i].getName());
    }
    System.out.println();
  }




  /**
   * Initializes the HTTP client
   *
   * @param config The configuration to read the settings from.
   */
  public static void initHttpClient(CrawlerConfig config) {
    String httpProxyHost = config.getProxyHost();
    String httpProxyPort = config.getProxyPort();
    String httpProxyUser = config.getProxyUser();
    String httpProxyPassword = config.getProxyPassword();


    String msg = "";
    if (httpProxyHost != null) {
      System.setProperty("http.proxyHost", httpProxyHost);
      msg += " host: " + httpProxyHost;
    }
    if (httpProxyPort != null) {
      System.setProperty("http.proxyPort", httpProxyPort);
      msg += " port: " + httpProxyPort;
    }
    if (httpProxyUser != null) {
      System.setProperty("http.proxyUser", httpProxyUser);
      msg += " user: " + httpProxyUser;
    }
    if (httpProxyPassword != null) {
      System.setProperty("http.proxyPassword", httpProxyPassword);
      msg += " password: (" + httpProxyPassword.length() + " characters)";
    }


    if (msg.length() != 0) {
      mLog.info("Using proxy:" + msg);
    } else {
      mLog.info("Using no proxy");
    }


    String userAgent = config.getUserAgent();
    if (userAgent != null) {
      System.setProperty("http.agent", userAgent);
      mLog.info("Using HTTP user agent:" + userAgent);
    }
  }




  /**
   * Wandelt alle HTML-Entit�ten in ihre Ensprechungen.
   *
   * @param text Den Text, dessen HTML-Entit�ten gewandelt werden sollen.
   *
   * @return Der gewandelte Text.
   */
  public static String replaceHtmlEntities(String text) {
    StringBuffer clean = new StringBuffer();


    int offset = 0;
    int entityStart;
    while ((entityStart = text.indexOf('&', offset)) != -1) {
      // Append the part since the last entity
      String textPart = text.substring(offset, entityStart);
      clean.append(textPart);


      // Find the end of the entity
      int entityEnd = text.indexOf(';', entityStart);
      if (entityEnd == -1) {
        // Syntax error: The entity doesn't end -> Forget that dirty end
        offset = text.length();
        break;
      }


      // Extract, decode and append the entity
      String entity = text.substring(entityStart, entityEnd + 1);
      String decoded;
      try {
        decoded = HtmlEntities.decode(entity);
      }
      catch (Throwable thr) {
        // This doesn't seem to be a wellformed entity -> Leave the text as it is
        decoded = entity;
      }
      clean.append(decoded);


      // Get the next offset
      offset = entityEnd + 1;
    }


    // Append the part since the last entity
    if (offset < text.length()) {
      clean.append(text.substring(offset, text.length()));
    }


    return clean.toString();
  }




  /**
   * S�ubert HTML-Text von seinen Tags und wandelt alle HTML-Entit�ten in ihre
   * Ensprechungen.
   *
   * @param text Der zu s�ubernde HTML-Text.
   *
   * @return Der von Tags gesüberte Text
   */
  public static String cleanFromHtmlTags(String text) {
    StringBuffer clean = new StringBuffer(text.length());


    int offset = 0;
    int tagStart;
    while ((tagStart = text.indexOf('<', offset)) != -1) {
      // Extract the good part since the last tag
      String goodPart = text.substring(offset, tagStart);


      // Check whether the good part is wasted by cascaded tags
      // Example: In the text "<!-- <br> --> Hello" "<!-- <br>" will be
      //          detected as tag and "--> Hello" as good part.
      //          We now have to scan the good part for a tag rest.
      //          (In this example: "-->")
      int tagRestEnd = goodPart.indexOf('>');
      if (tagRestEnd != -1) {
        goodPart = goodPart.substring(tagRestEnd + 1);
      }


      // Trim the good part
      goodPart = goodPart.trim();


      if (goodPart.length() > 0) {
        // Replace all entities in the text and append the result
        goodPart = replaceHtmlEntities(goodPart);
        clean.append(goodPart);


        // Append a space
        clean.append(" ");
      }


      // Find the end of the tag
      int tagEnd = text.indexOf('>', tagStart);
      if (tagEnd == -1) {
        // Syntax error: The tag doesn't end -> Forget that dirty end
        offset = text.length();
        break;
      }


      // Calculate the next offset
      offset = tagEnd + 1;
    }


    // Extract the good part since the last tag, replace all entities and append
    // the result
    if (offset < text.length()) {
      String goodPart = text.substring(offset, text.length()).trim();
      goodPart = replaceHtmlEntities(goodPart);
      clean.append(goodPart);
    }


    return clean.toString();
  }
  
  /**
   * Sets the account and password for a URL if there is a account/password entry matching to 
   * the URL in the store.
   * 
   * @param url the url for enrichment
   * @param authMap accountPasswordStore
   * @return modified url
   * @throws net.sf.regain.RegainException
   */
  public static AccountPasswordEntry findAuthenticationValuesForURL(String url,
    Map<String, AccountPasswordEntry> authMap) throws RegainException {


    String leftUrlPart = createURLWithoutPath(url);
    mLog.debug("search for >" + leftUrlPart + "< in authentication store.");
    // Lookup the key and in case of a match build the final url with account, password enrichment
    if (authMap.containsKey(leftUrlPart)) {
      mLog.debug("Found an authentication entry for " + leftUrlPart);
      return authMap.get(leftUrlPart);
    } else {
      mLog.debug("Don't found an authentication entry for " + leftUrlPart);
      return null;
    }
  }


  /**
   * Sets account and password for an URL
   * 
   * @param url the URL for enrichment
   * @param entry the account password entry
   * @return URL with replacement
   * @throws net.sf.regain.RegainException
   */
  public static String replaceAuthenticationValuesInURL(String url, AccountPasswordEntry entry) {


    String finalUrl = url;
    if (entry != null) {
      // Lookup the key and in case of a match build the final url with account, password enrichment
      finalUrl = url.substring(0, url.indexOf("://"));
      finalUrl += "://" + entry.getAccountName() + ":" +
        entry.getPassword() + "@";
      finalUrl += url.substring(url.indexOf("://") + 3);
    }
    return finalUrl;


  }


  /**
   * Extract left part of URL (protocol, host, port).
   * 
   * @param completeUrl
   * @return the resulting URL (e.g. http://bl.dfs.dk:8080/mypath/fil.jsp?query will be 
   *         http://bl.dfs.dk:8080/)
   */
  public static String createURLWithoutPath(String completeUrl) throws RegainException {


    String result = "";
    Matcher matcher = urlPatternLeft.matcher(completeUrl);
    matcher.find();
    if (matcher.groupCount() > 0) {
      try {
        return matcher.group(1) + "/";
      } catch (IllegalStateException ex) {
        // No match found
        return "";
      }
    } else {
      throw new RegainException("URL is unparsable. url: " + completeUrl);
    }
  }


  /**
   * Removes unwanted parts from the URL.
   *
   * @param url
   * @param urlCleaners
   * @return
   */
  public static String cleanURL(String url, String[] urlCleaners) {


    String result = url;
    for (String pattern : urlCleaners) {
      result = result.replaceAll(pattern, "");
      mLog.debug("Remove " + pattern + " from URL: " + url);
    }


    result = result.replaceAll("&&", "&");


    if (result.endsWith("&")) {
      result = result.substring(0, result.length() - 1);
    }
    if (result.endsWith("?")) {
      result = result.substring(0, result.length() - 1);
    }


    mLog.debug("Resulting Url after replacement: " + result);


    return result;
  }


  /**
   * Extract the username, password from a given protocol, host-domain url fragment.
   * Example: http://tester:secret&amp;host.sld.tld/
   *
   * @param urlFragment the fragment which contains protocol, optional user/pw and host+domain.
   * @return the user:pw if it exist in the urlFragment
   */
  public static String extractCredentialsFromProtocolHostFragment(String urlFragment) {
    String result = "";


    if (urlFragment.contains("@") && urlFragment.contains(":")) {
      // We've found a possible username@password part
      int startPos = urlFragment.indexOf("//") + 2;
      int endPos = urlFragment.indexOf("@");
      // We need at least x:x@ in length
      if (endPos > startPos + 2) {
        String temp = urlFragment.substring(startPos, endPos);
        if (!(temp.startsWith(":") || temp.endsWith(":") || temp.endsWith(":@"))) {
          result = temp;
        }
      }
    }


    return result;
  }
  
}
Source Code of net.sf.regain.crawler.CrawlerToolkit

Related Classes of net.sf.regain.crawler.CrawlerToolkit