Package org.commoncrawl.util.shared

Source Code of org.commoncrawl.util.shared.URLUtils

package org.commoncrawl.util.shared;

import java.net.MalformedURLException;
import java.util.Collection;
import java.util.regex.Pattern;

import org.commoncrawl.protocol.shared.URLFPV2;
import org.commoncrawl.util.shared.GoogleURL;
import org.commoncrawl.util.shared.URLFingerprint;

public class URLUtils {

  /**
   * Internal Helper
   *
   * @param candidateString
   * @param parts
   * @param rootNameIndex
   * @return
   */
  private static String buildRootNameString(String candidateString,
      String[] parts, int rootNameIndex) {
    int partsToInclude = parts.length - rootNameIndex;
    int dotsToInclude = partsToInclude - 1;

    // initial root name length is dot count
    int rootNameLength = dotsToInclude;
    for (int i = rootNameIndex; i < parts.length; ++i) {
      rootNameLength += parts[i].length();
    }
    return candidateString.substring(candidateString.length() - rootNameLength);
  }

  /**
   * Extract Top-Level Domain Name given Host Name
   *
   * @param hostName
   *          - domain name
   * @return tld name or null if invalid domain name
   */
  public static String extractTLDName(String hostName) {

    // special case for ip addresses
    if (ipAddressRegEx.matcher(hostName).matches()) {
      return "inaddr-arpa.arpa";
    }

    if (hostName.endsWith(".")) {
      hostName = hostName.substring(0, hostName.length() - 1);
    }
    if (hostName.startsWith("*") && hostName.length() > 1) {
      hostName = hostName.substring(1);
    }
    if (hostName.length() != 0) {
      if (!invalidDomainCharactersRegEx.matcher(hostName).find()) {
        String parts[] = hostName.split("\\.");
        if (parts.length >= 2) {
          Collection<String> secondaryNames = TLDNamesCollection
              .getSecondaryNames(parts[parts.length - 1]);

          if (secondaryNames.size() != 0) {
            // see if second to last part matches secondary names for this TLD
            // or there is a wildcard expression for secondary name in rule set
            if (secondaryNames.contains(parts[parts.length - 2])
                || secondaryNames.contains("*")) {
              // ok secondary part is potentianlly part of secondary name ...

              // check to see the part in not explicitly excluded ...
              if (secondaryNames.contains("!" + parts[parts.length - 2])) {
                // in this case, second to last part is NOT part of secondary
                // name
                return buildRootNameString(hostName, parts, parts.length - 1);
              } else {
                // otherwise, TLD contains 2 parts
                return buildRootNameString(hostName, parts, parts.length - 2);
              }
            }
            // ok second to last part does not match set of known secondary
            // names
            else {
              // make a wildcard string matching secondary name
              String extendedWildcard = "*." + parts[parts.length - 2];
              // if match, then this implies secondary name has two components
              if (secondaryNames.contains(extendedWildcard)) {

                if (parts.length >= 3) {
                  // this implies that there must be four parts to the name to
                  // extract root
                  // unless exlusion rule applies
                  String exclusionRule2 = "!" + parts[parts.length - 3] + "."
                      + parts[parts.length - 2];

                  // if exclusion rule is present ...
                  if (secondaryNames.contains(exclusionRule2)) {
                    // third part is NOT part of secondary name
                    return buildRootNameString(hostName, parts,
                        parts.length - 2);
                  } else {
                    // ok extended wildcard matched. last 3 parts are part of
                    // the TLD
                    if (parts.length >= 4) {
                      return buildRootNameString(hostName, parts,
                          parts.length - 3);
                    }
                  }
                }
              }
              // at this point ... if the null name exists ...
              else if (secondaryNames.contains("")) {
                // only last item is part of TLD
                return buildRootNameString(hostName, parts, parts.length - 1);
              }
            }
          }
        }
      }
    }
    return null;
  }

  /**
   * Extract the Root Domain Name (domain name up to one name after TLD Name)
   * from a fully qualified domain name
   *
   * @param hostName
   *          fully Qualified Domain Name
   * @return Root Domain Name or null if domain name is not valid
   */
  public static String extractRootDomainName(String hostName) {

    // special case for ip addresses
    if (ipAddressRegEx.matcher(hostName).matches()) {
      return hostName;
    }

    if (hostName.endsWith(".")) {
      hostName = hostName.substring(0, hostName.length() - 1);
    }
    if (hostName.startsWith("*") && hostName.length() > 1) {
      hostName = hostName.substring(1);
    }
    if (hostName.length() != 0) {
      if (!invalidDomainCharactersRegEx.matcher(hostName).find()) {
        String parts[] = hostName.split("\\.");
        if (parts.length >= 2) {
          Collection<String> secondaryNames = TLDNamesCollection
              .getSecondaryNames(parts[parts.length - 1]);

          if (secondaryNames.size() != 0) {
            // see if second to last part matches secondary names for this TLD
            // or there is a wildcard expression for secondary name in rule set
            if (secondaryNames.contains(parts[parts.length - 2])
                || secondaryNames.contains("*")) {
              // ok secondary part is potentianlly part of secondary name ...

              // check to see the part in not explicitly excluded ...
              if (secondaryNames.contains("!" + parts[parts.length - 2])) {
                // in this case, this is an explicit override. second to last
                // part is NOT part of secondary name
                return buildRootNameString(hostName, parts, parts.length - 2);
              } else {
                // otherwise, we need at least three parts
                if (parts.length >= 3) {
                  return buildRootNameString(hostName, parts, parts.length - 3);
                }
              }
            }
            // ok second to last part does not match set of known secondary
            // names
            else {
              // make a wildcard string matching secondary name
              String extendedWildcard = "*." + parts[parts.length - 2];
              // if match, then this implies secondary name has two components
              if (secondaryNames.contains(extendedWildcard)) {

                if (parts.length >= 3) {
                  // this implies that there must be four parts to the name to
                  // extract root
                  // unless exlusion rule applies
                  String exclusionRule2 = "!" + parts[parts.length - 3] + "."
                      + parts[parts.length - 2];

                  // if exclusion rule is present ...
                  if (secondaryNames.contains(exclusionRule2)) {
                    // third part is NOT part of secondary name
                    return buildRootNameString(hostName, parts,
                        parts.length - 3);
                  } else {
                    // ok extended wildcard matched. we need 4 parts minimum
                    if (parts.length >= 4) {
                      return buildRootNameString(hostName, parts,
                          parts.length - 4);
                    }
                  }
                }
              }
              // at this point ... if the null name exists ...
              else if (secondaryNames.contains("")) {
                // return second part as root name
                return buildRootNameString(hostName, parts, parts.length - 2);
              }
            }
          }
        }
      }
    }
    return null;
  }

  /** The maximum length of a Name */
  private static final int MAXNAME                      = 255;

  /** The maximum length of a label a Name */
  private static final int MAXLABEL                     = 63;

  /** The maximum number of labels in a Name */
  private static final int MAXLABELS                    = 128;

  static Pattern           invalidDomainCharactersRegEx = Pattern
                                                            .compile("[^0-9a-z\\-\\._]");
  static Pattern           ipAddressRegEx               = Pattern
                                                            .compile("^[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+$");
  static Pattern           numericOnly                  = Pattern
                                                            .compile("[0-9]*$");

  /**
   * validate a domain name
   *
   * @param domainName
   * @return true if domain name is valid
   */
  public static boolean isValidDomainName(String domainName) {

    // check for invalid length (max 255 characters)
    if (domainName.length() > MAXNAME) {
      return false;
    }

    String candidate = domainName.toLowerCase();

    // check to see if this is an ip address
    if (ipAddressRegEx.matcher(candidate).matches()) {
      return true;
    }

    // check for invalid characters
    if (invalidDomainCharactersRegEx.matcher(candidate).matches()) {
      return false;
    }
    // split into parts
    String[] parts = domainName.split("\\.");

    // check for max labels constraint
    if (parts.length > MAXLABELS) {
      return false;
    }
    return extractRootDomainName(candidate) != null;
  }

  /**
   * calculate a url fingerprint for the passed in url string
   *
   * @param urlString
   * @return URLFPV2 data structure representing canonical fingerprint for url
   *         OR null if the url is invalid
   */
  public static URLFPV2 getURLFPV2FromURL(String urlString) {

    try {
      // canonicalize the incoming url ...
      String canonicalURL = URLUtils.canonicalizeURL(urlString, true);

      if (canonicalURL != null) {
        return getURLFPV2FromCanonicalURL(canonicalURL);
      }
    } catch (MalformedURLException e) {
    }
    return null;
  }

  /**
   * calculate a url fingerprint given a GoogleURL object
   *
   * @param urlObject
   * @return URLFPV2 data structure representing canonical fingerprint for url
   *         OR null if the url is invalid
   */
  public static URLFPV2 getURLFPV2FromURLObject(GoogleURL urlObject) {
    try {
      // canonicalize the incoming url ...
      String canonicalURL = URLUtils.canonicalizeURL(urlObject, true);

      if (canonicalURL != null) {
        return getURLFPV2FromCanonicalURL(canonicalURL);
      }
    } catch (MalformedURLException e) {
    }
    return null;
  }

  /**
   * calculate a url fingerprint given a <<PREVIOUSLY CANONICALIZED>> url.
   *
   * @param canonicalURL
   * @return URLFPV2 data structure representing canonical fingerprint for url
   *         OR null if the url is invalid
   */
  public static URLFPV2 getURLFPV2FromCanonicalURL(String canonicalURL) {

    // create a url fp record
    URLFPV2 urlFP = new URLFPV2();

    urlFP.setUrlHash(URLFingerprint.generate64BitURLFPrint(canonicalURL));

    String hostName = fastGetHostFromURL(canonicalURL);
    String rootDomainName = null;

    if (hostName != null)
      rootDomainName = URLUtils.extractRootDomainName(hostName);

    if (hostName != null && rootDomainName != null) {
      // ok we want to strip the leading www. if necessary
      if (hostName.startsWith("www.")) {
        // ok now. one nasty hack ... :-(
        // if root name does not equal full host name ...
        if (!rootDomainName.equals(hostName)) {
          // strip the www. prefix
          hostName = hostName.substring(4);
        }
      }
      urlFP.setDomainHash(FPGenerator.std64.fp(hostName));
      urlFP.setRootDomainHash(FPGenerator.std64.fp(rootDomainName));
      return urlFP;
    }
    return null;
  }

  /** session id normalizer **/
  private static SessionIDURLNormalizer _sessionIdNormalizer = new SessionIDURLNormalizer();

  /**
   * canonicalize a given url. Use the GoogleURL canonicalization library to
   * canonicalize the url, then apply the session id normalization filter to
   * remove common session id patterns.
   *
   *
   * @param incomingURL
   * @param stripLeadingWWW
   *          - set to true to string www. prefix from the domain if present
   * @return a canonical representation of the passed in URL that can be safely
   *         used as a replacement for the original url
   * @throws MalformedURLException
   */

  public static String canonicalizeURL(String incomingURL,
      boolean stripLeadingWWW) throws MalformedURLException {

    GoogleURL urlObject = new GoogleURL(incomingURL);

    if (!urlObject.isValid()) {
      throw new MalformedURLException("URL:" + incomingURL + " is invalid");
    }

    return canonicalizeURL(urlObject, stripLeadingWWW);
  }

  /**
   * Canonicalize the given GoogleURL object.
   *
   * @param urlObject
   * @param stripLeadingWWW
   * @return
   * @throws MalformedURLException
   */
  public static String canonicalizeURL(GoogleURL urlObject,
      boolean stripLeadingWWW) throws MalformedURLException {

    StringBuilder urlOut = new StringBuilder();

    urlOut.append(urlObject.getScheme());
    urlOut.append("://");

    if (urlObject.getUserName() != GoogleURL.emptyString) {
      urlOut.append(urlObject.getUserName());
      if (urlObject.getPassword() != GoogleURL.emptyString) {
        urlOut.append(":");
        urlOut.append(urlObject.getPassword());
      }
      urlOut.append("@");
    }

    String host = urlObject.getHost();
    if (host.endsWith(".")) {
      host = host.substring(0, host.length() - 1);
    }

    if (stripLeadingWWW) {
      if (host.startsWith("www.")) {
        // ok now. one nasty hack ... :-(
        // if root name is null or root name does not equal full host name ...
        String rootName = extractRootDomainName(host);
        if (rootName == null || !rootName.equals(host)) {
          // striping the www. prefix
          host = host.substring(4);
        }
      }
    }
    urlOut.append(host);

    if (urlObject.getPort() != GoogleURL.emptyString
        && !urlObject.getPort().equals("80")) {
      urlOut.append(":");
      urlOut.append(urlObject.getPort());
    }
    if (urlObject.getPath() != GoogleURL.emptyString) {
      int indexOfSemiColon = urlObject.getPath().indexOf(';');
      if (indexOfSemiColon != -1) {
        urlOut.append(urlObject.getPath().substring(0, indexOfSemiColon));
      } else {
        urlOut.append(urlObject.getPath());
      }
    }
    if (urlObject.getQuery() != GoogleURL.emptyString) {
      urlOut.append("?");
      urlOut.append(urlObject.getQuery());
    }

    String canonicalizedURL = urlOut.toString();

    // phase 2 - remove common session id patterns
    canonicalizedURL = _sessionIdNormalizer.normalize(canonicalizedURL, "");

    return canonicalizedURL;
  }

  private static String fastGetHostFromURL(String urlString) {

    int hostStart = urlString.indexOf(":");
    if (hostStart != -1) {

      hostStart++;

      int urlLength = urlString.length();

      while (hostStart < urlString.length()) {
        char nextChar = urlString.charAt(hostStart);
        if (nextChar != '/' && nextChar != '\\' && nextChar != '\n'
            && nextChar != '\r' && nextChar != '\t' && nextChar != ' ') {
          break;
        }
        hostStart++;
      }

      if (hostStart < urlLength) {

        int hostEnd = hostStart + 1;

        while (hostEnd < urlLength) {
          char nextChar = urlString.charAt(hostEnd);
          if (nextChar == '/' || nextChar == '?' || nextChar == ';'
              || nextChar == '#')
            break;
          hostEnd++;
        }

        int indexOfAt = urlString.indexOf("@", hostStart);
        if (indexOfAt != -1 && indexOfAt < hostEnd) {
          hostStart = indexOfAt + 1;
        }

        String host = urlString.substring(hostStart, hostEnd);

        int hostLength = host.length();
        int colonEnd = host.indexOf(":");
        if (colonEnd != -1) {
          hostLength = colonEnd;
          host = urlString.substring(hostStart, hostStart + hostLength);
        }

        GoogleURL urlObject = new GoogleURL("http://" + host);

        if (urlObject.isValid()) {
          return urlObject.getHost();
        }
      }
    }
    return null;
  }

}
TOP

Related Classes of org.commoncrawl.util.shared.URLUtils

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.