Package org.commoncrawl.protocol.shared

Examples of org.commoncrawl.protocol.shared.URLFPV2


   *         OR null if the url is invalid
   */
  public static URLFPV2 getURLFPV2FromCanonicalURL(String canonicalURL) {

    // create a url fp record
    URLFPV2 urlFP = new URLFPV2();

    urlFP.setUrlHash(URLFingerprint.generate64BitURLFPrint(canonicalURL));

    String hostName = fastGetHostFromURL(canonicalURL);
    String rootDomainName = null;

    if (hostName != null)
      rootDomainName = URLUtils.extractRootDomainName(hostName);

    if (hostName != null && rootDomainName != null) {
      // ok we want to strip the leading www. if necessary
      if (hostName.startsWith("www.")) {
        // ok now. one nasty hack ... :-(
        // if root name does not equal full host name ...
        if (!rootDomainName.equals(hostName)) {
          // strip the www. prefix
          hostName = hostName.substring(4);
        }
      }
      urlFP.setDomainHash(FPGenerator.std64.fp(hostName));
      urlFP.setRootDomainHash(FPGenerator.std64.fp(rootDomainName));
      return urlFP;
    }
    return null;
  }
View Full Code Here

TOP

Related Classes of org.commoncrawl.protocol.shared.URLFPV2

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.