Package org.apache.nutch.util.domain

Examples of org.apache.nutch.util.domain.DomainSuffix


  public NutchDocument filter(NutchDocument doc, Parse parse, Text urlText, CrawlDatum datum, Inlinks inlinks)
  throws IndexingException {

    try {
      URL url = new URL(urlText.toString());
      DomainSuffix d = URLUtil.getDomainSuffix(url);
     
      doc.add("tld", d.getDomain());
     
    }catch (Exception ex) {
      LOG.warn(ex.toString());
    }
View Full Code Here


    int index = 0;
    String candidate = host;
    for(;index >= 0;) {
      index = candidate.indexOf('.');
      String subCandidate = candidate.substring(index+1);
      DomainSuffix d = tlds.get(subCandidate);
      if(d != null) {
        return d;
      }
      candidate = subCandidate;
    }
View Full Code Here

      // match for suffix, domain, and host in that order.  more general will
      // override more specific
      String domain = URLUtil.getDomainName(url).toLowerCase().trim();
      String host = URLUtil.getHost(url);
      String suffix = null;
      DomainSuffix domainSuffix = URLUtil.getDomainSuffix(url);
      if (domainSuffix != null) {
        suffix = domainSuffix.getDomain();
      }

      if (domainSet.contains(suffix) || domainSet.contains(domain)
        || domainSet.contains(host)) {
        // Matches, filter!
View Full Code Here

    int index = 0;
    String candidate = host;
    for(;index >= 0;) {
      index = candidate.indexOf('.');
      String subCandidate = candidate.substring(index+1);
      DomainSuffix d = tlds.get(subCandidate);
      if(d != null) {
        return d;
      }
      candidate = subCandidate;
    }
View Full Code Here

    List<String> tlds = doc.getFieldValues("tld");
    float boost = 1.0f;

    if (tlds != null) {
      for (String tld : tlds) {
        DomainSuffix entry = tldEntries.get(tld);
        if (entry != null)
          boost *= entry.getBoost();
      }
    }
    return initScore * boost;
  }
 
View Full Code Here

  @Override
  public NutchDocument filter(NutchDocument doc, String url, WebPage page)
      throws IndexingException {
    try {
      URL _url = new URL(url);
      DomainSuffix d = URLUtil.getDomainSuffix(_url);
      doc.add("tld", d.getDomain());
    } catch (Exception ex) {
      LOG.warn("Exception in TLDIndexingFilter",ex);
    }

    return doc;
View Full Code Here

    int index = 0;
    String candidate = host;
    for(;index >= 0;) {
      index = candidate.indexOf('.');
      String subCandidate = candidate.substring(index+1);
      DomainSuffix d = tlds.get(subCandidate);
      if(d != null) {
        return d;
      }
      candidate = subCandidate;
    }
View Full Code Here

    int index = 0;
    String candidate = host;
    for(;index >= 0;) {
      index = candidate.indexOf('.');
      String subCandidate = candidate.substring(index+1);
      DomainSuffix d = tlds.get(subCandidate);
      if(d != null) {
        return d;
      }
      candidate = subCandidate;
    }
View Full Code Here

TOP

Related Classes of org.apache.nutch.util.domain.DomainSuffix

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.