Package org.archive.modules.net

Examples of org.archive.modules.net.CrawlHost


        DisposableStoredSortedMap<Long,String> hd = stats.calcReverseSortedHostsDistribution();
        writer.print("[#urls] [#bytes] [host] [#robots] [#remaining] [#novel-urls] [#novel-bytes] [#dup-by-hash-urls] [#dup-by-hash-bytes] [#not-modified-urls] [#not-modified-bytes]\n");
        for (Map.Entry<Long,String> entry : hd.entrySet()) {
            // key is -count, value is hostname
            try {
                CrawlHost host = stats.serverCache.getHostFor(entry.getValue());
                writeReportLine(writer,
                        host.getSubstats().getFetchSuccesses(),
                        host.getSubstats().getTotalBytes(),
                        host.fixUpName(),
                        host.getSubstats().getRobotsDenials(),
                        host.getSubstats().getRemaining(),
                        host.getSubstats().getNovelUrls(),
                        host.getSubstats().getNovelBytes(),
                        host.getSubstats().getDupByHashUrls(),
                        host.getSubstats().getDupByHashBytes(),
                        host.getSubstats().getNotModifiedUrls(),
                        host.getSubstats().getNotModifiedBytes());
            } catch (Exception e) {
                logger.log(Level.WARNING, "unable to tally host stats for " + entry.getValue(), e);
            }
        }
        hd.dispose();
View Full Code Here


            long now = System.currentTimeMillis();
            int maxBandwidthKB = getMaxPerHostBandwidthUsageKbSec();
            if (maxBandwidthKB > 0) {
                // Enforce bandwidth limit
                ServerCache cache = this.getServerCache();
                CrawlHost host = cache.getHostFor(curi.getUURI());
                long minDurationToWait = host.getEarliestNextURIEmitTime()
                        - now;
                float maxBandwidth = maxBandwidthKB * 1.024F; // kilo factor
                long processedBytes = curi.getContentSize();
                host
                        .setEarliestNextURIEmitTime((long)(processedBytes / maxBandwidth)
                                + now);

                if (minDurationToWait > durationToWait) {
                    durationToWait = minDurationToWait;
View Full Code Here

    protected boolean evaluate(CrawlURI uri) {       
        ExternalGeoLookupInterface impl = getLookup();
        if (impl == null) {
            return false;
        }
        CrawlHost crawlHost = null;
        String host;
        InetAddress address;
        try {
            host = uri.getUURI().getHost();
            crawlHost = serverCache.getHostFor(host);
            if (crawlHost.getCountryCode() != null) {
                return countryCodes.contains(crawlHost.getCountryCode());
            }
            address = crawlHost.getIP();
            if (address == null) {
                // TODO: handle transient lookup failures better
                address = Address.getByName(host);
            }
            crawlHost.setCountryCode((String) impl.lookup(address));
            if (countryCodes.contains(crawlHost.getCountryCode())) {
                LOGGER.fine("Country Code Lookup: " + " " + host
                        + crawlHost.getCountryCode());
                return true;
            }
        } catch (UnknownHostException e) {
            LOGGER.log(Level.FINE, "Failed dns lookup " + uri, e);
            if (crawlHost != null) {
                crawlHost.setCountryCode("--");
            }
        } catch (URIException e) {
            LOGGER.log(Level.FINE, "Failed to parse hostname " + uri, e);
        }
View Full Code Here

        if (!(scheme.equals(HTTP_SCHEME) || scheme.equals(HTTPS_SCHEME))) {
            // handles only plain http and https
            return false;
        }

        CrawlHost host = getServerCache().getHostFor(curi.getUURI());
        if (host.getIP() == null && host.hasBeenLookedUp()) {
            curi.setFetchStatus(S_DOMAIN_PREREQUISITE_FAILURE);
            return false;
        }

        return true;
View Full Code Here

    }
   
    protected ProcessResult innerProcessResult(CrawlURI puri) {
        CrawlURI curi = (CrawlURI)puri;
        final CrawlServer server = serverCache.getServerFor(curi.getUURI());
        final CrawlHost host = serverCache.getHostFor(curi.getUURI());
        FetchStats.HasFetchStats[] haveStats =
            new FetchStats.HasFetchStats[] {
                server,
                host,
                frontier.getGroup(curi)
View Full Code Here

    protected void decisionMade(CrawlURI uri, DecideRule decisiveRule,
            int decisiveRuleNumber, DecideResult result) {
        if (fileLogger != null) {
            JSONObject extraInfo = null;
            if (logExtraInfo) {
                CrawlHost crawlHost = getServerCache().getHostFor(uri.getUURI());
                String host = "-";
                if (crawlHost != null) {
                    host  = crawlHost.fixUpName();
                }

                extraInfo = new JSONObject();
                extraInfo.put("hopPath", uri.getPathFromSeed());
                extraInfo.put("via", uri.getVia());
View Full Code Here

        // otherwise, host referenced in URI
        // TODO:FIXME: have fetcher insert exact IP contacted into curi,
        // use that rather than inferred by CrawlHost lookup
        String addr = null;
        try {
          CrawlHost crlh = getServerCache().getHostFor(curi.getUURI());
          if (crlh == null) {
              return null;
          }
          InetAddress inetadd = crlh.getIP();
          if (inetadd == null) {
              return null;
          }
          addr = inetadd.getHostAddress();
        } catch (Exception e) {
View Full Code Here

          logger.log(Level.WARNING, "chmod failed", e);
      }
  }

  private String getHostAddress(CrawlURI curi) {
      CrawlHost h = serverCache.getHostFor(curi.getUURI());
      if (h == null) {
          throw new NullPointerException("Crawlhost is null for " + curi + " " +
                  curi.getVia());
      }
      InetAddress a = h.getIP();
      if (a == null) {
          throw new NullPointerException("Address is null for " + curi + " " +
             curi.getVia() + ". Address " +
                 ((h.getIpFetched() == CrawlHost.IP_NEVER_LOOKED_UP) ?
                     "was never looked up." :
                     (System.currentTimeMillis() - h.getIpFetched()) + " ms ago."));
      }
      return h.getIP().getHostAddress();
  }
View Full Code Here

        }

        // If we've done a dns lookup and it didn't resolve a host
        // cancel further fetch-processing of this URI, because
        // the domain is unresolvable
        CrawlHost ch = serverCache.getHostFor(curi.getUURI());
        if (ch == null || ch.hasBeenLookedUp() && ch.getIP() == null) {
            if (logger.isLoggable(Level.FINE)) {
                logger.fine( "no dns for " + ch +
                    " cancelling processing for CrawlURI " + curi.toString());
            }
            curi.setFetchStatus(S_DOMAIN_PREREQUISITE_FAILURE);
//            curi.skipToPostProcessing();
            return true;
        }

        // If we haven't done a dns lookup  and this isn't a dns uri
        // shoot that off and defer further processing
        if (isIpExpired(curi) && !curi.getUURI().getScheme().equals("dns")) {
            logger.fine("Deferring processing of CrawlURI " + curi.toString()
                + " for dns lookup.");
            String preq = "dns:" + ch.getHostName();
            try {
                curi.markPrerequisite(preq);
            } catch (URIException e) {
                throw new RuntimeException(e); // shouldn't ever happen
            }
View Full Code Here

     *
     * @param curi the URI to check.
     * @return true if ip should be looked up.
     */
    public boolean isIpExpired(CrawlURI curi) {
        CrawlHost host = serverCache.getHostFor(curi.getUURI());
        if (!host.hasBeenLookedUp()) {
            // IP has not been looked up yet.
            return true;
        }

        if (host.getIpTTL() == CrawlHost.IP_NEVER_EXPIRES) {
            // IP never expires (numeric IP)
            return false;
        }

        long duration = getIpValidityDurationSeconds();
        if (duration == 0) {
            // Never expire ip if duration is null (set by user or more likely,
            // set to zero in case where we tried in FetchDNS but failed).
            return false;
        }
       
        long ttl = host.getIpTTL();
        if (ttl > duration) {
            // Use the larger of the operator-set minimum duration
            // or the DNS record TTL
            duration = ttl;
        }

        // Duration and ttl are in seconds.  Convert to millis.
        if (duration > 0) {
            duration *= 1000;
        }

        return (duration + host.getIpFetched()) < System.currentTimeMillis();
    }
View Full Code Here

TOP

Related Classes of org.archive.modules.net.CrawlHost

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.