Examples of org.archive.crawler.reporting.StatisticsTracker

org.archive.crawler.reporting.StatisticsTracker
This is an implementation of the AbstractTracker. It is designed to function with the WUI as well as performing various logging activity.
At the end of each snapshot a line is written to the 'progress-statistics.log' file.
The header of that file is as follows:
```
 [timestamp] [discovered]    [queued] [downloaded] [doc/s(avg)]  [KB/s(avg)] [dl-failures] [busy-thread] [mem-use-KB]
```
First there is a timestamp, accurate down to 1 second.
discovered, queued, downloaded and dl-failures are (respectively) the discovered URI count, pending URI count, successfully fetched count and failed fetch count from the frontier at the time of the snapshot.
KB/s(avg) is the bandwidth usage. We use the total bytes downloaded to calculate average bandwidth usage (KB/sec). Since we also note the value each time a snapshot is made we can calculate the average bandwidth usage during the last snapshot period to gain a "current" rate. The first number is the current and the average is in parenthesis.
doc/s(avg) works the same way as doc/s except it show the number of documents (URIs) rather then KB downloaded.
busy-threads is the total number of ToeThreads that are not available (and thus presumably busy processing a URI). This information is extracted from the crawl controller.
Finally mem-use-KB is extracted from the run time environment (Runtime.getRuntime().totalMemory()).
In addition to the data collected for the above logs, various other data is gathered and stored by this tracker.
- Successfully downloaded documents per fetch status code
- Successfully downloaded documents per document mime type
- Amount of data per mime type
- Successfully downloaded documents per host
- Amount of data per host
- Disposition of all seeds (this is written to 'reports.log' at end of crawl)
- Successfully downloaded documents per host per source
@contributor Parker Thompson @contributor Kristinn Sigurdsson @contributor gojomo

    
    @Override
    protected ProcessResult innerProcessResult(CrawlURI curi)
    throws InterruptedException {
        CrawlController controller = getCrawlController();
        StatisticsTracker stats = getStatisticsTracker();
        long allowedRuntimeMs = getRuntimeSeconds() * 1000L;
        long currentRuntimeMs = stats.getCrawlElapsedTime();
        if(currentRuntimeMs > allowedRuntimeMs){
            Operation op = getExpirationOperation();
            if(op != null){
                if (op.equals(Operation.PAUSE)) {
                    controller.requestCrawlPause();

View Full Code Here

        CrawlController cc = getCrawlController();
        return cc!=null ? cc.getStatisticsTracker() : null;
    }


    public Map<String,Number> rateReportData() {
        StatisticsTracker stats = getStats();
        if (stats == null) {
            return null;
        }
        
        CrawlStatSnapshot snapshot = stats.getSnapshot();
        Map<String,Number> map = new LinkedHashMap<String,Number>();
        map.put("currentDocsPerSecond", snapshot.currentDocsPerSecond);
        map.put("averageDocsPerSecond", snapshot.docsPerSecond);
        map.put("currentKiBPerSec", snapshot.currentKiBPerSec);
        map.put("averageKiBPerSec", snapshot.totalKiBPerSec);

View Full Code Here

        map.put("averageKiBPerSec", snapshot.totalKiBPerSec);
        return map;
    }


    public Object rateReport() {
        StatisticsTracker stats = getStats();
        if(stats==null) {
            return "<i>n/a</i>";
        }
        CrawlStatSnapshot snapshot = stats.getSnapshot();
        StringBuilder sb = new StringBuilder();
        sb
         .append(ArchiveUtils.doubleToString(snapshot.currentDocsPerSecond,2))
         .append(" URIs/sec (")
         .append(ArchiveUtils.doubleToString(snapshot.docsPerSecond,2))

View Full Code Here

         .append(" avg)");
        return sb.toString();
    }


    public Map<String,Number> loadReportData() {
        StatisticsTracker stats = getStats();
        if (stats == null) {
            return null;
        }
        
        CrawlStatSnapshot snapshot = stats.getSnapshot();
        Map<String,Number> map = new LinkedHashMap<String,Number>();
        
        map.put("busyThreads", snapshot.busyThreads);
        map.put("totalThreads", stats.threadCount());
        map.put("congestionRatio", snapshot.congestionRatio);
        map.put("averageQueueDepth", snapshot.averageDepth);
        map.put("deepestQueueDepth", snapshot.deepestUri);
        return map;
    }

View Full Code Here

        map.put("deepestQueueDepth", snapshot.deepestUri);
        return map;
    }


    public Object loadReport() {
        StatisticsTracker stats = getStats();
        if(stats==null) {
            return "<i>n/a</i>";
        }
        CrawlStatSnapshot snapshot = stats.getSnapshot();
        StringBuilder sb = new StringBuilder();
        sb
         .append(snapshot.busyThreads)
         .append(" active of ")
         .append(stats.threadCount())
         .append(" threads; ")
         .append(ArchiveUtils.doubleToString(snapshot.congestionRatio,2))
         .append(" congestion ratio; ")
         .append(snapshot.deepestUri)
         .append("  deepest queue; ")

View Full Code Here

         .append("  average depth");
        return sb.toString();
    }


    public Map<String,Long> uriTotalsReportData() {
        StatisticsTracker stats = getStats();
        if (stats == null) {
            return null;
        }


        CrawlStatSnapshot snapshot = stats.getSnapshot();


        Map<String,Long> totals = new LinkedHashMap<String,Long>();
        totals.put("downloadedUriCount", snapshot.downloadedUriCount);
        totals.put("queuedUriCount", snapshot.queuedUriCount);
        totals.put("totalUriCount", snapshot.totalCount());

View Full Code Here

        }
        return sb.toString(); 
    }


    public Map<String,Long> sizeTotalsReportData() {
        StatisticsTracker stats = getStats();
        if(stats==null) {
            return null;
        }
        
        // stats.crawledBytesSummary() also includes totals, so add those in here
        TreeMap<String, Long> map = new TreeMap<String,Long>(stats.getCrawledBytes());
        map.put("total", stats.getCrawledBytes().getTotalBytes());
        map.put("totalCount", stats.getCrawledBytes().getTotalUrls());
        return map;
    }

View Full Code Here

        map.put("totalCount", stats.getCrawledBytes().getTotalUrls());
        return map;
    }


    public String sizeTotalsReport() {
        StatisticsTracker stats = getStats();
        if(stats==null) {
            return "<i>n/a</i>";
        }
        return stats.crawledBytesSummary();
    }

View Full Code Here

        }
        return stats.crawledBytesSummary();
    }


    public Map<String,Object> elapsedReportData() {
        StatisticsTracker stats = getStats();
        if(stats==null) {
            return null;
        }
        
        Map<String,Object> map = new LinkedHashMap<String,Object>();
        long timeElapsed = stats.getCrawlElapsedTime();
        map.put("elapsedMilliseconds", timeElapsed);
        map.put("elapsedPretty", ArchiveUtils.formatMillisecondsToConventional(timeElapsed));
        
        return map;
    }

View Full Code Here

        
        return map;
    }


    public String elapsedReport() {
        StatisticsTracker stats = getStats();
        if(stats==null) {
            return "<i>n/a</i>";
        }
        long timeElapsed = stats.getCrawlElapsedTime();
        return ArchiveUtils.formatMillisecondsToConventional(timeElapsed);
    }

View Full Code Here

TOP

Related Classes of org.archive.crawler.reporting.StatisticsTracker

org.apache.commons.collections.Closure

org.archive.crawler.event.CrawlStateEvent

org.archive.crawler.event.CrawlURIDispositionEvent

org.archive.crawler.event.StatSnapshotEvent

org.archive.crawler.framework.CrawlJob

org.archive.crawler.prefetch.RuntimeLimitEnforcer

org.archive.crawler.util.TopNSet

org.archive.modules.net.CrawlHost

org.archive.modules.net.ServerCache

org.archive.util.PaddingStringBuffer

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.