Package bixo.datum

Examples of bixo.datum.GroupedUrlDatum


        }
    }
   
    private List<TupleEntry> getGroupedurlDatumList(String url) {
        List<TupleEntry> iterValues = new ArrayList<TupleEntry>();
        iterValues.add(new GroupedUrlDatum(url, url).getTupleEntry());
        return iterValues;
    }
View Full Code Here


    public void operate(FlowProcess process, FunctionCall<NullContext> funCall) {
        String key;
        try {
            UrlDatum datum = new UrlDatum(funCall.getArguments());
            key = _generator.getGroupingKey(datum);
            GroupedUrlDatum result = new GroupedUrlDatum(datum, key);
            funCall.getOutputCollector().add(BixoPlatform.clone(result.getTuple(), process));
        } catch (Exception e) {
            // TODO KKr - don't lose the tuple (skipping support)
            LOGGER.error("Unexpected exception while grouping URL (probably badly formed)", e);
        }
    }
View Full Code Here

        LOGGER.info("Processing tuple group: " + group);

        DiskQueue<GroupedUrlDatum> urls = new DiskQueue<GroupedUrlDatum>(MAX_URLS_IN_MEMORY);
        Iterator<TupleEntry> values = bufferCall.getArgumentsIterator();
        while (values.hasNext()) {
            urls.add(new GroupedUrlDatum(new TupleEntry(values.next())));
        }
       
        try {
            Runnable doRobots = new ProcessRobotsTask(protocolAndDomain, _scorer, urls, _fetcher, _parser, bufferCall.getOutputCollector(), _flowProcess);
            _executor.execute(doRobots);
View Full Code Here

     * @param urls Queue of URLs to empty out
     * @param groupingKey grouping key to use for all entries.
     * @param collector tuple output collector
     */
    public static void emptyQueue(Queue<GroupedUrlDatum> urls, String groupingKey, TupleEntryCollector collector, FlowProcess process) {
        GroupedUrlDatum datum;
        while ((datum = urls.poll()) != null) {
            ScoredUrlDatum scoreUrl = new ScoredUrlDatum(datum.getUrl(), groupingKey, UrlStatus.UNFETCHED, 1.0);
            scoreUrl.setPayload(datum.getPayload());
            // TODO KKr - move synchronization up, to avoid lots of contention with other threads?
            synchronized (collector) {
                collector.add(BixoPlatform.clone(scoreUrl.getTuple(), process));
            }
        }
View Full Code Here

                    validKey = GroupingKey.makeGroupingKey(domainInfo.getHostAddress(), robotRules.getCrawlDelay());
                    _flowProcess.increment(FetchCounters.DOMAINS_FINISHED, 1);
                }

                // Use the same key for every URL from this domain
                GroupedUrlDatum datum;
                while ((datum = _urls.poll()) != null) {
                    ScoredUrlDatum scoreUrl;
                    FetchCounters counter;
                    String url = datum.getUrl();

                    if (isDeferred) {
                        counter = FetchCounters.URLS_DEFERRED;
                        scoreUrl = new ScoredUrlDatum(url, GroupingKey.DEFERRED_GROUPING_KEY, UrlStatus.SKIPPED_DEFERRED, 0.0);
                    } else if (!robotRules.isAllowed(url)) {
                        counter = FetchCounters.URLS_BLOCKED;
                        scoreUrl = new ScoredUrlDatum(url, GroupingKey.BLOCKED_GROUPING_KEY, UrlStatus.SKIPPED_BLOCKED, 0.0);
                    } else {
                        double score = _scorer.generateScore(domain, pld, datum);
                        if (score == BaseScoreGenerator.SKIP_SCORE) {
                            counter = FetchCounters.URLS_SKIPPED;
                            scoreUrl = new ScoredUrlDatum(url, GroupingKey.SKIPPED_GROUPING_KEY, UrlStatus.UNFETCHED, score);
                        } else {
                            counter = FetchCounters.URLS_ACCEPTED;
                            scoreUrl = new ScoredUrlDatum(url, validKey, UrlStatus.UNFETCHED, score);
                        }
                    }
                   
                    scoreUrl.setPayload(datum.getPayload());
                    _flowProcess.increment(counter, 1);

                    // collectors aren't thread safe
                    synchronized (_collector) {
                        _collector.add(BixoPlatform.clone(scoreUrl.getTuple(), _flowProcess));
View Full Code Here

TOP

Related Classes of bixo.datum.GroupedUrlDatum

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.