Package bixo.datum

Examples of bixo.datum.FetchSetDatum


        final int numElements = 100;
        DiskQueue<FetchSetDatum> queue = new DiskQueue<FetchSetDatum>(numElements/10, new FetchSetComparator());
       
        final long fetchStartTime = System.currentTimeMillis();
        final long fetchDelay = 30000;
        FetchSetDatum datums[] = new FetchSetDatum[numElements];
        for (int i = 0; i < numElements; i++) {
            long fetchTime = fetchStartTime + (i * 10);
            int groupingKey = 100;
            String groupingRef = "groupingRef";
            List<ScoredUrlDatum> scoredUrls = new ArrayList<ScoredUrlDatum>();
            String url = String.format("http://domain-%03d.com/index.html", i);
            scoredUrls.add(new ScoredUrlDatum(url, groupingRef, UrlStatus.UNFETCHED, 0.0));
            FetchSetDatum datum = new FetchSetDatum(scoredUrls, fetchTime, fetchDelay, groupingKey, groupingRef);
            datums[i] = datum;
            assertTrue(queue.offer(datum));
        }
       
        for (int i = 0; i < numElements; i++) {
            FetchSetDatum datum = queue.poll();
            assertNotNull(datum);
            assertEquals(datums[i], datum);
        }
       
        assertNull(queue.poll());
View Full Code Here


         */
        public FetchSetDatum drain() {
            if (!_queue.isEmpty()) {
                return removeFromQueue();
            } else if (safeHasNext()) {
                return new FetchSetDatum(new TupleEntry(_values.next()));
            } else {
                return null;
            }
        }
View Full Code Here

         * Return the top-most item from the queue, or null if the queue is empty.
         *
         * @return fetch set from queue
         */
        private FetchSetDatum removeFromQueue() {
            FetchSetDatum result = _queue.poll();
            if (result != null) {
                _flowProcess.increment(FetchCounters.FETCHSETS_QUEUED, -1);
                _flowProcess.increment(FetchCounters.URLS_QUEUED, -result.getUrls().size());
            }
           
            return result;
        }
View Full Code Here

        FetcherPolicy fetcherPolicy = _fetcher.getFetcherPolicy();
       
        // Each value is a PreFetchedDatum that contains a set of URLs to fetch in one request from
        // a single server, plus other values needed to set state properly.
        while (!Thread.interrupted() && !fetcherPolicy.isTerminateFetch() && !values.isEmpty()) {
            FetchSetDatum datum = values.nextOrNull(_fetcherMode);
           
            try {
                if (datum == null) {
                    trace("Nothing ready to fetch, sleeping...");
                    process.keepAlive();
                    Thread.sleep(NOTHING_TO_FETCH_SLEEP_TIME);
                } else {
                    List<ScoredUrlDatum> urls = datum.getUrls();
                    String ref = datum.getGroupingRef();
                    trace("Processing %d URLs for %s", urls.size(), ref);

                    Runnable doFetch = new FetchTask(this, _fetcher, urls, ref);
                    if (datum.isLastList()) {
                        makeActive(ref, 0L);
                        trace("Executing fetch of %d URLs from %s (last batch)", urls.size(), ref);
                    } else {
                        Long nextFetchTime = System.currentTimeMillis() + datum.getFetchDelay();
                        makeActive(ref, nextFetchTime);
                        trace("Executing fetch of %d URLs from %s (next fetch time %d)", urls.size(), ref, nextFetchTime);
                    }

                    long startTime = System.currentTimeMillis();

                    try {
                        _executor.execute(doFetch);
                    } catch (RejectedExecutionException e) {
                        // should never happen.
                        LOGGER.error("Fetch pool rejected our fetch list for " + ref);

                        finished(ref);
                        skipUrls(urls, UrlStatus.SKIPPED_DEFERRED, String.format("Execution rejection skipped %d URLs", urls.size()));
                    }

                    // Adjust for how long it took to get the request queued.
                    adjustActive(ref, System.currentTimeMillis() - startTime);
                }
            } catch (InterruptedException e) {
                LOGGER.warn("FetchBuffer interrupted!");
                Thread.currentThread().interrupt();
            }
        }
       
        // Skip all URLs that we've got left.
        if (!values.isEmpty()) {
            trace("Found unprocessed URLs");
           
            UrlStatus status = Thread.interrupted() ? UrlStatus.SKIPPED_INTERRUPTED : UrlStatus.SKIPPED_TIME_LIMIT;
           
            while (!values.isEmpty()) {
                FetchSetDatum datum = values.drain();
                List<ScoredUrlDatum> urls = datum.getUrls();
                trace("Skipping %d urls from %s (e.g. %s) ", urls.size(), datum.getGroupingRef(), urls.get(0).getUrl());
                skipUrls(urls, status, null);
            }
        }
    }
View Full Code Here

            // Loop until we have something to return, or there's nothing that we can return, or we've
            // queued up as many fetchsets as we want without any delay.
            while (!isEmpty() && (fetchSetsQueued < MAX_FETCHSETS_TO_QUEUE_PER_DELAY)) {
                // First see if we've got something in the queue, and if so, then check if it's ready
                // to be processed.
                final FetchSetDatum queueDatum = removeFromQueue();
               
                if (queueDatum != null) {
                    String ref = queueDatum.getGroupingRef();
                    if (readyToFetch(ref) || (mode == FetcherMode.IMPOLITE)) {
                        List<ScoredUrlDatum> urls = queueDatum.getUrls();
                        trace("Returning %d urls via queue from %s (e.g. %s)", urls.size(), ref, urls.get(0).getUrl());
                        return queueDatum;
                    }
                }

                // Nothing ready from the top of the queue or nothing in the queue, let's see about the iterator.
                if (safeHasNext()) {
                    // Re-add the thing from the top of the queue, since we're going to want to keep it around.
                    // This is safe to call with a null datum.
                    addToQueue(queueDatum);
                   
                    // Now get our next FetchSet from the Hadoop iterator.
                    FetchSetDatum iterDatum = new FetchSetDatum(new TupleEntry(_values.next()));
                    List<ScoredUrlDatum> urls = iterDatum.getUrls();
                    String ref = iterDatum.getGroupingRef();
                   
                    if (iterDatum.isSkipped()) {
                        trace("Skipping %d urls via iterator from %s (e.g. %s)", urls.size(), ref, urls.get(0).getUrl());
                        skipUrls(urls, UrlStatus.SKIPPED_PER_SERVER_LIMIT, null);
                        continue;
                    }

                    if ((mode == FetcherMode.IMPOLITE) || readyToFetch(ref)) {
                        trace("Returning %d urls via iterator from %s (e.g. %s)", urls.size(), ref, urls.get(0).getUrl());
                        return iterDatum;
                    }

                    // We've got a datum from the iterator that's not ready to be processed, so we'll stuff it into the queue.
                    trace("Queuing %d urls via iterator from %s (e.g. %s)", urls.size(), iterDatum.getGroupingRef(), urls.get(0).getUrl());
                    addToQueue(iterDatum);
                    fetchSetsQueued += 1;
                    continue;
                }
               
View Full Code Here

       
        while (safeHasNext()) {
            ScoredUrlDatum scoredDatum = new ScoredUrlDatum(new TupleEntry(values.next()));
            FetchSetInfo setInfo = _policy.nextFetchSet(scoredDatum);
            if (setInfo != null) {
                FetchSetDatum result = makeFetchSetDatum(setInfo, newKey, safeHasNext());
                collector.add(BixoPlatform.clone(result.getTuple(), process));
            }
        }
       
        // See if we have another partially built datum to add.
        FetchSetInfo setInfo = _policy.endFetchSet();
        if (setInfo != null) {
            FetchSetDatum result = makeFetchSetDatum(setInfo, newKey, false);
            collector.add(BixoPlatform.clone(result.getTuple(), process));
        }
    }
View Full Code Here

    }

    private FetchSetDatum makeFetchSetDatum(FetchSetInfo setInfo, PartitioningKey key, boolean hasNext) {
        LOGGER.trace(String.format("Added %d urls for ref %s in group %d at %d", setInfo.getUrls().size(), key.getRef(), key.getValue(), setInfo.getSortKey()));
       
        FetchSetDatum result = new FetchSetDatum(setInfo.getUrls(), setInfo.getSortKey(), setInfo.getFetchDelay(), key.getValue(), key.getRef());
        result.setLastList(!hasNext || setInfo.isSkipping());
        result.setSkipped(setInfo.isSkipping());
        return result;
    }
View Full Code Here

TOP

Related Classes of bixo.datum.FetchSetDatum

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.