Package org.archive.crawler.frontier

Source Code of org.archive.crawler.frontier.WorkQueueFrontier

/*
*  This file is part of the Heritrix web crawler (crawler.archive.org).
*
*  Licensed to the Internet Archive (IA) by one or more individual
*  contributors.
*
*  The IA licenses this file to You under the Apache License, Version 2.0
*  (the "License"); you may not use this file except in compliance with
*  the License.  You may obtain a copy of the License at
*
*      http://www.apache.org/licenses/LICENSE-2.0
*
*  Unless required by applicable law or agreed to in writing, software
*  distributed under the License is distributed on an "AS IS" BASIS,
*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*  See the License for the specific language governing permissions and
*  limitations under the License.
*/

package org.archive.crawler.frontier;

import static org.archive.crawler.event.CrawlURIDispositionEvent.Disposition.DEFERRED_FOR_RETRY;
import static org.archive.crawler.event.CrawlURIDispositionEvent.Disposition.DISREGARDED;
import static org.archive.crawler.event.CrawlURIDispositionEvent.Disposition.FAILED;
import static org.archive.crawler.event.CrawlURIDispositionEvent.Disposition.SUCCEEDED;
import static org.archive.modules.fetcher.FetchStatusCodes.S_DEFERRED;
import static org.archive.modules.fetcher.FetchStatusCodes.S_RUNTIME_EXCEPTION;

import java.io.Closeable;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Queue;
import java.util.Set;
import java.util.SortedMap;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.DelayQueue;
import java.util.concurrent.Delayed;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;

import org.apache.commons.collections.iterators.ObjectArrayIterator;
import org.archive.crawler.datamodel.UriUniqFilter;
import org.archive.crawler.event.CrawlURIDispositionEvent;
import org.archive.crawler.framework.ToeThread;
import org.archive.crawler.frontier.precedence.BaseQueuePrecedencePolicy;
import org.archive.crawler.frontier.precedence.QueuePrecedencePolicy;
import org.archive.crawler.util.TopNSet;
import org.archive.modules.CrawlURI;
import org.archive.spring.KeyedProperties;
import org.archive.util.ArchiveUtils;
import org.archive.util.ObjectIdentityCache;
import org.archive.util.ObjectIdentityMemCache;
import org.springframework.beans.BeansException;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.ApplicationContext;
import org.springframework.context.ApplicationContextAware;
import org.springframework.context.support.AbstractApplicationContext;

import com.sleepycat.collections.StoredSortedMap;
import com.sleepycat.je.DatabaseException;

/**
* A common Frontier base using several queues to hold pending URIs.
*
* Uses in-memory map of all known 'queues' inside a single database.
* Round-robins between all queues.
*
* @author Gordon Mohr
* @author Christian Kohlschuetter
*/
public abstract class WorkQueueFrontier extends AbstractFrontier
implements Closeable,
           ApplicationContextAware {
    @SuppressWarnings("unused")
    private static final long serialVersionUID = 570384305871965843L;

    /**
     * If we know that only a small amount of queues is held in memory,
     * we can avoid using a disk-based BigMap.
     * This only works efficiently if the WorkQueue does not hold its
     * entries in memory as well.
     */
    private static final int MAX_QUEUES_TO_HOLD_ALLQUEUES_IN_MEMORY = 3000;

    /**
     * When a snooze target for a queue is longer than this amount, the queue
     * will be "long snoozed" instead of "short snoozed".  A "long snoozed"
     * queue may be swapped to disk because it's not needed soon. 
     */
    protected long snoozeLongMs = 5L*60L*1000L;
    public long getSnoozeLongMs() {
        return snoozeLongMs;
    }
    public void setSnoozeLongMs(long snooze) {
        this.snoozeLongMs = snooze;
    }
   
    private static final Logger logger =
        Logger.getLogger(WorkQueueFrontier.class.getName());
   
    // ApplicationContextAware implementation, for eventing
    protected AbstractApplicationContext appCtx;
    public void setApplicationContext(ApplicationContext applicationContext) throws BeansException {
        this.appCtx = (AbstractApplicationContext)applicationContext;
    }

    /** amount to replenish budget on each activation (duty cycle) */
    {
        setBalanceReplenishAmount(3000);
    }
    public int getBalanceReplenishAmount() {
        return (Integer) kp.get("balanceReplenishAmount");
    }
    public void setBalanceReplenishAmount(int replenish) {
        kp.put("balanceReplenishAmount",replenish);
    }


    /** budget penalty for an error fetch */
    {
        setErrorPenaltyAmount(100);
    }
    public int getErrorPenaltyAmount() {
        return (Integer) kp.get("errorPenaltyAmount");
    }
    public void setErrorPenaltyAmount(int penalty) {
        kp.put("errorPenaltyAmount",penalty);
    }

    /** total expenditure to allow a queue before 'retiring' it  */
    {
        setQueueTotalBudget(-1L);
    }
    public long getQueueTotalBudget() {
        return (Long) kp.get("queueTotalBudget");
    }
    public void setQueueTotalBudget(long budget) {
        kp.put("queueTotalBudget",budget);
    }
   
    /** queue precedence assignment policy to use. */
    {
        setQueuePrecedencePolicy(new BaseQueuePrecedencePolicy());
    }
    public QueuePrecedencePolicy getQueuePrecedencePolicy() {
        return (QueuePrecedencePolicy) kp.get("queuePrecedencePolicy");
    }
    public void setQueuePrecedencePolicy(QueuePrecedencePolicy policy) {
        kp.put("queuePrecedencePolicy",policy);
    }

    /** precedence rank at or below which queues are not crawled */
    protected int precedenceFloor = 255;
    public int getPrecedenceFloor() {
        return this.precedenceFloor;
    }
    public void setPrecedenceFloor(int floor) {
        this.precedenceFloor = floor;
    }

    /** truncate reporting of queues at this large but not unbounded number */
    protected int maxQueuesPerReportCategory = 2000;
    public int getMaxQueuesPerReportCategory() {
        return this.maxQueuesPerReportCategory;
    }
    public void setMaxQueuesPerReportCategory(int max) {
        this.maxQueuesPerReportCategory = max;
    }

    /** All known queues.
     */
    protected ObjectIdentityCache<WorkQueue> allQueues = null;
    // of classKey -> ClassKeyQueue

    /**
     * All per-class queues whose first item may be handed out.
     * Linked-list of keys for the queues.
     */
    protected BlockingQueue<String> readyClassQueues;
   
    /** all per-class queues from whom a URI is outstanding */
    protected Set<WorkQueue> inProcessQueues =
        Collections.newSetFromMap(new ConcurrentHashMap<WorkQueue, Boolean>()); // of ClassKeyQueue
   
    /**
     * All per-class queues held in snoozed state, sorted by wake time.
     */
    transient protected DelayQueue<DelayedWorkQueue> snoozedClassQueues;
    protected StoredSortedMap<Long,DelayedWorkQueue> snoozedOverflow;
    protected AtomicInteger snoozedOverflowCount = new AtomicInteger(0);
    protected static int MAX_SNOOZED_IN_MEMORY = 10000;
   
    /** URIs scheduled to be re-enqueued at future date */
    protected StoredSortedMap<Long, CrawlURI> futureUris;
   
    /** remember keys of small number of largest queues for reporting */
    transient protected TopNSet largestQueues = new TopNSet(20);
    /** remember this many largest queues for reporting's sake; actual tracking
     *  can be somewhat approximate when some queues shrink before others'
     *  sizes are again noted, or if the size is adjusted mid-crawl. */
    public int getLargestQueuesCount() {
        return largestQueues.getMaxSize();
    }
    public void setLargestQueuesCount(int count) {
        largestQueues.setMaxSize(count);
    }
   
    protected int highestPrecedenceWaiting = Integer.MAX_VALUE;

    /** The UriUniqFilter to use, tracking those UURIs which are
     * already in-process (or processed), and thus should not be
     * rescheduled. Also known as the 'alreadyIncluded' or
     * 'alreadySeen' structure */
    protected UriUniqFilter uriUniqFilter;
    public UriUniqFilter getUriUniqFilter() {
        return this.uriUniqFilter;
    }
    @Autowired
    public void setUriUniqFilter(UriUniqFilter uriUniqFilter) {
        this.uriUniqFilter = uriUniqFilter;
    }

    /**
     * Constructor.
     */
    public WorkQueueFrontier() {
        super();
    }
   
    public void start() {
        if(isRunning()) {
            return;
        }
        uriUniqFilter.setDestination(this);
        super.start();
        try {
            initInternalQueues();
        } catch (Exception e) {
            throw new IllegalStateException(e);
        }

    }

    /**
     * Initializes internal queues.  May decide to keep all queues in memory based on
     * {@link QueueAssignmentPolicy#maximumNumberOfKeys}.  Otherwise invokes
     * {@link #initAllQueues()} to actually set up the queues.
     *
     * Subclasses should invoke this method with recycle set to "true" in
     * a private readObject method, to restore queues after a checkpoint.
     *
     * @param recycle
     * @throws IOException
     * @throws DatabaseException
     */
    protected void initInternalQueues()
    throws IOException, DatabaseException {
        this.initOtherQueues();
        if (workQueueDataOnDisk()
                && preparer.getQueueAssignmentPolicy().maximumNumberOfKeys() >= 0
                && preparer.getQueueAssignmentPolicy().maximumNumberOfKeys() <=
                    MAX_QUEUES_TO_HOLD_ALLQUEUES_IN_MEMORY) {
            this.allQueues =
                new ObjectIdentityMemCache<WorkQueue>(701, .9f, 100);
        } else {
            this.initAllQueues();
        }
    }
   
    /**
     * Initialize the allQueues field in an implementation-appropriate
     * way.
     * @throws DatabaseException
     */
    protected abstract void initAllQueues() throws DatabaseException;
   
    /**
     * Initialize all other internal queues in an implementation-appropriate
     * way.
     * @throws DatabaseException
     */
    protected abstract void initOtherQueues() throws DatabaseException;

   
   
    /* (non-Javadoc)
     * @see org.archive.crawler.frontier.AbstractFrontier#stop()
     */
    @Override
    public void stop() {
        super.stop();
    }
   
    public void destroy() {
        // release resources and trigger end-of-frontier actions
        close();
    }
   
    /**
     * Release resources only needed when running
     */
    public void close() {
        ArchiveUtils.closeQuietly(uriUniqFilter);    
        ArchiveUtils.closeQuietly(allQueues);
    }
   
    /**
     * Accept the given CrawlURI for scheduling, as it has
     * passed the alreadyIncluded filter.
     *
     * Choose a per-classKey queue and enqueue it. If this
     * item has made an unready queue ready, place that
     * queue on the readyClassQueues queue.
     * @param caUri CrawlURI.
     */
    protected void processScheduleAlways(CrawlURI curi) {
//        assert Thread.currentThread() == managerThread;
        assert KeyedProperties.overridesActiveFrom(curi);
       
        prepForFrontier(curi);
        sendToQueue(curi);
    }
   
   
    /**
     * Arrange for the given CrawlURI to be visited, if it is not
     * already enqueued/completed.
     *
     * Differs from superclass in that it operates in calling thread, rather
     * than deferring operations via in-queue to managerThread. TODO: settle
     * on either defer or in-thread approach after testing.
     *
     * @see org.archive.crawler.framework.Frontier#schedule(org.archive.modules.CrawlURI)
     */
    @Override
    public void schedule(CrawlURI curi) {
        sheetOverlaysManager.applyOverlaysTo(curi);
        try {
            KeyedProperties.loadOverridesFrom(curi);
            if(curi.getClassKey()==null) {
                // remedial processing
                preparer.prepare(curi);
            }
            processScheduleIfUnique(curi);
        } finally {
            KeyedProperties.clearOverridesFrom(curi);
        }
    }

    /**
     * Arrange for the given CrawlURI to be visited, if it is not
     * already scheduled/completed.
     *
     * @see org.archive.crawler.framework.Frontier#schedule(org.archive.modules.CrawlURI)
     */
    protected void processScheduleIfUnique(CrawlURI curi) {
//        assert Thread.currentThread() == managerThread;
        assert KeyedProperties.overridesActiveFrom(curi);
       
        // Canonicalization may set forceFetch flag.  See
        // #canonicalization(CrawlURI) javadoc for circumstance.
        String canon = curi.getCanonicalString();
        if (curi.forceFetch()) {
            uriUniqFilter.addForce(canon, curi);
        } else {
            uriUniqFilter.add(canon, curi);
        }
    }

    /**
     * Send a CrawlURI to the appropriate subqueue.
     *
     * @param curi
     */
    protected void sendToQueue(CrawlURI curi) {
//        assert Thread.currentThread() == managerThread;
       
        WorkQueue wq = getQueueFor(curi.getClassKey());
        synchronized(wq) {
            int originalPrecedence = wq.getPrecedence();
            wq.enqueue(this, curi);
            // always take budgeting values from current curi
            // (whose overlay settings should be active here)
            wq.setSessionBudget(getBalanceReplenishAmount());
            wq.setTotalBudget(getQueueTotalBudget());
           
            if(!wq.isRetired()) {
                incrementQueuedUriCount();
                int currentPrecedence = wq.getPrecedence();
                if(!wq.isManaged() || currentPrecedence < originalPrecedence) {
                    // queue newly filled or bumped up in precedence; ensure enqueuing
                    // at precedence level (perhaps duplicate; if so that's handled elsewhere)
                    deactivateQueue(wq);
                }
            }
        }
        // Update recovery log.
        doJournalAdded(curi);
        wq.makeDirty();
        largestQueues.update(wq.getClassKey(), wq.getCount());
    }

    /**
     * Put the given queue on the readyClassQueues queue
     * @param wq
     */
    protected void readyQueue(WorkQueue wq) {
//        assert Thread.currentThread() == managerThread;

        try {
            readyClassQueues.put(wq.getClassKey());
            if(logger.isLoggable(Level.FINE)) {
                logger.log(Level.FINE,
                        "queue readied: " + wq.getClassKey());
            }
        } catch (InterruptedException e) {
            e.printStackTrace();
            System.err.println("unable to ready queue "+wq);
            // propagate interrupt up
            throw new RuntimeException(e);
        }
    }

    /**
     * Put the given queue on the inactiveQueues queue
     * @param wq
     */
    protected void deactivateQueue(WorkQueue wq) {
        int precedence = wq.getPrecedence();

        synchronized(wq) {
            wq.noteDeactivated();
            inProcessQueues.remove(wq);
            if(wq.getCount()==0) {
                System.err.println("deactivate empty queue?");
            }

            synchronized (getInactiveQueuesByPrecedence()) {
                getInactiveQueuesForPrecedence(precedence).add(wq.getClassKey());
                if(wq.getPrecedence() < highestPrecedenceWaiting ) {
                    highestPrecedenceWaiting = wq.getPrecedence();
                }
            }

            if(logger.isLoggable(Level.FINE)) {
                logger.log(Level.FINE,
                        "queue deactivated to p" + precedence
                        + ": " + wq.getClassKey());
            }
        }
    }
   
    /**
     * Get the queue of inactive uri-queue names at the given precedence.
     *
     * @param precedence
     * @return queue of inacti
     */
    protected Queue<String> getInactiveQueuesForPrecedence(int precedence) {
        Map<Integer,Queue<String>> inactiveQueuesByPrecedence =
            getInactiveQueuesByPrecedence();
        Queue<String> candidate = inactiveQueuesByPrecedence.get(precedence);
        if(candidate==null) {
            candidate = createInactiveQueueForPrecedence(precedence);
            inactiveQueuesByPrecedence.put(precedence,candidate);
        }
        return candidate;
    }

    /**
     * Return a sorted map of all queues of WorkQueue keys, keyed by precedence
     * @return SortedMap<Integer, Queue<String>> of inactiveQueues
     */
    protected abstract SortedMap<Integer, Queue<String>> getInactiveQueuesByPrecedence();

    /**
     * Create an inactiveQueue to hold queue names at the given precedence
     * @param precedence
     * @return Queue<String> for names of inactive queues
     */
    protected abstract Queue<String> createInactiveQueueForPrecedence(int precedence);

    /**
     * Put the given queue on the retiredQueues queue
     * @param wq
     */
    protected void retireQueue(WorkQueue wq) {
//        assert Thread.currentThread() == managerThread;

        inProcessQueues.remove(wq);
        getRetiredQueues().add(wq.getClassKey());
        decrementQueuedCount(wq.getCount());
        wq.setRetired(true);
        if(logger.isLoggable(Level.FINE)) {
            logger.log(Level.FINE,
                    "queue retired: " + wq.getClassKey());
        }
    }
   
    /**
     * Return queue of all retired queue names.
     *
     * @return Queue<String> of retired queue names
     */
    protected abstract Queue<String> getRetiredQueues();

    /**
     * Accommodate any changes in retirement-determining settings (like
     * total-budget or force-retire changes/overlays.
     *
     * (Essentially, exists to be called from tools like the UI
     * Scripting Console when the operator knows it's necessary.)
     */
    public void reconsiderRetiredQueues() {

        // The rules for a 'retired' queue may have changed; so,
        // unretire all queues to 'inactive'. If they still qualify
        // as retired/overbudget next time they come up, they'll
        // be re-retired; if not, they'll get a chance to become
        // active under the new rules.
       
        // TODO: Do this automatically, only when necessary.
       
        String key = getRetiredQueues().poll();
        while (key != null) {
            WorkQueue q = (WorkQueue)this.allQueues.get(key);
            if(q != null) {
                unretireQueue(q);
                q.makeDirty();
            }
            key = getRetiredQueues().poll();
        }
    }
    /**
     * Restore a retired queue to the 'inactive' state.
     *
     * @param q
     */
    private void unretireQueue(WorkQueue q) {
//        assert Thread.currentThread() == managerThread;

        deactivateQueue(q);
        q.setRetired(false);
        incrementQueuedUriCount(q.getCount());
    }

    /**
     * Return the work queue for the given classKey, or null
     * if no such queue exists.
     *
     * @param classKey key to look for
     * @return the found WorkQueue
     */
    protected abstract WorkQueue getQueueFor(String classKey);
   
    /**
     * Return the next CrawlURI eligible to be processed (and presumably
     * visited/fetched) by a a worker thread.
     *
     * Relies on the readyClassQueues having been loaded with
     * any work queues that are eligible to provide a URI.
     *
     * @return next CrawlURI eligible to be processed, or null if none available
     *
     * @see org.archive.crawler.framework.Frontier#next()
     */
    protected CrawlURI findEligibleURI() {
            // wake any snoozed queues
            wakeQueues();
            // consider rescheduled URIS
            checkFutures();
                  
            // find a non-empty ready queue, if any
            // TODO: refactor to untangle these loops, early-exits, etc!
            WorkQueue readyQ = null;
            findauri: while(true) {
                findaqueue: do {
                    String key = readyClassQueues.poll();
                    if(key==null) {
                        // no ready queues; try to activate one
                        if(!getInactiveQueuesByPrecedence().isEmpty()
                            && highestPrecedenceWaiting < getPrecedenceFloor()) {
                            activateInactiveQueue();
                            continue findaqueue;
                        } else {
                            // nothing ready or readyable
                            break findaqueue;
                        }
                    }
                    readyQ = getQueueFor(key);
                    if(readyQ==null) {
                         // readyQ key wasn't in all queues: unexpected
                        logger.severe("Key "+ key +
                            " in readyClassQueues but not allQueues");
                        break findaqueue;
                    }
                    if(readyQ.getCount()==0) {
                        // readyQ is empty and ready: it's exhausted
                        readyQ.noteExhausted();
                        readyQ.makeDirty();
                        readyQ = null;
                        continue;
                    }
                    if(!inProcessQueues.add(readyQ)) {
                        // double activation; discard this and move on
                        // (this guard allows other enqueuings to ready or
                        // the various inactive-by-precedence queues to
                        // sometimes redundantly enqueue a queue key)
                        readyQ = null;
                        continue;
                    }
                    // queue has gone 'in process'
                    readyQ.considerActive();
                    readyQ.setWakeTime(0); // clear obsolete wake time, if any
                   
                    // we know readyQ is not empty (getCount()!=0) so peek() shouldn't return null
                    CrawlURI readyQUri = readyQ.peek(this);
                    // see HER-1973 and HER-1946
                    sheetOverlaysManager.applyOverlaysTo(readyQUri);
                    try {
                        KeyedProperties.loadOverridesFrom(readyQUri);
                        readyQ.setSessionBudget(getBalanceReplenishAmount());
                        readyQ.setTotalBudget(getQueueTotalBudget());
                    } finally {
                        KeyedProperties.clearOverridesFrom(readyQUri);
                    }
                   
                    if (readyQ.isOverSessionBudget()) {
                        deactivateQueue(readyQ);
                        readyQ.makeDirty();
                        readyQ = null;
                        continue;
                    }
                    if (readyQ.isOverTotalBudget()) {
                        retireQueue(readyQ);
                        readyQ.makeDirty();
                        readyQ = null;
                        continue;
                    }
                } while (readyQ == null);
               
                if (readyQ == null) {
                    // no queues left in ready or readiable
                    break findauri;
                }
          
                returnauri: while(true) { // loop left by explicit return or break on empty
                    CrawlURI curi = null;
                    curi = readyQ.peek(this);  
                    if(curi == null) {
                        // should not reach
                        logger.severe("No CrawlURI from ready non-empty queue "
                                + readyQ.classKey + "\n"
                                + readyQ.shortReportLegend() + "\n"
                                + readyQ.shortReportLine() + "\n");
                        break returnauri;
                    }
                   
                    // from queues, override names persist but not map source
                    curi.setOverlayMapsSource(sheetOverlaysManager);
                    // TODO: consider optimizations avoiding this recalc of
                    // overrides when not necessary
                    sheetOverlaysManager.applyOverlaysTo(curi);
                    // check if curi belongs in different queue
                    String currentQueueKey;
                    try {
                        KeyedProperties.loadOverridesFrom(curi);
                        currentQueueKey = getClassKey(curi);
                    } finally {
                        KeyedProperties.clearOverridesFrom(curi);
                    }
                    if (currentQueueKey.equals(curi.getClassKey())) {
                        // curi was in right queue, emit
                        noteAboutToEmit(curi, readyQ);
                        return curi;
                    }
                    // URI's assigned queue has changed since it
                    // was queued (eg because its IP has become
                    // known). Requeue to new queue.
                    // TODO: consider synchronization on readyQ
                    readyQ.dequeue(this,curi);
                    doJournalRelocated(curi);
                    curi.setClassKey(currentQueueKey);
                    decrementQueuedCount(1);
                    curi.setHolderKey(null);
                    sendToQueue(curi);
                    if(readyQ.getCount()==0) {
                        // readyQ is empty and ready: it's exhausted
                        // release held status, allowing any subsequent
                        // enqueues to again put queue in ready
                        // FIXME: tiny window here where queue could
                        // receive new URI, be readied, fail not-in-process?
                        inProcessQueues.remove(readyQ);
                        readyQ.noteExhausted();
                        readyQ.makeDirty();
                        readyQ = null;
                        continue findauri;
                    }
                }
            }
               
            if(inProcessQueues.size()==0) {
                // Nothing was ready or in progress or imminent to wake; ensure
                // any piled-up pending-scheduled URIs are considered
                uriUniqFilter.requestFlush();
            }
           
            // if truly nothing ready, wait a moment before returning null
            // so that loop in surrounding next() has a chance of getting something
            // next time
            if(getTotalEligibleInactiveQueues()==0) {
                try {
                    Thread.sleep(1000);
                } catch (InterruptedException e) {
                    //
                }
            }
           
            // nothing eligible
            return null;
    }

    /**
     * Check for any future-scheduled URIs now eligible for reenqueuing
     */
    protected void checkFutures() {
//        assert Thread.currentThread() == managerThread;
        // TODO: consider only checking this every set interval
        if(!futureUris.isEmpty()) {
            synchronized(futureUris) {
                Iterator<CrawlURI> iter =
                    futureUris.headMap(System.currentTimeMillis())
                        .values().iterator();
                while(iter.hasNext()) {
                    CrawlURI curi = iter.next();
                    curi.setRescheduleTime(-1); // unless again set elsewhere
                    iter.remove();
                    futureUriCount.decrementAndGet();
                    receive(curi);
                }
            }
        }
    }
   
    /**
     * Activate an inactive queue, if any are available.
     */
    protected boolean activateInactiveQueue() {
        for (Entry<Integer, Queue<String>> entry: getInactiveQueuesByPrecedence().entrySet()) {
            int expectedPrecedence = entry.getKey();
            Queue<String> queueOfWorkQueueKeys = entry.getValue();

            while (true) {
                String workQueueKey;
                synchronized (getInactiveQueuesByPrecedence()) {
                    workQueueKey = queueOfWorkQueueKeys.poll();
                    if (workQueueKey == null) {
                        break;
                    }
                    updateHighestWaiting(expectedPrecedence);
                }

                WorkQueue candidateQ = (WorkQueue) this.allQueues.get(workQueueKey);
                if (candidateQ.getPrecedence() > expectedPrecedence) {
                    // queue demoted since placed; re-deactivate
                    deactivateQueue(candidateQ);
                    candidateQ.makeDirty();
                    continue;
                }

                try {
                    readyClassQueues.put(workQueueKey);
                } catch (InterruptedException e) {
                    throw new RuntimeException(e);
                }

                return true;
            }
        }

        return false;
    }

    /**
     * Recalculate the value of thehighest-precedence queue waiting
     * among inactive queues.
     *
     * @param startFrom start looking at this precedence value
     */
    protected void updateHighestWaiting(int startFrom) {
        // probe for new highestWaiting
        for(int precedenceKey : getInactiveQueuesByPrecedence().tailMap(startFrom).keySet()) {
            if(!getInactiveQueuesByPrecedence().get(precedenceKey).isEmpty()) {
                highestPrecedenceWaiting = precedenceKey;
                return;
            }
        }
        // nothing waiting
        highestPrecedenceWaiting = Integer.MAX_VALUE;
    }

    /**
     * Enqueue the given queue to either readyClassQueues or inactiveQueues,
     * as appropriate.
     *
     * @param wq
     */
    protected void reenqueueQueue(WorkQueue wq) {
        if (logger.isLoggable(Level.FINE)) {
            logger.fine("queue reenqueued: " +
                wq.getClassKey());
        }
        if(highestPrecedenceWaiting < wq.getPrecedence()
            || wq.getPrecedence() >= getPrecedenceFloor()) {
            // if still over budget, deactivate
            deactivateQueue(wq);
        } else {
            readyQueue(wq);
        }
    }
   
    /* (non-Javadoc)
     * @see org.archive.crawler.frontier.AbstractFrontier#getMaxInWait()
     */
    @Override
    protected long getMaxInWait() {
        Delayed next = snoozedClassQueues.peek();
        return next == null ? 60000 : next.getDelay(TimeUnit.MILLISECONDS);
    }

    /**
     * Utility method for advanced users/experimentation: force wake all snoozed
     * queues -- for example to kick a crawl where connectivity problems have
     * put all queues in slow-retry-snoozes back to busy-ness.
     */
    public void forceWakeQueues() {
        Iterator<DelayedWorkQueue> iterSnoozed = snoozedClassQueues.iterator();
        while(iterSnoozed.hasNext()) {
            WorkQueue queue = iterSnoozed.next().getWorkQueue(WorkQueueFrontier.this);
            queue.setWakeTime(0);
            reenqueueQueue(queue);
            queue.makeDirty();
            iterSnoozed.remove();
        }
        Iterator<DelayedWorkQueue> iterOverflow = snoozedOverflow.values().iterator();
        while(iterOverflow.hasNext()) {
            WorkQueue queue = iterOverflow.next().getWorkQueue(WorkQueueFrontier.this);
            queue.setWakeTime(0);
            reenqueueQueue(queue);
            queue.makeDirty();
            iterOverflow.remove();
            snoozedOverflowCount.decrementAndGet();
        }
    }
   
    /**
     * Wake any queues sitting in the snoozed queue whose time has come.
     */
    protected void wakeQueues() {
        DelayedWorkQueue waked;
        while((waked = snoozedClassQueues.poll())!=null) {
            WorkQueue queue = waked.getWorkQueue(this);
            queue.setWakeTime(0);
            queue.makeDirty();
            reenqueueQueue(queue);
        }
        // also consider overflow (usually empty)
        if(!snoozedOverflow.isEmpty()) {
            synchronized(snoozedOverflow) {
                Iterator<DelayedWorkQueue> iter =
                    snoozedOverflow.headMap(System.currentTimeMillis()).values().iterator();
                while(iter.hasNext()) {
                    DelayedWorkQueue dq = iter.next();
                    iter.remove();
                    snoozedOverflowCount.decrementAndGet();
                    WorkQueue queue = dq.getWorkQueue(this);
                    queue.setWakeTime(0);
                    queue.makeDirty();
                    reenqueueQueue(queue);
                }
            }
        }
    }
   
    /**
     * Note that the previously emitted CrawlURI has completed
     * its processing (for now).
     *
     * The CrawlURI may be scheduled to retry, if appropriate,
     * and other related URIs may become eligible for release
     * via the next next() call, as a result of finished().
     *
     * TODO: make as many decisions about what happens to the CrawlURI
     * (success, failure, retry) and queue (retire, snooze, ready) as
     * possible elsewhere, such as in DispositionProcessor. Then, break
     * this into simple branches or focused methods for each case.
     * 
     * @see org.archive.crawler.framework.Frontier#finished(org.archive.modules.CrawlURI)
     */
    protected void processFinish(CrawlURI curi) {
//        assert Thread.currentThread() == managerThread;
       
        long now = System.currentTimeMillis();

        curi.incrementFetchAttempts();
        logNonfatalErrors(curi);
       
        WorkQueue wq = (WorkQueue) curi.getHolder();
        // always refresh budgeting values from current curi
        // (whose overlay settings should be active here)
        wq.setSessionBudget(getBalanceReplenishAmount());
        wq.setTotalBudget(getQueueTotalBudget());
       
        assert (wq.peek(this) == curi) : "unexpected peek " + wq;

        int holderCost = curi.getHolderCost();

        if (needsReenqueuing(curi)) {
            // codes/errors which don't consume the URI, leaving it atop queue
            if(curi.getFetchStatus()!=S_DEFERRED) {
                wq.expend(holderCost); // all retries but DEFERRED cost
            }
            long delay_ms = retryDelayFor(curi) * 1000;
            curi.processingCleanup(); // lose state that shouldn't burden retry
            wq.unpeek(curi);
            wq.update(this, curi); // rewrite any changes
            handleQueue(wq,curi.includesRetireDirective(),now,delay_ms);
            appCtx.publishEvent(new CrawlURIDispositionEvent(this,curi,DEFERRED_FOR_RETRY));
            doJournalReenqueued(curi);
            wq.makeDirty();
            return; // no further dequeueing, logging, rescheduling to occur
        }

        // Curi will definitely be disposed of without retry, so remove from queue
        wq.dequeue(this,curi);
        decrementQueuedCount(1);
        largestQueues.update(wq.getClassKey(), wq.getCount());
        log(curi);

       
        if (curi.isSuccess()) {
            // codes deemed 'success'
            incrementSucceededFetchCount();
            totalProcessedBytes.addAndGet(curi.getRecordedSize());
            appCtx.publishEvent(new CrawlURIDispositionEvent(this,curi,SUCCEEDED));
            doJournalFinishedSuccess(curi);
          
        } else if (isDisregarded(curi)) {
            // codes meaning 'undo' (even though URI was enqueued,
            // we now want to disregard it from normal success/failure tallies)
            // (eg robots-excluded, operator-changed-scope, etc)
            incrementDisregardedUriCount();
            appCtx.publishEvent(new CrawlURIDispositionEvent(this,curi,DISREGARDED));
            holderCost = 0; // no charge for disregarded URIs
            // TODO: consider reinstating forget-URI capability, so URI could be
            // re-enqueued if discovered again
            doJournalDisregarded(curi);
           
        } else {
            // codes meaning 'failure'
            incrementFailedFetchCount();
            appCtx.publishEvent(new CrawlURIDispositionEvent(this,curi,FAILED));
            // if exception, also send to crawlErrors
            if (curi.getFetchStatus() == S_RUNTIME_EXCEPTION) {
                Object[] array = { curi };
                loggerModule.getRuntimeErrors().log(Level.WARNING, curi.getUURI()
                        .toString(), array);
            }       
            // charge queue any extra error penalty
            wq.noteError(getErrorPenaltyAmount());
            doJournalFinishedFailure(curi);
           
        }

        wq.expend(holderCost); // successes & failures charge cost to queue
       
        long delay_ms = curi.getPolitenessDelay();
        handleQueue(wq,curi.includesRetireDirective(),now,delay_ms);
        wq.makeDirty();
       
        if(curi.getRescheduleTime()>0) {
            // marked up for forced-revisit at a set time
            curi.processingCleanup();
            curi.resetForRescheduling();
            futureUris.put(curi.getRescheduleTime(),curi);
            futureUriCount.incrementAndGet();
        } else {
            curi.stripToMinimal();
            curi.processingCleanup();
        }
    }
   
    /**
     * Send an active queue to its next state, based on the supplied
     * parameters.
     *
     * @param wq
     * @param forceRetire
     * @param now
     * @param delay_ms
     */
    protected void handleQueue(WorkQueue wq, boolean forceRetire, long now, long delay_ms) {
        inProcessQueues.remove(wq);
        if(forceRetire) {
            retireQueue(wq);
        } else if (delay_ms > 0) {
            snoozeQueue(wq, now, delay_ms);
        } else {
            getQueuePrecedencePolicy().queueReevaluate(wq);
            reenqueueQueue(wq);
        }
    }

    /**
     * Place the given queue into 'snoozed' state, ineligible to
     * supply any URIs for crawling, for the given amount of time.
     *
     * @param wq queue to snooze
     * @param now time now in ms
     * @param delay_ms time to snooze in ms
     */
    private void snoozeQueue(WorkQueue wq, long now, long delay_ms) {
        long nextTime = now + delay_ms;
        wq.setWakeTime(nextTime);
        DelayedWorkQueue dq = new DelayedWorkQueue(wq);
        if(snoozedClassQueues.size()<MAX_SNOOZED_IN_MEMORY) {
            snoozedClassQueues.add(dq);
        } else {
            synchronized(snoozedOverflow) {
                snoozedOverflow.put(nextTime, dq);
                snoozedOverflowCount.incrementAndGet();
            }
        }
    }

    /**
     * Forget the given CrawlURI. This allows a new instance
     * to be created in the future, if it is reencountered under
     * different circumstances.
     *
     * @param curi The CrawlURI to forget
     */
    protected void forget(CrawlURI curi) {
        logger.finer("Forgetting " + curi);
        uriUniqFilter.forget(curi.getCanonicalString(), curi);
    }

    /**  (non-Javadoc)
     * @see org.archive.crawler.framework.Frontier#discoveredUriCount()
     */
    public long discoveredUriCount() {
        return (this.uriUniqFilter != null)? this.uriUniqFilter.count(): 0;
    }

    /**
     * @param match String to  match.
     * @return Number of items deleted.
     */
    public long deleteURIs(String queueRegex, String uriRegex) {
        long count = 0;
        Pattern queuePat = Pattern.compile(queueRegex);
        for (String qname: allQueues.keySet()) {
            if (queuePat.matcher(qname).matches()) {
                WorkQueue wq = getQueueFor(qname);
                wq.unpeek(null);
                long delCount = wq.deleteMatching(this, uriRegex);
                if (!wq.isRetired()) {
                  count += delCount;
                }
                wq.makeDirty();
            }
        }
        decrementQueuedCount(count);
        return count;
    }

    //
    // Reporter implementation
    //
   
   
    @Override
    public Map<String, Object> shortReportMap() {
        if (this.allQueues == null) {
            return null;
        }
       
        int allCount = allQueues.size();
        int inProcessCount = inProcessQueues.size();
        int readyCount = readyClassQueues.size();
        int snoozedCount = getSnoozedCount();
        int activeCount = inProcessCount + readyCount + snoozedCount;
        int inactiveCount = getTotalEligibleInactiveQueues();
        int ineligibleCount = getTotalIneligibleInactiveQueues();
        int retiredCount = getRetiredQueues().size();
        int exhaustedCount = allCount - activeCount - inactiveCount - retiredCount;

        Map<String,Object> map = new LinkedHashMap<String, Object>();
        map.put("totalQueues", allCount);
        map.put("inProcessQueues", inProcessCount);
        map.put("readyQueues", readyCount);
        map.put("snoozedQueues", snoozedCount);
        map.put("activeQueues", activeCount);
        map.put("inactiveQueues", inactiveCount);
        map.put("ineligibleQueues", ineligibleCount);
        map.put("retiredQueues", retiredCount);
        map.put("exhaustedQueues", exhaustedCount);
        map.put("lastReachedState", lastReachedState);

        return map;
    }

    /**
     * @param w Where to write to.
     */
    @Override
    public void shortReportLineTo(PrintWriter w) {
        if (!isRunning()) return; //???
       
        if (this.allQueues == null) {
            return;
        }
        int allCount = allQueues.size();
        int inProcessCount = inProcessQueues.size();
        int readyCount = readyClassQueues.size();
        int snoozedCount = getSnoozedCount();
        int activeCount = inProcessCount + readyCount + snoozedCount;
        int inactiveCount = getTotalEligibleInactiveQueues();
        int ineligibleCount = getTotalIneligibleInactiveQueues();
        int retiredCount = getRetiredQueues().size();
        int exhaustedCount =
            allCount - activeCount - inactiveCount - retiredCount;
        State last = lastReachedState;
        w.print(last);
        w.print(" - ");
        w.print(allCount);
        w.print(" URI queues: ");
        w.print(activeCount);
        w.print(" active (");
        w.print(inProcessCount);
        w.print(" in-process; ");
        w.print(readyCount);
        w.print(" ready; ");
        w.print(snoozedCount);
        w.print(" snoozed); ");
        w.print(inactiveCount);
        w.print(" inactive; ");
        w.print(ineligibleCount);
        w.print(" ineligible; ");
        w.print(retiredCount);
        w.print(" retired; ");
        w.print(exhaustedCount);
        w.print(" exhausted");       
        w.flush();
    }

    /**
     * Total of all URIs in inactive queues at all precedences
     * @return int total
     */
    protected int getTotalInactiveQueues() {
        return tallyInactiveTotals(getInactiveQueuesByPrecedence());
    }
   
    /**
     * Total of all URIs in inactive queues at precedences above the floor
     * @return int total
     */
    protected int getTotalEligibleInactiveQueues() {
        return tallyInactiveTotals(
                getInactiveQueuesByPrecedence().headMap(getPrecedenceFloor()));
    }
   
    /**
     * Total of all URIs in inactive queues at precedences at or below the floor
     * @return int total
     */
    protected int getTotalIneligibleInactiveQueues() {
        return tallyInactiveTotals(
                getInactiveQueuesByPrecedence().tailMap(getPrecedenceFloor()));
    }

    /**
     * @param iqueue
     * @return
     */
    private int tallyInactiveTotals(SortedMap<Integer,Queue<String>> iqueues) {
        int inactiveCount = 0;
        for(Queue<String> q : iqueues.values()) {
            inactiveCount += q.size();
        }
        return inactiveCount;
    }
   
    /* (non-Javadoc)
     * @see org.archive.util.Reporter#singleLineLegend()
     */
    @Override
    public String shortReportLegend() {
        return "total active in-process ready snoozed inactive retired exhausted";
    }

    /**
     * This method compiles a human readable report on the status of the frontier
     * at the time of the call.
     * @param name Name of report.
     * @param writer Where to write to.
     */
    @Override
    public synchronized void reportTo(PrintWriter writer) {
        int allCount = allQueues.size();
        int inProcessCount = inProcessQueues.size();
        int readyCount = readyClassQueues.size();
        int snoozedCount = getSnoozedCount();
        int activeCount = inProcessCount + readyCount + snoozedCount;
        int inactiveCount = getTotalInactiveQueues();
        int retiredCount = getRetiredQueues().size();
        int exhaustedCount =
            allCount - activeCount - inactiveCount - retiredCount;
       
        writer.print("Frontier report - ");
        writer.print(ArchiveUtils.get12DigitDate());
        writer.print("\n");
        writer.print(" Job being crawled: ");
        writer.print(controller.getMetadata().getJobName());
        writer.print("\n");
        writer.print("\n -----===== STATS =====-----\n");
        writer.print(" Discovered:    ");
        writer.print(Long.toString(discoveredUriCount()));
        writer.print("\n");
        writer.print(" Queued:        ");
        writer.print(Long.toString(queuedUriCount()));
        writer.print("\n");
        writer.print(" Finished:      ");
        writer.print(Long.toString(finishedUriCount()));
        writer.print("\n");
        writer.print("  Successfully: ");
        writer.print(Long.toString(succeededFetchCount()));
        writer.print("\n");
        writer.print("  Failed:       ");
        writer.print(Long.toString(failedFetchCount()));
        writer.print("\n");
        writer.print("  Disregarded:  ");
        writer.print(Long.toString(disregardedUriCount()));
        writer.print("\n");
        writer.print("\n -----===== QUEUES =====-----\n");
        writer.print(" Already included size:     ");
        writer.print(Long.toString(uriUniqFilter.count()));
        writer.print("\n");
        writer.print("               pending:     ");
        writer.print(Long.toString(uriUniqFilter.pending()));
        writer.print("\n");
        writer.print("\n All class queues map size: ");
        writer.print(Long.toString(allCount));
        writer.print("\n");
        writer.print( "             Active queues: ");
        writer.print(activeCount);
        writer.print("\n");
        writer.print("                    In-process: ");
        writer.print(inProcessCount);
        writer.print("\n");
        writer.print("                         Ready: ");
        writer.print(readyCount);
        writer.print("\n");
        writer.print("                       Snoozed: ");
        writer.print(snoozedCount);
        writer.print("\n");
        writer.print("           Inactive queues: ");
        writer.print(inactiveCount);
        writer.print(" (");
        Map<Integer,Queue<String>> inactives = getInactiveQueuesByPrecedence();
        boolean betwixt = false;
        for(Integer k : inactives.keySet()) {
            if(betwixt) {
                writer.print("; ");
            }
            writer.print("p");
            writer.print(k);
            writer.print(": ");
            writer.print(inactives.get(k).size());
            betwixt = true;
        }
        writer.print(")\n");
        writer.print("            Retired queues: ");
        writer.print(retiredCount);
        writer.print("\n");
        writer.print("          Exhausted queues: ");
        writer.print(exhaustedCount);
        writer.print("\n");
       
        State last = lastReachedState;
        writer.print("\n             Last state: "+last);       
       
        writer.print("\n -----===== MANAGER THREAD =====-----\n");
        ToeThread.reportThread(managerThread, writer);
       
        writer.print("\n -----===== "+largestQueues.size()+" LONGEST QUEUES =====-----\n");
        appendQueueReports(writer, "LONGEST", largestQueues.getEntriesDescending().iterator(), largestQueues.size(), largestQueues.size());
       
        writer.print("\n -----===== IN-PROCESS QUEUES =====-----\n");
        Collection<WorkQueue> inProcess = inProcessQueues;
        ArrayList<WorkQueue> copy = extractSome(inProcess, maxQueuesPerReportCategory);
        appendQueueReports(writer, "IN-PROCESS", copy.iterator(), copy.size(), maxQueuesPerReportCategory);
       
        writer.print("\n -----===== READY QUEUES =====-----\n");
        appendQueueReports(writer, "READY", this.readyClassQueues.iterator(),
            this.readyClassQueues.size(), maxQueuesPerReportCategory);
       
        writer.print("\n -----===== SNOOZED QUEUES =====-----\n");
        Object[] objs = snoozedClassQueues.toArray();
        DelayedWorkQueue[] qs = Arrays.copyOf(objs,objs.length,DelayedWorkQueue[].class);
        Arrays.sort(qs);
        appendQueueReports(writer, "SNOOZED", new ObjectArrayIterator(qs), getSnoozedCount(), maxQueuesPerReportCategory);
       
        writer.print("\n -----===== INACTIVE QUEUES =====-----\n");
        SortedMap<Integer,Queue<String>> sortedInactives = getInactiveQueuesByPrecedence();
        for(Integer prec : sortedInactives.keySet()) {
            Queue<String> inactiveQueues = sortedInactives.get(prec);
            appendQueueReports(writer, "INACTIVE-p"+prec, inactiveQueues.iterator(),
                    inactiveQueues.size(), maxQueuesPerReportCategory);
        }
       
        writer.print("\n -----===== RETIRED QUEUES =====-----\n");
        appendQueueReports(writer, "RETIRED", getRetiredQueues().iterator(),
            getRetiredQueues().size(), maxQueuesPerReportCategory);
       
        writer.flush();
    }
   
    /** Compact report of all nonempty queues (one queue per line)
     *
     * @param writer
     */
    public void allNonemptyReportTo(PrintWriter writer) {
        ArrayList<WorkQueue> inProcessQueuesCopy;
        synchronized(this.inProcessQueues) {
            // grab a copy that will be stable against mods for report duration
            Collection<WorkQueue> inProcess = this.inProcessQueues;
            inProcessQueuesCopy = new ArrayList<WorkQueue>(inProcess);
        }
        writer.print("\n -----===== IN-PROCESS QUEUES =====-----\n");
        queueSingleLinesTo(writer, inProcessQueuesCopy.iterator());

        writer.print("\n -----===== READY QUEUES =====-----\n");
        queueSingleLinesTo(writer, this.readyClassQueues.iterator());

        writer.print("\n -----===== SNOOZED QUEUES =====-----\n");
        queueSingleLinesTo(writer, this.snoozedClassQueues.iterator());
        queueSingleLinesTo(writer, this.snoozedOverflow.values().iterator());
       
        writer.print("\n -----===== INACTIVE QUEUES =====-----\n");
        for(Queue<String> inactiveQueues : getInactiveQueuesByPrecedence().values()) {
            queueSingleLinesTo(writer, inactiveQueues.iterator());
        }
       
        writer.print("\n -----===== RETIRED QUEUES =====-----\n");
        queueSingleLinesTo(writer, getRetiredQueues().iterator());
    }

    /** Compact report of all nonempty queues (one queue per line)
     *
     * @param writer
     */
    public void allQueuesReportTo(PrintWriter writer) {
        queueSingleLinesTo(writer, allQueues.keySet().iterator());
    }
   
    /**
     * Writer the single-line reports of all queues in the
     * iterator to the writer
     *
     * @param writer to receive report
     * @param iterator over queues of interest.
     */
    private void queueSingleLinesTo(PrintWriter writer, Iterator<?> iterator) {
        Object obj;
        WorkQueue q;
        boolean legendWritten = false;
        while( iterator.hasNext()) {
            obj = iterator.next();
            if (obj ==  null) {
                continue;
            }
            if(obj instanceof WorkQueue) {
                q = (WorkQueue)obj;
            } else if (obj instanceof DelayedWorkQueue) {
                q = ((DelayedWorkQueue)obj).getWorkQueue(this);
            } else {
                try {
                    q = this.allQueues.get((String)obj);
                } catch (ClassCastException cce) {
                    logger.log(Level.SEVERE,"not convertible to workqueue:"+obj,cce);
                    q = null;
                }
            }

            if(q != null) {
                if(!legendWritten) {
                    writer.println(q.shortReportLegend());
                    legendWritten = true;
                }
                q.shortReportLineTo(writer);
            } else {
                writer.print(" ERROR: "+obj);
            }
        }      
    }

    /**
     * Extract some of the elements in the given collection to an
     * ArrayList.  This method synchronizes on the given collection's
     * monitor.  The returned list will never contain more than the
     * specified maximum number of elements.
     *
     * @param c    the collection whose elements to extract
     * @param max  the maximum number of elements to extract
     * @return  the extraction
     */
    private static <T> ArrayList<T> extractSome(Collection<T> c, int max) {
        // Try to guess a sane initial capacity for ArrayList
        // Hopefully given collection won't grow more than 10 items
        // between now and the synchronized block...
        int initial = Math.min(c.size() + 10, max);
        int count = 0;
        ArrayList<T> list = new ArrayList<T>(initial);
        synchronized (c) {
            Iterator<T> iter = c.iterator();
            while (iter.hasNext() && (count < max)) {
                list.add(iter.next());
                count++;
            }
        }
        return list;
    }

    /**
     * Append queue report to general Frontier report.
     * @param w StringBuffer to append to.
     * @param iterator An iterator over
     * @param total
     * @param max
     */
    @SuppressWarnings("rawtypes")
    protected void appendQueueReports(PrintWriter w, String label, Iterator<?> iterator,
            int total, int max) {
        Object obj;
        WorkQueue q;
        int count;
        for(count = 0; iterator.hasNext() && (count < max); count++) {
            obj = iterator.next();
            if (obj ==  null) {
                continue;
            }
            if(obj instanceof WorkQueue) {
                q = (WorkQueue)obj;
            } else if (obj instanceof DelayedWorkQueue) {
                q = (WorkQueue)((DelayedWorkQueue)obj).getWorkQueue(this);
            } else if (obj instanceof Map.Entry) {
                q = this.allQueues.get((String)((Map.Entry)obj).getKey());
            } else {
                q = this.allQueues.get((String)obj);
            }
            if(q != null) {
                w.println(label+"#"+count+":");
                q.reportTo(w);
            } else {
                w.print("WARNING: No report for queue "+obj);
            }
        }
        count++;
        if(count < total) {
            w.print("...and " + (total - count) + " more "+label+".\n");
        }
    }

    /**
     * Force logging, etc. of operator- deleted CrawlURIs
     *
     * @see org.archive.crawler.framework.Frontier#deleted(org.archive.modules.CrawlURI)
     */
    public void deleted(CrawlURI curi) {
        //treat as disregarded
        appCtx.publishEvent(
            new CrawlURIDispositionEvent(this,curi,DISREGARDED));
        log(curi);
        incrementDisregardedUriCount();
        curi.stripToMinimal();
        curi.processingCleanup();
    }

    public void considerIncluded(CrawlURI curi) {
        sheetOverlaysManager.applyOverlaysTo(curi);
        if(curi.getClassKey()==null) {
            // remedial processing
            preparer.prepare(curi);
        }
        this.uriUniqFilter.note(curi.getCanonicalString());
        try {
            KeyedProperties.loadOverridesFrom(curi);
            curi.setClassKey(getClassKey(curi));
            WorkQueue wq = getQueueFor(curi.getClassKey());
            wq.expend(curi.getHolderCost());
            wq.makeDirty();
        } finally {
            KeyedProperties.clearOverridesFrom(curi);
        }
    }
   
    /**
     * Returns <code>true</code> if the WorkQueue implementation of this
     * Frontier stores its workload on disk instead of relying
     * on serialization mechanisms.
     *
     * TODO: rename! (this is a very misleading name) or kill (don't
     * see any implementations that return false)
     *
     * @return a constant boolean value for this class/instance
     */
    protected abstract boolean workQueueDataOnDisk();

    public long averageDepth() {
        if(inProcessQueues==null || readyClassQueues==null || snoozedClassQueues==null) {
            return 0;
        }
        int inProcessCount = inProcessQueues.size();
        int readyCount = readyClassQueues.size();
        int snoozedCount = getSnoozedCount();
        int activeCount = inProcessCount + readyCount + snoozedCount;
        int inactiveCount = getTotalInactiveQueues();
        int totalQueueCount = (activeCount+inactiveCount);
        return (totalQueueCount == 0) ? 0 : queuedUriCount.get() / totalQueueCount;
    }
   
    protected int getSnoozedCount() {
        return snoozedClassQueues.size() + snoozedOverflowCount.get();
    }
   
    public float congestionRatio() {
        if(inProcessQueues==null || readyClassQueues==null || snoozedClassQueues==null) {
            return 0;
        }
        int inProcessCount = inProcessQueues.size();
        int readyCount = readyClassQueues.size();
        int snoozedCount = getSnoozedCount();
        int activeCount = inProcessCount + readyCount + snoozedCount;
        int eligibleInactiveCount = getTotalEligibleInactiveQueues();
        return (float)(activeCount + eligibleInactiveCount) / (inProcessCount + snoozedCount);
    }
    public long deepestUri() {
        return largestQueues.getTopSet().size()==0 ? -1 : largestQueues.getTopSet().get(largestQueues.getLargest());
    }
   
    /**
     * Return whether frontier is exhausted: all crawlable URIs done (none
     * waiting or pending). Only gives precise answer inside managerThread.
     *
     * @see org.archive.crawler.framework.Frontier#isEmpty()
     */
    public boolean isEmpty() {
        return queuedUriCount.get() == 0
            && (uriUniqFilter == null || uriUniqFilter.pending() == 0)
            && futureUriCount.get() == 0;
    }

    /* (non-Javadoc)
     * @see org.archive.crawler.frontier.AbstractFrontier#getInProcessCount()
     */
    @Override
    protected int getInProcessCount() {
        return inProcessQueues.size();
    }
   
} // TODO: slim class! Suspect it should be < 800 lines, shedding budgeting/reporting
TOP

Related Classes of org.archive.crawler.frontier.WorkQueueFrontier

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.