Source Code of bixo.operations.FetchBuffer$QueuedValues

/*
 * Copyright 2009-2013 Scale Unlimited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package bixo.operations;


import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.RejectedExecutionException;
import java.util.concurrent.atomic.AtomicBoolean;


import org.slf4j.Logger;
import org.slf4j.LoggerFactory;




import bixo.config.BixoPlatform;
import bixo.config.FetcherPolicy;
import bixo.config.FetcherPolicy.FetcherMode;
import bixo.datum.FetchSetDatum;
import bixo.datum.FetchedDatum;
import bixo.datum.ScoredUrlDatum;
import bixo.datum.UrlStatus;
import bixo.fetcher.BaseFetcher;
import bixo.fetcher.FetchTask;
import bixo.fetcher.IFetchMgr;
import bixo.hadoop.FetchCounters;
import bixo.utils.DiskQueue;
import bixo.utils.ThreadedExecutor;
import cascading.flow.FlowProcess;
import cascading.operation.BaseOperation;
import cascading.operation.Buffer;
import cascading.operation.BufferCall;
import cascading.operation.OperationCall;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import cascading.tuple.TupleEntry;
import cascading.tuple.TupleEntryCollector;


import com.scaleunlimited.cascading.BaseDatum;
import com.scaleunlimited.cascading.LoggingFlowProcess;
import com.scaleunlimited.cascading.LoggingFlowReporter;
import com.scaleunlimited.cascading.NullContext;




@SuppressWarnings( { "serial", "rawtypes" })
public class FetchBuffer extends BaseOperation<NullContext> implements Buffer<NullContext>, IFetchMgr {
    private static Logger LOGGER = LoggerFactory.getLogger(FetchBuffer.class);


    private class QueuedFetchSetsComparator implements Comparator<FetchSetDatum> {


        private long getFetchTime(String groupingRef) {
            if (_activeRefs.get(groupingRef) == null) {
                Long nextFetchTime = _pendingRefs.get(groupingRef);
                if (nextFetchTime == null) {
                    return(0);
                } else {
                    return(nextFetchTime);
                }
            } else {
                // fetch set is active, so sort at end
                return(Long.MAX_VALUE);
            }
        }
        
        @Override
        public int compare(FetchSetDatum o1, FetchSetDatum o2) {
            long o1FetchTime = getFetchTime(o1.getGroupingRef());
            long o2FetchTime = getFetchTime(o2.getGroupingRef());
            
            // The entry that's ready sooner sorts sooner. If both
            // are ready, return the one with the bigger fetch set.
            if (o1FetchTime < o2FetchTime) {
                return -1;
            } else if (o1FetchTime > o2FetchTime) {
                return 1;
            } else if (o1.getUrls().size() > o2.getUrls().size()) {
                return -1;
            } else if (o1.getUrls().size() < o2.getUrls().size()) {
                return 1;
            } else {
                return 0;
            }
        }
        
    }
    
    private class QueuedValues {
        // TODO - make this part of CrawlPolicy. We'd like to contrain by total # of URLs, actually, not FetchSetDatums
        private static final int MAX_ELEMENTS_IN_MEMORY = 10000;
        
        private static final int MAX_FETCHSETS_TO_QUEUE_PER_DELAY = 100;
        
        private DiskQueue<FetchSetDatum> _queue;
        private Iterator<TupleEntry> _values;
        private boolean _iteratorDone;
        
        public QueuedValues(Iterator<TupleEntry> values) {
            _values = values;
            _iteratorDone = false;
            _queue = new DiskQueue<FetchSetDatum>(MAX_ELEMENTS_IN_MEMORY, new QueuedFetchSetsComparator());
        }
        
        /**
         * Return true if the iterator has another Tuple. This avoids calling
         * the hasNext() method after it returns false, as doing so with
         * Cascading 1.2 will trigger a NPE.
         * 
         * @return true if there's another Tuple waiting to be read.
         */
        private boolean safeHasNext() {
            _iteratorDone = _iteratorDone || !_values.hasNext();
            return !_iteratorDone;
        }
        
        public boolean isEmpty() {
            return _queue.isEmpty() && !safeHasNext();
        }
        
        private boolean readyToFetch(String ref) {
            if (_activeRefs.get(ref) == null) {
                Long nextFetchTime = _pendingRefs.get(ref);
                if ((nextFetchTime == null) || (nextFetchTime <= System.currentTimeMillis())) {
                    return true;
                }
            }
            
            return false;
        }
        
        public FetchSetDatum nextOrNull(FetcherMode mode) {
            
            int fetchSetsQueued = 0;
            
            // Loop until we have something to return, or there's nothing that we can return, or we've
            // queued up as many fetchsets as we want without any delay.
            while (!isEmpty() && (fetchSetsQueued < MAX_FETCHSETS_TO_QUEUE_PER_DELAY)) {
                // First see if we've got something in the queue, and if so, then check if it's ready
                // to be processed.
                final FetchSetDatum queueDatum = removeFromQueue();
                
                if (queueDatum != null) {
                    String ref = queueDatum.getGroupingRef();
                    if (readyToFetch(ref) || (mode == FetcherMode.IMPOLITE)) {
                        List<ScoredUrlDatum> urls = queueDatum.getUrls();
                        trace("Returning %d urls via queue from %s (e.g. %s)", urls.size(), ref, urls.get(0).getUrl());
                        return queueDatum;
                    }
                }


                // Nothing ready from the top of the queue or nothing in the queue, let's see about the iterator.
                if (safeHasNext()) {
                    // Re-add the thing from the top of the queue, since we're going to want to keep it around.
                    // This is safe to call with a null datum.
                    addToQueue(queueDatum);
                    
                    // Now get our next FetchSet from the Hadoop iterator.
                    FetchSetDatum iterDatum = new FetchSetDatum(new TupleEntry(_values.next()));
                    List<ScoredUrlDatum> urls = iterDatum.getUrls();
                    String ref = iterDatum.getGroupingRef();
                    
                    if (iterDatum.isSkipped()) {
                        trace("Skipping %d urls via iterator from %s (e.g. %s)", urls.size(), ref, urls.get(0).getUrl());
                        skipUrls(urls, UrlStatus.SKIPPED_PER_SERVER_LIMIT, null);
                        continue;
                    }


                    if ((mode == FetcherMode.IMPOLITE) || readyToFetch(ref)) {
                        trace("Returning %d urls via iterator from %s (e.g. %s)", urls.size(), ref, urls.get(0).getUrl());
                        return iterDatum;
                    }


                    // We've got a datum from the iterator that's not ready to be processed, so we'll stuff it into the queue.
                    trace("Queuing %d urls via iterator from %s (e.g. %s)", urls.size(), iterDatum.getGroupingRef(), urls.get(0).getUrl());
                    addToQueue(iterDatum);
                    fetchSetsQueued += 1;
                    continue;
                }
                
                // Nothing ready from top of queue, and iterator is empty too. If we had something from the top of the queue (which then
                // must not be ready), decide what to do based on our FetcherMode.
                if (queueDatum != null) {
                    List<ScoredUrlDatum> urls = queueDatum.getUrls();


                    switch (mode) {
                        case COMPLETE:
                            // Re-add the datum, since we don't want to skip it. And immediately return, as otherwise we're trapped
                            // in this loop, versus giving FetchBuffer time to delay.
                            trace("Blocked on %d urls via queue from %s (e.g. %s)", urls.size(), queueDatum.getGroupingRef(), urls.get(0).getUrl());
                            addToQueue(queueDatum);
                            return null;


                        case IMPOLITE:
                            trace("Impolitely returning %d urls via queue from %s (e.g. %s)", urls.size(), queueDatum.getGroupingRef(), urls.get(0).getUrl());
                            return queueDatum;
                            
                        case EFFICIENT:
                            // In efficient fetching, we punt on items that aren't ready. And immediately return, so that FetchBuffer's loop has
                            // time to delay, as otherwise we'd likely skip everything that's in the in-memory queue (since the item we're skipping
                            // is the "best" in terms of when it's going to be ready).
                            trace("Efficiently skipping %d urls via queue from %s (e.g. %s)", urls.size(), queueDatum.getGroupingRef(), urls.get(0).getUrl());
                            skipUrls(urls, UrlStatus.SKIPPED_INEFFICIENT, null);
                            return null;
                    }
                }
            }
            
            // Either we're all out of FetchSets to process (nothing left in iterator or queue) or we've queued up lots of sets, and
            // we want to give FetchBuffer a chance to sleep.
            return null;
        }
        
        /**
         * Empty the buffer, then the iterator, without worrying about mode/state.
         * 
         * @return
         */
        public FetchSetDatum drain() {
            if (!_queue.isEmpty()) {
                return removeFromQueue();
            } else if (safeHasNext()) {
                return new FetchSetDatum(new TupleEntry(_values.next()));
            } else {
                return null;
            }
        }


        /**
         * Return the top-most item from the queue, or null if the queue is empty.
         * 
         * @return fetch set from queue
         */
        private FetchSetDatum removeFromQueue() {
            FetchSetDatum result = _queue.poll();
            if (result != null) {
                _flowProcess.increment(FetchCounters.FETCHSETS_QUEUED, -1);
                _flowProcess.increment(FetchCounters.URLS_QUEUED, -result.getUrls().size());
            }
            
            return result;
        }


        private void addToQueue(FetchSetDatum datum) {
            if (datum != null) {
                _flowProcess.increment(FetchCounters.FETCHSETS_QUEUED, 1);
                _flowProcess.increment(FetchCounters.URLS_QUEUED, datum.getUrls().size());


                _queue.add(datum);
            }
        }
}


    private static final Fields FETCH_RESULT_FIELD = new Fields(BaseDatum.fieldName(FetchBuffer.class, "fetch-exception"));


    // Time to sleep when we don't have any URLs that can be fetched.
    private static final long NOTHING_TO_FETCH_SLEEP_TIME = 1000;


    private static final long HARD_TERMINATION_CLEANUP_DURATION = 10 * 1000L;


    private BaseFetcher _fetcher;
    private FetcherMode _fetcherMode;


    private transient ThreadedExecutor _executor;
    private transient LoggingFlowProcess _flowProcess;
    private transient TupleEntryCollector _collector;


    private transient Object _refLock;
    private transient ConcurrentHashMap<String, Long> _activeRefs;
    private transient ConcurrentHashMap<String, Long> _pendingRefs;
    
    private transient AtomicBoolean _keepCollecting;
    
    public FetchBuffer(BaseFetcher fetcher) {
        // We're going to output a tuple that contains a FetchedDatum, plus meta-data,
        // plus a result that could be a string, a status, or an exception
        super(FetchedDatum.FIELDS.append(FETCH_RESULT_FIELD));


        _fetcher = fetcher;
        _fetcherMode = _fetcher.getFetcherPolicy().getFetcherMode();
    }


    @Override
    public boolean isSafe() {
        // We definitely DO NOT want to be called multiple times for the same
        // scored datum, so let Cascading 1.1 know that the output from us should
        // be stashed in tempHfs if need be.
        return false;
    }


    @SuppressWarnings({"unchecked" })
    @Override
    public void prepare(FlowProcess flowProcess, OperationCall<NullContext> operationCall) {
        super.prepare(flowProcess, operationCall);


        _flowProcess = new LoggingFlowProcess(flowProcess);
        _flowProcess.addReporter(new LoggingFlowReporter());


        _executor = new ThreadedExecutor(_fetcher.getMaxThreads(), _fetcher.getFetcherPolicy().getRequestTimeout());


        _refLock = new Object();
        _pendingRefs = new ConcurrentHashMap<String, Long>();
        _activeRefs = new ConcurrentHashMap<String, Long>();
        
        _keepCollecting = new AtomicBoolean(true);
    }


    @Override
    public void operate(FlowProcess process, BufferCall<NullContext> buffCall) {
        QueuedValues values = new QueuedValues(buffCall.getArgumentsIterator());


        _collector = buffCall.getOutputCollector();
        FetcherPolicy fetcherPolicy = _fetcher.getFetcherPolicy();
        
        // Each value is a PreFetchedDatum that contains a set of URLs to fetch in one request from
        // a single server, plus other values needed to set state properly.
        while (!Thread.interrupted() && !fetcherPolicy.isTerminateFetch() && !values.isEmpty()) {
            FetchSetDatum datum = values.nextOrNull(_fetcherMode);
            
            try {
                if (datum == null) {
                    trace("Nothing ready to fetch, sleeping...");
                    process.keepAlive();
                    Thread.sleep(NOTHING_TO_FETCH_SLEEP_TIME);
                } else {
                    List<ScoredUrlDatum> urls = datum.getUrls();
                    String ref = datum.getGroupingRef();
                    trace("Processing %d URLs for %s", urls.size(), ref);


                    Runnable doFetch = new FetchTask(this, _fetcher, urls, ref);
                    if (datum.isLastList()) {
                        makeActive(ref, 0L);
                        trace("Executing fetch of %d URLs from %s (last batch)", urls.size(), ref);
                    } else {
                        Long nextFetchTime = System.currentTimeMillis() + datum.getFetchDelay();
                        makeActive(ref, nextFetchTime);
                        trace("Executing fetch of %d URLs from %s (next fetch time %d)", urls.size(), ref, nextFetchTime);
                    }


                    long startTime = System.currentTimeMillis();


                    try {
                        _executor.execute(doFetch);
                    } catch (RejectedExecutionException e) {
                        // should never happen.
                        LOGGER.error("Fetch pool rejected our fetch list for " + ref);


                        finished(ref);
                        skipUrls(urls, UrlStatus.SKIPPED_DEFERRED, String.format("Execution rejection skipped %d URLs", urls.size()));
                    }


                    // Adjust for how long it took to get the request queued.
                    adjustActive(ref, System.currentTimeMillis() - startTime);
                }
            } catch (InterruptedException e) {
                LOGGER.warn("FetchBuffer interrupted!");
                Thread.currentThread().interrupt();
            }
        }
        
        // Skip all URLs that we've got left.
        if (!values.isEmpty()) {
            trace("Found unprocessed URLs");
            
            UrlStatus status = Thread.interrupted() ? UrlStatus.SKIPPED_INTERRUPTED : UrlStatus.SKIPPED_TIME_LIMIT;
            
            while (!values.isEmpty()) {
                FetchSetDatum datum = values.drain();
                List<ScoredUrlDatum> urls = datum.getUrls();
                trace("Skipping %d urls from %s (e.g. %s) ", urls.size(), datum.getGroupingRef(), urls.get(0).getUrl());
                skipUrls(urls, status, null);
            }
        }
    }


    private synchronized void terminate() {
        if (_executor == null) {
            return;
        }
        
        try {
            // We don't know worst-case for amount of time a worker thread will effectively
            // "sleep" waiting for a FetchTask to be queued up, but we'll add in a bit of
            // slop to represent that amount of time.
            long pollTime = ThreadedExecutor.MAX_POLL_TIME;
            Thread.sleep(pollTime);
            
            long requestTimeout = _fetcher.getFetcherPolicy().getRequestTimeout();
            if (!_executor.terminate(requestTimeout)) {
                LOGGER.warn("Had to do a hard termination of general fetching");
                
                // Abort any active connections, which should give the FetchTasks a chance
                // to clean things up.
                _fetcher.abort();
                
                // Now give everybody who had to be interrupted some time to
                // actually write out their remaining URLs.
                Thread.sleep(HARD_TERMINATION_CLEANUP_DURATION);
            }
            
            // Now stop collecting results. If somebody is in the middle of the collect() call,
            // we want them to finish before we set it to false and drop out of this method.
            synchronized (_keepCollecting) {
                _keepCollecting.set(false);
            }
        } catch (InterruptedException e) {
            // FUTURE What's the right thing to do here? E.g. do I need to worry about
            // losing URLs still to be processed?
            LOGGER.warn("Interrupted while waiting for termination");
        } finally {
            _executor = null;
        }
    }
    
    @Override
    public void flush(FlowProcess process, OperationCall<NullContext> operationCall) {
        LOGGER.info("Flushing FetchBuffer");
        
        terminate();


        super.flush(process, operationCall);
    }
    
    @Override
    public void cleanup(FlowProcess process, OperationCall<NullContext> operationCall) {
        LOGGER.info("Cleaning up FetchBuffer");
        
        terminate();


        _flowProcess.dumpCounters();
        super.cleanup(process,  operationCall);
    }


    @Override
    public void finished(String ref) {
        synchronized (_refLock) {
            Long nextFetchTime = _activeRefs.remove(ref);
            if (nextFetchTime == null) {
                throw new RuntimeException("finished called on non-active ref: " + ref);
            }
            
            // If there's going to be more to fetch, put it back in the pending pool.
            if (nextFetchTime != 0) {
                trace("Finished batch fetch for %s, with next batch at %d", ref, nextFetchTime);
                _pendingRefs.put(ref, nextFetchTime);
            } else {
                trace("Finished last batch fetch for %s", ref);
            }
        }
    }


    @Override
    public void collect(Tuple tuple) {
        // Prevent two bad things from happening:
        // 1. Somebody changes _keepCollecting after we've tested that it's true
        // 2. Two people calling collector.add() at the same time (it's not thread safe)
        synchronized (_keepCollecting) {
            if (_keepCollecting.get()) {
                _collector.add(BixoPlatform.clone(tuple, _flowProcess));
            } else {
                LOGGER.warn("Losing an entry: " + tuple);
            }
        }
    }


    @Override
    public LoggingFlowProcess getProcess() {
        return _flowProcess;
    }
    
    private void skipUrls(List<ScoredUrlDatum> urls, UrlStatus status, String traceMsg) {
        for (ScoredUrlDatum datum : urls) {
            FetchedDatum result = new FetchedDatum(datum);
            Tuple tuple = result.getTuple();
            tuple.add(status.toString());
            _collector.add(BixoPlatform.clone(tuple, _flowProcess));
        }


        _flowProcess.increment(FetchCounters.URLS_SKIPPED, urls.size());
        if (status == UrlStatus.SKIPPED_PER_SERVER_LIMIT) {
            _flowProcess.increment(FetchCounters.URLS_SKIPPED_PER_SERVER_LIMIT, urls.size());
        }


        if ((traceMsg != null) && LOGGER.isTraceEnabled()) {
            LOGGER.trace(String.format(traceMsg, urls.size()));
        }
    }
    
    /**
     * Make <ref> active, removing from pending if necessary.
     * 
     * @param ref
     * @param nextFetchTime
     */
    private void makeActive(String ref, Long nextFetchTime) {
        synchronized (_refLock) {
            trace("Making %s active", ref);
            _pendingRefs.remove(ref);
            _activeRefs.put(ref, nextFetchTime);
        }
    }


    private void adjustActive(String ref, long deltaTime) {
        synchronized (_refLock) {
            Long nextFetchTime = _activeRefs.get(ref);
            if ((nextFetchTime != null) && (nextFetchTime != 0) && (deltaTime != 0)) {
                _activeRefs.put(ref, nextFetchTime + deltaTime);
            }
        }
    }


    private void trace(String template, Object... params) {
        if (LOGGER.isTraceEnabled()) {
            LOGGER.trace(String.format(template, params));
        }
    }
    


}
Source Code of bixo.operations.FetchBuffer$QueuedValues

Related Classes of bixo.operations.FetchBuffer$QueuedValues