Source Code of bixo.operations.FilterAndScoreByUrlAndRobots

/*
 * Copyright 2009-2013 Scale Unlimited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package bixo.operations;


import java.util.Iterator;
import java.util.concurrent.RejectedExecutionException;


import org.slf4j.Logger;
import org.slf4j.LoggerFactory;




import bixo.config.UserAgent;
import bixo.datum.GroupedUrlDatum;
import bixo.datum.ScoredUrlDatum;
import bixo.fetcher.BaseFetcher;
import bixo.hadoop.FetchCounters;
import bixo.robots.BaseRobotsParser;
import bixo.robots.RobotUtils;
import bixo.utils.DiskQueue;
import bixo.utils.GroupingKey;
import bixo.utils.ThreadedExecutor;
import cascading.flow.FlowProcess;
import cascading.operation.BaseOperation;
import cascading.operation.Buffer;
import cascading.operation.BufferCall;
import cascading.operation.OperationCall;
import cascading.tuple.TupleEntry;


import com.scaleunlimited.cascading.LoggingFlowProcess;
import com.scaleunlimited.cascading.LoggingFlowReporter;
import com.scaleunlimited.cascading.NullContext;




/**
 * Filter out URLs by either domain (not popular enough) or if they're blocked by robots.txt
 *
 */


@SuppressWarnings({"serial", "rawtypes"})
public class FilterAndScoreByUrlAndRobots extends BaseOperation<NullContext> implements Buffer<NullContext> {
  private static final Logger LOGGER = LoggerFactory.getLogger(FilterAndScoreByUrlAndRobots.class);
  
    private static final long COMMAND_TIMEOUT = RobotUtils.getMaxFetchTime();
    private static final long TERMINATE_TIMEOUT = COMMAND_TIMEOUT;


    private static final int MAX_URLS_IN_MEMORY = 100;


    private BaseScoreGenerator _scorer;
  private BaseFetcher _fetcher;
  private BaseRobotsParser _parser;
  
    private transient ThreadedExecutor _executor;
    private transient LoggingFlowProcess _flowProcess;
    
    public FilterAndScoreByUrlAndRobots(UserAgent userAgent, int maxThreads, BaseRobotsParser parser, BaseScoreGenerator scorer) {
        super(ScoredUrlDatum.FIELDS);


        _scorer = scorer;
        _parser = parser;
        _fetcher = RobotUtils.createFetcher(userAgent, maxThreads);
    }


    public FilterAndScoreByUrlAndRobots(BaseFetcher fetcher, BaseRobotsParser parser, BaseScoreGenerator scorer) {
        // We're going to output a ScoredUrlDatum (what FetcherBuffer expects).
        super(ScoredUrlDatum.FIELDS);


        _scorer = scorer;
        _parser = parser;
        _fetcher = fetcher;
    }


    @Override
    public boolean isSafe() {
        // We only want to fetch robots once.
        return false;
    }
    
    @SuppressWarnings("unchecked")
    @Override
    public void prepare(FlowProcess flowProcess, cascading.operation.OperationCall<NullContext> operationCall) {
        _executor = new ThreadedExecutor(_fetcher.getMaxThreads(), COMMAND_TIMEOUT);
        
        // FUTURE KKr - use Cascading process vs creating our own, once it
        // supports logging in local mode, and a setStatus() call.
        _flowProcess = new LoggingFlowProcess(flowProcess);
        _flowProcess.addReporter(new LoggingFlowReporter());
    }
    
    private synchronized void terminate() {
        if (_executor == null) {
            return;
        }
        
        try {
            if (!_executor.terminate(TERMINATE_TIMEOUT)) {
                LOGGER.warn("Had to do a hard shutdown of robots fetching");
            }
        } catch (InterruptedException e) {
            // FUTURE What's the right thing to do here? E.g. do I need to worry about
            // losing URLs still to be processed?
            LOGGER.warn("Interrupted while waiting for termination");
            Thread.currentThread().interrupt();
        } finally {
            _executor = null;
        }
    }


    @Override
    public void flush(FlowProcess flowProcess, OperationCall<NullContext> operationCall) {
        LOGGER.info("Flushing FilterAndScoreByUrlAndRobots");
        
        terminate();
        
        super.flush(flowProcess, operationCall);
    }
    
    @Override
    public void cleanup(FlowProcess flowProcess, cascading.operation.OperationCall<NullContext> operationCall) {
        LOGGER.info("Cleaning up FilterAndScoreByUrlAndRobots");
        
        terminate();
        
        _flowProcess.dumpCounters();
        super.cleanup(flowProcess, operationCall);
    }
    
  @Override
  public void operate(FlowProcess flowProcess, BufferCall<NullContext> bufferCall) {
        TupleEntry group = bufferCall.getGroup();
        String protocolAndDomain = group.getString(0);
        LOGGER.info("Processing tuple group: " + group);


        DiskQueue<GroupedUrlDatum> urls = new DiskQueue<GroupedUrlDatum>(MAX_URLS_IN_MEMORY);
        Iterator<TupleEntry> values = bufferCall.getArgumentsIterator();
        while (values.hasNext()) {
            urls.add(new GroupedUrlDatum(new TupleEntry(values.next())));
        }
        
        try {
            Runnable doRobots = new ProcessRobotsTask(protocolAndDomain, _scorer, urls, _fetcher, _parser, bufferCall.getOutputCollector(), _flowProcess);
            _executor.execute(doRobots);
        } catch (RejectedExecutionException e) {
            // should never happen.
            LOGGER.error("Robots handling pool rejected our request for " + protocolAndDomain);
            _flowProcess.increment(FetchCounters.DOMAINS_REJECTED, 1);
            _flowProcess.increment(FetchCounters.URLS_REJECTED, urls.size());
            ProcessRobotsTask.emptyQueue(urls, GroupingKey.DEFERRED_GROUPING_KEY, bufferCall.getOutputCollector(), flowProcess);
        } catch (Throwable t) {
           LOGGER.error("Caught an unexpected throwable - robots handling rejected our request for " + protocolAndDomain, t);
           _flowProcess.increment(FetchCounters.DOMAINS_REJECTED, 1);
           _flowProcess.increment(FetchCounters.URLS_REJECTED, urls.size());
           ProcessRobotsTask.emptyQueue(urls, GroupingKey.DEFERRED_GROUPING_KEY, bufferCall.getOutputCollector(), flowProcess);
      } 
  }


  


}
Source Code of bixo.operations.FilterAndScoreByUrlAndRobots

Related Classes of bixo.operations.FilterAndScoreByUrlAndRobots