Package bixo.operations

Source Code of bixo.operations.FilterAndScoreByUrlAndRobots

/*
* Copyright 2009-2013 Scale Unlimited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package bixo.operations;

import java.util.Iterator;
import java.util.concurrent.RejectedExecutionException;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


import bixo.config.UserAgent;
import bixo.datum.GroupedUrlDatum;
import bixo.datum.ScoredUrlDatum;
import bixo.fetcher.BaseFetcher;
import bixo.hadoop.FetchCounters;
import bixo.robots.BaseRobotsParser;
import bixo.robots.RobotUtils;
import bixo.utils.DiskQueue;
import bixo.utils.GroupingKey;
import bixo.utils.ThreadedExecutor;
import cascading.flow.FlowProcess;
import cascading.operation.BaseOperation;
import cascading.operation.Buffer;
import cascading.operation.BufferCall;
import cascading.operation.OperationCall;
import cascading.tuple.TupleEntry;

import com.scaleunlimited.cascading.LoggingFlowProcess;
import com.scaleunlimited.cascading.LoggingFlowReporter;
import com.scaleunlimited.cascading.NullContext;


/**
* Filter out URLs by either domain (not popular enough) or if they're blocked by robots.txt
*
*/

@SuppressWarnings({"serial", "rawtypes"})
public class FilterAndScoreByUrlAndRobots extends BaseOperation<NullContext> implements Buffer<NullContext> {
  private static final Logger LOGGER = LoggerFactory.getLogger(FilterAndScoreByUrlAndRobots.class);
 
    private static final long COMMAND_TIMEOUT = RobotUtils.getMaxFetchTime();
    private static final long TERMINATE_TIMEOUT = COMMAND_TIMEOUT;

    private static final int MAX_URLS_IN_MEMORY = 100;

    private BaseScoreGenerator _scorer;
  private BaseFetcher _fetcher;
  private BaseRobotsParser _parser;
 
    private transient ThreadedExecutor _executor;
    private transient LoggingFlowProcess _flowProcess;
   
    public FilterAndScoreByUrlAndRobots(UserAgent userAgent, int maxThreads, BaseRobotsParser parser, BaseScoreGenerator scorer) {
        super(ScoredUrlDatum.FIELDS);

        _scorer = scorer;
        _parser = parser;
        _fetcher = RobotUtils.createFetcher(userAgent, maxThreads);
    }

    public FilterAndScoreByUrlAndRobots(BaseFetcher fetcher, BaseRobotsParser parser, BaseScoreGenerator scorer) {
        // We're going to output a ScoredUrlDatum (what FetcherBuffer expects).
        super(ScoredUrlDatum.FIELDS);

        _scorer = scorer;
        _parser = parser;
        _fetcher = fetcher;
    }

    @Override
    public boolean isSafe() {
        // We only want to fetch robots once.
        return false;
    }
   
    @SuppressWarnings("unchecked")
    @Override
    public void prepare(FlowProcess flowProcess, cascading.operation.OperationCall<NullContext> operationCall) {
        _executor = new ThreadedExecutor(_fetcher.getMaxThreads(), COMMAND_TIMEOUT);
       
        // FUTURE KKr - use Cascading process vs creating our own, once it
        // supports logging in local mode, and a setStatus() call.
        _flowProcess = new LoggingFlowProcess(flowProcess);
        _flowProcess.addReporter(new LoggingFlowReporter());
    }
   
    private synchronized void terminate() {
        if (_executor == null) {
            return;
        }
       
        try {
            if (!_executor.terminate(TERMINATE_TIMEOUT)) {
                LOGGER.warn("Had to do a hard shutdown of robots fetching");
            }
        } catch (InterruptedException e) {
            // FUTURE What's the right thing to do here? E.g. do I need to worry about
            // losing URLs still to be processed?
            LOGGER.warn("Interrupted while waiting for termination");
            Thread.currentThread().interrupt();
        } finally {
            _executor = null;
        }
    }

    @Override
    public void flush(FlowProcess flowProcess, OperationCall<NullContext> operationCall) {
        LOGGER.info("Flushing FilterAndScoreByUrlAndRobots");
       
        terminate();
       
        super.flush(flowProcess, operationCall);
    }
   
    @Override
    public void cleanup(FlowProcess flowProcess, cascading.operation.OperationCall<NullContext> operationCall) {
        LOGGER.info("Cleaning up FilterAndScoreByUrlAndRobots");
       
        terminate();
       
        _flowProcess.dumpCounters();
        super.cleanup(flowProcess, operationCall);
    }
   
  @Override
  public void operate(FlowProcess flowProcess, BufferCall<NullContext> bufferCall) {
        TupleEntry group = bufferCall.getGroup();
        String protocolAndDomain = group.getString(0);
        LOGGER.info("Processing tuple group: " + group);

        DiskQueue<GroupedUrlDatum> urls = new DiskQueue<GroupedUrlDatum>(MAX_URLS_IN_MEMORY);
        Iterator<TupleEntry> values = bufferCall.getArgumentsIterator();
        while (values.hasNext()) {
            urls.add(new GroupedUrlDatum(new TupleEntry(values.next())));
        }
       
        try {
            Runnable doRobots = new ProcessRobotsTask(protocolAndDomain, _scorer, urls, _fetcher, _parser, bufferCall.getOutputCollector(), _flowProcess);
            _executor.execute(doRobots);
        } catch (RejectedExecutionException e) {
            // should never happen.
            LOGGER.error("Robots handling pool rejected our request for " + protocolAndDomain);
            _flowProcess.increment(FetchCounters.DOMAINS_REJECTED, 1);
            _flowProcess.increment(FetchCounters.URLS_REJECTED, urls.size());
            ProcessRobotsTask.emptyQueue(urls, GroupingKey.DEFERRED_GROUPING_KEY, bufferCall.getOutputCollector(), flowProcess);
        } catch (Throwable t) {
           LOGGER.error("Caught an unexpected throwable - robots handling rejected our request for " + protocolAndDomain, t);
           _flowProcess.increment(FetchCounters.DOMAINS_REJECTED, 1);
           _flowProcess.increment(FetchCounters.URLS_REJECTED, urls.size());
           ProcessRobotsTask.emptyQueue(urls, GroupingKey.DEFERRED_GROUPING_KEY, bufferCall.getOutputCollector(), flowProcess);
      }
  }

 

}
TOP

Related Classes of bixo.operations.FilterAndScoreByUrlAndRobots

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.