Package bixo.operations

Source Code of bixo.operations.UrlLengthener

/*
* Copyright 2009-2013 Scale Unlimited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package bixo.operations;

import java.io.IOException;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.RejectedExecutionException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import bixo.config.BixoPlatform;
import bixo.config.FetcherPolicy;
import bixo.config.FetcherPolicy.RedirectMode;
import bixo.config.UserAgent;
import bixo.fetcher.BaseFetcher;
import bixo.fetcher.SimpleHttpFetcher;
import bixo.hadoop.FetchCounters;
import bixo.utils.ThreadedExecutor;
import cascading.flow.FlowProcess;
import cascading.operation.BaseOperation;
import cascading.operation.Function;
import cascading.operation.FunctionCall;
import cascading.operation.OperationCall;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import cascading.tuple.TupleEntryCollector;

import com.scaleunlimited.cascading.LoggingFlowProcess;
import com.scaleunlimited.cascading.LoggingFlowReporter;
import com.scaleunlimited.cascading.NullContext;


@SuppressWarnings({"serial", "rawtypes"})
public class UrlLengthener extends BaseOperation<NullContext> implements Function<NullContext> {
    private static final Logger LOGGER = LoggerFactory.getLogger(UrlLengthener.class);
   
    public static final String URL_FN = "url";
   
    // Seems like things work better when we fetch the content versus aborting the connection
    private static final int MAX_CONTENT_SIZE = 2 * 1024;
   
    // Lots of sites fail when they get a HEAD request, so avoid many of those by only
    // processing a single redirect (typically from the link shortening service)
    private static final int MAX_REDIRECTS = 1;
   
    private static final int REDIRECT_CONNECTION_TIMEOUT = 20 * 1000;
    private static final int REDIRECT_SOCKET_TIMEOUT = 10 * 1000;
    private static final int REDIRECT_RETRY_COUNT = 1;

    private static final long COMMAND_TIMEOUT = (REDIRECT_CONNECTION_TIMEOUT + REDIRECT_SOCKET_TIMEOUT) * REDIRECT_RETRY_COUNT;
    private static final long TERMINATE_TIMEOUT = COMMAND_TIMEOUT * 2;

    private static final Pattern HOSTNAME_PATTERN = Pattern.compile("^http://([^/:?]{3,})");

    private static final Fields DEFAULT_FIELD = new Fields(URL_FN);
   
    private BaseFetcher _fetcher;
    private int _maxThreads;
    private Set<String> _urlShorteners;

    private transient LoggingFlowProcess _flowProcess;
    private transient TupleEntryCollector _collector;
    private transient ThreadedExecutor _executor;

    /**
     * Return a SimpleHttpFetcher that's appropriate for lengthening URLs.
     *
     * @param maxThreads - number of requests to make in parallel. Should be 1 to 100?
     * @param userAgent - what to use when making requests.
     * @return BaseFetcher that can be passed to the UrlLengthener constructor.
     */
    public static BaseFetcher makeFetcher(int maxThreads, UserAgent userAgent) {
        FetcherPolicy policy = new FetcherPolicy();
        policy.setRedirectMode(RedirectMode.FOLLOW_NONE);
        policy.setMaxRedirects(MAX_REDIRECTS);
        policy.setMaxConnectionsPerHost(maxThreads);

        SimpleHttpFetcher result = new SimpleHttpFetcher(maxThreads, policy, userAgent);
        result.setDefaultMaxContentSize(MAX_CONTENT_SIZE);
       
        // We don't want any encoding (compression) of the data.
        result.setAcceptEncoding("");
        return result;
    }
   
    public UrlLengthener(BaseFetcher fetcher) throws IOException {
        this(fetcher, DEFAULT_FIELD);
    }

    public UrlLengthener(BaseFetcher fetcher, Fields resultField) throws IOException {
        super(resultField);
       
        if (resultField.size() != 1) {
            throw new IllegalArgumentException("resultField must contain a single field");
        }
       
        _fetcher = fetcher;
        _maxThreads = fetcher.getMaxThreads();
       
        _urlShorteners = loadUrlShorteners();
    }

    @SuppressWarnings("unchecked")
    @Override
    public void prepare(FlowProcess flowProcess, OperationCall<NullContext> operationCall) {
        super.prepare(flowProcess, operationCall);
       
        _flowProcess = new LoggingFlowProcess(flowProcess);
        _flowProcess.addReporter(new LoggingFlowReporter());

        _executor = new ThreadedExecutor(_maxThreads, COMMAND_TIMEOUT);
    }
   
    @Override
    public void flush(FlowProcess flowProcess, OperationCall<NullContext> perationCall) {
        try {
            if (!_executor.terminate(TERMINATE_TIMEOUT)) {
                LOGGER.warn("Had to do a hard shutdown of robots fetching");
            }
        } catch (InterruptedException e) {
            // FUTURE What's the right thing to do here? E.g. do I need to worry about
            // losing URLs still to be processed?
            LOGGER.warn("Interrupted while waiting for termination");
            Thread.currentThread().interrupt();
        }
       
        super.flush(flowProcess, perationCall);
    }
   
    @Override
    public void cleanup(FlowProcess flowProcess, OperationCall<NullContext> operationCall) {
        _flowProcess.dumpCounters();
        super.cleanup(flowProcess, operationCall);
    }
   
    @Override
    public void operate(FlowProcess flowProcess, FunctionCall<NullContext> functionCall) {
        _collector = functionCall.getOutputCollector();

        String url = functionCall.getArguments().getTuple().getString(0);
       
        // Figure out if this is a URL from a shortener service.
        // If so, then we want to try to lengthen it.
        // If not, see if it looks like shortened URL, and try anyway.
       
        Matcher m = HOSTNAME_PATTERN.matcher(url);
        if (!m.find()) {
            emitTuple(url);
            return;
        }

        String hostname = m.group(1);
        if (!_urlShorteners.contains(hostname)) {
            // FUTURE - see if this looks like a shortened URL
            emitTuple(url);
            return;
        }
       
        try {
            ResolveRedirectsTask task = new ResolveRedirectsTask(url, _fetcher, _collector, _flowProcess);
            _executor.execute(task);
        } catch (RejectedExecutionException e) {
            // should never happen.
            LOGGER.error("Redirection handling pool rejected our request for " + url);
            _flowProcess.increment(FetchCounters.URLS_REJECTED, 1);
           
            emitTuple(url);
        } catch (Throwable t) {
            LOGGER.error("Caught an unexpected throwable - redirection code rejected our request for " + url, t);
            _flowProcess.increment(FetchCounters.URLS_REJECTED, 1);
           
            emitTuple(url);
        }
    }
   
    public static Set<String> loadUrlShorteners() throws IOException {
        Set<String> result = new HashSet<String>();
        List<String> lines = IOUtils.readLines(UrlLengthener.class.getResourceAsStream("/url-shorteners.txt"), "UTF-8");
        for (String line : lines) {
            line = line.trim();
            if ((line.length() == 0) || (line.startsWith("#"))) {
                continue;
            }
           
            int commentIndex = line.indexOf('#');
            if (commentIndex != -1) {
                line = line.substring(0, commentIndex).trim();
            }
           
            result.add(line);
        }
       
        return result;
    }
   

    private void emitTuple(String url) {
        synchronized(_collector) {
            _collector.add(BixoPlatform.clone(new Tuple(url), _flowProcess));
        }
    }
   
}
TOP

Related Classes of bixo.operations.UrlLengthener

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.