/*
* Copyright 2009-2013 Scale Unlimited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package bixo.operations;
import java.io.IOException;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.RejectedExecutionException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import bixo.config.BixoPlatform;
import bixo.config.FetcherPolicy;
import bixo.config.FetcherPolicy.RedirectMode;
import bixo.config.UserAgent;
import bixo.fetcher.BaseFetcher;
import bixo.fetcher.SimpleHttpFetcher;
import bixo.hadoop.FetchCounters;
import bixo.utils.ThreadedExecutor;
import cascading.flow.FlowProcess;
import cascading.operation.BaseOperation;
import cascading.operation.Function;
import cascading.operation.FunctionCall;
import cascading.operation.OperationCall;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import cascading.tuple.TupleEntryCollector;
import com.scaleunlimited.cascading.LoggingFlowProcess;
import com.scaleunlimited.cascading.LoggingFlowReporter;
import com.scaleunlimited.cascading.NullContext;
@SuppressWarnings({"serial", "rawtypes"})
public class UrlLengthener extends BaseOperation<NullContext> implements Function<NullContext> {
private static final Logger LOGGER = LoggerFactory.getLogger(UrlLengthener.class);
public static final String URL_FN = "url";
// Seems like things work better when we fetch the content versus aborting the connection
private static final int MAX_CONTENT_SIZE = 2 * 1024;
// Lots of sites fail when they get a HEAD request, so avoid many of those by only
// processing a single redirect (typically from the link shortening service)
private static final int MAX_REDIRECTS = 1;
private static final int REDIRECT_CONNECTION_TIMEOUT = 20 * 1000;
private static final int REDIRECT_SOCKET_TIMEOUT = 10 * 1000;
private static final int REDIRECT_RETRY_COUNT = 1;
private static final long COMMAND_TIMEOUT = (REDIRECT_CONNECTION_TIMEOUT + REDIRECT_SOCKET_TIMEOUT) * REDIRECT_RETRY_COUNT;
private static final long TERMINATE_TIMEOUT = COMMAND_TIMEOUT * 2;
private static final Pattern HOSTNAME_PATTERN = Pattern.compile("^http://([^/:?]{3,})");
private static final Fields DEFAULT_FIELD = new Fields(URL_FN);
private BaseFetcher _fetcher;
private int _maxThreads;
private Set<String> _urlShorteners;
private transient LoggingFlowProcess _flowProcess;
private transient TupleEntryCollector _collector;
private transient ThreadedExecutor _executor;
/**
* Return a SimpleHttpFetcher that's appropriate for lengthening URLs.
*
* @param maxThreads - number of requests to make in parallel. Should be 1 to 100?
* @param userAgent - what to use when making requests.
* @return BaseFetcher that can be passed to the UrlLengthener constructor.
*/
public static BaseFetcher makeFetcher(int maxThreads, UserAgent userAgent) {
FetcherPolicy policy = new FetcherPolicy();
policy.setRedirectMode(RedirectMode.FOLLOW_NONE);
policy.setMaxRedirects(MAX_REDIRECTS);
policy.setMaxConnectionsPerHost(maxThreads);
SimpleHttpFetcher result = new SimpleHttpFetcher(maxThreads, policy, userAgent);
result.setDefaultMaxContentSize(MAX_CONTENT_SIZE);
// We don't want any encoding (compression) of the data.
result.setAcceptEncoding("");
return result;
}
public UrlLengthener(BaseFetcher fetcher) throws IOException {
this(fetcher, DEFAULT_FIELD);
}
public UrlLengthener(BaseFetcher fetcher, Fields resultField) throws IOException {
super(resultField);
if (resultField.size() != 1) {
throw new IllegalArgumentException("resultField must contain a single field");
}
_fetcher = fetcher;
_maxThreads = fetcher.getMaxThreads();
_urlShorteners = loadUrlShorteners();
}
@SuppressWarnings("unchecked")
@Override
public void prepare(FlowProcess flowProcess, OperationCall<NullContext> operationCall) {
super.prepare(flowProcess, operationCall);
_flowProcess = new LoggingFlowProcess(flowProcess);
_flowProcess.addReporter(new LoggingFlowReporter());
_executor = new ThreadedExecutor(_maxThreads, COMMAND_TIMEOUT);
}
@Override
public void flush(FlowProcess flowProcess, OperationCall<NullContext> perationCall) {
try {
if (!_executor.terminate(TERMINATE_TIMEOUT)) {
LOGGER.warn("Had to do a hard shutdown of robots fetching");
}
} catch (InterruptedException e) {
// FUTURE What's the right thing to do here? E.g. do I need to worry about
// losing URLs still to be processed?
LOGGER.warn("Interrupted while waiting for termination");
Thread.currentThread().interrupt();
}
super.flush(flowProcess, perationCall);
}
@Override
public void cleanup(FlowProcess flowProcess, OperationCall<NullContext> operationCall) {
_flowProcess.dumpCounters();
super.cleanup(flowProcess, operationCall);
}
@Override
public void operate(FlowProcess flowProcess, FunctionCall<NullContext> functionCall) {
_collector = functionCall.getOutputCollector();
String url = functionCall.getArguments().getTuple().getString(0);
// Figure out if this is a URL from a shortener service.
// If so, then we want to try to lengthen it.
// If not, see if it looks like shortened URL, and try anyway.
Matcher m = HOSTNAME_PATTERN.matcher(url);
if (!m.find()) {
emitTuple(url);
return;
}
String hostname = m.group(1);
if (!_urlShorteners.contains(hostname)) {
// FUTURE - see if this looks like a shortened URL
emitTuple(url);
return;
}
try {
ResolveRedirectsTask task = new ResolveRedirectsTask(url, _fetcher, _collector, _flowProcess);
_executor.execute(task);
} catch (RejectedExecutionException e) {
// should never happen.
LOGGER.error("Redirection handling pool rejected our request for " + url);
_flowProcess.increment(FetchCounters.URLS_REJECTED, 1);
emitTuple(url);
} catch (Throwable t) {
LOGGER.error("Caught an unexpected throwable - redirection code rejected our request for " + url, t);
_flowProcess.increment(FetchCounters.URLS_REJECTED, 1);
emitTuple(url);
}
}
public static Set<String> loadUrlShorteners() throws IOException {
Set<String> result = new HashSet<String>();
List<String> lines = IOUtils.readLines(UrlLengthener.class.getResourceAsStream("/url-shorteners.txt"), "UTF-8");
for (String line : lines) {
line = line.trim();
if ((line.length() == 0) || (line.startsWith("#"))) {
continue;
}
int commentIndex = line.indexOf('#');
if (commentIndex != -1) {
line = line.substring(0, commentIndex).trim();
}
result.add(line);
}
return result;
}
private void emitTuple(String url) {
synchronized(_collector) {
_collector.add(BixoPlatform.clone(new Tuple(url), _flowProcess));
}
}
}