/*
* Copyright 2009-2013 Scale Unlimited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package bixo.fetcher;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import org.apache.http.HttpStatus;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.scaleunlimited.cascading.Payload;
import bixo.config.FetcherPolicy;
import bixo.config.UserAgent;
import bixo.datum.ContentBytes;
import bixo.datum.FetchedDatum;
import bixo.datum.HttpHeaders;
import bixo.datum.ScoredUrlDatum;
import bixo.exceptions.BaseFetchException;
import bixo.exceptions.HttpFetchException;
import bixo.exceptions.UrlFetchException;
@SuppressWarnings("serial")
public class LoggingFetcher extends BaseFetcher {
private static final Logger LOGGER = LoggerFactory.getLogger(LoggingFetcher.class);
public static final String FAKE_CONTENT_LOCATION = "Fake-LoggingFetcher";
// Generic HTML page we send back for every request - only customization is the URL
private static final String HTML_TEMPLATE =
"<!DOCTYPE HTML PUBLIC \"-//BBSW//DTD Compact HTML 2.0//EN\">\n" +
"<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\">\n" +
"<title>LoggingFetcher</title>\n" +
"</head><body>URL = %s</body></html>\n";
public LoggingFetcher(int maxThreads) {
super(maxThreads, new FetcherPolicy(), new UserAgent("agentName", "agentName@domain.com", "http://agentName.domain.com"));
}
@Override
public FetchedDatum get(ScoredUrlDatum datum) throws BaseFetchException {
String url = datum.getUrl();
Payload payload = datum.getPayload();
logPayload(url, payload);
// Create a simple HTML page here, where we fill in the URL as
// the field, and return that as the BytesWritable. we could add
// more of the datum values to the template if we cared.
try {
return makeFetchedDatum(url, String.format(HTML_TEMPLATE, url), payload);
} catch (UnsupportedEncodingException e) {
throw new RuntimeException("Should never happen", e);
} catch (MalformedURLException e) {
throw new UrlFetchException(url, e.getMessage());
}
}
private FetchedDatum makeFetchedDatum(String url, String htmlContent, Payload payload) throws MalformedURLException, HttpFetchException, UnsupportedEncodingException {
URL theUrl = new URL(url);
if (theUrl.getFile().equals("/robots.txt")) {
throw new HttpFetchException(url, "Never return robots.txt from LoggingFetcher", HttpStatus.SC_NOT_FOUND, null);
}
byte[] content = htmlContent.getBytes("UTF-8");
HttpHeaders headers = new HttpHeaders();
headers.add(HttpHeaderNames.CONTENT_LENGTH, "" + content.length);
headers.add(HttpHeaderNames.CONTENT_TYPE, "text/html");
// Set the location to a fixed value, so that when we're processing entries from
// the URL DB that might have been set using fake content, we know to ignore the
// refetch time if we're doing a real fetch.
headers.add(HttpHeaderNames.CONTENT_LOCATION, FAKE_CONTENT_LOCATION);
FetchedDatum result = new FetchedDatum(url, url, System.currentTimeMillis(), headers, new ContentBytes(content), "text/html", 100000);
result.setPayload(payload);
return result;
}
private void logPayload(String url, Payload payload) {
StringBuilder msg = new StringBuilder(url);
msg.append(" ( ");
for (String key : payload.keySet()) {
msg.append(key);
msg.append(':');
Object value = payload.get(key);
msg.append(value == null ? "null" : value.toString());
msg.append(' ');
}
msg.append(")");
LOGGER.info(msg.toString());
}
@Override
public void abort() {
// Do nothing
}
}