/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.crawler.frontier;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.httpclient.URIException;
import org.archive.crawler.event.CrawlStateEvent;
import org.archive.crawler.framework.Frontier;
import org.archive.modules.CrawlURI;
import org.archive.modules.SchedulingConstants;
import org.archive.modules.extractor.Hop;
import org.archive.modules.extractor.LinkContext;
import org.archive.net.UURI;
import org.archive.net.UURIFactory;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.ApplicationListener;
import org.springframework.context.Lifecycle;
import com.rabbitmq.client.AMQP.BasicProperties;
import com.rabbitmq.client.Channel;
import com.rabbitmq.client.Connection;
import com.rabbitmq.client.ConnectionFactory;
import com.rabbitmq.client.Consumer;
import com.rabbitmq.client.DefaultConsumer;
import com.rabbitmq.client.Envelope;
import com.rabbitmq.client.ShutdownSignalException;
/**
* @contributor nlevitt
*/
public class AMQPUrlReceiver implements Lifecycle, ApplicationListener<CrawlStateEvent> {
@SuppressWarnings("unused")
private static final long serialVersionUID = 1L;
private static final Logger logger =
Logger.getLogger(AMQPUrlReceiver.class.getName());
public static final String A_RECEIVED_FROM_AMQP = "receivedFromAMQP";
protected Frontier frontier;
public Frontier getFrontier() {
return this.frontier;
}
@Autowired
public void setFrontier(Frontier frontier) {
this.frontier = frontier;
}
protected String amqpUri = "amqp://guest:guest@localhost:5672/%2f";
public String getAmqpUri() {
return this.amqpUri;
}
public void setAmqpUri(String uri) {
this.amqpUri = uri;
}
protected String exchange = "umbra";
public String getExchange() {
return exchange;
}
public void setExchange(String exchange) {
this.exchange = exchange;
}
protected String queueName = "requests";
public String getQueueName() {
return queueName;
}
public void setQueueName(String queueName) {
this.queueName = queueName;
}
protected boolean isRunning = false;
@Override
public boolean isRunning() {
return isRunning;
}
private transient Lock lock = new ReentrantLock(true);
private class StarterRestarter extends Thread {
public StarterRestarter(String name) {
super(name);
}
@Override
public void run() {
while (!Thread.interrupted()) {
try {
lock.lockInterruptibly();
if (!isRunning) {
// start up again
try {
Consumer consumer = new UrlConsumer(channel());
channel().queueDeclare(getQueueName(), false, false, true, null);
channel().queueBind(getQueueName(), getExchange(), getQueueName());
channel().basicConsume(getQueueName(), false, consumer);
isRunning = true;
logger.info("started AMQP consumer uri=" + getAmqpUri() + " exchange=" + getExchange() + " queueName=" + getQueueName());
} catch (IOException e) {
logger.log(Level.SEVERE, "problem starting AMQP consumer (will try again after 30 seconds)", e);
}
}
Thread.sleep(30000);
} catch (InterruptedException e) {
return;
} finally {
lock.unlock();
}
}
}
}
transient private StarterRestarter starterRestarter;
@Override
public void start() {
lock.lock();
try {
// spawn off a thread to start up the amqp consumer, and try to restart it if it dies
if (!isRunning) {
starterRestarter = new StarterRestarter(AMQPUrlReceiver.class.getSimpleName() + "-starter-restarter");
starterRestarter.start();
}
} finally {
lock.unlock();
}
}
@Override
public void stop() {
lock.lock();
try {
logger.info("shutting down");
if (connection != null && connection.isOpen()) {
try {
connection.close();
} catch (IOException e) {
logger.log(Level.SEVERE, "problem closing AMQP connection", e);
}
}
if (starterRestarter != null && starterRestarter.isAlive()) {
starterRestarter.interrupt();
try {
starterRestarter.join();
} catch (InterruptedException e) {
}
}
starterRestarter = null;
connection = null;
channel = null;
isRunning = false;
} finally {
lock.unlock();
}
}
transient protected Connection connection = null;
transient protected Channel channel = null;
protected Connection connection() throws IOException {
lock.lock();
try {
if (connection != null && !connection.isOpen()) {
logger.warning("connection is closed, creating a new one");
connection = null;
}
if (connection == null) {
ConnectionFactory factory = new ConnectionFactory();
try {
factory.setUri(getAmqpUri());
} catch (Exception e) {
throw new IOException("problem with AMQP uri " + getAmqpUri(), e);
}
connection = factory.newConnection();
}
return connection;
} finally {
lock.unlock();
}
}
protected Channel channel() throws IOException {
lock.lock();
try {
if (channel != null && !channel.isOpen()) {
logger.warning("channel is not open, creating a new one");
channel = null;
}
if (channel == null) {
channel = connection().createChannel();
}
return channel;
} finally {
lock.unlock();
}
}
// XXX should we be using QueueingConsumer because of possible blocking in
// frontier.schedule()?
// "Note: all methods of this interface are invoked inside the Connection's
// thread. This means they a) should be non-blocking and generally do little
// work, b) must not call Channel or Connection methods, or a deadlock will
// ensue. One way of ensuring this is to use/subclass QueueingConsumer."
protected class UrlConsumer extends DefaultConsumer {
public UrlConsumer(Channel channel) {
super(channel);
}
@Override
public void handleDelivery(String consumerTag, Envelope envelope,
BasicProperties properties, byte[] body) throws IOException {
String decodedBody;
try {
decodedBody = new String(body, "UTF-8");
} catch (UnsupportedEncodingException e) {
throw new RuntimeException(e); // can't happen
}
JSONObject jo = new JSONObject(decodedBody);
if ("GET".equals(jo.getString("method"))) {
CrawlURI curi;
try {
curi = makeCrawlUri(jo);
// bypasses scoping (unless rechecking is configured)
getFrontier().schedule(curi);
if (logger.isLoggable(Level.FINE)) {
logger.fine("scheduled " + curi);
}
} catch (URIException e) {
logger.log(Level.WARNING,
"problem creating CrawlURI from json received via AMQP "
+ decodedBody, e);
} catch (JSONException e) {
logger.log(Level.SEVERE,
"problem creating CrawlURI from json received via AMQP "
+ decodedBody, e);
}
} else {
logger.warning("ignoring url with method other than GET - " + decodedBody);
}
this.getChannel().basicAck(envelope.getDeliveryTag(), false);
}
@Override
public void handleShutdownSignal(String consumerTag,
ShutdownSignalException sig) {
if (!sig.isInitiatedByApplication()) {
logger.log(Level.SEVERE, "amqp channel/connection unexpectedly shut down consumerTag=" + consumerTag, sig);
} else {
logger.info("amqp channel/connection shut down consumerTag=" + consumerTag);
}
isRunning = false;
}
// {
// "headers": {
// "Referer": "https://archive.org/",
// "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/32.0.1700.102 Chrome/32.0.1700.102 Safari/537.36",
// "Accept": "image/webp,*/*;q=0.8"
// },
// "url": "https://analytics.archive.org/0.gif?server_ms=256&server_name=www19.us.archive.org&service=ao&loadtime=358&timediff=-8&locale=en-US&referrer=-&version=2&count=9",
// "method": "GET"
// }
@SuppressWarnings("unchecked")
protected CrawlURI makeCrawlUri(JSONObject jo) throws URIException,
JSONException {
JSONObject joHeaders = jo.getJSONObject("headers");
UURI uuri = UURIFactory.getInstance(jo.getString("url"));
UURI via = UURIFactory.getInstance(jo.getString("parentUrl"));
JSONObject parentUrlMetadata = jo.getJSONObject("parentUrlMetadata");
String parentHopPath = parentUrlMetadata.getString("pathFromSeed");
String hopPath = parentHopPath + Hop.INFERRED.getHopString();
CrawlURI curi = new CrawlURI(uuri, hopPath, via, LinkContext.INFERRED_MISC);
// set the heritable data from the parent url, passed back to us via amqp
// XXX brittle, only goes one level deep, and only handles strings and arrays, the latter of which it converts to a Set.
// 'heritableData': {'source': 'https://facebook.com/whitehouse/', 'heritable': ['source', 'heritable']}
JSONObject heritableData = parentUrlMetadata.getJSONObject("heritableData");
for (String key: (Set<String>) heritableData.keySet()) {
Object value = heritableData.get(key);
if (value instanceof JSONArray) {
Set<String> valueSet = new HashSet<String>();
JSONArray arr = ((JSONArray) value);
for (int i = 0; i < arr.length(); i++) {
valueSet.add(arr.getString(i));
}
curi.getData().put(key, valueSet);
} else {
curi.getData().put(key, heritableData.get(key));
}
}
// set the http headers from the amqp message
Map<String, String> customHttpRequestHeaders = new HashMap<String, String>();
for (Object key : joHeaders.keySet()) {
customHttpRequestHeaders.put(key.toString(),
joHeaders.getString(key.toString()));
}
curi.getData().put("customHttpRequestHeaders", customHttpRequestHeaders);
/* Use HighestUriQueuePrecedencePolicy to ensure these high priority
* urls really get crawled ahead of others.
* See https://webarchive.jira.com/wiki/display/Heritrix/Precedence+Feature+Notes
*/
curi.setSchedulingDirective(SchedulingConstants.HIGH);
curi.setPrecedence(1);
//curi.setForceFetch(true);
curi.getAnnotations().add(A_RECEIVED_FROM_AMQP);
return curi;
}
}
@Override
public void onApplicationEvent(CrawlStateEvent event) {
switch(event.getState()) {
case PAUSING: case PAUSED:
if (channel != null && channel.isOpen()) {
try {
channel.flow(false);
} catch (IOException e) {
logger.log(Level.WARNING, "failed to pause flow on amqp channel", e);
}
}
break;
case RUNNING: case EMPTY: case PREPARING:
if (channel != null && channel.isOpen()) {
try {
channel.flow(true);
} catch (IOException e) {
logger.log(Level.SEVERE, "failed to resume flow on amqp channel", e);
}
}
break;
default:
}
}
}