Source Code of distributedRedditAnalyser.spout.RawRedditSpout

package distributedRedditAnalyser.spout;


import java.io.IOException;
import java.util.Map;
import java.util.concurrent.ArrayBlockingQueue;


import org.apache.http.HttpVersion;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.ResponseHandler;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.cookie.Cookie;
import org.apache.http.impl.client.BasicResponseHandler;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.cookie.BasicClientCookie2;
import org.apache.http.params.BasicHttpParams;
import org.apache.http.params.CoreConnectionPNames;
import org.apache.http.params.CoreProtocolPNames;
import org.apache.http.params.HttpParams;
import org.json.simple.JSONArray;
import org.json.simple.JSONObject;
import org.json.simple.parser.JSONParser;
import org.json.simple.parser.ParseException;


import distributedRedditAnalyser.reddit.Post;


import backtype.storm.spout.SpoutOutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.base.BaseRichSpout;
import backtype.storm.tuple.Fields;
import backtype.storm.tuple.Values;
import backtype.storm.utils.Utils;


/**
 * Directly interfaces with the Reddit API to create a raw stream of reddit posts
 * 
 * @author Luke Barnett 1109967
 * @author Tony Chen 1111377
 *
 */
public class RawRedditSpout extends BaseRichSpout {
  
  private static final long serialVersionUID = -4867266218997902575L;
  private SpoutOutputCollector collector;
  private Boolean initialPull = true;
  private long latestTimestamp = Long.MIN_VALUE;
  
  private final String SUBREDDIT;
  private final String URL;
  private final ArrayBlockingQueue<Post> QUEUE;
  //The number of pages to fetch on the initial scrape
  private final int INITIAL_PAGE_COUNT = 1;
  private int count = 0;


  /**
   * Creates a new raw reddit spout for the provided sub-reddit
   * @param subReddit The sub-reddit to use for the spout
   */
  public RawRedditSpout(String subReddit){
    SUBREDDIT = subReddit;
    URL = "http://www.reddit.com/r/" + SUBREDDIT + "/new/.json?sort=new&limit=100";
    QUEUE = new ArrayBlockingQueue<Post>(10000);
  }


  @Override
  public void open(Map conf, TopologyContext context,  SpoutOutputCollector collector) {
    this.collector = collector;
  }


  @Override
  public void nextTuple() {
    //Sleep to reduce congestion
    Utils.sleep(50);
    //Try and get the next post
    Post nextPost = getNextPost();
    //If we have gotten a post emit it
    if(nextPost != null)
      collector.emit(new Values(nextPost));
  }


  @Override
  public void declareOutputFields(OutputFieldsDeclarer declarer) {
    declarer.declare(new Fields("redditPost"));
  }
  
  private Post getNextPost(){
    /*
     * If the queue is empty then we need to try fill it up
     */
    if(QUEUE.size() < 1){
      //Set up the HTTP client
      HttpParams parameters = new BasicHttpParams();
      
      parameters.setParameter(CoreProtocolPNames.PROTOCOL_VERSION, HttpVersion.HTTP_1_1);
      parameters.setParameter(CoreProtocolPNames.HTTP_CONTENT_CHARSET, "ISO-8859-1");
      parameters.setBooleanParameter(CoreConnectionPNames.TCP_NODELAY, true);
      parameters.setIntParameter(CoreConnectionPNames.SOCKET_BUFFER_SIZE, 8192);
      parameters.setParameter(CoreProtocolPNames.USER_AGENT, "DistributedRedditAnalyser /u/Technicolour/");
      
      DefaultHttpClient httpClient = new DefaultHttpClient(parameters);
  
      try {
        //If this is our first scrape of the api, then we will perform a backlog
        if(initialPull){
          
          //For every page of the subreddit we are to scrape from
          String lastItemId = "";
          for(int i = 0; i < INITIAL_PAGE_COUNT; i++){
            //Retrieve the page
            HttpGet getRequest = new HttpGet(URL +"&count=" + count + "&after=" + lastItemId);
            ResponseHandler<String> responseHandler = new BasicResponseHandler();
            
            String responseBody = httpClient.execute(getRequest, responseHandler);
            
            //Parse it as JSON
            JSONParser parser= new JSONParser();
            
            JSONObject wrappingObject = (JSONObject) parser.parse(responseBody);
            
            JSONObject wrappingObjectData = (JSONObject) wrappingObject.get("data");
            
            JSONArray children = (JSONArray) wrappingObjectData.get("children");
            
            if(children.size() == 0)
              break;
            
            //reverse order so printed order is consistent
            for(int c=children.size()-1; c>=0; c--){
              JSONObject childData = (JSONObject) ((JSONObject) children.get(c)).get("data");
              QUEUE.add(new Post((String) childData.get("title"), SUBREDDIT));
            }
            
            lastItemId = (String) wrappingObjectData.get("after");
            
            //If this is the first page, then it's the point we want to store to ensure that we don't get repeated posts
            if(i == 0){
              latestTimestamp = ((Double) ((JSONObject)((JSONObject) children.get(0)).get("data")).get("created")).longValue();
            }
            
            //Rate limit
            if(i != INITIAL_PAGE_COUNT - 1)
              Utils.sleep(1000);
            count += 100;
          }
          initialPull = false;
        }else{
          //Rate limit for the API (pages are cached for 30 seconds)
          Utils.sleep(10000);
          //Get the page
          HttpGet getRequest = new HttpGet(URL);
          ResponseHandler<String> responseHandler = new BasicResponseHandler();
                
          String responseBody = httpClient.execute(getRequest, responseHandler);
          
          //Parse it
          JSONParser parser= new JSONParser();
          
          JSONObject wrappingObject = (JSONObject) parser.parse(responseBody);
          
          JSONObject wrappingObjectData = (JSONObject) wrappingObject.get("data");
          
          JSONArray children = (JSONArray) wrappingObjectData.get("children");
          
          if(children.size() > 0){
            //reverse order so it is an actual stream
            for(int c=children.size()-1; c>=0; c--){
              JSONObject childData = (JSONObject) ((JSONObject) children.get(c)).get("data");
              if(latestTimestamp < ((Double) childData.get("created")).longValue())
                QUEUE.add(new Post((String) childData.get("title"), SUBREDDIT));
            }
            latestTimestamp = ((Double) ((JSONObject)((JSONObject) children.get(0)).get("data")).get("created")).longValue();
          }
        }
      } catch (ClientProtocolException e) {
        e.printStackTrace();
      } catch (IOException e) {
        e.printStackTrace();
      } catch (ParseException e) {
        e.printStackTrace();
      } finally {
        httpClient.getConnectionManager().shutdown();
      }
    }
    
    return QUEUE.poll();
  }
  
  @Override
    public void close() {
        
    }


}
Source Code of distributedRedditAnalyser.spout.RawRedditSpout

Related Classes of distributedRedditAnalyser.spout.RawRedditSpout