Package com.datasalt.trident

Source Code of com.datasalt.trident.FakeTweetsBatchSpout

package com.datasalt.trident;

import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Map;
import java.util.Random;

import org.apache.commons.io.IOUtils;

import storm.trident.operation.TridentCollector;
import storm.trident.spout.IBatchSpout;
import backtype.storm.Config;
import backtype.storm.task.TopologyContext;
import backtype.storm.tuple.Fields;
import backtype.storm.tuple.Values;

/**
* A Spout that emits fake tweets. It calculates a random probability distribution for hashtags and actor activity. It
* uses a dataset of 500 english sentences. It has a fixed set of actors and subjects which you can also modify at your own will.
* Tweet text is one of the random 500 sentences followed by a hashtag of one subject.
*
* @author pere
*/
@SuppressWarnings({ "serial", "rawtypes" })
public class FakeTweetsBatchSpout implements IBatchSpout {

  private int batchSize;

  public final static String[] ACTORS = { "stefan", "dave", "pere", "nathan", "doug", "ted", "mary",
      "rose" };
  public final static String[] LOCATIONS = { "Spain", "USA", "Spain", "USA", "USA", "USA", "UK",
      "France" };
  public final static String[] SUBJECTS = { "berlin", "justinbieber", "hadoop", "life", "bigdata" };

  private double[] activityDistribution;
  private double[][] subjectInterestDistribution;
  private Random randomGenerator;
  private String[] sentences;

  private long tweetId = 0;

  public FakeTweetsBatchSpout() throws IOException {
    this(5);
  }

  public FakeTweetsBatchSpout(int batchSize) throws IOException {
    this.batchSize = batchSize;
  }

  @SuppressWarnings("unchecked")
  @Override
  public void open(Map conf, TopologyContext context) {
    // init
    System.err.println("Open Spout instance");
    this.randomGenerator = new Random();
    // read a resource with 500 sample english sentences
    try {
      sentences = (String[]) IOUtils.readLines(
          ClassLoader.getSystemClassLoader().getResourceAsStream("500_sentences_en.txt")).toArray(new String[0]);
    } catch(IOException e) {
      throw new RuntimeException(e);
    }
    // will define which actors are more proactive than the others
    this.activityDistribution = getProbabilityDistribution(ACTORS.length, randomGenerator);
    // will define what subjects each of the actors are most interested in
    this.subjectInterestDistribution = new double[ACTORS.length][];
    for(int i = 0; i < ACTORS.length; i++) {
      this.subjectInterestDistribution[i] = getProbabilityDistribution(SUBJECTS.length, randomGenerator);
    }
  }

  @Override
  public void emitBatch(long batchId, TridentCollector collector) {
    // emit batchSize fake tweets
    for(int i = 0; i < batchSize; i++) {
      collector.emit(getNextTweet());
    }
  }

  @Override
  public void ack(long batchId) {
    // nothing to do here
  }

  @Override
  public void close() {
    // nothing to do here
  }

  @Override
  public Map getComponentConfiguration() {
    // no particular configuration here
    return new Config();
  }

  @Override
  public Fields getOutputFields() {
    return new Fields("id", "text", "actor", "location", "date");
  }

  // --- Helper methods --- //
  // SimpleDateFormat is not thread safe!
  private SimpleDateFormat DATE_FORMAT = new SimpleDateFormat("yyyy-MM-dd hh:mm:ss aa");

  private Values getNextTweet() {
    int actorIndex = randomIndex(activityDistribution, randomGenerator);
    String author = ACTORS[actorIndex];
    String text = sentences[randomGenerator.nextInt(sentences.length)].trim() + " #"
        + SUBJECTS[randomIndex(subjectInterestDistribution[actorIndex], randomGenerator)];
    return new Values(++tweetId + "", text, author, LOCATIONS[actorIndex], DATE_FORMAT.format(System
        .currentTimeMillis()));
  }

  /**
   * Code snippet: http://stackoverflow.com/questions/2171074/generating-a-probability-distribution Returns an array of
   * size "n" with probabilities between 0 and 1 such that sum(array) = 1.
   */
  private static double[] getProbabilityDistribution(int n, Random randomGenerator) {
    double a[] = new double[n];
    double s = 0.0d;
    for(int i = 0; i < n; i++) {
      a[i] = 1.0d - randomGenerator.nextDouble();
      a[i] = -1 * Math.log(a[i]);
      s += a[i];
    }
    for(int i = 0; i < n; i++) {
      a[i] /= s;
    }
    return a;
  }

  private static int randomIndex(double[] distribution, Random randomGenerator) {
    double rnd = randomGenerator.nextDouble();
    double accum = 0;
    int index = 0;
    for(; index < distribution.length && accum < rnd; index++, accum += distribution[index - 1])
      ;
    return index - 1;
  }

  public static void main(String[] args) throws IOException, ParseException {
    FakeTweetsBatchSpout spout = new FakeTweetsBatchSpout();
    spout.open(null, null);
    for(int i = 0; i < 30; i++)
      System.out.println(spout.getNextTweet());
  }
}
TOP

Related Classes of com.datasalt.trident.FakeTweetsBatchSpout

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.