Package distributedRedditAnalyser

Source Code of distributedRedditAnalyser.Main

package distributedRedditAnalyser;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import weka.core.Attribute;
import weka.core.Instances;

import distributedRedditAnalyser.bolt.DistributedOzaBoostBolt;
import distributedRedditAnalyser.bolt.InstanceBolt;
import distributedRedditAnalyser.bolt.OzaBoostBolt;
import distributedRedditAnalyser.bolt.PrinterBolt;
import distributedRedditAnalyser.bolt.StatisticsBolt;
import distributedRedditAnalyser.bolt.StatsPrinterBolt;
import distributedRedditAnalyser.bolt.StatsWriterBolt;
import distributedRedditAnalyser.bolt.StringToWordVectorBolt;
import distributedRedditAnalyser.spout.RawRedditSpout;

import backtype.storm.Config;
import backtype.storm.LocalCluster;
import backtype.storm.topology.BoltDeclarer;
import backtype.storm.topology.TopologyBuilder;
import backtype.storm.utils.Utils;

/**
* Using multiple sub-reddits tries to predict what sub-reddit a post came from based on it's topic.
*
* Uses storm to distribute the task over a potential cluster
*
* Built against storm 0.7.2 (https://github.com/nathanmarz/storm/tree/0.7.2) (https://github.com/downloads/nathanmarz/storm/storm-0.7.2-rc1.zip)
*
* @author Luke Barnett 1109967
* @author Tony Chen 1111377
*
*/
public class Main {

  /**
   * Main entry point for the program.
   *
   * Takes a list of sub-reddits to perform the analysis with
   *
   * @param args The sub-reddits (case sensitive) to use
   */
  public static void main(String[] args) {
   
    if(args.length == 0){
      System.err.println("No sub-reddits given, unable to continue");
      showUsage();
      System.exit(2);
    }
    if(args.length < 2){
      System.err.println("Need to provide at least two subreddits");
      showUsage();
      System.exit(2);
    }
   
    //Copy the sub-reddits into a separate list for future use with more complex arguments
    ArrayList<String> subreddits = new ArrayList<String>();
    for(int i = 0; i < args.length; i++){
      subreddits.add(args[i]);
    }
   
    //TUNE PARAMETERS HERE
    final int STAT_RES = 5;
    final int FILTER_SET_SIZE = args.length * 100;//batch size is 100 posts per subreddit
    final int WORDS_TO_KEEP = 200; //need larger word vectors for better results
    final int RUNTIME = 300 * (60000);//change first term to number of minutes
   
    //Build the Instances Header
    ArrayList<Attribute> att = new ArrayList<Attribute>();
    att.add(new Attribute("title", (List<String>)null, 0));
    att.add(new Attribute("redditClass", subreddits, 1));
    Instances instHeaders = new Instances("reddit",att, 10);
    instHeaders.setClassIndex(1);
   
    /**
     * Start building the topology
     * Examples can be found at: https://github.com/nathanmarz/storm-starter
     */
   
    //Create the topology builder
    TopologyBuilder builder = new TopologyBuilder();
   
    BoltDeclarer instanceBolt = builder.setBolt("instancebolt", new InstanceBolt(instHeaders));
   
    String resultsFolder = ((Long)System.currentTimeMillis()).toString();
   
    //Add a spout for each sub-reddit
    for(String subreddit : subreddits){
      resultsFolder = String.format("[%s]", subreddit) + resultsFolder;
      builder.setSpout("raw:" + subreddit, new RawRedditSpout(subreddit));
      instanceBolt.shuffleGrouping("raw:" + subreddit);
    }
   
    builder.setBolt("stringToWordBolt", new StringToWordVectorBolt(FILTER_SET_SIZE, WORDS_TO_KEEP, instHeaders)).shuffleGrouping("instancebolt");
   
    //Normal OzaBoost
   
    //NaiveBayesMultinomial
    builder.setBolt("ozaBoostBolt:naiveBayesMultinomial", new OzaBoostBolt("bayes.NaiveBayesMultinomial")).shuffleGrouping("stringToWordBolt");
    builder.setBolt("statistics:naiveBayesMultinomial", new StatisticsBolt(subreddits.size(),STAT_RES)).shuffleGrouping("ozaBoostBolt:naiveBayesMultinomial");
   
    builder.setBolt("StatsPrinterBolt:naiveBayesMultinomial", new StatsPrinterBolt("naiveBayesMultinominal")).shuffleGrouping("statistics:naiveBayesMultinomial");
    builder.setBolt("StatsWriterBolt:naiveBayesMultinomial", new StatsWriterBolt("naiveBayesMultinominal", resultsFolder)).shuffleGrouping("statistics:naiveBayesMultinomial");
   
    //NaiveBayes
    builder.setBolt("ozaBoostBolt:naiveBayes", new OzaBoostBolt("bayes.NaiveBayes")).shuffleGrouping("stringToWordBolt");
    builder.setBolt("statistics:naiveBayes", new StatisticsBolt(subreddits.size(),STAT_RES)).shuffleGrouping("ozaBoostBolt:naiveBayes");
   
    builder.setBolt("StatsPrinterBolt:naiveBayes", new StatsPrinterBolt("naiveBayes")).shuffleGrouping("statistics:naiveBayes");
    builder.setBolt("StatsWriterBolt:naiveBayes", new StatsWriterBolt("naiveBayes", resultsFolder)).shuffleGrouping("statistics:naiveBayes");
   
    //Perceptron
    builder.setBolt("ozaBoostBolt:perceptron", new OzaBoostBolt("functions.Perceptron")).shuffleGrouping("stringToWordBolt");
    builder.setBolt("statistics:perceptron", new StatisticsBolt(subreddits.size(),STAT_RES)).shuffleGrouping("ozaBoostBolt:perceptron");
   
    builder.setBolt("StatsPrinterBolt:perceptron", new StatsPrinterBolt("perceptron")).shuffleGrouping("statistics:perceptron");
    builder.setBolt("StatsWriterBolt:perceptron", new StatsWriterBolt("perceptron", resultsFolder)).shuffleGrouping("statistics:perceptron");
   
    //Distributed OzaBoost
    //NaiveBayesMultinomial
    //Create the bolts
    /*BoltDeclarer naiveBayesMultinomial1 = builder.setBolt("ozaBoostBolt:naiveBayesMultinomial1", new DistributedOzaBoostBolt("bayes.NaiveBayesMultinomial", 1)).shuffleGrouping("stringToWordBolt");
    BoltDeclarer naiveBayesMultinomial2 = builder.setBolt("ozaBoostBolt:naiveBayesMultinomial2", new DistributedOzaBoostBolt("bayes.NaiveBayesMultinomial", 2)).shuffleGrouping("stringToWordBolt");
    BoltDeclarer naiveBayesMultinomial3 = builder.setBolt("ozaBoostBolt:naiveBayesMultinomial3", new DistributedOzaBoostBolt("bayes.NaiveBayesMultinomial", 3)).shuffleGrouping("stringToWordBolt");
   
    //Add each other to each other
    naiveBayesMultinomial1.shuffleGrouping("ozaBoostBolt:naiveBayesMultinomial2");
    naiveBayesMultinomial1.shuffleGrouping("ozaBoostBolt:naiveBayesMultinomial3");
   
    naiveBayesMultinomial2.shuffleGrouping("ozaBoostBolt:naiveBayesMultinomial1");
    naiveBayesMultinomial2.shuffleGrouping("ozaBoostBolt:naiveBayesMultinomial3");
   
    naiveBayesMultinomial3.shuffleGrouping("ozaBoostBolt:naiveBayesMultinomial2");
    naiveBayesMultinomial3.shuffleGrouping("ozaBoostBolt:naiveBayesMultinomial1");
   
    //Only subscribed to the first bolt for statistics
    builder.setBolt("statistics:naiveBayesMultinomial", new StatisticsBolt(subreddits.size(),STAT_RES)).shuffleGrouping("ozaBoostBolt:naiveBayesMultinomial1");
   
    builder.setBolt("statsPrinterBolt:naiveBayesMultinomial", new StatsPrinterBolt("naiveBayesMultinominal")).shuffleGrouping("statistics:naiveBayesMultinomial");
    builder.setBolt("statsWriterBolt:naiveBayesMultinomial", new StatsWriterBolt("naiveBayesMultinominal", resultsFolder)).shuffleGrouping("statistics:naiveBayesMultinomial");
   
    //NaiveBayes
    //Create the bolts
    BoltDeclarer naiveBayes1 = builder.setBolt("ozaBoostBolt:naiveBayes1", new DistributedOzaBoostBolt("bayes.NaiveBayes", 1)).shuffleGrouping("stringToWordBolt");
    BoltDeclarer naiveBayes2 = builder.setBolt("ozaBoostBolt:naiveBayes2", new DistributedOzaBoostBolt("bayes.NaiveBayes", 2)).shuffleGrouping("stringToWordBolt");
    BoltDeclarer naiveBayes3 = builder.setBolt("ozaBoostBolt:naiveBayes3", new DistributedOzaBoostBolt("bayes.NaiveBayes", 3)).shuffleGrouping("stringToWordBolt");
   
    //Add each other to each other
    naiveBayes1.shuffleGrouping("ozaBoostBolt:naiveBayes2");
    naiveBayes1.shuffleGrouping("ozaBoostBolt:naiveBayes3");
   
    naiveBayes2.shuffleGrouping("ozaBoostBolt:naiveBayes1");
    naiveBayes2.shuffleGrouping("ozaBoostBolt:naiveBayes3");
   
    naiveBayes3.shuffleGrouping("ozaBoostBolt:naiveBayes2");
    naiveBayes3.shuffleGrouping("ozaBoostBolt:naiveBayes1");
   
    //Only subscribed to the first bolt for statistics
    builder.setBolt("statistics:naiveBayes", new StatisticsBolt(subreddits.size(),STAT_RES)).shuffleGrouping("ozaBoostBolt:naiveBayes1");
   
    builder.setBolt("statsPrinterBolt:naiveBayes", new StatsPrinterBolt("naiveBayes")).shuffleGrouping("statistics:naiveBayes");
    builder.setBolt("statsWriterBolt:naiveBayes", new StatsWriterBolt("naiveBayes", resultsFolder)).shuffleGrouping("statistics:naiveBayes");
   
    //NaiveBayes
    //Create the bolts
    BoltDeclarer perceptron1 = builder.setBolt("ozaBoostBolt:perceptron1", new DistributedOzaBoostBolt("functions.Perceptron", 1)).shuffleGrouping("stringToWordBolt");
    BoltDeclarer perceptron2 = builder.setBolt("ozaBoostBolt:perceptron2", new DistributedOzaBoostBolt("functions.Perceptron", 2)).shuffleGrouping("stringToWordBolt");
    BoltDeclarer perceptron3 = builder.setBolt("ozaBoostBolt:perceptron3", new DistributedOzaBoostBolt("functions.Perceptron", 3)).shuffleGrouping("stringToWordBolt");
   
    //Add each other to each other
    perceptron1.shuffleGrouping("ozaBoostBolt:perceptron2");
    perceptron1.shuffleGrouping("ozaBoostBolt:perceptron3");
   
    perceptron2.shuffleGrouping("ozaBoostBolt:perceptron1");
    perceptron2.shuffleGrouping("ozaBoostBolt:perceptron3");
   
    perceptron3.shuffleGrouping("ozaBoostBolt:perceptron2");
    perceptron3.shuffleGrouping("ozaBoostBolt:perceptron1");
   
    //Only subscribed to the first bolt for statistics
    builder.setBolt("statistics:perceptron", new StatisticsBolt(subreddits.size(),STAT_RES)).shuffleGrouping("ozaBoostBolt:perceptron1");
   
    builder.setBolt("statsPrinterBolt:perceptron", new StatsPrinterBolt("perceptron")).shuffleGrouping("statistics:perceptron");
    builder.setBolt("statsWriterBolt:perceptron", new StatsWriterBolt("perceptron", resultsFolder)).shuffleGrouping("statistics:perceptron");*/
   
   
    //Create the configuration object
    Config conf = new Config();
   
    //Create a local cluster
    LocalCluster cluster = new LocalCluster();
   
    //Submit the topology to the cluster for execution
    cluster.submitTopology("redditAnalyser", conf, builder.createTopology());
   
    //Give a timeout period
    Utils.sleep(RUNTIME);
   
    //Kill the topology first
    cluster.killTopology("redditAnalyser");
   
    //Close the cluster
    cluster.shutdown();
   
  }
 
  /**
   * Prints the required usage of the program
   */
  private static void showUsage(){
    System.out.println("Usage:");
    System.out.println("java distributedRedditAnalyser.Main subreddit subreddit [subreddit...]");
  }

}
TOP

Related Classes of distributedRedditAnalyser.Main

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.