Package uk.ac.cam.ha293.tweetlabel.util

Examples of uk.ac.cam.ha293.tweetlabel.util.Stopwords


      for(Long uid : Tools.getCSVUserIDs()) {
        //System.out.println(totalCount);
        totalCount++;
        Set<String> svmTopicSet = new HashSet<String>();
        Set<String> baselineTopicSet = new HashSet<String>();
        FullSVMClassification svm = new FullSVMClassification(topicType,uid);
        int kCount=0;
        for(String topic : svm.getCategorySet()) {
          if(kCount == k) break;
          kCount++;
          svmTopicSet.add(topic);
          System.out.println("Adding topic "+topic+" "+svm.getScore(topic));
        }
        if(topicType.equals("alchemy")) {
          FullAlchemyClassification baseline = new FullAlchemyClassification(uid);
          kCount=0;
          for(String topic : baseline.getCategorySet()) {
View Full Code Here


    Corpus corpus = null;
    if(stem) corpus = Corpus.load("allprofiles-stemmed");
    else corpus = Corpus.load("allprofiles-unstemmed");
   
    //Check for model existence
    LDATopicModel lda = null;
    if(new File("models/lda/"+description+".model").exists()) {
      System.out.println("Found LDA model "+description);
      lda = LDATopicModel.load(description);
    } else {
      System.out.println("Couldn't find LDA model "+description+", creating new one");
      lda = new LDATopicModel(corpus,numTopics,burn,sample,0,alpha,0.01);
      lda.runGibbsSampling();
      lda.save(description);
    }
   
    try {
      //Get the document topic distributions and store these
      List<List<WordScore>> docTopics = lda.getDocuments();
      int docID = 0;
      for(List<WordScore> document : docTopics) {
        Long userID = lda.getDocIDFromIndex(docID);
        FileOutputStream fileOut = new FileOutputStream(dirName+"/"+userID+".csv");
        PrintWriter writeOut = new PrintWriter(fileOut);
        writeOut.println("\"topic\",\"probability\"");
        for(WordScore topic : document) {
          writeOut.println(topic.getWord()+","+topic.getScore());
        }
        writeOut.close();
        docID++;
      }
     
     
      //NOTE: We are saving these for now. However, we always have a saved model
      //and we can get these attributes from the model
     
      //should also save the topic-word distributions
      //okay, so we should definitely serialize topics and vocab
      Map<String,Integer> vocab = lda.getVocab();
      double[][] topics = lda.getTopicsUnsorted();
     
      //Save topics
      FileOutputStream topicsFileOut = new FileOutputStream(dirName+"/TOPICS.obj");
      ObjectOutputStream topicsObjectOut = new ObjectOutputStream(topicsFileOut);
      topicsObjectOut.writeObject(topics);
View Full Code Here

    Corpus corpus = null;
    if(stem) corpus = Corpus.loadLabelled(topicType, "allprofiles-stemmed");
    else corpus = Corpus.loadLabelled(topicType, "allprofiles-unstemmed");
   
    //Check for model existence
    LLDATopicModel llda = null;
    if(new File("models/llda/"+topicType+"/"+description+".model").exists()) {
      System.out.println("Found LLDA model "+description);
      llda = LLDATopicModel.load(topicType,description);
    } else {
      System.out.println("Couldn't find LLDA model "+description+", creating new one");
      llda = new LLDATopicModel(corpus,burn,sample,lag,1,0.01);
      llda.runGibbsSampling();
      llda.save(description);
    }
   
    try {
      //Get the document topic distributions and store these
      List<List<WordScore>> docTopics = llda.getDocuments();
      int docID = 0;
      for(List<WordScore> document : docTopics) {
        Long userID = llda.getDocIDFromIndex(docID);
        FileOutputStream fileOut = new FileOutputStream(dirName+"/"+userID+".csv");
        PrintWriter writeOut = new PrintWriter(fileOut);
        writeOut.println("\"topic\",\"probability\"");
        for(WordScore topic : document) {
          writeOut.println(topic.getWord()+","+topic.getScore());
        }
        writeOut.close();
        docID++;
      }
     
     
      //NOTE: We are saving these for now. However, we always have a saved model
      //and we can get these attributes from the model
     
      //should also save the topic-word distributions
      //okay, so we should definitely serialize topics and vocab
      Map<String,Integer> vocab = llda.getVocab();
      double[][] topics = llda.getTopicsUnsorted();
      ArrayList<String> topicIDs = llda.getTopicsIDList();
     
      //Save topics
      FileOutputStream topicsFileOut = new FileOutputStream(dirName+"/TOPICS.obj");
      ObjectOutputStream topicsObjectOut = new ObjectOutputStream(topicsFileOut);
      topicsObjectOut.writeObject(topics);
View Full Code Here

    alphaSet.add(1.25);
    alphaSet.add(1.5);
    alphaSet.add(1.75);
    alphaSet.add(2.00);
    for(Double alpha : alphaSet) {
      LLDATopicModel llda = new LLDATopicModel(corpus,1000,100,0,alpha,0.01);
      llda.runQuickCVGibbsSampling(0);
    }
  }
View Full Code Here

      final double alph=al;
      Thread thread = new Thread(){
        public void run() {
          System.out.println("THREAD: "+"Running for alpha="+alph);
          Corpus corpus = Corpus.loadLabelled("textwiseproper", "allprofiles-unstemmed-textwiseproper-top3");
          LLDATopicModel llda = new LLDATopicModel(corpus,1000,100,0,alph,0.01,fThread);
          llda.runQuickCVGibbsSampling(0);
        }
      };
      thread.start();
    }
    System.out.println("All threads started");
View Full Code Here

        Thread thread = new Thread(){
          public void run() {
            double[] alphas = {0.25,0.5,0.75,1.0,1.25,1.5,1.75,2.0};
            for(double alpha : alphas) {
              System.out.println("THREAD "+fThread+": Running for alpha="+alpha);
              LLDATopicModel llda = new LLDATopicModel(fCorpus,1000,100,0,alpha,0.01,fThread);
              llda.runQuickCVGibbsSampling(iReduction);
            }
          }
        };
        thread.start();
      }
View Full Code Here

        Thread thread = new Thread(){
          public void run() {
            double[] alphas = {0.25,0.5,0.75,1.0,1.25,1.5,1.75,2.0};
            for(double alpha : alphas) {
              System.out.println("THREAD "+fThread+": Running for alpha="+alpha);
              LLDATopicModel llda = new LLDATopicModel(fCorpus,1000,100,0,alpha,0.01,fThread);
              llda.runQuickCVGibbsSampling(-1*iReduction); //sooo hacky
            }
          }
        };
        thread.start();
      }
View Full Code Here

    return userIDs;
  }
 
  public static void alchemyClassificationRoutine() {
    System.out.println("Beginning complete Alchemy classification...");
    Profiler profiler = new Profiler();
    while(!synchronisedUserIDList.isEmpty()) {
      long currentID = synchronisedUserIDList.remove(0);
      SimpleProfile profile = profiler.loadCSVProfile(currentID);
      if(!profile.classifyAlchemy()) {
        System.err.println("Failed to classify profile properly, probably reached the daily limit");
        return;
      }
    }
View Full Code Here

    }
  }
 
  public static void textwiseClassificationRoutine() {
    System.out.println("Beginning complete Textwise classification...");
    Profiler profiler = new Profiler();
    while(!synchronisedUserIDList.isEmpty()) {
      long currentID = synchronisedUserIDList.remove(0);
      SimpleProfile profile = profiler.loadCSVProfile(currentID);
      if(!profile.classifyTextwise()) {
        System.err.println("Failed to classify profile "+currentID+" properly using Textwise");
        return;
      }
    }
View Full Code Here

    }
  }
 
  public static void properTextwiseClassificationRoutine() {
    System.out.println("Beginning complete Textwise classification...");
    Profiler profiler = new Profiler();
    while(!synchronisedUserIDList.isEmpty()) {
      long currentID = synchronisedUserIDList.remove(0);
      SimpleProfile profile = profiler.loadCSVProfile(currentID);
      if(!profile.classifyTextwiseProper()) {
        System.err.println("Failed to classify profile "+currentID+" properly using Textwise");
        return;
      }
    }
View Full Code Here

TOP

Related Classes of uk.ac.cam.ha293.tweetlabel.util.Stopwords

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.