Examples of uk.ac.cam.ha293.tweetlabel.util.Stemmer

uk.ac.cam.ha293.tweetlabel.types.WordScore
Stemmer, implementing the Porter Stemming Algorithm The Stemmer class transforms a word into its root form. The input word can be provided a character at time (by calling add()), or at once by calling one of the various stem(something) methods.

    if(strippedData == null) return;
    System.out.println(userID+","+tweetID+","+"\""+strippedData+"\"");
  }
  
  public Document asDocument() {
    return new Document(text);
  }

View Full Code Here

  }
  
  public Set<Document> asDocumentSet() {
    Set<Document> documents = new HashSet<Document>();
    for(Status tweet : tweets) {
      Document document = new Document(tweet.getText());
      documents.add(document);
    }
    return documents;
  }

View Full Code Here

    hasRun = false;
    Set<Document> documentSet = corpus.getDocuments();
    
    //Remove all no-topic documents to avoid breaking LLDA - use an iterator to avoid exceptions when removing
    for(Iterator<Document> iter = documentSet.iterator(); iter.hasNext();) {
      Document document = iter.next();
      if(document.getTopics().isEmpty()) {
        iter.remove();
      }
    }
    
    numDocs = documentSet.size();  
    System.out.println("THREAD "+threadNum+": numDocs = "+numDocs);
    documents = new int[numDocs][];
    numWordsInDocument = new int[numDocs];
    numTopicsInDocument = new int[numDocs];
    wordIDs = new HashMap<String,Integer>();
    topicIDs = new HashMap<String,Integer>();
    idLookup = new ArrayList<String>();
    topicLookup = new ArrayList<String>();
    docIDLookup = new ArrayList<Document>();
    docLabels = new ArrayList<ArrayList<Integer>>();
    int docID = 0;
    for(Document document : documentSet) {
      docIDLookup.add(document);
      ArrayList<Integer> labels = new ArrayList<Integer>();
      for(String topic : document.getTopics()) {
        int topicID;
        if(topicIDs.containsKey(topic)) {
          topicID = topicIDs.get(topic); 
        } else {
          topicID = topicIDs.keySet().size();
          topicIDs.put(topic, topicID);
          topicLookup.add(topic);
        }
        labels.add(topicID);
      }
      docLabels.add(labels); //In correct position docID...
      numTopicsInDocument[docID] = document.getTopics().size();
      String[] tokens = document.getDocumentString().split("\\s+");
      documents[docID] = new int[tokens.length];
      numWordsInDocument[docID] = tokens.length;
      for(int i=0; i<documents[docID].length; i++) {
        //Add the token's ID to the documents array
        int wordID;

View Full Code Here

      return topicLookup;
    }


    public Map<String,Double> inferTopicDistribution(SimpleProfile sp, int burnIn, int sampling, double alpha, double beta) {
      //Get FV from SP
      Document d = sp.asDocument();
    String[] tokens = d.getDocumentString().split("\\s+");
    ArrayList<Integer> fv = new ArrayList<Integer>();
    int numExistingWords = 0;
    for(int i=0; i<tokens.length; i++) {
      if(wordIDs.containsKey(tokens[i])) { 
        fv.add(wordIDs.get(tokens[i]));

View Full Code Here

    double[][] phi = getPhi();
    for(int topicID=0; topicID<numTopics; topicID++) {
      double[] wordProbs = phi[topicID];
      List<WordScore> wordScores = new LinkedList<WordScore>();
      for(int wordID=0; wordID<numWords; wordID++) {
        wordScores.add(new WordScore(idLookup.get(wordID), wordProbs[wordID]));
      }
      Collections.sort(wordScores);
      Collections.reverse(wordScores);
      topics.add(wordScores);
    }

View Full Code Here

    double[][] theta = getTheta();
    for(int docID=0; docID<numDocs; docID++) {
      double[] topicProbs = theta[docID];
      List<WordScore> wordScores = new LinkedList<WordScore>();
      for(int topicID=0; topicID<numTopics; topicID++) {
        wordScores.add(new WordScore("Topic"+topicID, topicProbs[topicID]));
      }
      Collections.sort(wordScores);
      Collections.reverse(wordScores);
      topics.add(wordScores);
    }

View Full Code Here

    for(int topicID=0; topicID<numTopics; topicID++) {
      double[] wordProbs = phi[topicID];
      
      List<WordScore> wordScores = new LinkedList<WordScore>();
      for(int wordID=0; wordID<numWords; wordID++) {
        wordScores.add(new WordScore(idLookup.get(wordID), wordProbs[wordID]));
      }
      Collections.sort(wordScores);
      Collections.reverse(wordScores);
      
      System.out.print("Topic "+topicID+": ");

View Full Code Here

    double[][] phi = getPhi();
    for(int topicID=0; topicID<numTopics; topicID++) {
      double[] wordProbs = phi[topicID];
      List<WordScore> wordScores = new LinkedList<WordScore>();
      for(int wordID=0; wordID<numWords; wordID++) {
        wordScores.add(new WordScore(idLookup.get(wordID), wordProbs[wordID]));
      }
      Collections.sort(wordScores);
      Collections.reverse(wordScores);
      topics.add(wordScores);
    }

View Full Code Here

    for(int topicID=0; topicID<numTopics; topicID++) {
      double[] wordProbs = phi[topicID];
      
      List<WordScore> wordScores = new LinkedList<WordScore>();
      for(int wordID=0; wordID<numWords; wordID++) {
        wordScores.add(new WordScore(idLookup.get(wordID), wordProbs[wordID]));
      }
      Collections.sort(wordScores);
      Collections.reverse(wordScores);
      
      System.out.print("Topic "+topicLookup.get(topicID)+": ");

View Full Code Here

    double[][] theta = getTheta();
    for(int docID=0; docID<numDocs; docID++) {
      double[] topicProbs = theta[docID];
      List<WordScore> wordScores = new LinkedList<WordScore>();
      for(int topicID=0; topicID<numTopics; topicID++) {
        wordScores.add(new WordScore(topicLookup.get(topicID), topicProbs[topicID]));
      }
      Collections.sort(wordScores);
      Collections.reverse(wordScores);
      topics.add(wordScores);
    }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of uk.ac.cam.ha293.tweetlabel.util.Stemmer

cc.mallet.pipe.CharSequence2TokenSequence

cc.mallet.pipe.Pipe

cc.mallet.pipe.PrintInputAndTarget

cc.mallet.pipe.SerialPipes

cc.mallet.pipe.TokenSequence2FeatureSequence

cc.mallet.pipe.TokenSequenceLowercase

cc.mallet.pipe.TokenSequenceRemoveStopwords

cc.mallet.topics.ParallelTopicModel

cc.mallet.topics.SimpleLDA

cc.mallet.types.Instance

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.