Examples of gnu.trove.TIntIntHashMap

gnu.trove.TIntIntHashMap
An open addressed Map implementation for int keys and int values. Created: Sun Nov 4 08:52:45 2001 @author Eric D. Friedman

  {
    long startTime = System.currentTimeMillis();
  
    int[] oneDocTopics = topicSequence.getFeatures();


    TIntIntHashMap currentTypeTopicCounts;
    int type, oldTopic, newTopic;
    double[] topicDistribution;
    double topicDistributionSum;
    int docLen = featureSequence.getLength();
    int adjustedValue;
    int[] topicIndices, topicCounts;


    double weight;
  
    // populate topic counts
    Arrays.fill(oneDocTopicCounts, 0);


    if (readjustTopicsAndStats) {
      for (int token = 0; token < docLen; token++) {
        oneDocTopicCounts[ oneDocTopics[token] ]++;
      }
    }


    // Iterate over the tokens (words) in the document
    for (int token = 0; token < docLen; token++) {
      type = featureSequence.getIndexAtPosition(token);
      oldTopic = oneDocTopics[token];
      currentTypeTopicCounts = typeTopicCounts[type];
      assert (currentTypeTopicCounts.size() != 0);


      if (readjustTopicsAndStats) {
        // Remove this token from all counts
        oneDocTopicCounts[oldTopic]--;
        adjustedValue = currentTypeTopicCounts.adjustOrPutValue(oldTopic, -1, -1);
        if (adjustedValue == 0) currentTypeTopicCounts.remove(oldTopic);
        else if (adjustedValue == -1) throw new IllegalStateException ("Token count in topic went negative.");
        tokensPerTopic[oldTopic]--;
      }


      // Build a distribution over topics for this token
      topicIndices = currentTypeTopicCounts.keys();
      topicCounts = currentTypeTopicCounts.getValues();
      topicDistribution = new double[topicIndices.length]; 
      // TODO Yipes, memory allocation in the inner loop!  But note that .keys and .getValues is doing this too.
      topicDistributionSum = 0;
      for (int i = 0; i < topicCounts.length; i++) {
        int topic = topicIndices[i];

View Full Code Here

                      boolean shouldSaveState,
                      boolean readjustTopicsAndStats /* currently ignored */) {


    int[] oneDocTopics = topicSequence.getFeatures();


    TIntIntHashMap currentTypeTopicCounts;
    int type, oldTopic, newTopic;
    double topicWeightsSum;
    int docLength = tokenSequence.getLength();


    //    populate topic counts
    TIntIntHashMap localTopicCounts = new TIntIntHashMap();
    for (int position = 0; position < docLength; position++) {
      localTopicCounts.adjustOrPutValue(oneDocTopics[position], 1, 1);
    }


    //    Initialize the topic count/beta sampling bucket
    double topicBetaMass = 0.0;
    for (int topic: localTopicCounts.keys()) {
      int n = localTopicCounts.get(topic);


      //      initialize the normalization constant for the (B * n_{t|d}) term
      topicBetaMass += beta * n /  (tokensPerTopic[topic] + betaSum);  


      //      update the coefficients for the non-zero topics
      cachedCoefficients[topic] =  (alpha[topic] + n) / (tokensPerTopic[topic] + betaSum);
    }


    double topicTermMass = 0.0;


    double[] topicTermScores = new double[numTopics];
    int[] topicTermIndices;
    int[] topicTermValues;
    int i;
    double score;


    //  Iterate over the positions (words) in the document 
    for (int position = 0; position < docLength; position++) {
      type = tokenSequence.getIndexAtPosition(position);
      oldTopic = oneDocTopics[position];


      currentTypeTopicCounts = typeTopicCounts[type];
      assert(currentTypeTopicCounts.get(oldTopic) >= 0);


      //  Remove this token from all counts. 
      //   Note that we actually want to remove the key if it goes
      //    to zero, not set it to 0.
      if (currentTypeTopicCounts.get(oldTopic) == 1) {
        currentTypeTopicCounts.remove(oldTopic);
      }
      else {
        currentTypeTopicCounts.adjustValue(oldTopic, -1);
      }


      smoothingOnlyMass -= alpha[oldTopic] * beta / 
        (tokensPerTopic[oldTopic] + betaSum);
      topicBetaMass -= beta * localTopicCounts.get(oldTopic) /
        (tokensPerTopic[oldTopic] + betaSum);
      
      if (localTopicCounts.get(oldTopic) == 1) {
        localTopicCounts.remove(oldTopic);
      }
      else {
        localTopicCounts.adjustValue(oldTopic, -1);
      }


      tokensPerTopic[oldTopic]--;
      
      smoothingOnlyMass += alpha[oldTopic] * beta / 
        (tokensPerTopic[oldTopic] + betaSum);
      topicBetaMass += beta * localTopicCounts.get(oldTopic) /
        (tokensPerTopic[oldTopic] + betaSum);
      
      cachedCoefficients[oldTopic] = 
        (alpha[oldTopic] + localTopicCounts.get(oldTopic)) /
        (tokensPerTopic[oldTopic] + betaSum);


      topicTermMass = 0.0;


      topicTermIndices = currentTypeTopicCounts.keys();
      topicTermValues = currentTypeTopicCounts.getValues();


      for (i=0; i < topicTermIndices.length; i++) {
        int topic = topicTermIndices[i];
        score =
          cachedCoefficients[topic] * topicTermValues[i];
        //        ((alpha[topic] + localTopicCounts.get(topic)) * 
        //        topicTermValues[i]) /
        //        (tokensPerTopic[topic] + betaSum);
        
        //        Note: I tried only doing this next bit if 
        //        score > 0, but it didn't make any difference,
        //        at least in the first few iterations.
        
        topicTermMass += score;
        topicTermScores[i] = score;
        //        topicTermIndices[i] = topic;
      }
      //      indicate that this is the last topic
      //      topicTermIndices[i] = -1;
      
      double sample = random.nextUniform() * (smoothingOnlyMass + topicBetaMass + topicTermMass);
      double origSample = sample;


//      Make sure it actually gets set
      newTopic = -1;


      if (sample < topicTermMass) {
        //topicTermCount++;


        i = -1;
        while (sample > 0) {
          i++;
          sample -= topicTermScores[i];
        }
        newTopic = topicTermIndices[i];


      }
      else {
        sample -= topicTermMass;


        if (sample < topicBetaMass) {
          //betaTopicCount++;


          sample /= beta;


          topicTermIndices = localTopicCounts.keys();
          topicTermValues = localTopicCounts.getValues();


          for (i=0; i < topicTermIndices.length; i++) {
            newTopic = topicTermIndices[i];


            sample -= topicTermValues[i] /
              (tokensPerTopic[newTopic] + betaSum);


            if (sample <= 0.0) {
              break;
            }
          }


        }
        else {
          //smoothingOnlyCount++;


          sample -= topicBetaMass;


          sample /= beta;


          for (int topic = 0; topic < numTopics; topic++) {
            sample -= alpha[topic] / 
              (tokensPerTopic[topic] + betaSum);


            if (sample <= 0.0) {
              newTopic = topic;
              break;
            }
          }


        }


      }


      if (newTopic == -1) {
        System.err.println("LDAHyper sampling error: "+ origSample + " " + sample + " " + smoothingOnlyMass + " " + 
            topicBetaMass + " " + topicTermMass);
        newTopic = numTopics-1; // TODO is this appropriate
        //throw new IllegalStateException ("LDAHyper: New topic not sampled.");
      }
      //assert(newTopic != -1);


      //      Put that new topic into the counts
      oneDocTopics[position] = newTopic;
      currentTypeTopicCounts.adjustOrPutValue(newTopic, 1, 1);


      smoothingOnlyMass -= alpha[newTopic] * beta / 
        (tokensPerTopic[newTopic] + betaSum);
      topicBetaMass -= beta * localTopicCounts.get(newTopic) /
        (tokensPerTopic[newTopic] + betaSum);


      localTopicCounts.adjustOrPutValue(newTopic, 1, 1);
      tokensPerTopic[newTopic]++;


      //      update the coefficients for the non-zero topics
      cachedCoefficients[newTopic] =
        (alpha[newTopic] + localTopicCounts.get(newTopic)) /
        (tokensPerTopic[newTopic] + betaSum);


      smoothingOnlyMass += alpha[newTopic] * beta / 
        (tokensPerTopic[newTopic] + betaSum);
      topicBetaMass += beta * localTopicCounts.get(newTopic) /
        (tokensPerTopic[newTopic] + betaSum);


      assert(currentTypeTopicCounts.get(newTopic) >= 0);


    }


    //    Clean up our mess: reset the coefficients to values with only
    //    smoothing. The next doc will update its own non-zero topics...
    for (int topic: localTopicCounts.keys()) {
      cachedCoefficients[topic] =
        alpha[topic] / (tokensPerTopic[topic] + betaSum);
    }


    if (shouldSaveState) {
      //      Update the document-topic count histogram,
      //      for dirichlet estimation
      docLengthCounts[ docLength ]++;
      for (int topic: localTopicCounts.keys()) {
        topicDocCounts[topic][ localTopicCounts.get(topic) ]++;
      }
    }
  }

View Full Code Here

  protected StateLabelMap map;
  protected TIntArrayList cache;


  public TwoLabelGEConstraints() {
    this.constraintsList = new ArrayList<TwoLabelGEConstraint>();
    this.constraintsMap = new TIntIntHashMap();
    this.map = null;
    this.cache = new TIntArrayList();
  }

View Full Code Here

  {
    //System.out.println ("HashedSparseVector setIndex2Location indices.length="+indices.length+" maxindex="+indices[indices.length-1]);
    assert (index2location == null);
    assert (indices.length > 0);
    this.maxIndex = indices[indices.length - 1];
    this.index2location = new TIntIntHashMap (numLocations ());
    //index2location.setDefaultValue (-1);
    for (int i = 0; i < indices.length; i++)
      index2location.put (indices[i], i);
  }

View Full Code Here

    inTop50 = 0;
    inTop20 = 0;
    inTop10 = 0;
    inTop1 = 0;
    notInTop50 = 0;
    rankRelevantDocument = new TIntIntHashMap();
    int queryCounter = -1;
    int previousQueryId = -1;
    try {
      final BufferedReader br = Files.openFileReader(resultFilename);
      int firstSpaceIndex;

View Full Code Here

        //logger.warn("Using old-fashioned number of terms strategy. Please consider setting invertedfile.processpointers for forward compatible use");
      }


      while (i < numberOfUniqueTerms) {
        iterationCounter++;
        TIntIntHashMap codesHashMap = null;
        TIntArrayList[][] tmpStorage = null;
        IntLongTuple results = null;


        //logger.info("Iteration " + iterationCounter+ iteration_message_suffix);


        // traverse the lexicon looking to determine the first N() terms
        // this can be done two ways: for the first X terms
        // OR for the first Y pointers


        startProcessingLexicon = System.currentTimeMillis();


        if (numberOfPointersPerIteration > 0) {// we've been configured
                            // to run with a given
                            // number of pointers
          //logger.info("Scanning lexicon for "
//              + numberOfPointersPerIteration + " pointers");
          /*
           * this is less speed efficient, as we have no way to guess
           * how many terms it will take to fill the given number of
           * pointers. The advantage is that memory consumption is
           * more directly correlated to number of pointers than
           * number of terms, so when indexing tricky collections, it
           * is easier to find a number of pointers that can fit in
           * memory
           */


          codesHashMap = new TIntIntHashMap();
          ArrayList<TIntArrayList[]> tmpStorageStorage = new ArrayList<TIntArrayList[]>();
          results = scanLexiconForPointers(
              numberOfPointersPerIteration, lexiconStream,
              codesHashMap, tmpStorageStorage);
          tmpStorage = (TIntArrayList[][]) tmpStorageStorage
              .toArray(new TIntArrayList[0][0]);


        } else// we're running with a given number of terms
        {
          tmpStorage = new TIntArrayList[processTerms][];
          codesHashMap = new TIntIntHashMap(processTerms);
          results = scanLexiconForTerms(processTerms, lexiconStream,
              codesHashMap, tmpStorage);
        }


        processTerms = results.Terms;// no of terms to process on
                        // this iteration
        numberOfPointersThisIteration = results.Pointers;
        numberOfPointers += results.Pointers;// no of pointers to
                            // process on this
                            // iteration
        i += processTerms;


        if (processTerms == 0)
          break;
        //logger.info("time to process part of lexicon: "  + ((System.currentTimeMillis() - startProcessingLexicon) / 1000D));


        InvertedIndexBuilder.displayMemoryUsage(r);


        // Scan the direct file looking for those terms
        startTraversingDirectFile = System.currentTimeMillis();
        traverseDirectFile(codesHashMap, tmpStorage);
        //logger.info("time to traverse direct file: "+ ((System.currentTimeMillis() - startTraversingDirectFile) / 1000D));


        InvertedIndexBuilder.displayMemoryUsage(r);


        // write the inverted file for this part of the lexicon, ie
        // processTerms number of terms
        startWritingInvertedFile = System.currentTimeMillis();
        numberOfTokens += writeInvertedFilePart(dos, tmpStorage,
            processTerms);
        //logger.info("time to write inverted file: "  + ((System.currentTimeMillis() - startWritingInvertedFile) / 1000D));


        InvertedIndexBuilder.displayMemoryUsage(r);


        //logger.info("time to perform one iteration: "+ ((System.currentTimeMillis() - startProcessingLexicon) / 1000D));
        //logger.info("number of pointers processed: "+ numberOfPointersThisIteration);


        tmpStorage = null;
        codesHashMap.clear();
        codesHashMap = null;
      }


      //logger.info("Finished generating inverted file, rewriting lexicon");
//      this.numberOfDocuments = numberOfDocuments;

View Full Code Here

      }
    
      while(i<_numberOfUniqueTerms)
      {
        iterationCounter++;
        TIntIntHashMap codesHashMap = null;
        TIntArrayList[][] tmpStorage = null;
        IntLongTuple results = null;
        
        //logger.info("Iteration "+iterationCounter+iteration_message_suffix);
        
        //traverse the lexicon looking to determine the first N() terms
        //this can be done two ways: for the first X terms
        //OR for the first Y pointers
        //ie either N=X, or N=fn(Y)
        
        startProcessingLexicon = System.currentTimeMillis();
        
        if (numberOfPointersPerIteration > 0)
        {//we've been configured to run with a given number of pointers
          if (logger.isDebugEnabled())
            logger.debug("Scanning lexicon for "+ numberOfPointersPerIteration + " pointers");
        
          /* this is less speed efficient, as we have no way to guess how many
           * terms it will take to fill the given number of pointers. 
           * The advantage is that memory consumption is more directly correlated
           * to number of pointers than number of terms, so when indexing tricky
           * collections, it is easier to find a number of pointers that can fit
           * in memory */
           
          codesHashMap = new TIntIntHashMap();
          ArrayList<TIntArrayList[]> tmpStorageStorage = new ArrayList<TIntArrayList[]>();
          results = scanLexiconForPointers(
            numberOfPointersPerIteration, 
            lexiconStream,
            codesHashMap,
            tmpStorageStorage);
          tmpStorage = (TIntArrayList[][]) tmpStorageStorage.toArray(
            new TIntArrayList[0][0]);
          
        }
        else//we're running with a given number of terms
        {
          if (logger.isDebugEnabled())
            logger.debug("Scanning lexicon for " + processTerms+" terms");
          tmpStorage = new TIntArrayList[processTerms][];
          codesHashMap = new TIntIntHashMap(processTerms);
          results = scanLexiconForTerms(
            processTerms,
            lexiconStream,
            codesHashMap,
            tmpStorage);
        }
        
        processTerms = results.Terms;//no of terms to process on this iteration
        numberOfPointersThisIteration = results.Pointers;
        _numberOfPointers += results.Pointers;//no of pointers to process on this iteration
        logger.debug("Selected " + results.Terms + " terms, " + results.Pointers + " pointers for this iteration");
        
        if (results.Terms == 0)
        {
          //logger.warn("No terms found this iteration - presuming end of iteration cycle (perhaps some lexicon terms are empty)");
          break;
        }
        i += processTerms;
        
        if (logger.isDebugEnabled())
          logger.debug("time to process part of lexicon: " + ((System.currentTimeMillis()- startProcessingLexicon) / 1000D));
        
        
        displayMemoryUsage(r);  
        
        //Scan the direct file looking for those terms
        startTraversingDirectFile = System.currentTimeMillis();
        traverseDirectFile(codesHashMap, tmpStorage);
        if (logger.isDebugEnabled())
          logger.debug("time to traverse direct file: " + ((System.currentTimeMillis() - startTraversingDirectFile) / 1000D));
        
        displayMemoryUsage(r);      
  
        //write the inverted file for this part of the lexicon, ie processTerms number of terms
        startWritingInvertedFile = System.currentTimeMillis();
        _numberOfTokens += writeInvertedFilePart(dos, tmpStorage, processTerms);
        if (logger.isDebugEnabled())
          logger.debug("time to write inverted file: "
           + ((System.currentTimeMillis()- startWritingInvertedFile) / 1000D));
        
              
        displayMemoryUsage(r);
  
        if (logger.isDebugEnabled()) {
          logger.debug(
              "time to perform one iteration: "
                + ((System.currentTimeMillis() - startProcessingLexicon)
                  / 1000D));
          logger.debug(
            "number of pointers processed: "
              + numberOfPointersThisIteration);  
        }
        
        
        tmpStorage  = null; 
        codesHashMap.clear(); 
        codesHashMap = null;
      }

View Full Code Here

      
      final int fieldCount = srcFieldCount1;
      
      //creating a new map between new and old term codes
      if (keepTermCodeMap)
        termcodeHashmap = new TIntIntHashMap();


      //setting the input streams
      Iterator<Map.Entry<String,LexiconEntry>> lexInStream1 = 
        (Iterator<Map.Entry<String,LexiconEntry>>)srcIndex1.getIndexStructureInputStream("lexicon");
      Iterator<Map.Entry<String,LexiconEntry>> lexInStream2 =

View Full Code Here

    
    int blockLengthIndex = -1;
    //BlockInvertedIndex invIndex = (BlockInvertedIndex)inv;
    int[][][] postings = new int[phraseLength][][];
    for (int i = 0; i < phraseLength; i++) {
      docidsMap[i] = new TIntIntHashMap();
      String t = ((SingleTermQuery) phraseTerms.get(i)).getTerm();
      if (terms.getStatistics(t) == null)
      {
        LexiconEntry le = index.getLexicon().getLexiconEntry(t);
        if (le == null)

View Full Code Here

  }
  
  public void checkInvertedIndexStream(Index index, int[] documentLengths) throws Exception
  {
    final int numDocs = index.getCollectionStatistics().getNumberOfDocuments();
    TIntIntHashMap calculatedDocLengths = new TIntIntHashMap();
    InvertedIndexInputStream iiis = (InvertedIndexInputStream) index.getIndexStructureInputStream("inverted");
    assertNotNull(iiis);
    int ithTerm = -1;
    while(iiis.hasNext())
    {
      ithTerm++;
      final IterablePosting ip = iiis.getNextPostings();
      int count = 0;
      final int expected = iiis.getNumberOfCurrentPostings();
      while(ip.next() != IterablePosting.EOL)
      {
        //System.err.println("Got id " + ip.getId());
        assertTrue("Got too big a docid ("+ip.getId()+") from inverted index input stream for term at index " + ithTerm, ip.getId() < numDocs);
        count++;
        calculatedDocLengths.adjustOrPutValue(ip.getId(), ip.getFrequency(), ip.getFrequency());
      }
      assertEquals(expected, count);
    }
    assertEquals("Number of documents is unexpected,", documentLengths.length - countZero(documentLengths), calculatedDocLengths.size());
    long tokens = 0;
    for(int docid : calculatedDocLengths.keys())
    {
      assertEquals("Document length for docid "+docid+" is unexpected,", documentLengths[docid], calculatedDocLengths.get(docid));
      tokens += calculatedDocLengths.get(docid);
    }
    assertEquals("Number of tokens is unexpected,", StaTools.sum(documentLengths), tokens);
  }

View Full Code Here

0 1 2 3

TOP

Related Classes of gnu.trove.TIntIntHashMap

cc.mallet.classify.constraints.pr.MaxEntL2FLPRConstraints

cc.mallet.fst.semi_supervised.constraints.TwoLabelGEConstraints

cc.mallet.fst.semi_supervised.pr.constraints.OneLabelL2PRConstraints

cc.mallet.grmm.types.FactorGraph

cc.mallet.topics.LDAHyper

cc.mallet.topics.LDAStream

cc.mallet.topics.RTopicModel

cc.mallet.types.HashedSparseVector

com.intellij.util.containers.IntToIntSetMap

com.intellij.util.io.IntToIntBtree

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.