Package com.carrotsearch.hppc

Examples of com.carrotsearch.hppc.IntIntOpenHashMap$KeysIterator


public class HppcOomPut
{
    public static void main(String [] args)
    throws Exception
    {
        IntIntOpenHashMap map = new IntIntOpenHashMap(100, 1f);
        Field f = map.getClass().getDeclaredField("keys");
        f.setAccessible(true);

        boolean hitOOM = false;
        for (int i = 0;; i++) {
            try {
                if (hitOOM) { System.out.println("put(" + i + ")"); }
                map.put(i,  i);
            } catch (OutOfMemoryError e) {
                hitOOM = true;
                System.out.println("OOM, map: " + map.size() + " " + ((int[])f.get(map)).length);
            }
        }
    }
View Full Code Here


    public int get(int k) { return instance.get(k); }

    @Override
    public int containKeys(int [] keys)
    {
        final IntIntOpenHashMap prepared = this.instance;
        int count = 0;
        for (int i = 0; i < keys.length; i++)
            count += prepared.containsKey(keys[i]) ? 1 : 0;
        return count;
    }
View Full Code Here

    }
   
    @Override
    public int putAll(int [] keys, int [] values)
    {
        final IntIntOpenHashMap instance = this.instance;
        int count = 0;
        for (int i = 0; i < keys.length; i++)
        {
            count += instance.put(keys[i], values[i]);
        }
        return count;
    }
View Full Code Here

        /*
         * We'll use a int -> int map for counting. A bigram can be encoded
         * as an int by shifting one of the bigram's characters by 16 bits
         * and then ORing the other character to form a 32-bit int.
         */
        final IntIntOpenHashMap counts = new IntIntOpenHashMap(
            IntIntOpenHashMap.DEFAULT_CAPACITY,
            IntIntOpenHashMap.DEFAULT_LOAD_FACTOR);

        for (int i = 0; i < CHARS.length - 1; i++)
        {
            counts.putOrAdd((CHARS[i] << 16 | CHARS[i+1]), 1, 1);
        }
        // [[[end:bigram-counting]]]
    }
View Full Code Here

        final byte [] stemsFieldIndices = preprocessingContext.allStems.fieldIndices;

        if (documentCount == 0)
        {
            vsmContext.termDocumentMatrix = new DenseDoubleMatrix2D(0, 0);
            vsmContext.stemToRowIndex = new IntIntOpenHashMap();
            return;
        }

        // Determine the index of the title field
        int titleFieldIndex = -1;
        final String [] fieldsName = preprocessingContext.allFields.name;
        for (int i = 0; i < fieldsName.length; i++)
        {
            if (Document.TITLE.equals(fieldsName[i]))
            {
                titleFieldIndex = i;
                break;
            }
        }

        // Determine the stems we, ideally, should include in the matrix
        int [] stemsToInclude = computeRequiredStemIndices(preprocessingContext);

        // Sort stems by weight, so that stems get included in the matrix in the order
        // of frequency
        final double [] stemsWeight = new double [stemsToInclude.length];
        for (int i = 0; i < stemsToInclude.length; i++)
        {
            final int stemIndex = stemsToInclude[i];
            stemsWeight[i] = termWeighting.calculateTermWeight(stemsTf[stemIndex],
                stemsTfByDocument[stemIndex].length / 2, documentCount)
                * getWeightBoost(titleFieldIndex, stemsFieldIndices[stemIndex]);
        }
        final int [] stemWeightOrder = IndirectSort.mergesort(0, stemsWeight.length,
            new IndirectComparator.DescendingDoubleComparator(stemsWeight));

        // Calculate the number of terms we can include to fulfill the max matrix size
        final int maxRows = maximumMatrixSize / documentCount;
        final DoubleMatrix2D tdMatrix = new DenseDoubleMatrix2D(Math.min(maxRows,
            stemsToInclude.length), documentCount);

        for (int i = 0; i < stemWeightOrder.length && i < maxRows; i++)
        {
            final int stemIndex = stemsToInclude[stemWeightOrder[i]];
            final int [] tfByDocument = stemsTfByDocument[stemIndex];
            final int df = tfByDocument.length / 2;
            final byte fieldIndices = stemsFieldIndices[stemIndex];

            for (int j = 0; j < df; j++) {
                double weight = termWeighting.calculateTermWeight(
                    tfByDocument[j * 2 + 1], df, documentCount);

                weight *= getWeightBoost(titleFieldIndex, fieldIndices);
                tdMatrix.set(i, tfByDocument[j * 2], weight);
            }
        }

        // Convert stemsToInclude into tdMatrixStemIndices
        final IntIntOpenHashMap stemToRowIndex = new IntIntOpenHashMap();
        for (int i = 0; i < stemWeightOrder.length && i < tdMatrix.rows(); i++)
        {
            stemToRowIndex.put(stemsToInclude[stemWeightOrder[i]], i);
        }

        // Store the results
        vsmContext.termDocumentMatrix = tdMatrix;
        vsmContext.stemToRowIndex = stemToRowIndex;
View Full Code Here

     * {@link VectorSpaceModelContext#termPhraseMatrix} will remain <code>null</code>.
     */
    public void buildTermPhraseMatrix(VectorSpaceModelContext context)
    {
        final PreprocessingContext preprocessingContext = context.preprocessingContext;
        final IntIntOpenHashMap stemToRowIndex = context.stemToRowIndex;
        final int [] labelsFeatureIndex = preprocessingContext.allLabels.featureIndex;
        final int firstPhraseIndex = preprocessingContext.allLabels.firstPhraseIndex;

        if (firstPhraseIndex >= 0 && stemToRowIndex.size() > 0)
        {
            // Build phrase matrix
            int [] phraseFeatureIndices = new int [labelsFeatureIndex.length
                - firstPhraseIndex];
            for (int featureIndex = 0; featureIndex < phraseFeatureIndices.length; featureIndex++)
View Full Code Here

     * same term space as the original term-document matrix.
     */
    static DoubleMatrix2D buildAlignedMatrix(VectorSpaceModelContext vsmContext,
        int [] featureIndex, ITermWeighting termWeighting)
    {
        final IntIntOpenHashMap stemToRowIndex = vsmContext.stemToRowIndex;
        if (featureIndex.length == 0)
        {
            return new DenseDoubleMatrix2D(stemToRowIndex.size(), 0);
        }

        final DoubleMatrix2D phraseMatrix = new SparseDoubleMatrix2D(stemToRowIndex
            .size(), featureIndex.length);

        final PreprocessingContext preprocessingContext = vsmContext.preprocessingContext;
        final int [] wordsStemIndex = preprocessingContext.allWords.stemIndex;
        final int [] stemsTf = preprocessingContext.allStems.tf;
        final int [][] stemsTfByDocument = preprocessingContext.allStems.tfByDocument;
        final int [][] phrasesWordIndices = preprocessingContext.allPhrases.wordIndices;
        final int documentCount = preprocessingContext.documents.size();
        final int wordCount = wordsStemIndex.length;

        for (int i = 0; i < featureIndex.length; i++)
        {
            final int feature = featureIndex[i];
            final int [] wordIndices;
            if (feature < wordCount)
            {
                wordIndices = new int []
                {
                    feature
                };
            }
            else
            {
                wordIndices = phrasesWordIndices[feature - wordCount];
            }

            for (int wordIndex = 0; wordIndex < wordIndices.length; wordIndex++)
            {
                final int stemIndex = wordsStemIndex[wordIndices[wordIndex]];
                if (stemToRowIndex.containsKey(stemIndex))
                {
                    final int rowIndex = stemToRowIndex.lget();

                    double weight = termWeighting.calculateTermWeight(stemsTf[stemIndex],
                        stemsTfByDocument[stemIndex].length / 2, documentCount);

                    phraseMatrix.setQuick(rowIndex, i, weight);
View Full Code Here

            // arrays.
            Collections.sort(rcs, new SubstringComparator(wordIndexesArray, stemIndexes));

            int totalPhraseTf = rcs.get(0).frequency;
            Substring mostFrequentOriginal = rcs.get(0);
            IntIntOpenHashMap phraseTfByDocument = new IntIntOpenHashMap();
            phraseTfByDocument.putAll(mostFrequentOriginal.tfByDocument);

            // Don't change the rcs list type from ArrayList or we'll
            // run into O(n^2) iteration cost :)
            for (int i = 0; i < rcs.size() - 1; i++)
            {
                final Substring substring = rcs.get(i);
                final Substring nextSubstring = rcs.get(i + 1);

                if (substring
                    .isEquivalentTo(nextSubstring, wordIndexesArray, stemIndexes))
                {
                    totalPhraseTf += nextSubstring.frequency;
                    addAllWithOffset(phraseTfByDocument, nextSubstring.tfByDocument, -1);
                    if (mostFrequentOriginal.frequency < nextSubstring.frequency)
                    {
                        mostFrequentOriginal = nextSubstring;
                    }
                }
                else
                {
                    int [] wordIndexes = new int [(mostFrequentOriginal.to - mostFrequentOriginal.from)];
                    for (int j = 0; j < wordIndexes.length; j++)
                    {
                        wordIndexes[j] = wordIndexesArray[mostFrequentOriginal.from + j];
                    }
                    phraseWordIndexes.add(wordIndexes);
                    phraseTf.add(totalPhraseTf);
                    phraseTfByDocumentList.add(IntMapUtils.flatten(phraseTfByDocument));

                    totalPhraseTf = nextSubstring.frequency;
                    mostFrequentOriginal = nextSubstring;
                    phraseTfByDocument.clear();
                    phraseTfByDocument.putAll(nextSubstring.tfByDocument);
                }
            }

            // Add the last substring
            final Substring substring = rcs.get(rcs.size() - 1);
View Full Code Here

                            currentSuffixIndex + currentLcp - j, (j == 0 ? 2 : 1));

                        // By document tf. Again, topmost phrase gets tf = 2, the other
                        // ones get tf = 1. This time, we need to track from which document's
                        // tf we need to set off the "minus 1", hence the documentIndexToOffset field.
                        rcsStack[sp].tfByDocument = new IntIntOpenHashMap();
                        rcsStack[sp].tfByDocument.put(
                            documentIndexArray[suffixArray[i - 1]], 1);
                        if (j == 0)
                        {
                            rcsStack[sp].tfByDocument.putOrAdd(currentDocumentIndex,
                                1, 1);
                        }
                        else
                        {
                            rcsStack[sp].documentIndexToOffset = documentIndexArray[suffixArray[i - 1]];
                        }
                    }
                }

                i++;
            }
            else
            {
                Substring r = rcsStack[sp];
                if ((r.to - r.from) < currentLcp)
                {
                    Substring r1 = rcsStack[sp];

                    // The phrase we're about to add is an extension of the topmost
                    // phrase on the stack. The new phrase will contribute to the
                    // topmost phrase's tf, so we need to track the document index
                    // from which we'd set off the "minus 1".
                    r1.documentIndexToOffset = documentIndexArray[suffixArray[i - 1]];

                    // Add the intermediate phrases too (which makes
                    // the algorithm no longer linear btw)
                    int length = currentLcp - (r1.to - r1.from);
                    for (int j = length - 1; j >= 0; j--)
                    {
                        if (currentLcp - j >= MIN_PHRASE_LENGTH)
                        {
                            sp++;
                            rcsStack[sp] = new Substring(i, currentSuffixIndex,
                                currentSuffixIndex + currentLcp - j, (j == 0 ? 2 : 1));

                            rcsStack[sp].tfByDocument = new IntIntOpenHashMap();
                            rcsStack[sp].tfByDocument.put(
                                documentIndexArray[suffixArray[i - 1]], 1);
                            if (j == 0)
                            {
                                rcsStack[sp].tfByDocument.putOrAdd(currentDocumentIndex,
View Full Code Here

            matrixBuilder.buildTermDocumentMatrix(vsmContext);
            matrixBuilder.buildTermPhraseMatrix(vsmContext);

            // Prepare rowIndex -> stemIndex mapping for labeling
            final IntIntOpenHashMap rowToStemIndex = new IntIntOpenHashMap();
            for (IntIntCursor c : vsmContext.stemToRowIndex)
            {
                rowToStemIndex.put(c.value, c.key);
            }

            final DoubleMatrix2D tdMatrix;
            if (useDimensionalityReduction && clusterCount * 2 < preprocessingContext.documents.size())
            {
View Full Code Here

TOP

Related Classes of com.carrotsearch.hppc.IntIntOpenHashMap$KeysIterator

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.