Package com.carrotsearch.hppc

Examples of com.carrotsearch.hppc.IntArrayList$ValueIterator


    @Test
    public void testIterableCursor() throws Exception
    {
        // [[[start:iteration-lists-using-iterator]]]
        // Prepare some list to iterate over
        final IntArrayList list = prepare(10);
       
        // Lists implement the Iterable interface that returns [type]Cursor elements.
        // The cursor contains the index and value of the current element.
        for (IntCursor c : list)
        {
View Full Code Here


    @Test
    public void testSimpleGetLoop() throws Exception
    {
        // [[[start:iteration-lists-using-get]]]
        final IntArrayList list = prepare(10);
       
        // Another way to iterate over array list is to access each element
        // of the list using the get() method.
        final int size = list.size();
        for (int i = 0; i < size; i++)
        {
            System.out.println(i + ": " + list.get(i));
        }
        // [[[end:iteration-lists-using-get]]]
    }
View Full Code Here

    @Test
    public void testWithProcedureClosure()
    {
        // [[[start:iteration-lists-using-procedures]]]
        final IntArrayList list = prepare(10);

        // Lists also support iteration through [type]Procedure interfaces.
        // The apply() method will be called once for each element in the list.
        list.forEach(new IntProcedure()
        {
            public void apply(int value)
            {
                System.out.println(value);
            }
View Full Code Here

    @Test
    public void testDirectBufferLoop() throws Exception
    {
        // [[[start:iteration-lists-using-direct-buffer-access]]]
        final IntArrayList list = prepare(10);

        // For the fastest iteration, you can access the lists' data buffer directly.
        final int [] buffer = list.buffer;
       
        // Make sure you take the list.size() and not the length of the data buffer.
        final int size = list.size();
       
        // Iterate of the the array as usual.
        for (int i = 0; i < size; i++)
        {
            System.out.println(i + ": " + buffer[i]);
View Full Code Here

        final CharacterSequence seq = new CharacterSequence("cocoa$");
        final SuffixTree stree = SuffixTreeBuilder.from(seq).build();

        stree.visit(new VisitorAdapter()
        {
            final IntArrayList states = new IntArrayList();

            public void post(int state)
            {
                if (stree.getRootState() != state)
                {
                    final StringBuilder buffer = new StringBuilder();
                    for (int i = 0; i < states.size(); i += 2)
                        for (int j = states.get(i); j <= states.get(i + 1); j++)
                            buffer.append((char) seq.objectAt(j));

                    if (stree.isLeaf(state)) buffer.append(" [leaf]");
                    nodes.add(buffer.toString());

                    states.remove(states.size() - 1);
                    states.remove(states.size() - 1);
                }
            };

            public boolean edge(int fromState, int toState, int startIndex, int endIndex)
            {
                states.add(startIndex);
                states.add(endIndex);
                return true;
            }
        });

        Collections.sort(nodes);
View Full Code Here

        }

        // Filter out labels that do not meet the minimum cluster size
        if (minClusterSize > 1)
        {
            final IntArrayList newFeatureIndex = new IntArrayList(
                labelsFeatureIndex.length);
            final ArrayList<BitSet> newDocumentIndices = Lists
                .newArrayListWithExpectedSize(labelsFeatureIndex.length);

            for (int i = 0; i < labelsFeatureIndex.length; i++)
            {
                if (labelsDocumentIndices[i].cardinality() >= minClusterSize)
                {
                    newFeatureIndex.add(labelsFeatureIndex[i]);
                    newDocumentIndices.add(labelsDocumentIndices[i]);
                }
            }
            context.allLabels.documentIndices = newDocumentIndices
                .toArray(new BitSet [newDocumentIndices.size()]);
            context.allLabels.featureIndex = newFeatureIndex.toArray();
            LabelFilterProcessor.updateFirstPhraseIndex(context);
        }
        else
        {
            context.allLabels.documentIndices = labelsDocumentIndices;
View Full Code Here

        final int [] tokenImagesOrder = IndirectSort.mergesort(tokenImages, 0,
            tokenImages.length, CharArrayComparators.NORMALIZING_CHAR_ARRAY_COMPARATOR);

        // Create holders for new arrays
        final List<char []> normalizedWordImages = Lists.newArrayList();
        final IntArrayList normalizedWordTf = new IntArrayList();
        final List<int []> wordTfByDocumentList = Lists.newArrayList();
        final ByteArrayList fieldIndexList = new ByteArrayList();
        final ShortArrayList types = new ShortArrayList();

        final int [] wordIndexes = new int [tokenCount];
        Arrays.fill(wordIndexes, -1);

        // Initial values for counters
        int tf = 1;
        int maxTf = 1;
        int maxTfVariantIndex = tokenImagesOrder[0];
        int totalTf = 1;
        int variantStartIndex = 0;

        // A byte set for word fields tracking
        final BitSet fieldIndices = new BitSet(context.allFields.name.length);

        // A stack for pushing information about the term's documents.
        final IntStack wordDocuments = new IntStack();

        if (documentIndexesArray[tokenImagesOrder[0]] >= 0)
        {
            wordDocuments.push(documentIndexesArray[tokenImagesOrder[0]]);
        }

        // Go through the ordered token images
        for (int i = 0; i < tokenImagesOrder.length - 1; i++)
        {
            final char [] image = tokenImages[tokenImagesOrder[i]];
            final char [] nextImage = tokenImages[tokenImagesOrder[i + 1]];
            final int tokenType = tokenTypesArray[tokenImagesOrder[i]];
            final int documentIndex = documentIndexesArray[tokenImagesOrder[i + 1]];

            // Reached the end of non-null tokens?
            if (image == null)
            {
                break;
            }

            // Check if we want to index this token at all
            if (isNotIndexed(tokenType))
            {
                variantStartIndex = i + 1;
                maxTfVariantIndex = tokenImagesOrder[i + 1];

                resetForNewTokenImage(documentIndexesArray, tokenImagesOrder,
                    fieldIndices, wordDocuments, i);
                continue;
            }

            fieldIndices.set(tokensFieldIndex[tokenImagesOrder[i]]);

            // Now check if image case is changing
            final boolean sameCase = CharArrayComparators.FAST_CHAR_ARRAY_COMPARATOR
                .compare(image, nextImage) == 0;
            if (sameCase)
            {
                // Case has not changed, just increase counters
                tf++;
                totalTf++;
                wordDocuments.push(documentIndex);
                continue;
            }

            // Case (or even token image) has changed. Update most frequent case
            // variant
            if (maxTf < tf)
            {
                maxTf = tf;
                maxTfVariantIndex = tokenImagesOrder[i];
                tf = 1;
            }

            final boolean sameImage = CharArrayComparators.CASE_INSENSITIVE_CHAR_ARRAY_COMPARATOR
                .compare(image, nextImage) == 0;

            // Check if token image has changed
            if (sameImage)
            {
                totalTf++;
                wordDocuments.push(documentIndex);
            }
            else
            {
                // The image has changed completely.
                // Before we start processing the new image, we need to
                // see if we want to store the previous image, and if so
                // we need add some data about it to the arrays
               
                // wordDocuments.size() may contain duplicate entries from the same document,
                // but this check is faster than deduping, so we do it first. 
                if (wordDocuments.size() >= dfThreshold)
                {
                    // Flatten the list of documents this term occurred in.
                    final int [] sparseEncoding = SparseArray.toSparseEncoding(wordDocuments);
                    final int df = (sparseEncoding.length >> 1);
                    if (df >= dfThreshold)
                    {
                        wordTfByDocumentList.add(sparseEncoding);
   
                        // Add the word to the word list
                        normalizedWordImages.add(tokenImages[maxTfVariantIndex]);
                        types.add(tokenTypesArray[maxTfVariantIndex]);
                        normalizedWordTf.add(totalTf);
                        fieldIndexList.add((byte) fieldIndices.bits[0]);

                        // Add this word's index in AllWords to all its instances
                        // in the AllTokens multiarray
                        for (int j = variantStartIndex; j < i + 1; j++)
                        {
                            wordIndexes[tokenImagesOrder[j]] = normalizedWordImages.size() - 1;
                        }
                    }
                }

                // Reinitialize counters
                totalTf = 1;
                tf = 1;
                maxTf = 1;
                maxTfVariantIndex = tokenImagesOrder[i + 1];
                variantStartIndex = i + 1;

                // Re-initialize int set used for document frequency calculation
                resetForNewTokenImage(documentIndexesArray, tokenImagesOrder,
                    fieldIndices, wordDocuments, i);
            }
        }

        // Mapping from allTokens
        context.allTokens.wordIndex = wordIndexes;

        context.allWords.image = normalizedWordImages
            .toArray(new char [normalizedWordImages.size()] []);
        context.allWords.tf = normalizedWordTf.toArray();
        context.allWords.tfByDocument =
            wordTfByDocumentList.toArray(new int [wordTfByDocumentList.size()] []);
        context.allWords.fieldIndices = fieldIndexList.toArray();
        context.allWords.type = types.toArray();
    }
View Full Code Here

            // Convert word indices to stem indices.
            assert cc.phrases.size() == 1;
            int [] stemIndices = context.allWords.stemIndex;
            int [] phraseWords = cc.phrases.get(0);
            IntArrayList stemList = new IntArrayList(phraseWords.length);
            for (int seqIndex : phraseWords)
            {
                int termIndex = sb.input.get(seqIndex);
                stemList.add(stemIndices[termIndex]);
            }
           
            // Check if we have stem-equivalent phrase like this.
            ClusterCandidate equivalent = merged.get(stemList);
            if (equivalent == null)
View Full Code Here

        // Find all subphrases
        List<Substring> rcs = discoverRcs(suffixArray, lcpArray, documentIndexArray);

        List<int []> phraseWordIndexes = Lists.newArrayList();
        IntArrayList phraseTf = new IntArrayList();
        List<int []> phraseTfByDocumentList = Lists.newArrayList();

        if (rcs.size() > 0)
        {
            // Determine most frequent originals and create the final phrase
            // array. Also merge the phrase tf by document maps into flat
            // arrays.
            Collections.sort(rcs, new SubstringComparator(wordIndexesArray, stemIndexes));

            int totalPhraseTf = rcs.get(0).frequency;
            Substring mostFrequentOriginal = rcs.get(0);
            IntIntOpenHashMap phraseTfByDocument = new IntIntOpenHashMap();
            phraseTfByDocument.putAll(mostFrequentOriginal.tfByDocument);

            // Don't change the rcs list type from ArrayList or we'll
            // run into O(n^2) iteration cost :)
            for (int i = 0; i < rcs.size() - 1; i++)
            {
                final Substring substring = rcs.get(i);
                final Substring nextSubstring = rcs.get(i + 1);

                if (substring
                    .isEquivalentTo(nextSubstring, wordIndexesArray, stemIndexes))
                {
                    totalPhraseTf += nextSubstring.frequency;
                    addAllWithOffset(phraseTfByDocument, nextSubstring.tfByDocument, -1);
                    if (mostFrequentOriginal.frequency < nextSubstring.frequency)
                    {
                        mostFrequentOriginal = nextSubstring;
                    }
                }
                else
                {
                    int [] wordIndexes = new int [(mostFrequentOriginal.to - mostFrequentOriginal.from)];
                    for (int j = 0; j < wordIndexes.length; j++)
                    {
                        wordIndexes[j] = wordIndexesArray[mostFrequentOriginal.from + j];
                    }
                    phraseWordIndexes.add(wordIndexes);
                    phraseTf.add(totalPhraseTf);
                    phraseTfByDocumentList.add(IntMapUtils.flatten(phraseTfByDocument));

                    totalPhraseTf = nextSubstring.frequency;
                    mostFrequentOriginal = nextSubstring;
                    phraseTfByDocument.clear();
                    phraseTfByDocument.putAll(nextSubstring.tfByDocument);
                }
            }

            // Add the last substring
            final Substring substring = rcs.get(rcs.size() - 1);
            int [] wordIndexes = new int [(substring.to - substring.from)];
            for (int j = 0; j < wordIndexes.length; j++)
            {
                wordIndexes[j] = wordIndexesArray[mostFrequentOriginal.from + j];
            }
            phraseWordIndexes.add(wordIndexes);
            phraseTf.add(totalPhraseTf);
            phraseTfByDocumentList.add(IntMapUtils.flatten(phraseTfByDocument));
        }

        // Store the results to allPhrases
        context.allPhrases.wordIndices = phraseWordIndexes
            .toArray(new int [phraseWordIndexes.size()] []);
        context.allPhrases.tf = phraseTf.toArray();
        context.allPhrases.tfByDocument = phraseTfByDocumentList
            .toArray(new int [phraseTfByDocumentList.size()] []);
    }
View Full Code Here

        }

        // Prepare arrays
        images = Lists.newArrayList();
        tokenTypes = new ShortArrayList();
        documentIndices = new IntArrayList();
        fieldIndices = new ByteArrayList();

        final Iterator<Document> docIterator = documents.iterator();
        int documentIndex = 0;
        final ITokenizer ts = context.language.getTokenizer();
View Full Code Here

TOP

Related Classes of com.carrotsearch.hppc.IntArrayList$ValueIterator

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.