Package com.carrotsearch.hppc

Examples of com.carrotsearch.hppc.ShortArrayList$ValueIterator


        // Create holders for new arrays
        final List<char []> normalizedWordImages = Lists.newArrayList();
        final IntArrayList normalizedWordTf = new IntArrayList();
        final List<int []> wordTfByDocumentList = Lists.newArrayList();
        final ByteArrayList fieldIndexList = new ByteArrayList();
        final ShortArrayList types = new ShortArrayList();

        final int [] wordIndexes = new int [tokenCount];
        Arrays.fill(wordIndexes, -1);

        // Initial values for counters
        int tf = 1;
        int maxTf = 1;
        int maxTfVariantIndex = tokenImagesOrder[0];
        int totalTf = 1;
        int variantStartIndex = 0;

        // A byte set for word fields tracking
        final BitSet fieldIndices = new BitSet(context.allFields.name.length);

        // A stack for pushing information about the term's documents.
        final IntStack wordDocuments = new IntStack();

        if (documentIndexesArray[tokenImagesOrder[0]] >= 0)
        {
            wordDocuments.push(documentIndexesArray[tokenImagesOrder[0]]);
        }

        // Go through the ordered token images
        for (int i = 0; i < tokenImagesOrder.length - 1; i++)
        {
            final char [] image = tokenImages[tokenImagesOrder[i]];
            final char [] nextImage = tokenImages[tokenImagesOrder[i + 1]];
            final int tokenType = tokenTypesArray[tokenImagesOrder[i]];
            final int documentIndex = documentIndexesArray[tokenImagesOrder[i + 1]];

            // Reached the end of non-null tokens?
            if (image == null)
            {
                break;
            }

            // Check if we want to index this token at all
            if (isNotIndexed(tokenType))
            {
                variantStartIndex = i + 1;
                maxTfVariantIndex = tokenImagesOrder[i + 1];

                resetForNewTokenImage(documentIndexesArray, tokenImagesOrder,
                    fieldIndices, wordDocuments, i);
                continue;
            }

            fieldIndices.set(tokensFieldIndex[tokenImagesOrder[i]]);

            // Now check if image case is changing
            final boolean sameCase = CharArrayComparators.FAST_CHAR_ARRAY_COMPARATOR
                .compare(image, nextImage) == 0;
            if (sameCase)
            {
                // Case has not changed, just increase counters
                tf++;
                totalTf++;
                wordDocuments.push(documentIndex);
                continue;
            }

            // Case (or even token image) has changed. Update most frequent case
            // variant
            if (maxTf < tf)
            {
                maxTf = tf;
                maxTfVariantIndex = tokenImagesOrder[i];
                tf = 1;
            }

            final boolean sameImage = CharArrayComparators.CASE_INSENSITIVE_CHAR_ARRAY_COMPARATOR
                .compare(image, nextImage) == 0;

            // Check if token image has changed
            if (sameImage)
            {
                totalTf++;
                wordDocuments.push(documentIndex);
            }
            else
            {
                // The image has changed completely.
                // Before we start processing the new image, we need to
                // see if we want to store the previous image, and if so
                // we need add some data about it to the arrays
               
                // wordDocuments.size() may contain duplicate entries from the same document,
                // but this check is faster than deduping, so we do it first. 
                if (wordDocuments.size() >= dfThreshold)
                {
                    // Flatten the list of documents this term occurred in.
                    final int [] sparseEncoding = SparseArray.toSparseEncoding(wordDocuments);
                    final int df = (sparseEncoding.length >> 1);
                    if (df >= dfThreshold)
                    {
                        wordTfByDocumentList.add(sparseEncoding);
   
                        // Add the word to the word list
                        normalizedWordImages.add(tokenImages[maxTfVariantIndex]);
                        types.add(tokenTypesArray[maxTfVariantIndex]);
                        normalizedWordTf.add(totalTf);
                        fieldIndexList.add((byte) fieldIndices.bits[0]);

                        // Add this word's index in AllWords to all its instances
                        // in the AllTokens multiarray
                        for (int j = variantStartIndex; j < i + 1; j++)
                        {
                            wordIndexes[tokenImagesOrder[j]] = normalizedWordImages.size() - 1;
                        }
                    }
                }

                // Reinitialize counters
                totalTf = 1;
                tf = 1;
                maxTf = 1;
                maxTfVariantIndex = tokenImagesOrder[i + 1];
                variantStartIndex = i + 1;

                // Re-initialize int set used for document frequency calculation
                resetForNewTokenImage(documentIndexesArray, tokenImagesOrder,
                    fieldIndices, wordDocuments, i);
            }
        }

        // Mapping from allTokens
        context.allTokens.wordIndex = wordIndexes;

        context.allWords.image = normalizedWordImages
            .toArray(new char [normalizedWordImages.size()] []);
        context.allWords.tf = normalizedWordTf.toArray();
        context.allWords.tfByDocument =
            wordTfByDocumentList.toArray(new int [wordTfByDocumentList.size()] []);
        context.allWords.fieldIndices = fieldIndexList.toArray();
        context.allWords.type = types.toArray();
    }
View Full Code Here


            throw new ProcessingException("Maximum number of tokenized fields is 8.");
        }

        // Prepare arrays
        images = Lists.newArrayList();
        tokenTypes = new ShortArrayList();
        documentIndices = new IntArrayList();
        fieldIndices = new ByteArrayList();

        final Iterator<Document> docIterator = documents.iterator();
        int documentIndex = 0;
View Full Code Here

TOP

Related Classes of com.carrotsearch.hppc.ShortArrayList$ValueIterator

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.