Examples of com.carrotsearch.hppc.BitSet

com.carrotsearch.hppc.BitSet
An "open" BitSet implementation that allows direct access to the array of words storing the bits.
Unlike java.util.bitset, the fact that bits are packed into an array of longs is part of the interface. This allows efficient implementation of other algorithms by someone other than the author. It also allows one to efficiently implement alternate serialization or interchange formats.

The index range for a bitset can easily exceed positive int range in Java (0x7fffffff), so many methods in this class accept or return a long. There are adapter methods that return views compatible with {@link LongLookupContainer} and {@link IntLookupContainer} interfaces.
@see #asIntLookupContainer() @see #asLongLookupContainer() @author "Original implementation from the Lucene project."

    public static void before()
    {
        final int MB = 1024 * 1024;
        final int bits = 128 * MB * 4;


        hppc = new BitSet(bits);
        jre = new java.util.BitSet(bits);


        // Randomly fill every bits (this is fairly dense distribution).
        for (int i = 0; i < bits; i += 1 + rnd.nextInt(10))
        {

View Full Code Here

                // Add label and score
                cluster.addPhrases(labelFormatter.format(context, labelFeature));
                cluster.setAttribute(Cluster.SCORE, clusterLabelScore[i]);


                // Add documents
                final BitSet bs = clusterDocuments[i];
                for (int bit = bs.nextSetBit(0); bit >= 0; bit = bs.nextSetBit(bit + 1))
                {
                    cluster.addDocuments(documents.get(bit));
                }


                // Add cluster

View Full Code Here


        final BitSet [] labelsDocumentIndices = new BitSet [labelsFeatureIndex.length];


        for (int i = 0; i < labelsFeatureIndex.length; i++)
        {
            final BitSet documentIndices = new BitSet(documentCount);


            final int featureIndex = labelsFeatureIndex[i];
            if (featureIndex < wordCount)
            {
                addTfByDocumentToBitSet(documentIndices,
                    stemsTfByDocument[wordsStemIndex[featureIndex]]);
            }
            else
            {
                final int phraseIndex = featureIndex - wordCount;
                if (exactPhraseAssignment)
                {
                    addTfByDocumentToBitSet(documentIndices,
                        phrasesTfByDocument[phraseIndex]);
                }
                else
                {
                    final int [] wordIndices = phrasesWordIndices[phraseIndex];
                    boolean firstAdded = false;


                    for (int j = 0; j < wordIndices.length; j++)
                    {
                        final int wordIndex = wordIndices[j];
                        if (!TokenTypeUtils.isCommon(wordsTypes[wordIndex]))
                        {
                            if (!firstAdded)
                            {
                                addTfByDocumentToBitSet(documentIndices,
                                    stemsTfByDocument[wordsStemIndex[wordIndex]]);
                                firstAdded = true;
                            }
                            else
                            {
                                final BitSet temp = new BitSet(documentCount);
                                addTfByDocumentToBitSet(temp,
                                    stemsTfByDocument[wordsStemIndex[wordIndex]]);
                                // .retainAll == set intersection
                                documentIndices.and(temp);
                            }

View Full Code Here

        final int [][] phrasesWordIndices = context.allPhrases.wordIndices;
        final int wordCount = wordsStemIndex.length;


        final int [][] stemsTfByDocument = context.allStems.tfByDocument;
        int documentCount = context.documents.size();
        final BitSet requiredStemIndices = new BitSet(labelsFeatureIndex.length);


        for (int i = 0; i < labelsFeatureIndex.length; i++)
        {
            final int featureIndex = labelsFeatureIndex[i];
            if (featureIndex < wordCount)
            {
                addStemIndex(wordsStemIndex, documentCount, stemsTfByDocument,
                    requiredStemIndices, featureIndex);
            }
            else
            {
                final int [] wordIndices = phrasesWordIndices[featureIndex - wordCount];
                for (int j = 0; j < wordIndices.length; j++)
                {
                    final int wordIndex = wordIndices[j];
                    if (!TokenTypeUtils.isCommon(wordsTypes[wordIndex]))
                    {
                        addStemIndex(wordsStemIndex, documentCount, stemsTfByDocument,
                            requiredStemIndices, wordIndex);
                    }
                }
            }
        }


        return requiredStemIndices.asIntLookupContainer().toArray();
    }

View Full Code Here

        int maxTfVariantIndex = tokenImagesOrder[0];
        int totalTf = 1;
        int variantStartIndex = 0;


        // A byte set for word fields tracking
        final BitSet fieldIndices = new BitSet(context.allFields.name.length);


        // A stack for pushing information about the term's documents.
        final IntStack wordDocuments = new IntStack();


        if (documentIndexesArray[tokenImagesOrder[0]] >= 0)
        {
            wordDocuments.push(documentIndexesArray[tokenImagesOrder[0]]);
        }


        // Go through the ordered token images
        for (int i = 0; i < tokenImagesOrder.length - 1; i++)
        {
            final char [] image = tokenImages[tokenImagesOrder[i]];
            final char [] nextImage = tokenImages[tokenImagesOrder[i + 1]];
            final int tokenType = tokenTypesArray[tokenImagesOrder[i]];
            final int documentIndex = documentIndexesArray[tokenImagesOrder[i + 1]];


            // Reached the end of non-null tokens?
            if (image == null)
            {
                break;
            }


            // Check if we want to index this token at all
            if (isNotIndexed(tokenType))
            {
                variantStartIndex = i + 1;
                maxTfVariantIndex = tokenImagesOrder[i + 1];


                resetForNewTokenImage(documentIndexesArray, tokenImagesOrder, 
                    fieldIndices, wordDocuments, i);
                continue;
            }


            fieldIndices.set(tokensFieldIndex[tokenImagesOrder[i]]);


            // Now check if image case is changing
            final boolean sameCase = CharArrayComparators.FAST_CHAR_ARRAY_COMPARATOR
                .compare(image, nextImage) == 0;
            if (sameCase)

View Full Code Here

     * set of clusters in Carrot2 format. 
     */
    private void postProcessing(List<ClusterCandidate> clusters)
    {
        // Adapt to Carrot2 classes, counting used documents on the way.
        final BitSet all = new BitSet(documents.size());
        final ArrayList<Document> docs = Lists.newArrayListWithCapacity(documents.size());
        final ArrayList<String> phrases = Lists.newArrayListWithCapacity(3);
        for (ClusterCandidate c : clusters)
        {
            final Cluster c2 = new Cluster();
            c2.addPhrases(collectPhrases(phrases, c));
            c2.addDocuments(collectDocuments(docs, c.documents));
            c2.setScore((double) c.score);
            this.clusters.add(c2);


            all.or(c.documents);
            docs.clear(); 
            phrases.clear();
        }


        Collections.sort(this.clusters,

View Full Code Here

        }
        else
        {
            // For larger hash maps, use a bitset to sort keys.


            final BitSet bset = new BitSet(map.size());
            for (IntIntCursor c : map)
            {
                bset.set(c.key);
            }


            for (int key = bset.nextSetBit(0); key >= 0; key = bset.nextSetBit(key + 1))
            {
                result[index++] = key;
                result[index++] = map.get(key);
            }
        }

View Full Code Here


        private void countDocs(int level, int state)
        {
            assert !stree.isLeaf(state);


            final BitSet me = getBitSet(level);
            for (int edge = stree.firstEdge(state); edge != NO_EDGE; edge = stree.nextEdge(edge))
            {
                final int childState = stree.getToState(edge);
                if (stree.isLeaf(childState))
                {
                    final int documentIndex = sb.stateOriginDocument.get(childState);
                    me.set(documentIndex);
                }
                else
                {
                    final BitSet child = getBitSet(level + 1);
                    child.clear();
                    edges.push(stree.getStartIndex(edge), stree.getEndIndex(edge));
                    countDocs(level + 1, childState);
                    edges.discard(2);
                    me.or(child);
                }

View Full Code Here


        protected abstract void visit(int state, int cardinality, BitSet documents, IntStack path);


        private BitSet getBitSet(int level)
        {
            while (bsets.size() <= level) bsets.add(new BitSet());
            return bsets.get(level);
        }

View Full Code Here


    /* For cluster merging. */
    ClusterCandidate()
    {
        this.phrases = Lists.newArrayList();
        this.documents = new BitSet();
    }

View Full Code Here

0 1

TOP

Related Classes of com.carrotsearch.hppc.BitSet

com.carrotsearch.hppc.cursors.IntCursor

com.carrotsearch.hppc.cursors.LongCursor

com.carrotsearch.hppc.jub.BitSetBenchmark

org.carrot2.clustering.lingo.LingoClusteringAlgorithm

org.carrot2.clustering.stc.ClusterCandidate

org.carrot2.clustering.stc.GeneralizedSuffixTree$Visitor

org.carrot2.clustering.stc.STCClusteringAlgorithm

org.carrot2.text.preprocessing.CaseNormalizer

org.carrot2.text.preprocessing.DocumentAssigner

org.carrot2.text.vsm.TermDocumentMatrixBuilder

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.