Package com.carrotsearch.hppc

Examples of com.carrotsearch.hppc.IntArrayList


            if (!nodesChecked[i])
            {
                nodeQueue.clear();
                nodeQueue.addLast(i);
                nodesChecked[i] = true;
                IntArrayList clusterGroup = new IntArrayList();

                while (!nodeQueue.isEmpty())
                {
                    // Get a node from the queue
                    int node = nodeQueue.removeFirst();

                    // Add to the current sub-graph (cluster group)
                    clusterGroup.add(node);

                    // Add all its non-checked neighbors to the queue
                    for (int j = i + 1; j < vertexCount; j++)
                    {
                        if (!nodesChecked[j])
                        {
                            if (arcPredicate.isArcPresent(node, j))
                            {
                                nodeQueue.addLast(j);
                                nodesChecked[j] = true;
                            }
                        }
                    }
                }

                if (clusterGroup.size() > 1 || !pruneOneNodeSubrgaphs)
                {
                    clusterGroups.add(clusterGroup);
                }
            }
        }
View Full Code Here


            preprocessingPipeline.preprocess(documents, null, language);

        // Add trivial AllLabels so that we can reuse the common TD matrix builder
        final int [] stemsMfow = preprocessingContext.allStems.mostFrequentOriginalWordIndex;
        final short [] wordsType = preprocessingContext.allWords.type;
        final IntArrayList featureIndices = new IntArrayList(stemsMfow.length);
        for (int i = 0; i < stemsMfow.length; i++)
        {
            final short flag = wordsType[stemsMfow[i]];
            if ((flag & (ITokenizer.TF_COMMON_WORD | ITokenizer.TF_QUERY_WORD | ITokenizer.TT_NUMERIC)) == 0)
            {
                featureIndices.add(stemsMfow[i]);
            }
        }
        preprocessingContext.allLabels.featureIndex = featureIndices.toArray();
        preprocessingContext.allLabels.firstPhraseIndex = -1;

        // Further processing only if there are words to process
        clusters = Lists.newArrayList();
        if (preprocessingContext.hasLabels())
        {
            // Term-document matrix building and reduction
            final VectorSpaceModelContext vsmContext = new VectorSpaceModelContext(
                preprocessingContext);
            final ReducedVectorSpaceModelContext reducedVsmContext = new ReducedVectorSpaceModelContext(
                vsmContext);

            matrixBuilder.buildTermDocumentMatrix(vsmContext);
            matrixBuilder.buildTermPhraseMatrix(vsmContext);

            // Prepare rowIndex -> stemIndex mapping for labeling
            final IntIntOpenHashMap rowToStemIndex = new IntIntOpenHashMap();
            for (IntIntCursor c : vsmContext.stemToRowIndex)
            {
                rowToStemIndex.put(c.value, c.key);
            }

            final DoubleMatrix2D tdMatrix;
            if (useDimensionalityReduction && clusterCount * 2 < preprocessingContext.documents.size())
            {
                matrixReducer.reduce(reducedVsmContext, clusterCount * 2);
                tdMatrix = reducedVsmContext.coefficientMatrix.viewDice();
            }
            else
            {
                tdMatrix = vsmContext.termDocumentMatrix;
            }

            // Initial selection containing all columns, initial clustering
            final IntArrayList columns = new IntArrayList(tdMatrix.columns());
            for (int c = 0; c < tdMatrix.columns(); c++)
            {
                columns.add(c);
            }
            final List<IntArrayList> rawClusters = Lists.newArrayList();
            rawClusters.addAll(split(partitionCount, tdMatrix, columns, maxIterations));
            Collections.sort(rawClusters, BY_SIZE_DESCENDING);
           
            int largestIndex = 0;
            while (rawClusters.size() < clusterCount && largestIndex < rawClusters.size())
            {
                // Find largest cluster to split
                IntArrayList largest = rawClusters.get(largestIndex);
                if (largest.size() <= partitionCount * 2)
                {
                    // No cluster is large enough to produce a meaningful
                    // split (i.e. a split into subclusters with more than
                    // 1 member).
                    break;
                }

                final List<IntArrayList> split = split(partitionCount, tdMatrix, largest,
                    maxIterations);
                if (split.size() > 1)
                {
                    rawClusters.remove(largestIndex);
                    rawClusters.addAll(split);
                    Collections.sort(rawClusters, BY_SIZE_DESCENDING);
                    largestIndex = 0;
                }
                else
                {
                    largestIndex++;
                }
            }

            for (int i = 0; i < rawClusters.size(); i++)
            {
                final Cluster cluster = new Cluster();

                final IntArrayList rawCluster = rawClusters.get(i);
                if (rawCluster.size() > 1)
                {
                    cluster.addPhrases(getLabels(rawCluster,
                        vsmContext.termDocumentMatrix, rowToStemIndex,
                        preprocessingContext.allStems.mostFrequentOriginalWordIndex,
                        preprocessingContext.allWords.image));
                    for (int j = 0; j < rawCluster.size(); j++)
                    {
                        cluster.addDocuments(documents.get(rawCluster.get(j)));
                    }
                    clusters.add(cluster);
                }
            }
        }
View Full Code Here

        // Prepare results holders
        List<IntArrayList> result = Lists.newArrayList();
        List<IntArrayList> previousResult = null;
        for (int i = 0; i < partitions; i++)
        {
            result.add(new IntArrayList(selected.columns()));
        }
        for (int i = 0; i < selected.columns(); i++)
        {
            result.get(i % partitions).add(i);
        }

        // Matrices for centroids and document-centroid similarities
        final DoubleMatrix2D centroids = new DenseDoubleMatrix2D(selected.rows(),
            partitions).assign(selected.viewPart(0, 0, selected.rows(), partitions));
        final DoubleMatrix2D similarities = new DenseDoubleMatrix2D(partitions,
            selected.columns());

        // Run a fixed number of K-means iterations
        for (int it = 0; it < iterations; it++)
        {
            // Update centroids
            for (int i = 0; i < result.size(); i++)
            {
                final IntArrayList cluster = result.get(i);
                for (int k = 0; k < selected.rows(); k++)
                {
                    double sum = 0;
                    for (int j = 0; j < cluster.size(); j++)
                    {
                        sum += selected.get(k, cluster.get(j));
                    }
                    centroids.setQuick(k, i, sum / cluster.size());
                }
            }

            if (it < iterations - 1)
            {
                previousResult = result;
                result = Lists.newArrayList();
                for (int i = 0; i < partitions; i++)
                {
                    result.add(new IntArrayList(selected.columns()));
                }
            }

            // Calculate similarity to centroids
            centroids.zMult(selected, similarities, 1, 0, true, false);

            // Assign documents to the nearest centroid
            for (int c = 0; c < similarities.columns(); c++)
            {
                int maxRow = 0;
                double max = similarities.get(0, c);
                for (int r = 1; r < similarities.rows(); r++)
                {
                    if (max < similarities.get(r, c))
                    {
                        max = similarities.get(r, c);
                        maxRow = r;
                    }
                }

                result.get(maxRow).add(c);
            }

            if (ObjectUtils.equals(previousResult, result))
            {
                // Unchanged result
                break;
            }
        }

        // Map the results back to the global indices
        for (Iterator<IntArrayList> it = result.iterator(); it.hasNext();)
        {
            final IntArrayList cluster = it.next();
            if (cluster.isEmpty())
            {
                it.remove();
            }
            else
            {
                for (int j = 0; j < cluster.size(); j++)
                {
                    cluster.set(j, selectedToInput.get(cluster.get(j)));
                }
            }
        }

        return result;
View Full Code Here

            return;
        }

        // Lists to accommodate the results
        final ArrayList<char []> stemImages = new ArrayList<char []>(allWordsCount);
        final IntArrayList stemTf = new IntArrayList(allWordsCount);
        final IntArrayList stemMostFrequentWordIndexes = new IntArrayList(allWordsCount);
        final ArrayList<int []> stemTfByDocumentList = new ArrayList<int []>(allWordsCount);
        final ByteArrayList fieldIndexList = new ByteArrayList();

        // Counters
        int totalTf = wordTfArray[stemImagesOrder[0]];
        int mostFrequentWordFrequency = wordTfArray[stemImagesOrder[0]];
        int mostFrequentWordIndex = stemImagesOrder[0];
        int stemIndex = 0;

        // A list of document-term-frequency pairs, by document, for all words with identical stems.
        final ArrayList<int[]> stemTfsByDocument = Lists.newArrayList();
       
        stemTfsByDocument.add(wordTfByDocumentArray[stemImagesOrder[0]]);
        byte fieldIndices = 0;
        fieldIndices |= wordsFieldIndices[0];

        // For locating query words
        final MutableCharArray buffer = new MutableCharArray(
            wordStemImages[stemImagesOrder[0]]);
        boolean inQuery = queryStems.contains(buffer);

        // Go through all words in the order of stem images
        for (int i = 0; i < stemImagesOrder.length - 1; i++)
        {
            final int orderIndex = stemImagesOrder[i];
            final char [] stem = wordStemImages[orderIndex];
            final int nextInOrderIndex = stemImagesOrder[i + 1];
            final char [] nextStem = wordStemImages[nextInOrderIndex];

            stemIndexesArray[orderIndex] = stemIndex;
            if (inQuery)
            {
                wordsType[orderIndex] |= ITokenizer.TF_QUERY_WORD;
            }

            // Now check if token image is changing
            final boolean sameStem = CharArrayComparators.FAST_CHAR_ARRAY_COMPARATOR
                .compare(stem, nextStem) == 0;

            if (sameStem)
            {
                totalTf += wordTfArray[nextInOrderIndex];
                stemTfsByDocument.add(wordTfByDocumentArray[nextInOrderIndex]);
                fieldIndices |= wordsFieldIndices[nextInOrderIndex];
                if (mostFrequentWordFrequency < wordTfArray[nextInOrderIndex])
                {
                    mostFrequentWordFrequency = wordTfArray[nextInOrderIndex];
                    mostFrequentWordIndex = nextInOrderIndex;
                }
            }
            else
            {
                stemImages.add(stem);
                stemTf.add(totalTf);
                stemMostFrequentWordIndexes.add(mostFrequentWordIndex);
                storeTfByDocument(stemTfByDocumentList, stemTfsByDocument);
                fieldIndexList.add(fieldIndices);

                stemIndex++;
                totalTf = wordTfArray[nextInOrderIndex];
                mostFrequentWordFrequency = wordTfArray[nextInOrderIndex];
                mostFrequentWordIndex = nextInOrderIndex;
                fieldIndices = 0;
                fieldIndices |= wordsFieldIndices[nextInOrderIndex];

                stemTfsByDocument.clear();
                stemTfsByDocument.add(wordTfByDocumentArray[nextInOrderIndex]);

                buffer.reset(wordStemImages[nextInOrderIndex]);
                inQuery = queryStems.contains(buffer);
            }
        }

        // Store tf for the last stem in the array
        stemImages.add(wordStemImages[stemImagesOrder[stemImagesOrder.length - 1]]);
        stemTf.add(totalTf);
        stemMostFrequentWordIndexes.add(mostFrequentWordIndex);
        stemIndexesArray[stemImagesOrder[stemImagesOrder.length - 1]] = stemIndex;
        storeTfByDocument(stemTfByDocumentList, stemTfsByDocument);
        fieldIndexList.add(fieldIndices);
        if (inQuery)
        {
            wordsType[stemImagesOrder[stemImagesOrder.length - 1]] |= ITokenizer.TF_QUERY_WORD;
        }

        // Convert lists to arrays and store them in allStems
        context.allStems.image = stemImages.toArray(new char [stemImages.size()] []);
        context.allStems.mostFrequentOriginalWordIndex = stemMostFrequentWordIndexes
            .toArray();
        context.allStems.tf = stemTf.toArray();
        context.allStems.tfByDocument = stemTfByDocumentList
            .toArray(new int [stemTfByDocumentList.size()] []);
        context.allStems.fieldIndices = fieldIndexList.toArray();
View Full Code Here

        stopWordLabelFilter.filter(context, acceptedStems, acceptedPhrases);
        numericLabelFilter.filter(context, acceptedStems, acceptedPhrases);
        stopLabelFilter.filter(context, acceptedStems, acceptedPhrases);
        completeLabelFilter.filter(context, acceptedStems, acceptedPhrases);

        final IntArrayList acceptedFeatures = new IntArrayList(acceptedStems.length
            + acceptedPhrases.length);

        final int [] mostFrequentOriginalWordIndex = context.allStems.mostFrequentOriginalWordIndex;
        for (int i = 0; i < acceptedStems.length; i++)
        {
            if (acceptedStems[i])
            {
                acceptedFeatures.add(mostFrequentOriginalWordIndex[i]);
            }
        }

        for (int i = 0; i < acceptedPhrases.length; i++)
        {
            if (acceptedPhrases[i])
            {
                acceptedFeatures.add(i + wordCount);
            }
        }

        context.allLabels.featureIndex = acceptedFeatures.toArray();
        updateFirstPhraseIndex(context);
    }
View Full Code Here

      terminals.add(curr);
  }
 
  public IntArrayList getSubTerminalIdList()
  {
    IntArrayList list = new IntArrayList();
   
    for (CTNode node : getSubTerminals())
      list.add(node.getTerminalId());
   
    return list;
  }
View Full Code Here

 
  // --------------------------------- projectivize ---------------------------------
 
  public void projectivize()
  {
    IntArrayList ids = new IntArrayList();
    int i, size = size();
    DEPNode nonProj;
   
    for (i=1; i<size; i++)
      ids.add(i);
   
    while ((nonProj = getSmallestNonProjectiveArc(ids)) != null)
      nonProj.setHead(nonProj.getHead().getHead(), DEPLib.DEP_NON_PROJ);
  }
View Full Code Here

   */
  public AbstractTrainSpace(AbstractModel model, boolean hasWeight)
  {
    m_model  = model;
    b_weight = hasWeight;
    a_ys     = new IntArrayList();
    a_xs     = new ArrayList<int[]>();
    if (hasWeighta_vs = new ArrayList<double[]>();
  }
View Full Code Here

  }
 
  void measureTime()
  {
    int i, j, len = 10, size = 1000000;
    IntArrayList list;
    IntDeque deque;
    long st, et;
   
    st = System.currentTimeMillis();
   
    for (i=0; i<size; i++)
    {
      list = new IntArrayList();
     
      for (j=0; j<len; j++)
        list.add(j);
     
      list.remove(list.size()-1);
    }
   
    et = System.currentTimeMillis();
    System.out.println(et-st);
   
View Full Code Here

   
    IntArrayList[] groups = new IntArrayList[size];
   
    for (i=0; i<size; i++)
    {
      groups[i] = new IntArrayList();
      groups[i].addAll(lhs[i]);
      groups[i].addAll(rhs[i]);
    }
   
    return groups;
View Full Code Here

TOP

Related Classes of com.carrotsearch.hppc.IntArrayList

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.