Examples of org.carrot2.text.preprocessing.PreprocessingContext$AllStems

org.carrot2.util.resource.ClassLoaderLocator
Information about all unique stems found in the input {@link PreprocessingContext#documents}. Each entry in each array corresponds to one base form different words can be transformed to by the {@link IStemmer} used whileprocessing. E.g. the English mining and mine will be aggregated to one entry in the arrays, while they will have separate entries in {@link AllWords}.
All arrays in this class have the same length and values across different arrays correspond to each other for the same index.

     * Builds a term document matrix from data provided in the <code>context</code>,
     * stores the result in there.
     */
    public void buildTermDocumentMatrix(VectorSpaceModelContext vsmContext)
    {
        final PreprocessingContext preprocessingContext = vsmContext.preprocessingContext;


        final int documentCount = preprocessingContext.documents.size();
        final int [] stemsTf = preprocessingContext.allStems.tf;
        final int [][] stemsTfByDocument = preprocessingContext.allStems.tfByDocument;
        final byte [] stemsFieldIndices = preprocessingContext.allStems.fieldIndices;

View Full Code Here

     * the processing context contains no phrases,
     * {@link VectorSpaceModelContext#termPhraseMatrix} will remain <code>null</code>.
     */
    public void buildTermPhraseMatrix(VectorSpaceModelContext context)
    {
        final PreprocessingContext preprocessingContext = context.preprocessingContext;
        final IntIntOpenHashMap stemToRowIndex = context.stemToRowIndex;
        final int [] labelsFeatureIndex = preprocessingContext.allLabels.featureIndex;
        final int firstPhraseIndex = preprocessingContext.allLabels.firstPhraseIndex;


        if (firstPhraseIndex >= 0 && stemToRowIndex.size() > 0)

View Full Code Here

        }


        final DoubleMatrix2D phraseMatrix = new SparseDoubleMatrix2D(stemToRowIndex
            .size(), featureIndex.length);


        final PreprocessingContext preprocessingContext = vsmContext.preprocessingContext;
        final int [] wordsStemIndex = preprocessingContext.allWords.stemIndex;
        final int [] stemsTf = preprocessingContext.allStems.tf;
        final int [][] stemsTfByDocument = preprocessingContext.allStems.tfByDocument;
        final int [][] phrasesWordIndices = preprocessingContext.allPhrases.wordIndices;
        final int documentCount = preprocessingContext.documents.size();

View Full Code Here

     * Perform clustering for a given language.
     */
    protected void cluster(LanguageCode language)
    {
        // Preprocessing of documents
        final PreprocessingContext preprocessingContext = 
            preprocessingPipeline.preprocess(documents, null, language);


        // Add trivial AllLabels so that we can reuse the common TD matrix builder
        final int [] stemsMfow = preprocessingContext.allStems.mostFrequentOriginalWordIndex;
        final short [] wordsType = preprocessingContext.allWords.type;
        final IntArrayList featureIndices = new IntArrayList(stemsMfow.length);
        for (int i = 0; i < stemsMfow.length; i++)
        {
            final short flag = wordsType[stemsMfow[i]];
            if ((flag & (ITokenizer.TF_COMMON_WORD | ITokenizer.TF_QUERY_WORD | ITokenizer.TT_NUMERIC)) == 0)
            {
                featureIndices.add(stemsMfow[i]);
            }
        }
        preprocessingContext.allLabels.featureIndex = featureIndices.toArray();
        preprocessingContext.allLabels.firstPhraseIndex = -1;


        // Further processing only if there are words to process
        clusters = Lists.newArrayList();
        if (preprocessingContext.hasLabels())
        {
            // Term-document matrix building and reduction
            final VectorSpaceModelContext vsmContext = new VectorSpaceModelContext(
                preprocessingContext);
            final ReducedVectorSpaceModelContext reducedVsmContext = new ReducedVectorSpaceModelContext(

View Full Code Here

    @Test
    public void testCarrot905()
    {
        createDocuments("", "aa . bb", "", "bb . cc", "", "aa . cc . cc");


        PreprocessingContext context = preprocessingPipeline.preprocess(
            this.context.documents, 
            this.context.query, 
            this.context.language.getLanguageCode());


        // The preprocessing pipeline will produce increasing indices in tfByDocument,

View Full Code Here


      private final MutableCharArray tempCharSequence;
      private final Class<?> tokenFilterClass;


      private ChineseTokenizer() throws Exception {
        this.tempCharSequence = new MutableCharArray(new char[0]);


        // As Smart Chinese is not available during compile time,
        // we need to resort to reflection.
        final Class<?> tokenizerClass = ReflectionUtils.classForName(
            "org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false);

View Full Code Here

    // is not affected by the test stopwords and stoplabels.
    ILexicalData lexicalData = preprocessing.lexicalDataFactory
        .getLexicalData(LanguageCode.MALTESE);


    for (String word : wordsToCheck.split(",")) {
      if (!lexicalData.isCommonWord(new MutableCharArray(word))
          && !lexicalData.isStopLabel(word)) {
        clusters.add(new Cluster(word));
      }
    }
  }

View Full Code Here

        if (context.hasLabels())
        {
            // Term-document matrix building and reduction
            final VectorSpaceModelContext vsmContext = new VectorSpaceModelContext(
                context);
            final ReducedVectorSpaceModelContext reducedVsmContext = new ReducedVectorSpaceModelContext(
                vsmContext);
            LingoProcessingContext lingoContext = new LingoProcessingContext(
                reducedVsmContext);


            matrixBuilder.buildTermDocumentMatrix(vsmContext);

View Full Code Here

        // Further processing only if there are words to process
        clusters = Lists.newArrayList();
        if (context.hasLabels())
        {
            // Term-document matrix building and reduction
            final VectorSpaceModelContext vsmContext = new VectorSpaceModelContext(
                context);
            final ReducedVectorSpaceModelContext reducedVsmContext = new ReducedVectorSpaceModelContext(
                vsmContext);
            LingoProcessingContext lingoContext = new LingoProcessingContext(
                reducedVsmContext);

View Full Code Here


          }
        },
        
        // Using the class loader directly because this time we want to omit the prefix 
        new ClassLoaderLocator(core.getResourceLoader().getClassLoader())));
    
    this.controller.init(initAttributes);
    this.idFieldName = core.getSchema().getUniqueKeyField().getName();


    // Make sure the requested Carrot2 clustering algorithm class is available

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.carrot2.text.preprocessing.PreprocessingContext$AllStems

com.carrotsearch.hppc.BitSet

com.carrotsearch.hppc.IntArrayList

com.carrotsearch.hppc.IntIntOpenHashMap

com.carrotsearch.hppc.IntStack

com.tamingtext.carrot2.Carrot2ExampleTest

org.apache.http.message.BasicNameValuePair

org.apache.lucene.search.IndexSearcher

org.apache.mahout.math.matrix.DoubleMatrix2D

org.carrot2.clustering.kmeans.BisectingKMeansClusteringAlgorithm

org.carrot2.clustering.kmeans.BisectingKMeansClusteringAlgorithmTest

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.