Examples of org.carrot2.text.preprocessing.PreprocessingContext.AllPhrases

Package org.carrot2.text.preprocessing.PreprocessingContext

Examples of org.carrot2.text.preprocessing.PreprocessingContext.AllPhrases

org.carrot2.util.resource.DirLocator
Represents an occurrence of a {@link SpriteReferenceDirective} in a specific CSS file.

    /**
     * Discovers labels for clusters.
     */
    void buildLabels(LingoProcessingContext context, ITermWeighting termWeighting)
    {
        final PreprocessingContext preprocessingContext = context.preprocessingContext;
        final VectorSpaceModelContext vsmContext = context.vsmContext;
        final DoubleMatrix2D reducedTdMatrix = context.reducedVsmContext.baseMatrix;
        final int [] wordsStemIndex = preprocessingContext.allWords.stemIndex;
        final int [] labelsFeatureIndex = preprocessingContext.allLabels.featureIndex;
        final int [] mostFrequentOriginalWordIndex = preprocessingContext.allStems.mostFrequentOriginalWordIndex;

View Full Code Here

     * Make sure term frequencies and 
     */
    public void phraseTfsCorrect()
    {
        // for each discovered phrase, do manual count and verify if tf and tfByDocument are correct.
        AllPhrases allPhrases = context.allPhrases;
        for (int index = 0; index < allPhrases.size(); index++)
        {
            IntIntOpenHashMap realTfByDocuments = countManually(context, allPhrases.wordIndices[index]);
            final int realTf = realTfByDocuments.forEach(new IntIntProcedure()
            {
                int tf;
                public void apply(int key, int value)
                {
                    tf += value;
                }
            }).tf;


            Assertions.assertThat(allPhrases.tf[index]).as("Phrase: " + allPhrases.getPhrase(index))
                .isEqualTo(realTf);
            
            // Phrase extractor does not sort the byDocumentTf, so we need to addAllFromFlattened
            // to a map and then flatten with sorting.
            Assertions
                .assertThat(
                    IntMapUtils.flattenSortedByKey(IntMapUtils.addAllFromFlattened(
                        new IntIntOpenHashMap(), allPhrases.tfByDocument[index])))
                .as("Phrase: " + allPhrases.getPhrase(index))
                .isEqualTo(IntMapUtils.flattenSortedByKey(realTfByDocuments));
        }
    }

View Full Code Here


      private final MutableCharArray tempCharSequence;
      private final Class<?> tokenFilterClass;


      private ChineseTokenizer() throws Exception {
        this.tempCharSequence = new MutableCharArray(new char[0]);


        // As Smart Chinese is not available during compile time,
        // we need to resort to reflection.
        final Class<?> tokenizerClass = ReflectionUtils.classForName(
            "org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false);

View Full Code Here

    // is not affected by the test stopwords and stoplabels.
    ILexicalData lexicalData = preprocessing.lexicalDataFactory
        .getLexicalData(LanguageCode.MALTESE);


    for (String word : wordsToCheck.split(",")) {
      if (!lexicalData.isCommonWord(new MutableCharArray(word))
          && !lexicalData.isStopLabel(word)) {
        clusters.add(new Cluster(word));
      }
    }
  }

View Full Code Here

        if (context.hasLabels())
        {
            // Term-document matrix building and reduction
            final VectorSpaceModelContext vsmContext = new VectorSpaceModelContext(
                context);
            final ReducedVectorSpaceModelContext reducedVsmContext = new ReducedVectorSpaceModelContext(
                vsmContext);
            LingoProcessingContext lingoContext = new LingoProcessingContext(
                reducedVsmContext);


            matrixBuilder.buildTermDocumentMatrix(vsmContext);

View Full Code Here

        // Further processing only if there are words to process
        clusters = Lists.newArrayList();
        if (context.hasLabels())
        {
            // Term-document matrix building and reduction
            final VectorSpaceModelContext vsmContext = new VectorSpaceModelContext(
                context);
            final ReducedVectorSpaceModelContext reducedVsmContext = new ReducedVectorSpaceModelContext(
                vsmContext);
            LingoProcessingContext lingoContext = new LingoProcessingContext(
                reducedVsmContext);

View Full Code Here

     * Discovers labels for clusters.
     */
    void buildLabels(LingoProcessingContext context, ITermWeighting termWeighting)
    {
        final PreprocessingContext preprocessingContext = context.preprocessingContext;
        final VectorSpaceModelContext vsmContext = context.vsmContext;
        final DoubleMatrix2D reducedTdMatrix = context.reducedVsmContext.baseMatrix;
        final int [] wordsStemIndex = preprocessingContext.allWords.stemIndex;
        final int [] labelsFeatureIndex = preprocessingContext.allLabels.featureIndex;
        final int [] mostFrequentOriginalWordIndex = preprocessingContext.allStems.mostFrequentOriginalWordIndex;
        final int [][] phrasesWordIndices = preprocessingContext.allPhrases.wordIndices;

View Full Code Here


          }
        },
        
        // Using the class loader directly because this time we want to omit the prefix 
        new ClassLoaderLocator(core.getResourceLoader().getClassLoader())));
    
    this.controller.init(initAttributes);
    this.idFieldName = core.getSchema().getUniqueKeyField().getName();


    // Make sure the requested Carrot2 clustering algorithm class is available

View Full Code Here


    @Override
    protected IResource getXsltResource()
    {
        if (solrXsltAdapter == null) {
            return new ClassResource(SolrDocumentSource.class, "solr-to-c2.xsl");
        } else {
            return solrXsltAdapter; 
        }
    }

View Full Code Here

                    .defaultLanguage(LanguageCode.ENGLISH);




                    File resourcesDir = new File(environment.configFile(), "carrot2/resources");


                    ResourceLookup resourceLookup = new ResourceLookup(new DirLocator(resourcesDir));


                    DefaultLexicalDataFactoryDescriptor.attributeBuilder(attributes)
                    .mergeResources(true);
                    LexicalDataLoaderDescriptor.attributeBuilder(attributes)
                    .resourceLookup(resourceLookup);

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.carrot2.text.preprocessing.PreprocessingContext.AllPhrases

com.carrotsearch.hppc.BitSet

com.carrotsearch.hppc.IntArrayList

com.carrotsearch.hppc.IntIntOpenHashMap

com.carrotsearch.hppc.IntStack

com.tamingtext.carrot2.Carrot2ExampleTest

org.apache.http.message.BasicNameValuePair

org.apache.lucene.search.IndexSearcher

org.apache.mahout.math.matrix.DoubleMatrix2D

org.carrot2.clustering.kmeans.BisectingKMeansClusteringAlgorithm

org.carrot2.clustering.kmeans.BisectingKMeansClusteringAlgorithmTest

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.