Package org.carrot2.util

Examples of org.carrot2.util.IntArrayPredicateIterator


      private final MutableCharArray tempCharSequence;
      private final Class<?> tokenFilterClass;

      private ChineseTokenizer() throws Exception {
        this.tempCharSequence = new MutableCharArray(new char[0]);

        // As Smart Chinese is not available during compile time,
        // we need to resort to reflection.
        final Class<?> tokenizerClass = ReflectionUtils.classForName(
            "org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false);
View Full Code Here


    // is not affected by the test stopwords and stoplabels.
    ILexicalData lexicalData = preprocessing.lexicalDataFactory
        .getLexicalData(LanguageCode.MALTESE);

    for (String word : wordsToCheck.split(",")) {
      if (!lexicalData.isCommonWord(new MutableCharArray(word))
          && !lexicalData.isStopLabel(word)) {
        clusters.add(new Cluster(word));
      }
    }
  }
View Full Code Here

        if (context.hasLabels())
        {
            // Term-document matrix building and reduction
            final VectorSpaceModelContext vsmContext = new VectorSpaceModelContext(
                context);
            final ReducedVectorSpaceModelContext reducedVsmContext = new ReducedVectorSpaceModelContext(
                vsmContext);
            LingoProcessingContext lingoContext = new LingoProcessingContext(
                reducedVsmContext);

            matrixBuilder.buildTermDocumentMatrix(vsmContext);
View Full Code Here

        // Further processing only if there are words to process
        clusters = Lists.newArrayList();
        if (context.hasLabels())
        {
            // Term-document matrix building and reduction
            final VectorSpaceModelContext vsmContext = new VectorSpaceModelContext(
                context);
            final ReducedVectorSpaceModelContext reducedVsmContext = new ReducedVectorSpaceModelContext(
                vsmContext);
            LingoProcessingContext lingoContext = new LingoProcessingContext(
                reducedVsmContext);
View Full Code Here

        /*
         * Recursively iterate through documents, fields and sentences. This can be
         * implemented a bit faster (without iterators), but I guess the overhead here is
         * minimal anyway.
         */
        final IntArrayPredicateIterator docIterator = new IntArrayPredicateIterator(
            context.allTokens.type, 0, context.allTokens.type.length - 1,
            ON_DOCUMENT_SEPARATOR);

        while (docIterator.hasNext())
        {
            final int docStart = docIterator.next();
            final int docLength = docIterator.getLength();

            document(context, docStart, docLength);
        }
    }
View Full Code Here

    /**
     * Invoked for each document. Splits further into fields.
     */
    protected void document(PreprocessingContext context, int start, int length)
    {
        final IntArrayPredicateIterator fieldIterator = new IntArrayPredicateIterator(
            context.allTokens.type, start, length,
            ON_FIELD_SEPARATOR);

        while (fieldIterator.hasNext())
        {
            final int fieldStart = fieldIterator.next();
            final int fieldLength = fieldIterator.getLength();

            field(context, fieldStart, fieldLength);
        }
    }
View Full Code Here

    /**
     * Invoked for each document's field. Splits further into sentences.
     */
    protected void field(PreprocessingContext context, int start, int length)
    {
        final IntArrayPredicateIterator sentenceIterator = new IntArrayPredicateIterator(
            context.allTokens.type, start, length,
            ON_SENTENCE_SEPARATOR);

        while (sentenceIterator.hasNext())
        {
            final int sentenceStart = sentenceIterator.next();
            final int sentenceLength = sentenceIterator.getLength();

            sentence(context, sentenceStart, sentenceLength);
        }
    }
View Full Code Here

          }
        },
       
        // Using the class loader directly because this time we want to omit the prefix
        new ClassLoaderLocator(core.getResourceLoader().getClassLoader())));
   
    this.controller.init(initAttributes);
    this.idFieldName = core.getSchema().getUniqueKeyField().getName();

    // Make sure the requested Carrot2 clustering algorithm class is available
View Full Code Here

    @Override
    protected IResource getXsltResource()
    {
        if (solrXsltAdapter == null) {
            return new ClassResource(SolrDocumentSource.class, "solr-to-c2.xsl");
        } else {
            return solrXsltAdapter;
        }
    }
View Full Code Here

                    .defaultLanguage(LanguageCode.ENGLISH);


                    File resourcesDir = new File(environment.configFile(), "carrot2/resources");

                    ResourceLookup resourceLookup = new ResourceLookup(new DirLocator(resourcesDir));

                    DefaultLexicalDataFactoryDescriptor.attributeBuilder(attributes)
                    .mergeResources(true);
                    LexicalDataLoaderDescriptor.attributeBuilder(attributes)
                    .resourceLookup(resourceLookup);
View Full Code Here

TOP

Related Classes of org.carrot2.util.IntArrayPredicateIterator

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.