Examples of org.carrot2.text.util.MutableCharArray

org.carrot2.text.util.MutableCharArray
Implements {@link CharSequence} over a mutable char[] buffer.
This class implements proper content-based {@link #hashCode()} and{@link #equals(Object)} against other {@link MutableCharArray} objects, assuming theunderlying character buffers does not change. In case the buffers is changed, the resulting behavior is unpredictable.

       */
    case ARABIC:
      // Intentional fall-through.


    default:
      return new ExtendedWhitespaceTokenizer();
    }
  }

View Full Code Here


    static ITokenizer createTokenizer() {
      try {
        return new ChineseTokenizer();
      } catch (Throwable e) {
        return new ExtendedWhitespaceTokenizer();
      }
    }

View Full Code Here

    }
    return solrStopWords.get(fieldName);
  }


  public ILexicalData getLexicalData(LanguageCode languageCode) {
    final ILexicalData carrot2LexicalData = carrot2LexicalDataFactory
        .getLexicalData(languageCode);


    return new ILexicalData() {
      public boolean isStopLabel(CharSequence word) {
        // Nothing in Solr maps to the concept of a stop label,
        // so return Carrot2's default here.
        return carrot2LexicalData.isStopLabel(word);
      }


      public boolean isCommonWord(MutableCharArray word) {
        // Loop over the fields involved in clustering first
        for (String fieldName : fieldNames) {
          for (CharArraySet stopWords : getSolrStopWordsForField(fieldName)) {
            if (stopWords.contains(word)) {
              return true;
            }
          }
        }
        // Check default Carrot2 stop words too
        return carrot2LexicalData.isCommonWord(word);
      }
    };
  }

View Full Code Here

      return;
    }


    // Test with Maltese so that the English clustering performed in other tests
    // is not affected by the test stopwords and stoplabels.
    ILexicalData lexicalData = preprocessing.lexicalDataFactory
        .getLexicalData(LanguageCode.MALTESE);


    for (String word : wordsToCheck.split(",")) {
      if (!lexicalData.isCommonWord(new MutableCharArray(word))
          && !lexicalData.isStopLabel(word)) {
        clusters.add(new Cluster(word));
      }
    }
  }

View Full Code Here

     * one <code>language</code>.
     */
    private void cluster(LanguageCode language)
    {
        // Preprocessing of documents
        final PreprocessingContext context = preprocessingPipeline.preprocess(documents,
            query, language);


        // Further processing only if there are words to process
        clusters = Lists.newArrayList();
        if (context.hasLabels())
        {
            // Term-document matrix building and reduction
            final VectorSpaceModelContext vsmContext = new VectorSpaceModelContext(
                context);
            final ReducedVectorSpaceModelContext reducedVsmContext = new ReducedVectorSpaceModelContext(

View Full Code Here


      private final MutableCharArray tempCharSequence;
      private final Class<?> tokenFilterClass;


      private ChineseTokenizer() throws Exception {
        this.tempCharSequence = new MutableCharArray(new char[0]);


        // As Smart Chinese is not available during compile time,
        // we need to resort to reflection.
        final Class<?> tokenizerClass = ReflectionUtils.classForName(
            "org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false);

View Full Code Here

    // is not affected by the test stopwords and stoplabels.
    ILexicalData lexicalData = preprocessing.lexicalDataFactory
        .getLexicalData(LanguageCode.MALTESE);


    for (String word : wordsToCheck.split(",")) {
      if (!lexicalData.isCommonWord(new MutableCharArray(word))
          && !lexicalData.isStopLabel(word)) {
        clusters.add(new Cluster(word));
      }
    }
  }

View Full Code Here

    public void mark(PreprocessingContext context)
    {
        final char [][] wordImages = context.allWords.image;
        final short [] types = context.allWords.type;


        final MutableCharArray mutableCharArray = new MutableCharArray("");
        char [] buffer = new char [128];
        final ILexicalData lexData = context.language.getLexicalData();


        for (int i = 0; i < wordImages.length; i++)
        {
            final char [] word = wordImages[i];
            if (buffer.length < word.length) buffer = new char [word.length];


            CharArrayUtils.toLowerCase(word, buffer);
            mutableCharArray.reset(buffer, 0, word.length);
            if (lexData.isCommonWord(mutableCharArray))
            {
                types[i] |= ITokenizer.TF_COMMON_WORD;
            }
        }

View Full Code Here


        for (String entry : input)
        {
            char [] chars = entry.toCharArray();
            CharArrayUtils.toLowerCaseInPlace(chars);
            cloned.add(new MutableCharArray(chars));
        }


        return cloned;
    }

View Full Code Here

        int newLen = normalizer.normalize(buffer, word.length());
        newLen = delegate.stem(buffer, newLen);


        if (newLen != word.length() || !equals(buffer, newLen, word))
        {
            return new MutableCharArray(Arrays.copyOf(buffer, newLen));
        }


        // Same-same.
        return null;
    }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.carrot2.text.util.MutableCharArray

com.carrotsearch.hppc.IntArrayList

com.carrotsearch.hppc.IntIntOpenHashMap

com.carrotsearch.hppc.IntStack

com.tamingtext.carrot2.Carrot2ExampleTest

org.apache.http.message.BasicNameValuePair

org.apache.lucene.search.IndexSearcher

org.apache.mahout.math.matrix.DoubleMatrix2D

org.carrot2.cli.batch.BatchApp

org.carrot2.clustering.kmeans.BisectingKMeansClusteringAlgorithm

org.carrot2.clustering.kmeans.BisectingKMeansClusteringAlgorithmTest

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.