Examples of org.carrot2.text.util.MutableCharArray

org.carrot2.text.util.MutableCharArray
Implements {@link CharSequence} over a mutable char[] buffer.
This class implements proper content-based {@link #hashCode()} and{@link #equals(Object)} against other {@link MutableCharArray} objects, assuming theunderlying character buffers does not change. In case the buffers is changed, the resulting behavior is unpredictable.


    private final MutableCharArray tempCharSequence;


    public ThaiTokenizerAdapter()
    {
        this.tempCharSequence = new MutableCharArray(new char [0]);
        if (!platformSupportsThai()) {
            throw new RuntimeException("Thai segmentation not supported on this platform.");
        }
    }

View Full Code Here

                buffer[i] = word.charAt(i);
            snowballStemmer.setCurrent(buffer, len);


            if (snowballStemmer.stem())
            {
                return new MutableCharArray(Arrays.copyOf(
                    snowballStemmer.getCurrentBuffer(), snowballStemmer.getCurrentBufferLength()));
            }
            else
            {
                return null;

View Full Code Here


    private final MutableCharArray tempCharSequence;


    public ChineseTokenizerAdapter()
    {
        this.tempCharSequence = new MutableCharArray(new char [0]);
        this.sentenceTokenizer = new SentenceTokenizer(new StringReader(""));
    }

View Full Code Here

        len = hindiNormalizer.normalize(buffer, len);
        len = hindiStemmer.stem(buffer, len);


        if (!equals(word, buffer, len))
        {
            return new MutableCharArray(Arrays.copyOf(buffer, len));
        }
        else
        {
            return word;
        }

View Full Code Here

        fieldIndices = new ByteArrayList();


        final Iterator<Document> docIterator = documents.iterator();
        int documentIndex = 0;
        final ITokenizer ts = context.language.getTokenizer();
        final MutableCharArray wrapper = new MutableCharArray(CharArrayUtils.EMPTY_ARRAY);


        while (docIterator.hasNext())
        {
            final Document doc = docIterator.next();

View Full Code Here

        }
        else
        {
            final char [] tokenImage = new char [chs.length()];
            System.arraycopy(chs.getBuffer(), chs.getStart(), tokenImage, 0, chs.length());
            tokenCache.add(new MutableCharArray(tokenImage));
            return tokenImage;
        }
    }

View Full Code Here

        final IStemmer stemmer = context.language.getStemmer();


        final char [][] wordImages = context.allWords.image;
        final char [][] stemImages = new char [wordImages.length] [];


        final MutableCharArray mutableCharArray = new MutableCharArray(CharArrayUtils.EMPTY_ARRAY);
        char [] buffer = new char [128];


        for (int i = 0; i < wordImages.length; i++)
        {
            final char [] word = wordImages[i];
            if (buffer.length < word.length) buffer = new char [word.length];


            final boolean different = CharArrayUtils.toLowerCase(word, buffer);


            mutableCharArray.reset(buffer, 0, word.length);
            final CharSequence stemmed = stemmer.stem(mutableCharArray);
            if (stemmed != null)
            {
                mutableCharArray.reset(stemmed);
                stemImages[i] = context.intern(mutableCharArray);
            }
            else
            {
                // We need to put the original word here, otherwise, we wouldn't be able

View Full Code Here

        stemTfsByDocument.add(wordTfByDocumentArray[stemImagesOrder[0]]);
        byte fieldIndices = 0;
        fieldIndices |= wordsFieldIndices[0];


        // For locating query words
        final MutableCharArray buffer = new MutableCharArray(
            wordStemImages[stemImagesOrder[0]]);
        boolean inQuery = queryStems.contains(buffer);


        // Go through all words in the order of stem images
        for (int i = 0; i < stemImagesOrder.length - 1; i++)
        {
            final int orderIndex = stemImagesOrder[i];
            final char [] stem = wordStemImages[orderIndex];
            final int nextInOrderIndex = stemImagesOrder[i + 1];
            final char [] nextStem = wordStemImages[nextInOrderIndex];


            stemIndexesArray[orderIndex] = stemIndex;
            if (inQuery)
            {
                wordsType[orderIndex] |= ITokenizer.TF_QUERY_WORD;
            }


            // Now check if token image is changing
            final boolean sameStem = CharArrayComparators.FAST_CHAR_ARRAY_COMPARATOR
                .compare(stem, nextStem) == 0;


            if (sameStem)
            {
                totalTf += wordTfArray[nextInOrderIndex];
                stemTfsByDocument.add(wordTfByDocumentArray[nextInOrderIndex]);
                fieldIndices |= wordsFieldIndices[nextInOrderIndex];
                if (mostFrequentWordFrequency < wordTfArray[nextInOrderIndex])
                {
                    mostFrequentWordFrequency = wordTfArray[nextInOrderIndex];
                    mostFrequentWordIndex = nextInOrderIndex;
                }
            }
            else
            {
                stemImages.add(stem);
                stemTf.add(totalTf);
                stemMostFrequentWordIndexes.add(mostFrequentWordIndex);
                storeTfByDocument(stemTfByDocumentList, stemTfsByDocument);
                fieldIndexList.add(fieldIndices);


                stemIndex++;
                totalTf = wordTfArray[nextInOrderIndex];
                mostFrequentWordFrequency = wordTfArray[nextInOrderIndex];
                mostFrequentWordIndex = nextInOrderIndex;
                fieldIndices = 0;
                fieldIndices |= wordsFieldIndices[nextInOrderIndex];


                stemTfsByDocument.clear();
                stemTfsByDocument.add(wordTfByDocumentArray[nextInOrderIndex]);


                buffer.reset(wordStemImages[nextInOrderIndex]);
                inQuery = queryStems.contains(buffer);
            }
        }


        // Store tf for the last stem in the array

View Full Code Here

            for (int i = 0; i < split.length; i++)
            {
                final CharSequence stem = stemmer.stem(split[i]);
                if (stem != null)
                {
                    queryWords.add(new MutableCharArray(stem));
                }
                else
                {
                    queryWords.add(new MutableCharArray(split[i]));
                }
            }
        }


        return queryWords;

View Full Code Here


            final ProcessingResult result = ctrl1.process(
                Collections.<String, Object> emptyMap(), TestComponent.class);


            data1 = result.getAttribute("english");
            assertTrue(data1.isCommonWord(new MutableCharArray("uniquea")));
        }


        // Create another pooling controller, same folder, but different resource lookup.
        final Controller ctrl2 = ControllerFactory.createPooling();
        final ILexicalData data2;
        {
            ctrl2.init(ImmutableMap.<String, Object> of(
                resourceLookupKey, 
                new ResourceLookup(new DirLocator(tempDir1), classpathLocator)));


            final ProcessingResult result = ctrl2.process(
                Collections.<String, Object> emptyMap(), TestComponent.class);


            data2 = result.getAttribute("english");
            assertTrue(data2.isCommonWord(new MutableCharArray("uniquea")));


            assertSame(data1, data2);
        }


        /*
         * Now force reloading of resources from that path on ctrl1. The new stop word resource
         * should contain 'uniqueb'.
         */
        FileUtils.writeStringToFile(new File(tempDir1, "stopwords.en"), "uniqueb");


        final ILexicalData data3 = ctrl1.process(
            ImmutableMap.<String, Object> of(reloadResourcesKey, true), TestComponent.class)
                .getAttribute("english");


        assertNotSame(data1, data3);
        assertFalse(data3.isCommonWord(new MutableCharArray("uniquea")));
        assertTrue(data3.isCommonWord(new MutableCharArray("uniqueb")));


        /*
         * But since it's the same location, all other controllers should now see updated resources
         * (and share the same lexical data).
         */

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.carrot2.text.util.MutableCharArray

com.carrotsearch.hppc.IntArrayList

com.carrotsearch.hppc.IntIntOpenHashMap

com.carrotsearch.hppc.IntStack

com.tamingtext.carrot2.Carrot2ExampleTest

org.apache.http.message.BasicNameValuePair

org.apache.lucene.search.IndexSearcher

org.apache.mahout.math.matrix.DoubleMatrix2D

org.carrot2.cli.batch.BatchApp

org.carrot2.clustering.kmeans.BisectingKMeansClusteringAlgorithm

org.carrot2.clustering.kmeans.BisectingKMeansClusteringAlgorithmTest

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.