Package org.carrot2.text.util

Examples of org.carrot2.text.util.MutableCharArray


    private final MutableCharArray tempCharSequence;

    public ThaiTokenizerAdapter()
    {
        this.tempCharSequence = new MutableCharArray(new char [0]);
        if (!platformSupportsThai()) {
            throw new RuntimeException("Thai segmentation not supported on this platform.");
        }
    }
View Full Code Here


                buffer[i] = word.charAt(i);
            snowballStemmer.setCurrent(buffer, len);

            if (snowballStemmer.stem())
            {
                return new MutableCharArray(Arrays.copyOf(
                    snowballStemmer.getCurrentBuffer(), snowballStemmer.getCurrentBufferLength()));
            }
            else
            {
                return null;
View Full Code Here

    private final MutableCharArray tempCharSequence;

    public ChineseTokenizerAdapter()
    {
        this.tempCharSequence = new MutableCharArray(new char [0]);
        this.sentenceTokenizer = new SentenceTokenizer(new StringReader(""));
    }
View Full Code Here

        len = hindiNormalizer.normalize(buffer, len);
        len = hindiStemmer.stem(buffer, len);

        if (!equals(word, buffer, len))
        {
            return new MutableCharArray(Arrays.copyOf(buffer, len));
        }
        else
        {
            return word;
        }
View Full Code Here

        fieldIndices = new ByteArrayList();

        final Iterator<Document> docIterator = documents.iterator();
        int documentIndex = 0;
        final ITokenizer ts = context.language.getTokenizer();
        final MutableCharArray wrapper = new MutableCharArray(CharArrayUtils.EMPTY_ARRAY);

        while (docIterator.hasNext())
        {
            final Document doc = docIterator.next();
View Full Code Here

        }
        else
        {
            final char [] tokenImage = new char [chs.length()];
            System.arraycopy(chs.getBuffer(), chs.getStart(), tokenImage, 0, chs.length());
            tokenCache.add(new MutableCharArray(tokenImage));
            return tokenImage;
        }
    }
View Full Code Here

        final IStemmer stemmer = context.language.getStemmer();

        final char [][] wordImages = context.allWords.image;
        final char [][] stemImages = new char [wordImages.length] [];

        final MutableCharArray mutableCharArray = new MutableCharArray(CharArrayUtils.EMPTY_ARRAY);
        char [] buffer = new char [128];

        for (int i = 0; i < wordImages.length; i++)
        {
            final char [] word = wordImages[i];
            if (buffer.length < word.length) buffer = new char [word.length];

            final boolean different = CharArrayUtils.toLowerCase(word, buffer);

            mutableCharArray.reset(buffer, 0, word.length);
            final CharSequence stemmed = stemmer.stem(mutableCharArray);
            if (stemmed != null)
            {
                mutableCharArray.reset(stemmed);
                stemImages[i] = context.intern(mutableCharArray);
            }
            else
            {
                // We need to put the original word here, otherwise, we wouldn't be able
View Full Code Here

        stemTfsByDocument.add(wordTfByDocumentArray[stemImagesOrder[0]]);
        byte fieldIndices = 0;
        fieldIndices |= wordsFieldIndices[0];

        // For locating query words
        final MutableCharArray buffer = new MutableCharArray(
            wordStemImages[stemImagesOrder[0]]);
        boolean inQuery = queryStems.contains(buffer);

        // Go through all words in the order of stem images
        for (int i = 0; i < stemImagesOrder.length - 1; i++)
        {
            final int orderIndex = stemImagesOrder[i];
            final char [] stem = wordStemImages[orderIndex];
            final int nextInOrderIndex = stemImagesOrder[i + 1];
            final char [] nextStem = wordStemImages[nextInOrderIndex];

            stemIndexesArray[orderIndex] = stemIndex;
            if (inQuery)
            {
                wordsType[orderIndex] |= ITokenizer.TF_QUERY_WORD;
            }

            // Now check if token image is changing
            final boolean sameStem = CharArrayComparators.FAST_CHAR_ARRAY_COMPARATOR
                .compare(stem, nextStem) == 0;

            if (sameStem)
            {
                totalTf += wordTfArray[nextInOrderIndex];
                stemTfsByDocument.add(wordTfByDocumentArray[nextInOrderIndex]);
                fieldIndices |= wordsFieldIndices[nextInOrderIndex];
                if (mostFrequentWordFrequency < wordTfArray[nextInOrderIndex])
                {
                    mostFrequentWordFrequency = wordTfArray[nextInOrderIndex];
                    mostFrequentWordIndex = nextInOrderIndex;
                }
            }
            else
            {
                stemImages.add(stem);
                stemTf.add(totalTf);
                stemMostFrequentWordIndexes.add(mostFrequentWordIndex);
                storeTfByDocument(stemTfByDocumentList, stemTfsByDocument);
                fieldIndexList.add(fieldIndices);

                stemIndex++;
                totalTf = wordTfArray[nextInOrderIndex];
                mostFrequentWordFrequency = wordTfArray[nextInOrderIndex];
                mostFrequentWordIndex = nextInOrderIndex;
                fieldIndices = 0;
                fieldIndices |= wordsFieldIndices[nextInOrderIndex];

                stemTfsByDocument.clear();
                stemTfsByDocument.add(wordTfByDocumentArray[nextInOrderIndex]);

                buffer.reset(wordStemImages[nextInOrderIndex]);
                inQuery = queryStems.contains(buffer);
            }
        }

        // Store tf for the last stem in the array
View Full Code Here

            for (int i = 0; i < split.length; i++)
            {
                final CharSequence stem = stemmer.stem(split[i]);
                if (stem != null)
                {
                    queryWords.add(new MutableCharArray(stem));
                }
                else
                {
                    queryWords.add(new MutableCharArray(split[i]));
                }
            }
        }

        return queryWords;
View Full Code Here

            final ProcessingResult result = ctrl1.process(
                Collections.<String, Object> emptyMap(), TestComponent.class);

            data1 = result.getAttribute("english");
            assertTrue(data1.isCommonWord(new MutableCharArray("uniquea")));
        }

        // Create another pooling controller, same folder, but different resource lookup.
        final Controller ctrl2 = ControllerFactory.createPooling();
        final ILexicalData data2;
        {
            ctrl2.init(ImmutableMap.<String, Object> of(
                resourceLookupKey,
                new ResourceLookup(new DirLocator(tempDir1), classpathLocator)));

            final ProcessingResult result = ctrl2.process(
                Collections.<String, Object> emptyMap(), TestComponent.class);

            data2 = result.getAttribute("english");
            assertTrue(data2.isCommonWord(new MutableCharArray("uniquea")));

            assertSame(data1, data2);
        }

        /*
         * Now force reloading of resources from that path on ctrl1. The new stop word resource
         * should contain 'uniqueb'.
         */
        FileUtils.writeStringToFile(new File(tempDir1, "stopwords.en"), "uniqueb");

        final ILexicalData data3 = ctrl1.process(
            ImmutableMap.<String, Object> of(reloadResourcesKey, true), TestComponent.class)
                .getAttribute("english");

        assertNotSame(data1, data3);
        assertFalse(data3.isCommonWord(new MutableCharArray("uniquea")));
        assertTrue(data3.isCommonWord(new MutableCharArray("uniqueb")));

        /*
         * But since it's the same location, all other controllers should now see updated resources
         * (and share the same lexical data).
         */
 
View Full Code Here

TOP

Related Classes of org.carrot2.text.util.MutableCharArray

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.