Package org.carrot2.text.preprocessing

Source Code of org.carrot2.text.preprocessing.PreprocessingContextAssert$WordAssert

/*
* Carrot2 project.
*
* Copyright (C) 2002-2014, Dawid Weiss, Stanisław Osiński.
* All rights reserved.
*
* Refer to the full license file "carrot2.LICENSE"
* in the root folder of the repository checkout or at:
* http://www.carrot2.org/carrot2.LICENSE
*/

package org.carrot2.text.preprocessing;

import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;

import org.carrot2.text.analysis.TokenTypeUtils;
import org.carrot2.text.preprocessing.PreprocessingContext.AllPhrases;
import org.carrot2.text.preprocessing.PreprocessingContext.AllTokens;
import org.carrot2.text.util.CharArrayComparators;
import org.carrot2.util.IntMapUtils;
import org.fest.assertions.Assertions;
import org.fest.util.Strings;

import com.carrotsearch.hppc.IntIntOpenHashMap;
import com.carrotsearch.hppc.procedures.IntIntProcedure;
import com.google.common.base.Objects;
import com.google.common.collect.Lists;

/**
* Fest-style assertions on the content of {@link PreprocessingContext}.
*/
class PreprocessingContextAssert
{
    /** missing word constant. */
    public final static String MW = "<MW>";
    /** document separator constant. */
    public final static String DS = "<DS>";
    /** field separator constant. */
    public final static String FS = "<FS>";
    /** end of stream constant. */
    public final static String EOS = "<EOS>";

    final PreprocessingContext context;

    final class PreprocessingContextPhraseAssert
    {
        private int phraseIndex;
       
        PreprocessingContextPhraseAssert(int index)
        {
            assert index >= 0;
            this.phraseIndex = index;
        }

        public PreprocessingContextPhraseAssert withDocumentTf(int documentIndex, int expectedTf)
        {
            int [] byDocTf = context.allPhrases.tfByDocument[phraseIndex];
            for (int i = 0; i < byDocTf.length; i += 2)
            {
                if (byDocTf[i] == documentIndex) {
                    Assertions.assertThat(expectedTf).isEqualTo(byDocTf[i + 1]);
                    return this;
                }
            }

            org.junit.Assert.fail("No document " + documentIndex + " for this phrase: "
                + context.allPhrases.getPhrase(phraseIndex) + "\n" + context.allPhrases);
            return this;
        }

        /**
         * Asserts exact mapping of document-tf (the number of mappings and their value, regardless
         * of their order).
         */
        public PreprocessingContextPhraseAssert withExactDocumentTfs(int [][] docTfPairs)
        {
            for (int [] docTf : docTfPairs)
            {
                Assertions.assertThat(docTf.length).isEqualTo(2);
                withDocumentTf(docTf[0], docTf[1]);
            }

            Assertions.assertThat(context.allPhrases.tfByDocument[phraseIndex].length / 2)
                .describedAs("tfByDocument array size for phrase: '" + context.allPhrases.getPhrase(phraseIndex) + "'")
                .isEqualTo(docTfPairs.length);

            return this;
        }

        public PreprocessingContextPhraseAssert withTf(int expectedTf)
        {
            Assertions.assertThat(context.allPhrases.tf[phraseIndex])
                .describedAs("tf different for phrase '" + context.allPhrases.getPhrase(phraseIndex) + "'")
                .isEqualTo(expectedTf);
            return this;
        }
    }

    PreprocessingContextAssert(PreprocessingContext context)
    {
        this.context = context;
    }

    public List<String> wordImages()
    {
        Assertions.assertThat(context.allWords.image)
            .describedAs("the context's allWords is not properly initialized.").isNotNull();

        List<String> result = Lists.newArrayList();
        for (int i = context.allWords.image.length; --i >= 0;)
        {
            result.add(new String(context.allWords.image[i]));
        }
        Collections.shuffle(result);
        return result;
    }

    /**
     * Return a list of random-ordered, space-separated phrase images.
     */
    public List<String> phraseImages()
    {
        Assertions.assertThat(context.allPhrases.wordIndices)
            .describedAs("the context's allPhrases is not properly initialized.").isNotNull();

        List<String> result = Lists.newArrayList();
        for (int i = context.allPhrases.wordIndices.length; --i >= 0;)
        {
            result.add(context.allPhrases.getPhrase(i).toString());
        }
        Collections.shuffle(result);
        return result;
    }

    /** Assert the context contains a phrase consisting of these exact images. */
    public PreprocessingContextPhraseAssert containsPhrase(List<String> processedTermImages)
    {
        return containsPhrase(processedTermImages.toArray(
            new String [processedTermImages.size()]));
    }

    /** Assert the context contains a phrase consisting of these exact images. */
    public PreprocessingContextPhraseAssert containsPhrase(String... processedTermImages)
    {
        Assertions.assertThat(processedTermImages).isNotEmpty();
        Assertions.assertThat(context.allPhrases.wordIndices)
            .describedAs("the context's allPhrases is not properly initialized.").isNotNull();

        // Naive scan over the set of extracted phrases.
        final String phraseImage = Strings.join(processedTermImages).with(" ");
        int foundAt = -1;
        for (int i = context.allPhrases.wordIndices.length; --i >= 0;)
        {
            if (phraseImage.equals(context.allPhrases.getPhrase(i).toString()))
            {
                if (foundAt >= 0) org.junit.Assert.fail("More than one phrase with an identical image '"
                    + phraseImage + "'?\n\n" + context.allPhrases);
                foundAt = i;
            }
        }

        if (foundAt < 0)
            org.junit.Assert.fail("No phrase '" + phraseImage + "' in allPhrases:\n" + context.allPhrases);

        return new PreprocessingContextPhraseAssert(foundAt);
    }

    /**
     * Looks up a phrase that matches the list of stemmed images. Stem images
     * are preprocessed in this method and underscore "_"
     * character is removed (clearer test input in conjunction with {@link TestStemmerFactory}).
     */
    public PreprocessingContextPhraseAssert containsPhraseStemmedAs(String... stemImages)
    {
        Assertions.assertThat(stemImages).isNotEmpty();
        Assertions.assertThat(context.allPhrases.wordIndices)
            .describedAs("the context's allPhrases is not properly initialized.").isNotNull();

        for (int i = 0; i < stemImages.length; i++)
            stemImages[i] = stemImages[i].replaceAll("_", "");

        // Naive scan over the set of extracted phrases.
        Comparator<char[]> comp = CharArrayComparators.FAST_CHAR_ARRAY_COMPARATOR;
        int foundAt = -1;
nextPhrase:
        for (int i = context.allPhrases.wordIndices.length; --i >= 0;)
        {
            int [] wordIdxs = context.allPhrases.wordIndices[i];

            if (wordIdxs.length == stemImages.length)
            {
                for (int j = 0; j < wordIdxs.length; j++)
                {
                    if (comp.compare(
                        context.allStems.image[context.allWords.stemIndex[wordIdxs[j]]],
                        stemImages[j].toCharArray()) != 0)
                    {
                        continue nextPhrase;
                    }
                }

                if (foundAt >= 0)
                {
                    org.junit.Assert.fail("More than one phrase corresponds to stem sequence '" +
                        Arrays.toString(stemImages) + "':\n" + context.allPhrases);
                }
                foundAt = i;
            }
        }

        if (foundAt < 0)
            org.junit.Assert.fail("No phrase corresponding to stem sequence '" +
                Arrays.toString(stemImages) + "' in allPhrases:\n" + context.allPhrases);

        return new PreprocessingContextPhraseAssert(foundAt);
    }

    public static PreprocessingContextAssert assertThat(PreprocessingContext context)
    {
        return new PreprocessingContextAssert(context);
    }
   
    public static List<TokenEntry> tokens(PreprocessingContext context)
    {
        return new PreprocessingContextAssert(context).tokens();
    }

    final class StemAssert
    {
        private final int stemIndex;
        private final String stemImage;

        public StemAssert(int stemIndex)
        {
            this.stemIndex = stemIndex;
            this.stemImage = new String(context.allStems.image[stemIndex]);
        }

        public StemAssert withTf(int expectedTf)
        {
            Assertions.assertThat(context.allStems.tf[stemIndex])
                .describedAs("tf different for stem " + stemImage)
                    .isEqualTo(expectedTf);
            return this;
        }

        public StemAssert withDocumentTf(int documentIndex, int expectedTf)
        {
            int [] byDocTf = context.allStems.tfByDocument[stemIndex];
            for (int i = 0; i < byDocTf.length; i += 2)
            {
                if (byDocTf[i] == documentIndex) {
                    Assertions.assertThat(expectedTf).isEqualTo(byDocTf[i + 1]);
                    return this;
                }
            }

            org.junit.Assert.fail("No document " + documentIndex + " for this stem: "
                + stemImage + "\n" + context.allPhrases);
            return this;
        }

        public StemAssert withExactDocumentTfs(int [][] docTfPairs)
        {
            for (int [] docTf : docTfPairs)
            {
                Assertions.assertThat(docTf.length).isEqualTo(2);
                withDocumentTf(docTf[0], docTf[1]);
            }

            Assertions.assertThat(context.allStems.tfByDocument[stemIndex].length / 2)
                .describedAs("tfByDocument array size for stem: '" + stemImage + "'")
                .isEqualTo(docTfPairs.length);

            return this;
        }

        public StemAssert withFieldIndices(int... expectedIndices)
        {
            int [] indices = PreprocessingContext.toFieldIndexes(context.allStems.fieldIndices[stemIndex]);
            Assertions.assertThat(expectedIndices).as("field indices of stem '" + stemImage + "'")
                .isEqualTo(indices);
            return this;
        }
    }

    StemAssert constainsStem(String stemImage)
    {
        Assertions.assertThat(stemImage).isNotEmpty();
        Assertions.assertThat(context.allStems.image)
            .describedAs("the context's allStems is not properly initialized.").isNotNull();

        Comparator<char[]> comp = CharArrayComparators.FAST_CHAR_ARRAY_COMPARATOR;
        int found = -1;
        for (int i = 0; i < context.allStems.image.length; i++)
        {
            if (comp.compare(context.allStems.image[i], stemImage.toCharArray()) == 0)
            {
                if (found >= 0)
                    org.junit.Assert.fail("Duplicate stem with image '" + stemImage + "' in stems:\n"
                        + context.allStems);
                found = i;
            }
        }
       
        if (found == -1)
            org.junit.Assert.fail("No stem with image '" + stemImage + "' in stems:\n"
                + context.allStems);
       
        return new StemAssert(found);
    }

    final class WordAssert
    {
        private final int wordIndex;
        private final String wordImage;

        public WordAssert(int wordIndex)
        {
            this.wordIndex = wordIndex;
            this.wordImage = new String(context.allWords.image[wordIndex]);
        }

        public WordAssert withTf(int expectedTf)
        {
            Assertions.assertThat(context.allWords.tf[wordIndex])
                .describedAs("tf different for word " + wordImage)
                    .isEqualTo(expectedTf);
            return this;
        }

        public WordAssert withDocumentTf(int documentIndex, int expectedTf)
        {
            int [] byDocTf = context.allWords.tfByDocument[wordIndex];
            for (int i = 0; i < byDocTf.length; i += 2)
            {
                if (byDocTf[i] == documentIndex) {
                    Assertions.assertThat(expectedTf).isEqualTo(byDocTf[i + 1]);
                    return this;
                }
            }

            org.junit.Assert.fail("No document " + documentIndex + " for this word: "
                + wordImage + "\n" + context.allPhrases);
            return this;
        }

        public WordAssert withExactDocumentTfs(int [][] docTfPairs)
        {
            for (int [] docTf : docTfPairs)
            {
                Assertions.assertThat(docTf.length).isEqualTo(2);
                withDocumentTf(docTf[0], docTf[1]);
            }

            Assertions.assertThat(context.allWords.tfByDocument[wordIndex].length / 2)
                .describedAs("tfByDocument array size for word: '" + wordImage + "'")
                .isEqualTo(docTfPairs.length);

            return this;
        }

        public WordAssert withFieldIndices(int... expectedIndices)
        {
            int [] indices = PreprocessingContext.toFieldIndexes(context.allWords.fieldIndices[wordIndex]);
            Assertions.assertThat(expectedIndices).as("field indices of word '" + wordImage + "'")
                .isEqualTo(indices);
            return this;
        }

        /** type masked to token type only. */
        public void withTokenType(int tokenType)
        {
            Assertions.assertThat(TokenTypeUtils.maskType(context.allWords.type[wordIndex]))
                .as("token type (masked) of word '" + wordImage + "'")
                .isEqualTo(tokenType);
        }

        /** raw value of token type field. */
        public void withExactTokenType(int tokenType)
        {
            Assertions.assertThat(tokenType)
                .as("token type of word '" + wordImage + "'")
                .isEqualTo(context.allWords.type[wordIndex]);
        }
    }

    public WordAssert containsWord(String wordImage)
    {
        Assertions.assertThat(wordImage).isNotEmpty();
        Assertions.assertThat(context.allWords.image)
            .describedAs("the context's allWords is not properly initialized.").isNotNull();

        Comparator<char[]> comp = CharArrayComparators.FAST_CHAR_ARRAY_COMPARATOR;
        int found = -1;
        for (int i = 0; i < context.allWords.image.length; i++)
        {
            if (comp.compare(context.allWords.image[i], wordImage.toCharArray()) == 0)
            {
                if (found >= 0)
                    org.junit.Assert.fail("Duplicate word with image '" + wordImage + "' in words:\n"
                        + context.allWords);
                found = i;
            }
        }
       
        if (found == -1)
            org.junit.Assert.fail("No word with image '" + wordImage + "' in words:\n"
                + context.allStems);

        return new WordAssert(found);
    }

    public final class TokenEntry
    {
        final int tokenIndex;
       
        TokenEntry(int tokenIndex)
        {
            this.tokenIndex = tokenIndex;
        }
       
        public String getTokenImage()
        {
            if (context.allTokens.image[tokenIndex] == null)
                return null;
            return new String(context.allTokens.image[tokenIndex]);
        }

        public String getWordImage()
        {
            if (context.allTokens.image[tokenIndex] == null)
            {
                if (TokenTypeUtils.isDocumentSeparator(context.allTokens.type[tokenIndex]))
                    return DS;
                if (TokenTypeUtils.isFieldSeparator(context.allTokens.type[tokenIndex]))
                    return FS;
                if (TokenTypeUtils.isTerminator(context.allTokens.type[tokenIndex]))
                    return EOS;
                throw new RuntimeException();
            }
            int wordIndex = context.allTokens.wordIndex[tokenIndex];
            if (wordIndex < 0)
                return MW;
            return new String(context.allWords.image[wordIndex]);
        }

        public String getStemImage()
        {
            if (getTokenImage() == null)
                return null;

            int wordIndex = context.allTokens.wordIndex[tokenIndex];
            int stemIndex = context.allWords.stemIndex[wordIndex];
            return new String(context.allStems.image[stemIndex]);
        }

        public Integer getWordType()
        {
            if (getTokenImage() == null)
                return null;

            return (int) context.allWords.type[context.allTokens.wordIndex[tokenIndex]];
        }
    }

    public List<TokenEntry> tokens()
    {
        List<TokenEntry> result = Lists.newArrayList();
        for (int i = 0; i < context.allTokens.image.length; i++)
            result.add(new TokenEntry(i));
        return result;
    }

    final class TokenAssert
    {
        private final int tokenIndex;
        private final String tokenImage;

        public TokenAssert(int tokenIndex)
        {
            this.tokenIndex = tokenIndex;
            this.tokenImage = tokenIndex + ":"
                + (context.allTokens.image[tokenIndex] != null ? new String(context.allTokens.image[tokenIndex]) : "<null>");
        }

        /** type masked to token type only. */
        public TokenAssert hasTokenType(int tokenType)
        {
            Assertions.assertThat(tokenType)
                .as("token type (masked) of token '" + tokenImage + "'")
                .isEqualTo(TokenTypeUtils.maskType(context.allTokens.type[tokenIndex]));
            return this;
        }

        /** raw value of token type field. */
        public TokenAssert hasExactTokenType(int tokenType)
        {
            Assertions.assertThat(tokenType)
                .as("token type of token '" + tokenImage + "'")
                .isEqualTo(context.allTokens.type[tokenIndex]);
            return this;
        }

        public TokenAssert hasImage(String image)
        {
            Assertions.assertThat(
                CharArrayComparators.FAST_CHAR_ARRAY_COMPARATOR.compare(
                    image != null ? image.toCharArray() : null,
                    context.allTokens.image[tokenIndex]) == 0)
                    .as("token image equality: " + image + " vs. " +
                    new String(Objects.firstNonNull(context.allTokens.image[tokenIndex], "<null>".toCharArray())))
                    .isTrue();
            return this;
        }

        public TokenAssert hasDocIndex(int expectedDocIndex)
        {
            Assertions.assertThat(context.allTokens.documentIndex[tokenIndex])
                .as("documentIndex")
                .isEqualTo(expectedDocIndex);
            return this;
        }

        public TokenAssert hasFieldIndex(int expectedFieldIndex)
        {
            Assertions.assertThat(context.allTokens.fieldIndex[tokenIndex])
                .as("fieldIndex")
                .isEqualTo((byte) expectedFieldIndex);
            return this;
        }
    }
   
    public TokenAssert tokenAt(int tokenIndex)
    {
        return new TokenAssert(tokenIndex);
    }


    /**
     * Make sure term frequencies and
     */
    public void phraseTfsCorrect()
    {
        // for each discovered phrase, do manual count and verify if tf and tfByDocument are correct.
        AllPhrases allPhrases = context.allPhrases;
        for (int index = 0; index < allPhrases.size(); index++)
        {
            IntIntOpenHashMap realTfByDocuments = countManually(context, allPhrases.wordIndices[index]);
            final int realTf = realTfByDocuments.forEach(new IntIntProcedure()
            {
                int tf;
                public void apply(int key, int value)
                {
                    tf += value;
                }
            }).tf;

            Assertions.assertThat(allPhrases.tf[index]).as("Phrase: " + allPhrases.getPhrase(index))
                .isEqualTo(realTf);
           
            // Phrase extractor does not sort the byDocumentTf, so we need to addAllFromFlattened
            // to a map and then flatten with sorting.
            Assertions
                .assertThat(
                    IntMapUtils.flattenSortedByKey(IntMapUtils.addAllFromFlattened(
                        new IntIntOpenHashMap(), allPhrases.tfByDocument[index])))
                .as("Phrase: " + allPhrases.getPhrase(index))
                .isEqualTo(IntMapUtils.flattenSortedByKey(realTfByDocuments));
        }
    }

    /**
     * Manually and naively count doc->tf for the given word sequence.
     */
    private IntIntOpenHashMap countManually(PreprocessingContext context, int [] phraseWordIndices)
    {
        IntIntOpenHashMap tfByDoc = new IntIntOpenHashMap();
        AllTokens allTokens = context.allTokens;
outer:
        for (int i = allTokens.wordIndex.length - phraseWordIndices.length; --i >=0 ;)
        {
            for (int j = 0; j < phraseWordIndices.length; j++)
            {
                int wordInPhrase = phraseWordIndices[j];
                int wordInTokens = allTokens.wordIndex[i + j];
                if (wordInPhrase != wordInTokens)
                    continue outer;
            }
            tfByDoc.putOrAdd(allTokens.documentIndex[i], 1, 1);
        }
        return tfByDoc;
    }
}
TOP

Related Classes of org.carrot2.text.preprocessing.PreprocessingContextAssert$WordAssert

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.