Source Code of de.jungblut.nlp.VectorizerUtilsTest

package de.jungblut.nlp;


import static org.junit.Assert.assertEquals;


import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;


import org.junit.Test;


import com.google.common.collect.HashMultimap;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.Lists;
import com.google.common.collect.Multiset.Entry;


import de.jungblut.math.DoubleVector;
import de.jungblut.math.sparse.SparseDoubleVector;


public class VectorizerUtilsTest {


  List<String> documents = Lists.newArrayList("this is doc 1", // 0
      "this doc 2", // 1
      "that doc is totally unrelated", // 2
      "i dont think that is a document"); // 3
  List<String[]> tokenizedDocuments = new ArrayList<>(documents.size());
  {
    Tokenizer tkn = new StandardTokenizer();
    for (String doc : documents)
      tokenizedDocuments.add(tkn.tokenize(doc));
  }


  @Test
  public void testBuildDictionary() {
    String[] expectedResults = new String[] { "1", "2",
        VectorizerUtils.OUT_OF_VOCABULARY, "a", "doc", "document", "dont", "i",
        "is", "that", "think", "this", "totally", "unrelated" };
    String[] dict = VectorizerUtils.buildDictionary(tokenizedDocuments);
    assertArrayEquals(expectedResults, dict);


    // test with spam detector and 50% threshold
    expectedResults = new String[] { "1", "2",
        VectorizerUtils.OUT_OF_VOCABULARY, "a", "document", "dont", "i",
        "that", "think", "this", "totally", "unrelated" };
    dict = VectorizerUtils.buildDictionary(tokenizedDocuments, 0.5f, 0);
    assertArrayEquals(expectedResults, dict);
  }


  @Test
  public void testBuildInvertedIndexMap() {
    String[] tokens = new String[] { "is", "think", "unrelated", "a", "i", "2",
        "that", "1", "document", "dont", "doc", "totally", "this" };
    int[][] docs = new int[][] { { 0, 2, 3 }, { 3 }, { 2 }, { 3 }, { 3 },
        { 1 }, { 2, 3 }, { 0 }, { 3 }, { 3 }, { 0, 1, 2 }, { 2 }, { 0, 1 }, };
    HashMultimap<String, Integer> invertedIndex = VectorizerUtils
        .buildInvertedIndexMap(tokenizedDocuments,
            VectorizerUtils.buildDictionary(tokenizedDocuments));


    for (int i = 0; i < tokens.length; i++) {
      Set<Integer> set = invertedIndex.get(tokens[i]);
      for (int doc : docs[i]) {
        set.remove(doc);
      }
      assertEquals(set.size(), 0);
    }


  }


  @Test
  public void testBuildInvertedIndexArray() {
    String[] tokens = new String[] { "is", "think", "unrelated", "a", "i", "2",
        "that", "1", "document", "dont", "doc", "totally", "this" };
    int[][] docs = new int[][] { { 0, 2, 3 }, { 3 }, { 2 }, { 3 }, { 3 },
        { 1 }, { 2, 3 }, { 0 }, { 3 }, { 3 }, { 0, 1, 2 }, { 2 }, { 0, 1 }, };
    String[] dict = VectorizerUtils.buildDictionary(tokenizedDocuments);
    int[][] dictDocs = VectorizerUtils.buildInvertedIndexArray(
        tokenizedDocuments, dict);


    for (int i = 0; i < tokens.length; i++) {
      int[] tokenDocs = dictDocs[Arrays.binarySearch(dict, tokens[i])];
      Arrays.sort(docs[i]);
      Arrays.sort(tokenDocs);
      assertArrayEquals(docs[i], tokenDocs);
    }


  }


  @Test
  public void testTfIdfVectorize() {


    String[] dict = VectorizerUtils.buildDictionary(tokenizedDocuments);
    assertEquals(14, dict.length);
    int[] docCount = VectorizerUtils.buildInvertedIndexDocumentCount(
        tokenizedDocuments, dict);


    List<DoubleVector> tfIdfVectorize = VectorizerUtils.tfIdfVectorize(
        tokenizedDocuments, dict, docCount);


    // {10=0.6931471805599453, 7=0.28768207245178085, 3=0.28768207245178085,
    // 0=1.3862943611198906}
    DoubleVector v1 = new SparseDoubleVector(13);
    v1.set(10, 0.6931471805599453);
    v1.set(7, 0.28768207245178085);
    v1.set(3, 0.28768207245178085);
    v1.set(0, 1.3862943611198906);
    // {10=0.6931471805599453, 3=0.28768207245178085, 1=1.3862943611198906}
    DoubleVector v2 = new SparseDoubleVector(13);
    v2.set(10, 0.6931471805599453);
    v2.set(3, 0.28768207245178085);
    v2.set(1, 1.3862943611198906);
    // {12=1.3862943611198906, 11=1.3862943611198906, 8=0.6931471805599453,
    // 7=0.28768207245178085, 3=0.28768207245178085}
    DoubleVector v3 = new SparseDoubleVector(13);
    v3.set(12, 1.3862943611198906);
    v3.set(11, 1.3862943611198906);
    v3.set(8, 0.6931471805599453);
    v3.set(7, 0.28768207245178085);
    v3.set(3, 0.28768207245178085);


    // {9=1.3862943611198906, 8=0.6931471805599453, 7=0.28768207245178085,
    // 6=1.3862943611198906, 5=1.3862943611198906, 4=1.3862943611198906,
    // 2=1.3862943611198906}
    DoubleVector v4 = new SparseDoubleVector(13);
    v4.set(9, 1.3862943611198906);
    v4.set(8, 0.6931471805599453);
    v4.set(7, 0.28768207245178085);
    v4.set(6, 1.3862943611198906);
    v4.set(5, 1.3862943611198906);
    v4.set(4, 1.3862943611198906);
    v4.set(2, 1.3862943611198906);


    assertEquals(4, tfIdfVectorize.size());


    assertEquals(0d, tfIdfVectorize.get(0).subtract(v1).sum(), 1e-5);
    assertEquals(0d, tfIdfVectorize.get(1).subtract(v2).sum(), 1e-5);
    assertEquals(0d, tfIdfVectorize.get(2).subtract(v3).sum(), 1e-5);
    assertEquals(0d, tfIdfVectorize.get(3).subtract(v4).sum(), 1e-5);


  }


  @Test
  public void testGetMostFrequentItems() {
    Map<String, Integer> expectedResults = new HashMap<>();
    String[] expectedResultsArray = new String[] { "is", "doc", "that", "this",
        "think", "unrelated", "a", "i", "2", "1", "document", "dont", "totally" };
    int[] expectedResultCounts = new int[] { 3, 3, 2, 2, 1, 1, 1, 1, 1, 1, 1,
        1, 1 };
    for (int i = 0; i < expectedResultCounts.length; i++) {
      expectedResults.put(expectedResultsArray[i], expectedResultCounts[i]);
    }


    HashMultiset<String> set = HashMultiset.create();
    for (String[] s : tokenizedDocuments)
      set.addAll(Arrays.asList(s));
    ArrayList<Entry<String>> mostFrequentItems = VectorizerUtils
        .getMostFrequentItems(set);


    for (Entry<String> entry : mostFrequentItems) {
      assertEquals(expectedResults.get(entry.getElement()).intValue(),
          entry.getCount());
      expectedResults.remove(entry.getElement());
    }
    assertEquals(0, expectedResults.size());
  }


  @Test
  public void testBuildTransitionVector() {
    String[] dict = VectorizerUtils.buildDictionary(tokenizedDocuments);
    int[] transitionVector = VectorizerUtils.buildTransitionVector(dict,
        tokenizedDocuments.get(0));
    int[] expected = new int[] { 11, 8, 4, 0 };
    assertArrayEquals(expected, transitionVector);
  }


  static void assertArrayEquals(int[] expected, int[] actual) {
    assertEquals(expected.length, actual.length);
    for (int i = 0; i < expected.length; i++) {
      assertEquals(expected[i], actual[i]);
    }
  }


  static void assertArrayEquals(String[] expected, String[] actual) {
    assertEquals(expected.length, actual.length);
    for (int i = 0; i < expected.length; i++) {
      assertEquals(expected[i], actual[i]);
    }
  }


}
Source Code of de.jungblut.nlp.VectorizerUtilsTest

Related Classes of de.jungblut.nlp.VectorizerUtilsTest