Package de.jungblut.nlp

Source Code of de.jungblut.nlp.MinHashTest

package de.jungblut.nlp;

import static org.junit.Assert.assertEquals;

import org.junit.Test;

import de.jungblut.math.sparse.SparseDoubleVector;
import de.jungblut.nlp.MinHash.HashType;

public class MinHashTest {

  @Test
  public void testMinHashing() throws Exception {

    SparseDoubleVector vec1 = new SparseDoubleVector(5);
    vec1.set(0, 2d);
    vec1.set(3, 311d);
    vec1.set(4, 2d);
    SparseDoubleVector vec2 = new SparseDoubleVector(5);
    vec2.set(0, 2d);
    vec1.set(2, 2d);
    vec2.set(3, 311d);
    vec2.set(4, 2d);

    MinHash minHash = MinHash.create(4);

    int[] minHashVector = minHash.minHashVector(vec1);
    int[] minHashVector2 = minHash.minHashVector(vec2);

    // we just differ by a single place, so it should still be 1.0 similarity
    double similarity = minHash
        .measureSimilarity(minHashVector, minHashVector2);
    assertEquals(1.0d, similarity, 1e-5);
  }

  @Test
  public void testMinHashingMurmur() throws Exception {

    SparseDoubleVector vec1 = new SparseDoubleVector(5);
    vec1.set(0, 2d);
    vec1.set(3, 311d);
    vec1.set(4, 2d);
    SparseDoubleVector vec2 = new SparseDoubleVector(5);
    vec2.set(0, 2d);
    vec1.set(2, 2d);
    vec2.set(3, 311d);
    vec2.set(4, 2d);

    MinHash minHash = MinHash.create(4, HashType.MURMUR128);

    int[] minHashVector = minHash.minHashVector(vec1);
    int[] minHashVector2 = minHash.minHashVector(vec2);

    // we just differ by a single place, so it should still be 1.0 similarity
    double similarity = minHash
        .measureSimilarity(minHashVector, minHashVector2);
    assertEquals(1.0d, similarity, 1e-5);
  }

  @Test
  public void testMinHashingMd5() throws Exception {

    SparseDoubleVector vec1 = new SparseDoubleVector(5);
    vec1.set(0, 2d);
    vec1.set(3, 311d);
    vec1.set(4, 2d);
    SparseDoubleVector vec2 = new SparseDoubleVector(5);
    vec2.set(0, 2d);
    vec1.set(2, 2d);
    vec2.set(3, 311d);
    vec2.set(4, 2d);

    MinHash minHash = MinHash.create(4, HashType.MD5);

    int[] minHashVector = minHash.minHashVector(vec1);
    int[] minHashVector2 = minHash.minHashVector(vec2);

    // we just differ by a single place, so it should still be 1.0 similarity
    double similarity = minHash
        .measureSimilarity(minHashVector, minHashVector2);
    assertEquals(1.0d, similarity, 1e-5);
  }

}
TOP

Related Classes of de.jungblut.nlp.MinHashTest

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.