/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.math.hadoop.similarity;
import java.io.File;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.mahout.common.MahoutTestCase;
import org.apache.mahout.math.Matrix;
import org.apache.mahout.math.RandomAccessSparseVector;
import org.apache.mahout.math.VarIntWritable;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.math.hadoop.DistributedRowMatrix.MatrixEntryWritable;
import org.apache.mahout.math.hadoop.MathHelper;
import org.apache.mahout.math.hadoop.similarity.RowSimilarityJob.EntriesToVectorsReducer;
import org.apache.mahout.math.hadoop.similarity.RowSimilarityJob.SimilarityReducer;
import org.apache.mahout.math.hadoop.similarity.vector.DistributedTanimotoCoefficientVectorSimilarity;
import org.apache.mahout.math.hadoop.similarity.vector.DistributedVectorSimilarity;
import org.easymock.IArgumentMatcher;
import org.easymock.classextension.EasyMock;
import org.junit.Test;
/**
* tests {@link RowSimilarityJob}
*/
public final class TestRowSimilarityJob extends MahoutTestCase {
/**
* Tests {@link RowSimilarityJob.RowWeightMapper}
*/
@Test
public void testRowWeightMapper() throws Exception {
Mapper<IntWritable,VectorWritable,VarIntWritable,WeightedOccurrence>.Context context =
EasyMock.createMock(Mapper.Context.class);
context.write(new VarIntWritable(456), new WeightedOccurrence(123, 0.5, 2.0));
context.write(new VarIntWritable(789), new WeightedOccurrence(123, 0.1, 2.0));
EasyMock.replay(context);
Vector vector = new RandomAccessSparseVector(Integer.MAX_VALUE);
vector.set(456, 0.5);
vector.set(789, 0.1);
RowSimilarityJob.RowWeightMapper mapper = new RowSimilarityJob.RowWeightMapper();
setField(mapper, "similarity", new DistributedTanimotoCoefficientVectorSimilarity());
mapper.map(new IntWritable(123), new VectorWritable(vector), context);
EasyMock.verify(context);
}
/**
* Tests {@link RowSimilarityJob.WeightedOccurrencesPerColumnReducer}
*/
@Test
public void testWeightedOccurrencesPerColumnReducer() throws Exception {
List<WeightedOccurrence> weightedOccurrences = Arrays.asList(new WeightedOccurrence(45, 0.5, 1.0),
new WeightedOccurrence(78, 3.0, 9.0));
Reducer<VarIntWritable,WeightedOccurrence,VarIntWritable,WeightedOccurrenceArray>.Context context =
EasyMock.createMock(Reducer.Context.class);
context.write(EasyMock.eq(new VarIntWritable(123)), weightedOccurrenceArrayMatches(weightedOccurrences));
EasyMock.replay(context);
new RowSimilarityJob.WeightedOccurrencesPerColumnReducer().reduce(new VarIntWritable(123), weightedOccurrences,
context);
EasyMock.verify(context);
}
/**
* applies an {@link IArgumentMatcher} to a {@link WeightedOccurrenceArray} that checks whether
* it matches all {@link WeightedOccurrence}
*/
static WeightedOccurrenceArray weightedOccurrenceArrayMatches(
final Collection<WeightedOccurrence> occurrencesToMatch) {
EasyMock.reportMatcher(new IArgumentMatcher() {
@Override
public boolean matches(Object argument) {
if (argument instanceof WeightedOccurrenceArray) {
WeightedOccurrence[] occurrences = ((WeightedOccurrenceArray) argument).getWeightedOccurrences();
if (occurrences.length != occurrencesToMatch.size()) {
return false;
}
for (WeightedOccurrence occurrence : occurrences) {
if (!occurrencesToMatch.contains(occurrence)) {
return false;
}
}
return true;
}
return false;
}
@Override
public void appendTo(StringBuffer buffer) {}
});
return null;
}
/**
* Tests {@link RowSimilarityJob.CooccurrencesMapper}
*/
@Test
public void testCooccurrencesMapper() throws Exception {
Mapper<VarIntWritable,WeightedOccurrenceArray,WeightedRowPair,Cooccurrence>.Context context =
EasyMock.createMock(Mapper.Context.class);
Counter counter = EasyMock.createMock(Counter.class);
context.write(new WeightedRowPair(34, 34, 1.0, 1.0), new Cooccurrence(12, 0.5, 0.5));
context.write(new WeightedRowPair(34, 56, 1.0, 3.0), new Cooccurrence(12, 0.5, 1.0));
context.write(new WeightedRowPair(56, 56, 3.0, 3.0), new Cooccurrence(12, 1.0, 1.0));
EasyMock.expect(context.getCounter(RowSimilarityJob.Counter.COOCCURRENCES)).andReturn(counter);
counter.increment(3);
EasyMock.replay(context, counter);
WeightedOccurrenceArray weightedOccurrences = new WeightedOccurrenceArray(new WeightedOccurrence[] {
new WeightedOccurrence(34, 0.5, 1.0), new WeightedOccurrence(56, 1.0, 3.0) });
new RowSimilarityJob.CooccurrencesMapper().map(new VarIntWritable(12), weightedOccurrences, context);
EasyMock.verify(context, counter);
}
public void testCooccurrencesMapperOrdering() throws Exception {
Mapper<VarIntWritable,WeightedOccurrenceArray,WeightedRowPair,Cooccurrence>.Context context =
EasyMock.createMock(Mapper.Context.class);
Counter counter = EasyMock.createMock(Counter.class);
context.write(new WeightedRowPair(34, 34, 1.0, 1.0), new Cooccurrence(12, 0.5, 0.5));
context.write(new WeightedRowPair(34, 56, 1.0, 3.0), new Cooccurrence(12, 0.5, 1.0));
context.write(new WeightedRowPair(56, 56, 3.0, 3.0), new Cooccurrence(12, 1.0, 1.0));
EasyMock.expect(context.getCounter(RowSimilarityJob.Counter.COOCCURRENCES)).andReturn(counter);
counter.increment(3);
EasyMock.replay(context, counter);
WeightedOccurrenceArray weightedOccurrences = new WeightedOccurrenceArray(new WeightedOccurrence[] {
new WeightedOccurrence(56, 1.0, 3.0), new WeightedOccurrence(34, 0.5, 1.0) });
new RowSimilarityJob.CooccurrencesMapper().map(new VarIntWritable(12), weightedOccurrences, context);
EasyMock.verify(context, counter);
}
/**
* Tests {@link SimilarityReducer}
*/
@Test
public void testSimilarityReducer() throws Exception {
Reducer<WeightedRowPair,Cooccurrence,SimilarityMatrixEntryKey,MatrixEntryWritable>.Context context =
EasyMock.createMock(Reducer.Context.class);
Counter counter = EasyMock.createMock(Counter.class);
context.write(EasyMock.eq(new SimilarityMatrixEntryKey(12, 0.5)),
MathHelper.matrixEntryMatches(12, 34, 0.5));
context.write(EasyMock.eq(new SimilarityMatrixEntryKey(34, 0.5)),
MathHelper.matrixEntryMatches(34, 12, 0.5));
EasyMock.expect(context.getCounter(RowSimilarityJob.Counter.SIMILAR_ROWS)).andReturn(counter);
counter.increment(1);
EasyMock.replay(context, counter);
SimilarityReducer reducer = new SimilarityReducer();
setField(reducer, "similarity", new DistributedTanimotoCoefficientVectorSimilarity());
reducer.reduce(new WeightedRowPair(12, 34, 3.0, 3.0), Arrays.asList(new Cooccurrence(56, 1.0, 2.0),
new Cooccurrence(78, 3.0, 6.0)), context);
EasyMock.verify(context, counter);
}
/**
* Tests {@link SimilarityReducer} in the special case of computing the similarity of a row to
* itself
*/
@Test
public void testSimilarityReducerSelfSimilarity() throws Exception {
Reducer<WeightedRowPair,Cooccurrence,SimilarityMatrixEntryKey,MatrixEntryWritable>.Context context =
EasyMock.createMock(Reducer.Context.class);
Counter counter = EasyMock.createMock(Counter.class);
context.write(EasyMock.eq(new SimilarityMatrixEntryKey(90, 1.0)), MathHelper.matrixEntryMatches(90, 90, 1.0));
EasyMock.expect(context.getCounter(RowSimilarityJob.Counter.SIMILAR_ROWS)).andReturn(counter);
counter.increment(1);
EasyMock.replay(context, counter);
SimilarityReducer reducer = new SimilarityReducer();
setField(reducer, "similarity", new DistributedTanimotoCoefficientVectorSimilarity());
reducer.reduce(new WeightedRowPair(90, 90, 2.0, 2.0), Arrays.asList(new Cooccurrence(56, 1.0, 2.0),
new Cooccurrence(78, 3.0, 6.0)), context);
EasyMock.verify(context, counter);
}
/**
* Tests {@link EntriesToVectorsReducer}
*/
@Test
public void testEntriesToVectorsReducer() throws Exception {
Reducer<SimilarityMatrixEntryKey,MatrixEntryWritable,IntWritable,VectorWritable>.Context context =
EasyMock.createMock(Reducer.Context.class);
context.write(EasyMock.eq(new IntWritable(12)), MathHelper.vectorMatches(MathHelper.elem(34, 0.8)));
EasyMock.replay(context);
EntriesToVectorsReducer reducer = new EntriesToVectorsReducer();
setField(reducer, "maxSimilaritiesPerRow", 1);
reducer.reduce(new SimilarityMatrixEntryKey(12, 1.0), Arrays.asList(
MathHelper.matrixEntry(12, 34, 0.8),
MathHelper.matrixEntry(12, 56, 0.7)), context);
EasyMock.verify(context);
}
/**
* integration test with a tiny data set
*
* <pre>
*
* input matrix:
*
* 1, 0, 1, 1, 0
* 0, 0, 1, 1, 0
* 0, 0, 0, 0, 1
*
* similarity matrix (via tanimoto):
*
* 1, 0.666, 0
* 0.666, 1, 0
* 0, 0, 1
* </pre>
*/
@Test
public void testSmallSampleMatrix() throws Exception {
File inputFile = getTestTempFile("rows");
File outputDir = getTestTempDir("output");
outputDir.delete();
File tmpDir = getTestTempDir("tmp");
Configuration conf = new Configuration();
Path inputPath = new Path(inputFile.getAbsolutePath());
FileSystem fs = FileSystem.get(inputPath.toUri(), conf);
MathHelper.writeEntries(new double[][] {
new double[] { 1, 0, 1, 1, 0 },
new double[] { 0, 0, 1, 1, 0 },
new double[] { 0, 0, 0, 0, 1 }},
fs, conf, inputPath);
conf.set("mapred.input.dir", inputFile.getAbsolutePath());
conf.set("mapred.output.dir", outputDir.getAbsolutePath());
conf.setBoolean("mapred.output.compress", false);
RowSimilarityJob rowSimilarityJob = new RowSimilarityJob();
rowSimilarityJob.setConf(conf);
rowSimilarityJob.run(new String[] { "--numberOfColumns", "3", "--similarityClassname",
DistributedTanimotoCoefficientVectorSimilarity.class.getName(), "--tempDir", tmpDir.getAbsolutePath() });
Matrix similarityMatrix =
MathHelper.readEntries(conf, new Path(outputDir.getAbsolutePath(), "part-r-00000"), 3, 3);
assertNotNull(similarityMatrix);
assertEquals(3, similarityMatrix.numCols());
assertEquals(3, similarityMatrix.numRows());
assertEquals(1.0, similarityMatrix.get(0, 0), EPSILON);
assertEquals(1.0, similarityMatrix.get(1, 1), EPSILON);
assertEquals(1.0, similarityMatrix.get(2, 2), EPSILON);
assertEquals(0.0, similarityMatrix.get(2, 0), EPSILON);
assertEquals(0.0, similarityMatrix.get(2, 1), EPSILON);
assertEquals(0.0, similarityMatrix.get(0, 2), EPSILON);
assertEquals(0.0, similarityMatrix.get(1, 2), EPSILON);
assertEquals(0.6666, similarityMatrix.get(0, 1), 0.0001);
assertEquals(0.6666, similarityMatrix.get(1, 0), 0.0001);
}
/**
* a tanimoto-coefficient like {@link DistributedVectorSimilarity} that returns NaN for identical rows
*/
static class DistributedTanimotoCoefficientExcludeIdentityVectorSimilarity implements DistributedVectorSimilarity {
private static final DistributedVectorSimilarity tanimoto = new DistributedTanimotoCoefficientVectorSimilarity();
@Override
public double similarity(int rowA, int rowB, Iterable<Cooccurrence> cooccurrences, double weightOfVectorA,
double weightOfVectorB, int numberOfRows) {
if (rowA == rowB) {
return Double.NaN;
}
return tanimoto.similarity(rowA, rowB, cooccurrences, weightOfVectorA, weightOfVectorB, numberOfRows);
}
@Override
public double weight(Vector v) {
return tanimoto.weight(v);
}
}
/**
* integration test for the limitation of the entries of the similarity matrix
*
* <pre>
* c1 c2 c3 c4 c5 c6
* r1 1 0 1 1 0 1
* r2 0 1 1 1 1 1
* r3 1 1 0 1 0 0
*
* tanimoto(r1,r2) = 0.5
* tanimoto(r2,r3) = 0.333
* tanimoto(r3,r1) = 0.4
*
* When we set maxSimilaritiesPerRow to 1 the following pairs should be found:
*
* r1 --> r2
* r2 --> r1
* r3 --> r1
* </pre>
*/
@Test
public void testLimitEntriesInSimilarityMatrix() throws Exception {
File inputFile = getTestTempFile("rows");
File outputDir = getTestTempDir("output");
outputDir.delete();
File tmpDir = getTestTempDir("tmp");
Configuration conf = new Configuration();
Path inputPath = new Path(inputFile.getAbsolutePath());
FileSystem fs = FileSystem.get(inputPath.toUri(), conf);
MathHelper.writeEntries(new double[][] {
new double[] { 1, 0, 1, 1, 0, 1 },
new double[] { 0, 1, 1, 1, 1, 1 },
new double[] { 1, 1, 0, 1, 0, 0 }},
fs, conf, inputPath);
conf.set("mapred.input.dir", inputFile.getAbsolutePath());
conf.set("mapred.output.dir", outputDir.getAbsolutePath());
conf.setBoolean("mapred.output.compress", false);
RowSimilarityJob rowSimilarityJob = new RowSimilarityJob();
rowSimilarityJob.setConf(conf);
rowSimilarityJob.run(new String[] { "--numberOfColumns", "3", "--maxSimilaritiesPerRow", "1",
"--similarityClassname", DistributedTanimotoCoefficientExcludeIdentityVectorSimilarity.class.getName(),
"--tempDir", tmpDir.getAbsolutePath() });
Matrix similarityMatrix =
MathHelper.readEntries(conf, new Path(outputDir.getAbsolutePath(), "part-r-00000"), 3, 3);
assertNotNull(similarityMatrix);
assertEquals(3, similarityMatrix.numCols());
assertEquals(3, similarityMatrix.numRows());
assertEquals(0.0, similarityMatrix.get(0, 0), EPSILON);
assertEquals(0.5, similarityMatrix.get(0, 1), EPSILON);
assertEquals(0.0, similarityMatrix.get(0, 2), EPSILON);
assertEquals(0.5, similarityMatrix.get(1, 0), EPSILON);
assertEquals(0.0, similarityMatrix.get(1, 1), EPSILON);
assertEquals(0.0, similarityMatrix.get(1, 2), EPSILON);
assertEquals(0.4, similarityMatrix.get(2, 0), EPSILON);
assertEquals(0.0, similarityMatrix.get(2, 1), EPSILON);
assertEquals(0.0, similarityMatrix.get(2, 2), EPSILON);
}
}