/*
* Carrot2 project.
*
* Copyright (C) 2002-2014, Dawid Weiss, Stanisław Osiński.
* All rights reserved.
*
* Refer to the full license file "carrot2.LICENSE"
* in the root folder of the repository checkout or at:
* http://www.carrot2.org/carrot2.LICENSE
*/
package org.carrot2.core.test;
import static org.carrot2.core.test.SampleDocumentData.DOCUMENTS_DATA_MINING;
import static org.carrot2.core.test.assertions.Carrot2CoreAssertions.assertThatClusters;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import org.carrot2.core.Cluster;
import org.carrot2.core.Controller;
import org.carrot2.core.Document;
import org.carrot2.core.IClusteringAlgorithm;
import org.carrot2.core.Platform;
import org.carrot2.core.ProcessingResult;
import org.carrot2.core.attribute.AttributeNames;
import org.carrot2.util.attribute.Bindable;
import org.carrot2.util.attribute.BindableMetadata;
import org.fest.assertions.Assertions;
import org.junit.Assume;
import org.junit.Test;
import com.carrotsearch.randomizedtesting.annotations.Nightly;
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakLingering;
import com.google.common.base.Strings;
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Multimap;
import com.google.common.collect.Sets;
/**
* Simple baseline tests that apply to all clustering algorithms.
*/
public abstract class ClusteringAlgorithmTestBase<T extends IClusteringAlgorithm>
extends ProcessingComponentTestBase<T>
{
/**
* Algorithms are bindable, so their metadata should always be available.
*/
@Test
public void testMetadataAvailable()
{
Class<? extends IClusteringAlgorithm> c = getComponentClass();
Assume.assumeTrue(c.getAnnotation(Bindable.class) != null);
BindableMetadata metadata = BindableMetadata.forClassWithParents(c);
assertNotNull(metadata);
assertNotNull(metadata.getAttributeMetadata());
}
/**
* A test to check if the algorithm does not fail with no documents.
*/
@Test
public void testNoDocuments()
{
final Collection<Cluster> clusters =
cluster(Collections.<Document> emptyList()).getClusters();
assertNotNull(clusters);
assertEquals(0, clusters.size());
}
/**
* @see "http://issues.carrot2.org/browse/CARROT-400"
*/
@Test
public void testEmptyDocuments()
{
final List<Document> documents = Lists.newArrayList();
final int documentCount = randomIntBetween(1, 100);
for (int i = 0; i < documentCount; i++)
{
documents.add(new Document());
}
final List<Cluster> clusters = cluster(documents).getClusters();
assertNotNull(clusters);
assertEquals(1, clusters.size());
assertThat(clusters.get(0).size()).isEqualTo(documentCount);
}
@Test
public void testClusteringDataMining()
{
final ProcessingResult processingResult = cluster(DOCUMENTS_DATA_MINING);
final Collection<Cluster> clusters = processingResult.getClusters();
assertThat(clusters.size()).isGreaterThan(0);
}
@SuppressWarnings("unchecked")
@Test
@ThreadLeakLingering(linger = 5000)
public void testRepeatedClusteringWithCache()
{
// Caching controller is not available for .NET at the moment.
assumeTrue("Java test only.", Platform.getPlatform() == Platform.JAVA);
final Controller controller = getCachingController(initAttributes, IClusteringAlgorithm.class);
final Map<String, Object> processingAttributes = ImmutableMap.of(
AttributeNames.DOCUMENTS, (Object) DOCUMENTS_DATA_MINING);
controller.process(processingAttributes, getComponentClass());
controller.process(processingAttributes, getComponentClass());
controller.dispose();
}
/**
* Performs a very simple stress test using a pooling {@link Controller}. The
* test is performed with default init attributes.
*/
@Nightly @Test
@ThreadLeakLingering(linger = 5000)
public void testStress() throws InterruptedException, ExecutionException
{
final int numberOfThreads = randomIntBetween(1, 10);
final int queriesPerThread = scaledRandomIntBetween(5, 25);
/*
* This yields a pooling controller effectively, because no cache interfaces are passed.
*/
@SuppressWarnings("unchecked")
final Controller controller = getCachingController(initAttributes);
ExecutorService executorService = Executors.newFixedThreadPool(numberOfThreads);
List<Callable<ProcessingResult>> callables = Lists.newArrayList();
for (int i = 0; i < numberOfThreads * queriesPerThread; i++)
{
final int dataSetIndex = i;
callables.add(new Callable<ProcessingResult>()
{
public ProcessingResult call() throws Exception
{
Map<String, Object> localAttributes = Maps.newHashMap();
localAttributes.put(AttributeNames.DOCUMENTS, SampleDocumentData.ALL
.get(dataSetIndex % SampleDocumentData.ALL.size()));
localAttributes.put("dataSetIndex", dataSetIndex);
return controller.process(localAttributes, getComponentClass());
}
});
}
try
{
List<Future<ProcessingResult>> results = executorService.invokeAll(callables);
Multimap<Integer, List<Cluster>> clusterings = ArrayListMultimap.create();
// Group results by query
for (Future<ProcessingResult> future : results)
{
final ProcessingResult processingResult = future.get();
final Integer dataSetIndex = (Integer) processingResult.getAttributes().get("dataSetIndex");
clusterings.put(dataSetIndex, processingResult.getClusters());
}
// Make sure results are the same within each data set
for (Integer dataSetIndex : clusterings.keySet())
{
Collection<List<Cluster>> clustering = clusterings.get(dataSetIndex);
Iterator<List<Cluster>> iterator = clustering.iterator();
if (!iterator.hasNext())
{
continue;
}
final List<Cluster> firstClusterList = iterator.next();
Assertions.assertThat(firstClusterList).isNotEmpty();
while (iterator.hasNext())
{
assertThatClusters(firstClusterList).isEquivalentTo(iterator.next());
}
}
}
finally
{
executorService.shutdown();
}
}
/**
* Performs clustering using {@link Controller}.
*
* @param documents Documents to be clustered.
* @return {@link ProcessingResult} returned from the controller.
*/
public ProcessingResult cluster(Collection<Document> documents)
{
processingAttributes.put(AttributeNames.DOCUMENTS, documents);
Controller controller = getSimpleController(initAttributes);
try {
ProcessingResult process = controller.process(processingAttributes, getComponentClass());
return process;
} finally {
controller.dispose();
super.simpleController = null;
}
}
/**
* Recursively collects documents from clusters.
*/
public Collection<Document> collectDocuments(Collection<Cluster> clusters)
{
return collectDocuments(clusters, new HashSet<Document>());
}
/*
*
*/
private Collection<Document> collectDocuments(Collection<Cluster> clusters,
Collection<Document> documents)
{
for (final Cluster cluster : clusters)
{
documents.addAll(cluster.getDocuments());
collectDocuments(cluster.getSubclusters());
}
return documents;
}
public static Set<String> collectClusterLabels(ProcessingResult pr)
{
final Set<String> clusterLabels = Sets.newHashSet();
new Cloneable()
{
public void dumpClusters(List<Cluster> clusters, int depth)
{
for (Cluster c : clusters) {
clusterLabels.add(c.getLabel());
if (c.getSubclusters() != null) {
dumpClusters(c.getSubclusters(), depth + 1);
}
}
}
}.dumpClusters(pr.getClusters(), 0);
return clusterLabels;
}
public static void dumpClusterLabels(ProcessingResult pr)
{
new Cloneable()
{
public void dumpClusters(List<Cluster> clusters, int depth)
{
String indent = Strings.repeat(" ", depth);
for (Cluster c : clusters) {
System.out.println(indent + c.getLabel());
if (c.getSubclusters() != null) {
dumpClusters(c.getSubclusters(), depth + 1);
}
}
}
}.dumpClusters(pr.getClusters(), 0);
}
}