Package org.carrot2.core.test

Source Code of org.carrot2.core.test.ClusteringAlgorithmTestBase

/*
* Carrot2 project.
*
* Copyright (C) 2002-2014, Dawid Weiss, Stanisław Osiński.
* All rights reserved.
*
* Refer to the full license file "carrot2.LICENSE"
* in the root folder of the repository checkout or at:
* http://www.carrot2.org/carrot2.LICENSE
*/

package org.carrot2.core.test;

import static org.carrot2.core.test.SampleDocumentData.DOCUMENTS_DATA_MINING;
import static org.carrot2.core.test.assertions.Carrot2CoreAssertions.assertThatClusters;

import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;

import org.carrot2.core.Cluster;
import org.carrot2.core.Controller;
import org.carrot2.core.Document;
import org.carrot2.core.IClusteringAlgorithm;
import org.carrot2.core.Platform;
import org.carrot2.core.ProcessingResult;
import org.carrot2.core.attribute.AttributeNames;
import org.carrot2.util.attribute.Bindable;
import org.carrot2.util.attribute.BindableMetadata;
import org.fest.assertions.Assertions;
import org.junit.Assume;
import org.junit.Test;

import com.carrotsearch.randomizedtesting.annotations.Nightly;
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakLingering;
import com.google.common.base.Strings;
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Multimap;
import com.google.common.collect.Sets;

/**
* Simple baseline tests that apply to all clustering algorithms.
*/
public abstract class ClusteringAlgorithmTestBase<T extends IClusteringAlgorithm>
    extends ProcessingComponentTestBase<T>
{
    /**
     * Algorithms are bindable, so their metadata should always be available.
     */
    @Test
    public void testMetadataAvailable()
    {
        Class<? extends IClusteringAlgorithm> c = getComponentClass();
        Assume.assumeTrue(c.getAnnotation(Bindable.class) != null);
       
        BindableMetadata metadata = BindableMetadata.forClassWithParents(c);
        assertNotNull(metadata);
        assertNotNull(metadata.getAttributeMetadata());
    }

    /**
     * A test to check if the algorithm does not fail with no documents.
     */
    @Test
    public void testNoDocuments()
    {
        final Collection<Cluster> clusters =
            cluster(Collections.<Document> emptyList()).getClusters();

        assertNotNull(clusters);
        assertEquals(0, clusters.size());
    }

    /**
     * @see "http://issues.carrot2.org/browse/CARROT-400"
     */
    @Test
    public void testEmptyDocuments()
    {
        final List<Document> documents = Lists.newArrayList();
        final int documentCount = randomIntBetween(1, 100);
        for (int i = 0; i < documentCount; i++)
        {
            documents.add(new Document());
        }

        final List<Cluster> clusters = cluster(documents).getClusters();

        assertNotNull(clusters);
        assertEquals(1, clusters.size());
        assertThat(clusters.get(0).size()).isEqualTo(documentCount);
    }

    @Test
    public void testClusteringDataMining()
    {
        final ProcessingResult processingResult = cluster(DOCUMENTS_DATA_MINING);
        final Collection<Cluster> clusters = processingResult.getClusters();

        assertThat(clusters.size()).isGreaterThan(0);
    }

    @SuppressWarnings("unchecked")
    @Test
    @ThreadLeakLingering(linger = 5000)
    public void testRepeatedClusteringWithCache()
    {
        // Caching controller is not available for .NET at the moment.
        assumeTrue("Java test only.", Platform.getPlatform() == Platform.JAVA);

        final Controller controller = getCachingController(initAttributes, IClusteringAlgorithm.class);

        final Map<String, Object> processingAttributes = ImmutableMap.of(
            AttributeNames.DOCUMENTS, (Object) DOCUMENTS_DATA_MINING);

        controller.process(processingAttributes, getComponentClass());
        controller.process(processingAttributes, getComponentClass());

        controller.dispose();
    }

    /**
     * Performs a very simple stress test using a pooling {@link Controller}. The
     * test is performed with default init attributes.
     */
    @Nightly @Test
    @ThreadLeakLingering(linger = 5000)
    public void testStress() throws InterruptedException, ExecutionException
    {
        final int numberOfThreads = randomIntBetween(1, 10);
        final int queriesPerThread = scaledRandomIntBetween(5, 25);

        /*
         * This yields a pooling controller effectively, because no cache interfaces are passed.
         */
        @SuppressWarnings("unchecked")
        final Controller controller = getCachingController(initAttributes);

        ExecutorService executorService = Executors.newFixedThreadPool(numberOfThreads);
        List<Callable<ProcessingResult>> callables = Lists.newArrayList();
        for (int i = 0; i < numberOfThreads * queriesPerThread; i++)
        {
            final int dataSetIndex = i;
            callables.add(new Callable<ProcessingResult>()
            {
                public ProcessingResult call() throws Exception
                {
                    Map<String, Object> localAttributes = Maps.newHashMap();
                    localAttributes.put(AttributeNames.DOCUMENTS, SampleDocumentData.ALL
                        .get(dataSetIndex % SampleDocumentData.ALL.size()));
                    localAttributes.put("dataSetIndex", dataSetIndex);
                    return controller.process(localAttributes, getComponentClass());
                }
            });
        }

        try
        {
            List<Future<ProcessingResult>> results = executorService.invokeAll(callables);
            Multimap<Integer, List<Cluster>> clusterings = ArrayListMultimap.create();

            // Group results by query
            for (Future<ProcessingResult> future : results)
            {
                final ProcessingResult processingResult = future.get();
                final Integer dataSetIndex = (Integer) processingResult.getAttributes().get("dataSetIndex");
                clusterings.put(dataSetIndex, processingResult.getClusters());
            }

            // Make sure results are the same within each data set
            for (Integer dataSetIndex : clusterings.keySet())
            {
                Collection<List<Cluster>> clustering = clusterings.get(dataSetIndex);
                Iterator<List<Cluster>> iterator = clustering.iterator();
                if (!iterator.hasNext())
                {
                    continue;
                }

                final List<Cluster> firstClusterList = iterator.next();
                Assertions.assertThat(firstClusterList).isNotEmpty();
                while (iterator.hasNext())
                {
                    assertThatClusters(firstClusterList).isEquivalentTo(iterator.next());
                }
            }
        }
        finally
        {
            executorService.shutdown();
        }
    }

    /**
     * Performs clustering using {@link Controller}.
     *
     * @param documents Documents to be clustered.
     * @return {@link ProcessingResult} returned from the controller.
     */
    public ProcessingResult cluster(Collection<Document> documents)
    {
        processingAttributes.put(AttributeNames.DOCUMENTS, documents);
        Controller controller = getSimpleController(initAttributes);
        try {
            ProcessingResult process = controller.process(processingAttributes, getComponentClass());
            return process;
        } finally {
            controller.dispose();
            super.simpleController = null;
        }
    }

    /**
     * Recursively collects documents from clusters.
     */
    public Collection<Document> collectDocuments(Collection<Cluster> clusters)
    {
        return collectDocuments(clusters, new HashSet<Document>());
    }

    /*
     *
     */
    private Collection<Document> collectDocuments(Collection<Cluster> clusters,
        Collection<Document> documents)
    {
        for (final Cluster cluster : clusters)
        {
            documents.addAll(cluster.getDocuments());
            collectDocuments(cluster.getSubclusters());
        }

        return documents;
    }

    public static Set<String> collectClusterLabels(ProcessingResult pr)
    {
        final Set<String> clusterLabels = Sets.newHashSet();
        new Cloneable()
        {
            public void dumpClusters(List<Cluster> clusters, int depth)
            {
                for (Cluster c : clusters) {
                    clusterLabels.add(c.getLabel());
                    if (c.getSubclusters() != null) {
                        dumpClusters(c.getSubclusters(), depth + 1);
                    }
                }
            }
        }.dumpClusters(pr.getClusters(), 0);

        return clusterLabels;
    }
   
    public static void dumpClusterLabels(ProcessingResult pr)
    {
        new Cloneable()
        {
            public void dumpClusters(List<Cluster> clusters, int depth)
            {
                String indent = Strings.repeat("  ", depth);
                for (Cluster c : clusters) {
                    System.out.println(indent + c.getLabel());
                    if (c.getSubclusters() != null) {
                        dumpClusters(c.getSubclusters(), depth + 1);
                    }
                }
            }
        }.dumpClusters(pr.getClusters(), 0);
    }   
}
TOP

Related Classes of org.carrot2.core.test.ClusteringAlgorithmTestBase

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.