Package org.carrot2.core

Examples of org.carrot2.core.Controller


        .getLexicalData(LanguageCode.MALTESE);

    for (String word : wordsToCheck.split(",")) {
      if (!lexicalData.isCommonWord(new MutableCharArray(word))
          && !lexicalData.isStopLabel(word)) {
        clusters.add(new Cluster(word));
      }
    }
  }
View Full Code Here


  @Test
  public void testSimple() throws Exception {
    //<start id="crt2.simple"/>
    //... setup some documents elsewhere
    final Controller controller =
            ControllerFactory.createSimple();//<co id="crt2.controller.creation"/>
    documents = new ArrayList<Document>();
    for (int i = 0; i < titles.length; i++) {
      Document doc = new Document(titles[i], snippets[i],
              "file://foo_" + i + ".txt");
      documents.add(doc);
    }
    final ProcessingResult result = controller.process(documents,
            "red fox",
            LingoClusteringAlgorithm.class);//<co id="crt2.process"/>
    displayResults(result);//<co id="crt2.print"/>

    /*
 
View Full Code Here

         * [[[end:using-attributes-raw-map-intro]]]
         */
        {
            // [[[start:using-attributes-raw-map]]]
            /* A controller to manage the processing pipeline. */
            final Controller controller = ControllerFactory.createSimple();
           
            /* Prepare attribute map */
            final Map<String, Object> attributes = new HashMap<String, Object>();

            /* Put attribute values using direct keys. */
            attributes.put(CommonAttributesDescriptor.Keys.QUERY, "data mining");
            attributes.put(CommonAttributesDescriptor.Keys.RESULTS, 100);
            attributes.put("LingoClusteringAlgorithm.desiredClusterCountBase", 15);

            /* Put your own API key here! */
            attributes.put(Bing3WebDocumentSourceDescriptor.Keys.APPID, BingKeyAccess.getKey());
           
            /* Perform processing */
            final ProcessingResult result = controller.process(attributes,
                Bing3WebDocumentSource.class, LingoClusteringAlgorithm.class);
   
            /* Documents fetched from the document source, clusters created by Carrot2. */
            final List<Document> documents = result.getDocuments();
            final List<Cluster> clusters = result.getClusters();
            // [[[end:using-attributes-raw-map]]]
           
            ConsoleFormatter.displayResults(result);
        }
       
        /* [[[start:using-attributes-builders-intro]]]
         *
         * <div>
         * <p>
         * As an alternative to the raw attribute map used in the previous example, you
         * can use attribute map builders. Attribute map builders have a number of advantages:
         * </p>
         *
         * <ul>
         * <li>Type-safety: the correct type of the value will be enforced at compile time</li>
         * <li>Error prevention: unexpected results caused by typos in attribute name strings are avoided</li>
         * <li>Early error detection: in case an attribute's key changes, your compiler will detect that</li>
         * <li>IDE support: your IDE will suggest the right method names and parameters</li>
         * </ul>
         *
         * <p>
         * A possible disadvantage of attribute builders is that one algorithm's attributes can
         * be divided into a number of builders and hence not readily available in your IDE's auto
         * complete window. Please consult attribute documentation in Carrot2 manual for pointers to
         * the appropriate builder classes and methods.
         * </p>
         *
         * <p>
         * The code shown below fetches 100 results for query <em>data mining</em> from
         * {@link org.carrot2.source.microsoft.Bing3WebDocumentSource} and clusters them using
         * the {@link org.carrot2.clustering.lingo.LingoClusteringAlgorithm} tuned to create slightly
         * fewer clusters than by default. Please note how the API key is passed and use your own
         * key in production deployments.
         * </p>
         * </div>
         *
         * [[[end:using-attributes-builders-intro]]]
         */
        {
            /// [[[start:using-attributes-builders]]]
            /* A controller to manage the processing pipeline. */
            final Controller controller = ControllerFactory.createSimple();
           
            /* Prepare attribute map */
            final Map<String, Object> attributes = new HashMap<String, Object>();

            /* Put values using attribute builders */
            CommonAttributesDescriptor
                .attributeBuilder(attributes)
                    .query("data mining")
                    .results(100);
            LingoClusteringAlgorithmDescriptor
                .attributeBuilder(attributes)
                    .desiredClusterCountBase(15)
                    .matrixReducer()
                        .factorizationQuality(FactorizationQuality.HIGH);
                       
            Bing3WebDocumentSourceDescriptor
                .attributeBuilder(attributes)
                    .appid(BingKeyAccess.getKey()); // use your own key here
           
            /* Perform processing */
            final ProcessingResult result = controller.process(attributes,
                Bing3WebDocumentSource.class, LingoClusteringAlgorithm.class);
   
            /* Documents fetched from the document source, clusters created by Carrot2. */
            final List<Document> documents = result.getDocuments();
            final List<Cluster> clusters = result.getClusters();
            /// [[[end:using-attributes-builders]]]
           
            ConsoleFormatter.displayResults(result);
        }
       
        /* [[[start:using-attributes-output-intro]]]
         * <div>
         * <p>
         * Some algorithms apart from clusters can produce additional, usually
         * diagnostic, output. The output is present in the attributes map contained
         * in the {@link org.carrot2.core.ProcessingResult}. You can read the contents
         * of that map directly or through the attribute map builders. Carrot2 manual
         * lists and describes in detail the output attributes of each component.
         * </p>
         * <p>
         * The code shown below clusters clusters an example collection of
         * {@link org.carrot2.core.Document}s using the Lingo algorithm. Lingo can
         * optionally use native platform-specific matrix computation libraries. The
         * example code reads an attribute to find out whether such libraries were
         * successfully loaded and used.
         * </p>
         * </div>
         * [[[end:using-attributes-output-intro]]]
         */
        {
            /// [[[start:using-attributes-output]]]
            /* A controller to manage the processing pipeline. */
            final Controller controller = ControllerFactory.createSimple();
           
            /* Prepare attribute map */
            final Map<String, Object> attributes = new HashMap<String, Object>();
            CommonAttributesDescriptor
                .attributeBuilder(attributes)
                    .documents(SampleDocumentData.DOCUMENTS_DATA_MINING);
            LingoClusteringAlgorithmDescriptor
                .attributeBuilder(attributes)
                    .desiredClusterCountBase(15)
                    .matrixReducer()
                        .factorizationQuality(FactorizationQuality.HIGH);

            /* Perform processing */
            final ProcessingResult result = controller.process(attributes,
                LingoClusteringAlgorithm.class);
           
            /* Clusters created by Carrot2, read processing time */
            final List<Cluster> clusters = result.getClusters();
            final Long clusteringTime = CommonAttributesDescriptor.attributeBuilder(
View Full Code Here

         * [[[end:clustering-data-from-document-sources-simple-intro]]]
         */
        {
            /// [[[start:clustering-data-from-document-sources-simple]]]
            /* A controller to manage the processing pipeline. */
            final Controller controller = ControllerFactory.createSimple();

            /* Perform processing */
            final ProcessingResult result = controller.process("data mining", 100,
                EToolsDocumentSource.class, LingoClusteringAlgorithm.class);
   
            /* Documents fetched from the document source, clusters created by Carrot2. */
            final List<Document> documents = result.getDocuments();
            final List<Cluster> clusters = result.getClusters();
            /// [[[end:clustering-data-from-document-sources-simple]]]
           
            ConsoleFormatter.displayResults(result);
        }
       
        /* [[[start:clustering-data-from-document-sources-advanced-intro]]]
         *
         * If your production code needs to fetch documents from popular search engines,
         * it is very important that you generate and use your own API key rather than Carrot2's
         * default one. You can pass the API key along with the query and the requested
         * number of results in an attribute map. Carrot2 manual lists all supported attributes
         * along with their keys, types and allowed values. The code shown below, fetches and clusters
         * 50 results from {@link org.carrot2.source.microsoft.Bing3WebDocumentSource}.
         *
         * [[[end:clustering-data-from-document-sources-advanced-intro]]]
         */
        {
            /// [[[start:clustering-data-from-document-sources-advanced]]]
            /* A controller to manage the processing pipeline. */
            final Controller controller = ControllerFactory.createSimple();
   
            /* Prepare attributes */
            final Map<String, Object> attributes = new HashMap<String, Object>();
           
            /* Put your own API key here! */
            Bing3WebDocumentSourceDescriptor.attributeBuilder(attributes)
                .appid(BingKeyAccess.getKey());

            /* Query an the required number of results */
            attributes.put(CommonAttributesDescriptor.Keys.QUERY, "clustering");
            attributes.put(CommonAttributesDescriptor.Keys.RESULTS, 50);
   
            /* Perform processing */
            final ProcessingResult result = controller.process(attributes,
                Bing3WebDocumentSource.class, STCClusteringAlgorithm.class);

            /* Documents fetched from the document source, clusters created by Carrot2. */
            final List<Document> documents = result.getDocuments();
            final List<Cluster> clusters = result.getClusters();
View Full Code Here

        // [[[start:clustering-non-english-content]]]
        /*
         * We use a Controller that reuse instances of Carrot2 processing components
         * and caches results produced by document sources.
         */
        final Controller controller = ControllerFactory.createCachingPooling(IDocumentSource.class);

        /*
         * In the first call, we'll cluster a document list, setting the language for each
         * document separately.
         */
        final List<Document> documents = Lists.newArrayList();
        for (Document document : SampleDocumentData.DOCUMENTS_DATA_MINING)
        {
            documents.add(new Document(document.getTitle(), document.getSummary(),
                document.getContentUrl(), LanguageCode.ENGLISH));
        }

        final Map<String, Object> attributes = Maps.newHashMap();
        CommonAttributesDescriptor.attributeBuilder(attributes)
            .documents(documents);
        final ProcessingResult englishResult = controller.process(
            attributes, LingoClusteringAlgorithm.class);
        ConsoleFormatter.displayResults(englishResult);

        /*
         * In the second call, we will fetch results for a Chinese query from Bing,
         * setting explicitly the Bing's specific language attribute. Based on that
         * attribute, the document source will set the appropriate language for each
         * document.
         */
        attributes.clear();
       
        CommonAttributesDescriptor.attributeBuilder(attributes)
            .query("聚类" /* clustering? */)
            .results(100);

        Bing3WebDocumentSourceDescriptor.attributeBuilder(attributes)
            .market(MarketOption.CHINESE_CHINA);
        Bing3WebDocumentSourceDescriptor
            .attributeBuilder(attributes)
                .appid(BingKeyAccess.getKey()); // use your own ID here!

        final ProcessingResult chineseResult = controller.process(attributes,
            Bing3WebDocumentSource.class, LingoClusteringAlgorithm.class);
        ConsoleFormatter.displayResults(chineseResult);

        /*
         * In the third call, we will fetch results for the same Chinese query from
         * Google. As Google document source does not have its specific attribute for
         * setting the language, it will not set the documents' language for us. To make
         * sure the right lexical resources are used, we will need to set the
         * MultilingualClustering.defaultLanguage attribute to Chinese on our own.
         */
        attributes.clear();
       
        CommonAttributesDescriptor.attributeBuilder(attributes)
            .query("聚类" /* clustering? */)
            .results(100);

        MultilingualClusteringDescriptor.attributeBuilder(attributes)
            .defaultLanguage(LanguageCode.CHINESE_SIMPLIFIED);

        final ProcessingResult chineseResult2 = controller.process(attributes,
            GoogleDocumentSource.class, LingoClusteringAlgorithm.class);
        ConsoleFormatter.displayResults(chineseResult2);
        // [[[end:clustering-non-english-content]]]
    }
View Full Code Here

         * Create a caching controller that will reuse processing component instances, but
         * will not perform any caching of results produced by components. We will leave
         * caching of documents from Lucene index to Lucene and the operating system
         * caches.
         */
        final Controller controller = ControllerFactory.createPooling();

        /*
         * Prepare a map with component-specific attributes. Here, this map will contain
         * the index location and names of fields to be used to fetch document title and
         * summary.
         */
        final Map<String, Object> luceneGlobalAttributes = new HashMap<String, Object>();

        String indexPath = "put your index path here or pass as the first argument";
        if (args.length == 1)
        {
            indexPath = args[0];
        }

        // Sanity check.
        if (!new File(indexPath).isDirectory()) {
            System.err.println("Index directory does not exist: " + indexPath);
            return;
        }

        LuceneDocumentSourceDescriptor
            .attributeBuilder(luceneGlobalAttributes)
            .directory(FSDirectory.open(new File(indexPath)));

        /*
         * In ClusteringDataFromLucene we used a simple configuration of
         * LuceneDocumentSource whereby we only provided the names of Lucene fields to be
         * used for titles and summaries. If more advanced mapping of Lucene documents is
         * required, you can implement your own version of IFieldMapper as below.
         *
         * Note that we could also provide here an instance of the mapper rather than
         * its class. The differences are summarized below:
         *
         * > Class: Class has to have a no-parameter constructor. Instances of the
         *   class will not be shared between processing threads, which means the
         *   implementation does not have to be thread-safe. Recommended in most
         *   situations unless the instances are expensive to create.
         *  
         * > Instance: The provided instance will be shared across processing threads,
         *   which means the implementation MUST be thread-safe.
         */
        LuceneDocumentSourceDescriptor
            .attributeBuilder(luceneGlobalAttributes)
            .fieldMapper(new CustomFieldMapper());

        /*
         * The Analyzer used by Lucene while searching can also be provided via factory
         * because it does not have a parameterless constructor.
         */
        LuceneDocumentSourceDescriptor
            .attributeBuilder(luceneGlobalAttributes)
            .analyzer(StandardAnalyzerFactory.class);

        /*
         * Initialize the controller passing the above attributes as component-specific
         * for Lucene. The global attributes map will be empty. Note that we've provided
         * an identifier for our specially-configured Lucene component, we'll need to use
         * this identifier when performing processing.
         */
        controller.init(
            new HashMap<String, Object>(),
            new ProcessingComponentConfiguration(
                LuceneDocumentSource.class, "lucene", luceneGlobalAttributes));

        /*
         * Perform processing.
         */
        final String query = "mining";
        final Map<String, Object> processingAttributes = Maps.newHashMap();
        CommonAttributesDescriptor.attributeBuilder(processingAttributes)
            .query(query);

        /*
         * We need to refer to the Lucene component by its identifier we set during
         * initialization. As we've not assigned any identifier to the
         * LingoClusteringAlgorithm we want to use, we can its fully qualified class name.
         */
        ProcessingResult process = controller.process(
            processingAttributes, "lucene", LingoClusteringAlgorithm.class.getName());

        ConsoleFormatter.displayResults(process);
    }
View Full Code Here

*/
public class UsingComponentSuites
{
    public static void main(String [] args) throws Exception
    {
        @SuppressWarnings("unchecked")
        final Controller controller = ControllerFactory.createCachingPooling(IDocumentSource.class);

        // Initialization-time attributes that will apply to all components.
        final Map<String, Object> initAttributes = Maps.newHashMap();

        // Prepare resource lookup facade. We will use the suites directory
        // and class path resources.
        final ResourceLookup resourceLookup = new ResourceLookup(
            new DirLocator(new File("suites")),
            new ContextClassLoaderLocator());

        // We know we'll be using Bing so set up its access key.
        // use your own ID here!
        Bing3WebDocumentSourceDescriptor
            .attributeBuilder(initAttributes)
                .appid(BingKeyAccess.getKey());
       
        // We'll read the component suite definition from an XML stream.
        // IResource is an abstraction layer over resources in Carrot2.
        IResource suiteXml = resourceLookup.getFirst("suite-examples.xml");

        // Deserialize the component suite definition.
        final ProcessingComponentSuite suite =
            ProcessingComponentSuite.deserialize(suiteXml, resourceLookup);

        // Initialize the controller with the suite. All components from the suite
        // will be available for processing within this controller.
        controller.init(initAttributes, suite.getComponentConfigurations());

        // From the suite definition, you can get the document sources and clustering
        // algorithm descriptors.
        final List<DocumentSourceDescriptor> sources = suite.getSources();
        final List<String> sourceIds = Lists.transform(sources,
            ProcessingComponentDescriptor.ProcessingComponentDescriptorToId.INSTANCE);
        System.out.println("Found " + sourceIds.size() + " document sources: "
            + sourceIds);

        final List<ProcessingComponentDescriptor> algorithms = suite.getAlgorithms();
        final List<String> algorithmIds = Lists.transform(algorithms,
            ProcessingComponentDescriptor.ProcessingComponentDescriptorToId.INSTANCE);
        System.out.println("Found " + algorithmIds.size() + " clutering algorithms: "
            + algorithmIds + "\n\n");

        // Run not more than two algorithms on not more than two sources
        for (int s = 0; s < Math.min(sourceIds.size(), 2); s++)
        {
            for (int a = 0; a < Math.min(algorithmIds.size(), 2); a++)
            {
                // You can retrieve some metadata about the components, such as
                // human-readable label, from their descriptors.
                System.out.println("Querying " + sources.get(s).getLabel()
                    + ", clustering with " + algorithms.get(a).getLabel());

                // As usual, we pass attributes for processing
                final Map<String, Object> attributes = Maps.newHashMap();
                CommonAttributesDescriptor.attributeBuilder(attributes)
                    .query("data mining");

                // Pass component ids to the controller to perform processing
                final ProcessingResult result = controller.process(attributes,
                    sourceIds.get(s), algorithmIds.get(a));
                ConsoleFormatter.displayClusters(result.getClusters());
                System.out.println();
            }
        }
View Full Code Here

   
    @UsesExternalServices
    @Test
    public void testRequestIndependence()
    {
        @SuppressWarnings("unchecked")
        final Controller controller = ControllerFactory.createCachingPooling(
            org.carrot2.core.IDocumentSource.class);
        closeAfterTest(controller);

        final Map<String,Object> attrs = Maps.newHashMap();

        CommonAttributesDescriptor.attributeBuilder(attrs)
            .results(50)
            .query("data mining");
       
        controller.process(attrs,
            org.carrot2.webapp.source.WebDocumentSource.class, LingoClusteringAlgorithm.class);              

        attrs.clear();
        CommonAttributesDescriptor.attributeBuilder(attrs)
            .results(50)
            .query(WebDocumentSource.QUERY_FAILURE);

        try {
            controller.process(attrs,
                org.carrot2.webapp.source.WebDocumentSource.class, LingoClusteringAlgorithm.class);
            fail();
        } catch (ProcessingException e) {
            assertThat(e.getCause().getMessage()).contains("Synthetic failure");
        }
View Full Code Here

*/
public class UsingCustomLexicalResources
{
    public static void main(String [] args)
    {
        @SuppressWarnings("unchecked")
        final Controller controller = ControllerFactory.createCachingPooling(IDocumentSource.class);

        // We will pass our custom resource locator at initialization time. There is a
        // variety of implementations of IResourceLocator interface, we will use
        // an explicit filesystem folder in the current working directory.
        File resourcesDir = new File("resources");
        ResourceLookup resourceLookup = new ResourceLookup(new DirLocator(resourcesDir));

        Map<String, Object> attrs = Maps.newHashMap();

        // Note that we tell the linguistic component to merge all lexical resources,
        // this is the default setting and it usually helps with multi-lingual content.
        DefaultLexicalDataFactoryDescriptor.attributeBuilder(attrs)
            .mergeResources(true);
        LexicalDataLoaderDescriptor.attributeBuilder(attrs)
            .resourceLookup(resourceLookup);

        controller.init(attrs);

        // Cluster some data with Lingo and STC.
        clusterAndDisplayClusters(controller, LingoClusteringAlgorithm.class);
        clusterAndDisplayClusters(controller, STCClusteringAlgorithm.class);
    }
View Full Code Here

            {
                documents.add(new Document(row[1], row[2], row[0]));
            }

            /* A controller to manage the processing pipeline. */
            final Controller controller = ControllerFactory.createSimple();

            /*
             * Perform clustering by topic using the Lingo algorithm. Lingo can
             * take advantage of the original query, so we provide it along with the documents.
             */
            final ProcessingResult byTopicClusters = controller.process(documents, "data mining",
                LingoClusteringAlgorithm.class);
            final List<Cluster> clustersByTopic = byTopicClusters.getClusters();
           
            /* Perform clustering by domain. In this case query is not useful, hence it is null. */
            final ProcessingResult byDomainClusters = controller.process(documents, null,
                ByUrlClusteringAlgorithm.class);
            final List<Cluster> clustersByDomain = byDomainClusters.getClusters();
            // [[[end:clustering-document-list]]]
           
            ConsoleFormatter.displayClusters(clustersByTopic);
View Full Code Here

TOP

Related Classes of org.carrot2.core.Controller

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.