Package org.apache.nutch.clustering.carrot2

Source Code of org.apache.nutch.clustering.carrot2.Clusterer

/**
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.nutch.clustering.carrot2;

import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.List;
import java.util.Iterator;

import org.apache.nutch.clustering.HitsCluster;
import org.apache.nutch.clustering.OnlineClusterer;
import org.apache.nutch.searcher.HitDetails;
import com.dawidweiss.carrot.core.local.*;
import com.dawidweiss.carrot.core.local.clustering.RawCluster;
import com.dawidweiss.carrot.core.local.impl.ClustersConsumerOutputComponent;
import com.dawidweiss.carrot.util.tokenizer.SnippetTokenizerLocalFilterComponent;
import com.stachoodev.carrot.filter.lingo.local.LingoLocalFilterComponent;

import com.dawidweiss.carrot.util.tokenizer.languages.dutch.Dutch;
import com.dawidweiss.carrot.util.tokenizer.languages.english.English;
import com.dawidweiss.carrot.util.tokenizer.languages.french.French;
import com.dawidweiss.carrot.util.tokenizer.languages.german.German;
import com.dawidweiss.carrot.util.tokenizer.languages.italian.Italian;
import com.dawidweiss.carrot.util.tokenizer.languages.spanish.Spanish;
import com.dawidweiss.carrot.core.local.linguistic.Language;


/**
* An plugin providing an implementation of {@link OnlineClusterer} extension
* using clustering components of the Carrot2 project
* (<a href="http://carrot2.sourceforge.net">http://carrot2.sourceforge.net</a>).
*
* @author Dawid Weiss
* @version $Id: Clusterer.java,v 1.1 2004/08/09 23:23:53 johnnx Exp $
*/
public class Clusterer implements OnlineClusterer {
  private final LocalController controller;

  /**
   * An empty public constructor for making new instances
   * of the clusterer.
   */
  public Clusterer() {
    controller = new LocalControllerBase();
    addComponentFactories();
    addProcesses();
  }

  /** Adds the required component factories to a local Carrot2 controller. */
  private void addComponentFactories() {
    // Local nutch input component
    LocalComponentFactory nutchInputFactory = new LocalComponentFactoryBase() {
      public LocalComponent getInstance() {
        return new LocalNutchInputComponent();
      }
    };
    controller.addLocalComponentFactory("input.localnutch", nutchInputFactory);
   
    // Cluster consumer output component
    LocalComponentFactory clusterConsumerOutputFactory = new LocalComponentFactoryBase() {
      public LocalComponent getInstance() {
        return new ClustersConsumerOutputComponent();
      }
    };
    controller.addLocalComponentFactory("output.cluster-consumer",
      clusterConsumerOutputFactory);
   
    // Clustering component here.
    LocalComponentFactory lingoFactory = new LocalComponentFactoryBase() {
      public LocalComponent getInstance() {
        HashMap defaults = new HashMap();
       
        // These are adjustments settings for the clustering algorithm...
        // You can play with them, but the values below are our 'best guess'
        // settings that we acquired experimentally.
        defaults.put("lsi.threshold.clusterAssignment", "0.150");
        defaults.put("lsi.threshold.candidateCluster""0.775");

        // TODO: this should be eventually replaced with documents from Nutch
        // tagged with a language tag. There is no need to again determine
        // the language of a document.
        return new LingoLocalFilterComponent(
          // If you want to include Polish in the list of supported languages,
          // you have to download a separate Carrot2-component called
          // carrot2-stemmer-lametyzator.jar, put it in classpath
          // and add new Polish() below.
          new Language[]
          {
            new English(),
            new Dutch(),
            new French(),
            new German(),
            new Italian(),
            new Spanish()
          }, defaults);
      }
    };
    controller.addLocalComponentFactory("filter.lingo-old", lingoFactory);     
  }

  /** Adds a clustering process to the local controller */ 
  private void addProcesses() {
    LocalProcessBase lingoNMFKM3
      = new LocalProcessBase(
        "input.localnutch",
        "output.cluster-consumer",
        new String [] {"filter.lingo-old"},
        "Example the Lingo clustering algorithm.",
        "");

    try {
      controller.addProcess("lingo-nmf-km-3", lingoNMFKM3);
    } catch (Exception e) {
      throw new RuntimeException("Could not assemble clustering process.", e);
    }
  }
 
  /**
   * See {@link OnlineClusterer} for documentation.
   */
  public HitsCluster [] clusterHits(HitDetails [] hitDetails, String [] descriptions) {
    Map requestParams = new HashMap();
    requestParams.put(LocalNutchInputComponent.NUTCH_INPUT_HIT_DETAILS_ARRAY,
      hitDetails);
    requestParams.put(LocalNutchInputComponent.NUTCH_INPUT_SUMMARIES_ARRAY,
      descriptions);
    try {
      ProcessingResult result =
        controller.query("lingo-nmf-km-3", "pseudo-query", requestParams);

      ClustersConsumerOutputComponent.Result output =
        (ClustersConsumerOutputComponent.Result) result.getQueryResult();

      List outputClusters = output.clusters;
      HitsCluster [] clusters = new HitsCluster[ outputClusters.size() ];

      int j = 0;
      for (Iterator i = outputClusters.iterator(); i.hasNext(); j++) {
        RawCluster rcluster = (RawCluster) i.next();
        clusters[j] = new HitsClusterAdapter(rcluster, hitDetails);
      }

      // invoke Carrot2 process here.
      return clusters;
    } catch (MissingProcessException e) {
      throw new RuntimeException("Missing clustering process.", e);
    } catch (Exception e) {
      throw new RuntimeException("Unidentified problems with the clustering.", e);
    }
  }
}
TOP

Related Classes of org.apache.nutch.clustering.carrot2.Clusterer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.