/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.clustering.carrot2;
import java.util.*;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.clustering.HitsCluster;
import org.apache.nutch.clustering.OnlineClusterer;
import org.apache.nutch.searcher.HitDetails;
import com.dawidweiss.carrot.core.local.*;
import com.dawidweiss.carrot.core.local.clustering.RawCluster;
import com.dawidweiss.carrot.core.local.impl.ClustersConsumerOutputComponent;
import com.dawidweiss.carrot.core.local.linguistic.Language;
import com.dawidweiss.carrot.util.tokenizer.languages.AllKnownLanguages;
import com.stachoodev.carrot.filter.lingo.local.LingoLocalFilterComponent;
/**
* An plugin providing an implementation of {@link OnlineClusterer}
* extension using clustering components of the Carrot2 project
* (<a href="http://carrot2.sourceforge.net">http://carrot2.sourceforge.net</a>).
*
* We hardcode the following Carrot2 process:
* <pre><![CDATA[
* <local-process id="yahoo-lingo">
* <name>Yahoo Search API -- Lingo Classic Clusterer</name>
*
* <input component-key="input-localnutch" />
* <filter component-key="filter-lingo" />
* <output component-key="output-clustersConsumer" />
* </local-process>
* ]]></pre>
*
* @author Dawid Weiss
* @version $Id: Clusterer.java,v 1.1 2004/08/09 23:23:53 johnnx Exp $
*/
public class Clusterer implements OnlineClusterer, Configurable {
/** Default language property name. */
private final static String CONF_PROP_DEFAULT_LANGUAGE =
"extension.clustering.carrot2.defaultLanguage";
/** Recognizable languages property name. */
private final static String CONF_PROP_LANGUAGES =
"extension.clustering.carrot2.languages";
/** Internal clustering process ID in Carrot2 LocalController */
private final static String PROCESS_ID = "nutch-lingo";
public static final Log logger = LogFactory.getLog(Clusterer.class);
/** The LocalController instance used for clustering */
private LocalController controller;
/** Nutch configuration. */
private Configuration conf;
/**
* Default language for hits. English by default, but may be changed
* via a property in Nutch configuration.
*/
private String defaultLanguage = "en";
/**
* A list of recognizable languages..
* English only by default, but configurable via Nutch configuration.
*/
private String [] languages = new String [] {defaultLanguage};
/**
* An empty public constructor for making new instances
* of the clusterer.
*/
public Clusterer() {
initialize();
}
private synchronized void initialize() {
controller = new LocalControllerBase();
addComponentFactories();
addProcesses();
}
/** Adds the required component factories to a local Carrot2 controller. */
private void addComponentFactories() {
// * <input component-key="input-localnutch" />
LocalComponentFactory nutchInputFactory = new LocalComponentFactoryBase() {
public LocalComponent getInstance() {
return new LocalNutchInputComponent(defaultLanguage);
}
};
controller.addLocalComponentFactory("input-localnutch", nutchInputFactory);
// * <filter component-key="filter-lingo" />
LocalComponentFactory lingoFactory = new LocalComponentFactoryBase() {
public LocalComponent getInstance() {
HashMap defaults = new HashMap();
// These are adjustments settings for the clustering algorithm.
// If you try the live WebStart demo of Carrot2 you can see how they affect
// the final clustering: http://www.carrot2.org/webstart
defaults.put("lsi.threshold.clusterAssignment", "0.150");
defaults.put("lsi.threshold.candidateCluster", "0.775");
// Initialize a new Lingo clustering component.
ArrayList languageList = new ArrayList(languages.length);
for (int i = 0; i < languages.length; i++) {
final String lcode = languages[i];
try {
Language lang = AllKnownLanguages.getLanguageForIsoCode(lcode);
if (lang == null) {
if (logger.isWarnEnabled()) {
logger.warn("Language not supported in Carrot2: " + lcode);
}
} else {
languageList.add(lang);
if (logger.isDebugEnabled()) {
logger.debug("Language loaded: " + lcode);
}
}
} catch (Throwable t) {
if (logger.isWarnEnabled()) {
logger.warn("Language could not be loaded: " + lcode, t);
}
}
}
return new LingoLocalFilterComponent(
(Language []) languageList.toArray(new Language [languageList.size()]), defaults);
}
};
controller.addLocalComponentFactory("filter-lingo", lingoFactory);
// * <output component-key="output-clustersConsumer" />
LocalComponentFactory clusterConsumerOutputFactory = new LocalComponentFactoryBase() {
public LocalComponent getInstance() {
return new ClustersConsumerOutputComponent();
}
};
controller.addLocalComponentFactory("output-clustersConsumer",
clusterConsumerOutputFactory);
}
/**
* Adds a hardcoded clustering process to the local controller.
*/
private void addProcesses() {
LocalProcessBase process = new LocalProcessBase(
"input-localnutch", // input
"output-clustersConsumer", // output
new String [] {"filter-lingo"}, // filters
"The Lingo clustering algorithm (www.carrot2.org).",
"");
try {
controller.addProcess(PROCESS_ID, process);
} catch (Exception e) {
throw new RuntimeException("Could not assemble clustering process.", e);
}
}
/**
* See {@link OnlineClusterer} for documentation.
*/
public HitsCluster [] clusterHits(HitDetails [] hitDetails, String [] descriptions) {
Map requestParams = new HashMap();
requestParams.put(LocalNutchInputComponent.NUTCH_INPUT_HIT_DETAILS_ARRAY,
hitDetails);
requestParams.put(LocalNutchInputComponent.NUTCH_INPUT_SUMMARIES_ARRAY,
descriptions);
try {
// The input component takes Nutch's results so we don't need the query argument.
final ProcessingResult result =
controller.query(PROCESS_ID, "no-query", requestParams);
final ClustersConsumerOutputComponent.Result output =
(ClustersConsumerOutputComponent.Result) result.getQueryResult();
final List outputClusters = output.clusters;
final HitsCluster [] clusters = new HitsCluster[ outputClusters.size() ];
int j = 0;
for (Iterator i = outputClusters.iterator(); i.hasNext(); j++) {
RawCluster rcluster = (RawCluster) i.next();
clusters[j] = new HitsClusterAdapter(rcluster, hitDetails);
}
// invoke Carrot2 process here.
return clusters;
} catch (MissingProcessException e) {
throw new RuntimeException("Missing clustering process.", e);
} catch (Exception e) {
throw new RuntimeException("Unidentified problems with the clustering.", e);
}
}
/**
* Implementation of {@link Configurable}
*/
public void setConf(Configuration conf) {
this.conf = conf;
// Configure default language and other component settings.
if (conf.get(CONF_PROP_DEFAULT_LANGUAGE) != null) {
// Change the default language.
this.defaultLanguage = conf.get(CONF_PROP_DEFAULT_LANGUAGE);
}
if (conf.getStrings(CONF_PROP_LANGUAGES) != null) {
this.languages = conf.getStrings(CONF_PROP_LANGUAGES);
}
if (logger.isInfoEnabled()) {
logger.info("Default language: " + defaultLanguage);
logger.info("Enabled languages: " + Arrays.asList(languages));
}
initialize();
}
/**
* Implementation of {@link Configurable}
*/
public Configuration getConf() {
return conf;
}
}