/*
* Carrot2 project.
*
* Copyright (C) 2002-2014, Dawid Weiss, Stanisław Osiński.
* All rights reserved.
*
* Refer to the full license file "carrot2.LICENSE"
* in the root folder of the repository checkout or at:
* http://www.carrot2.org/carrot2.LICENSE
*/
package org.carrot2.source.solr;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.carrot2.core.Cluster;
import org.carrot2.core.Document;
import org.carrot2.core.ProcessingResult;
import org.carrot2.core.attribute.AttributeNames;
import org.carrot2.core.attribute.Init;
import org.carrot2.core.attribute.Internal;
import org.carrot2.core.attribute.Processing;
import org.carrot2.source.SearchEngineResponse;
import org.carrot2.source.xml.RemoteXmlSimpleSearchEngineBase;
import org.carrot2.util.attribute.Attribute;
import org.carrot2.util.attribute.AttributeLevel;
import org.carrot2.util.attribute.Bindable;
import org.carrot2.util.attribute.DefaultGroups;
import org.carrot2.util.attribute.Group;
import org.carrot2.util.attribute.Input;
import org.carrot2.util.attribute.Label;
import org.carrot2.util.attribute.Level;
import org.carrot2.util.attribute.Output;
import org.carrot2.util.attribute.constraint.ImplementingClasses;
import org.carrot2.util.resource.ClassLoaderResource;
import org.carrot2.util.resource.ClassResource;
import org.carrot2.util.resource.FileResource;
import org.carrot2.util.resource.IResource;
import com.google.common.base.Predicate;
import com.google.common.base.Strings;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
/**
* Fetches documents from an instance of Solr.
*
* @see <a href="http://lucene.apache.org/solr/">Apache SOLR</a>
*/
@Bindable(prefix = "SolrDocumentSource")
public class SolrDocumentSource extends RemoteXmlSimpleSearchEngineBase
{
protected static final String FIELD_MAPPING = "Index field mapping";
/**
* Solr service URL base. The URL base can contain additional Solr parameters,
* for example: <tt>http://localhost:8983/solr/select?fq=timestemp:[NOW-24HOUR TO NOW]</tt>
*/
@Input
@Processing
@Attribute
@Label("Service URL")
@Level(AttributeLevel.ADVANCED)
@Group(SERVICE)
public String serviceUrlBase = "http://localhost:8983/solr/select";
/**
* Filter query appended to {@link #serviceUrlBase}.
*/
@Input
@Init
@Processing
@Attribute
@Label("Filter query")
@Level(AttributeLevel.MEDIUM)
@Group(SERVICE)
public String solrFilterQuery = "";
/**
* Title field name. Name of the Solr field that will provide document titles.
*/
@Input
@Processing
@Attribute
@Label("Title field name")
@Level(AttributeLevel.MEDIUM)
@Group(FIELD_MAPPING)
public String solrTitleFieldName = "title";
/**
* Summary field name. Name of the Solr field that will provide document summary.
*/
@Input
@Processing
@Attribute
@Label("Summary field name")
@Level(AttributeLevel.MEDIUM)
@Group(FIELD_MAPPING)
public String solrSummaryFieldName = "description";
/**
* URL field name. Name of the Solr field that will provide document URLs.
*/
@Input
@Processing
@Attribute
@Label("URL field name")
@Level(AttributeLevel.MEDIUM)
@Group(FIELD_MAPPING)
public String solrUrlFieldName = "url";
/**
* Document identifier field name (specified in Solr schema). This field is necessary
* to connect Solr-side clusters or highlighter output to documents.
*/
@Input
@Processing
@Attribute
@Label("ID field name")
@Level(AttributeLevel.MEDIUM)
@Group(FIELD_MAPPING)
public String solrIdFieldName;
/**
* Provides a custom XSLT stylesheet for converting from Solr's output to
* an XML format <a href="http://download.carrot2.org/head/manual/index.html#section.architecture.xml-formats">
* parsed by Carrot2</a>. For performance reasons this attribute
* can be provided at initialization time only (no processing-time overrides).
*/
@Input
@Init
@Attribute
@Label("Custom XSLT adapter from Solr to Carrot2 format")
@Level(AttributeLevel.ADVANCED)
@Group(FIELD_MAPPING)
@ImplementingClasses(classes =
{
ClassLoaderResource.class,
FileResource.class
}, strict = false)
public IResource solrXsltAdapter;
/**
* If clusters are present in the Solr output they will be read and exposed to components
* further down the processing chain. Note that {@link #solrIdFieldName} is required to match
* document references.
*/
@Input
@Init
@Processing
@Attribute
@Label("Read Solr clusters if present")
@Level(AttributeLevel.BASIC)
@Group(FIELD_MAPPING)
public boolean readClusters = false;
/**
* If highlighter fragments are present in the Solr output they will be used (and preferred) over full
* field content. This may be used to decrease the memory required for clustering. In general if highlighter
* is used the contents of full fields won't be emitted from Solr though (because it makes little sense).
*
* <p>Setting this option to <code>false</code> will disable using the highlighter output
* entirely.</p>
*/
@Input
@Init
@Processing
@Attribute
@Label("Use highlighter output if present")
@Level(AttributeLevel.BASIC)
@Group(FIELD_MAPPING)
public boolean useHighlighterOutput = true;
/**
* Copy Solr fields from the search result to Carrot2 {@link Document} instances (as fields).
*/
@Input
@Init
@Processing
@Attribute
@Label("Copy Solr document fields")
@Level(AttributeLevel.ADVANCED)
@Group(FIELD_MAPPING)
public boolean copyFields = false;
/**
* If {@link #readClusters} is <code>true</code> and clusters are present in the input
* XML, they will be deserialized and exposed to components further down the processing
* chain.
*/
@Processing
@Input @Output
@Internal
@Attribute(key = AttributeNames.CLUSTERS)
@Label("Clusters")
@Level(AttributeLevel.BASIC)
@Group(DefaultGroups.RESULT_INFO)
public List<Cluster> clusters;
@Override
protected void afterFetch(SearchEngineResponse response,
ProcessingResult processingResult)
{
if (readClusters) {
final Set<String> ids = Sets.newHashSet();
List<Document> documents = processingResult.getDocuments();
if (documents == null) documents = Collections.emptyList();
List<Cluster> clusters = processingResult.getClusters();
if (documents != null && clusters != null) {
for (Document doc : documents) {
ids.add(doc.getStringId());
}
Predicate<Document> docFilter = new Predicate<Document>()
{
@Override
public boolean apply(Document input)
{
return input != null && ids.contains(input.getStringId());
}
};
this.clusters = sanityCheck(clusters, docFilter);
}
}
}
@Override
protected String buildServiceUrl()
{
return serviceUrlBase
+ (serviceUrlBase.contains("?") ? "&" : "?")
+ "q=" + urlEncode(query)
+ (Strings.isNullOrEmpty(solrFilterQuery) ? "" : "&fq=" + urlEncode(solrFilterQuery))
+ "&start=" + start
+ "&rows=" + results
+ "&indent=off";
}
@Override
protected IResource getXsltResource()
{
if (solrXsltAdapter == null) {
return new ClassResource(SolrDocumentSource.class, "solr-to-c2.xsl");
} else {
return solrXsltAdapter;
}
}
@Override
protected Map<String, String> getXsltParameters()
{
final Map<String, String> parameters = Maps.newHashMap();
parameters.put("solr.title-field", solrTitleFieldName);
parameters.put("solr.summary-field", solrSummaryFieldName);
parameters.put("solr.url-field", solrUrlFieldName);
parameters.put("solr.id-field", Strings.nullToEmpty(solrIdFieldName));
parameters.put("solr.use-highlighter-output", useHighlighterOutput ? "true" : "false");
parameters.put("solr.copy-fields", copyFields ? "true" : "false");
return parameters;
}
private static List<Cluster> sanityCheck(List<Cluster> in, Predicate<Document> docFilter)
{
List<Cluster> cloned = Lists.newArrayListWithCapacity(in.size());
for (Cluster c : in) {
Cluster c2 = new Cluster();
c2.addPhrases(c.getPhrases());
c2.addDocuments(
Iterables.filter(c.getDocuments(), docFilter));
c2.addSubclusters(sanityCheck(c.getSubclusters(), docFilter));
cloned.add(c2);
}
return cloned;
}
}