package io.crate.metadata;
import com.google.common.collect.ImmutableSet;
import org.apache.lucene.analysis.Analyzer;
import io.crate.Constants;
import io.crate.exceptions.AnalyzerInvalidException;
import io.crate.exceptions.AnalyzerUnknownException;
import org.elasticsearch.cluster.ClusterService;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.io.stream.BytesStreamOutput;
import org.elasticsearch.common.logging.ESLogger;
import org.elasticsearch.common.logging.Loggers;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.settings.loader.JsonSettingsLoader;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentFactory;
import org.elasticsearch.index.analysis.TokenizerFactory;
import org.elasticsearch.indices.analysis.IndicesAnalysisService;
import java.io.IOException;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
* Service to get builtin and custom analyzers, tokenizers, token_filters, char_filters
public class FulltextAnalyzerResolver {
private final ClusterService clusterService;
private final IndicesAnalysisService indicesAnalysisService;
// redefined list of extended analyzers not available outside of
// a concrete index (see AnalyzerModule.ExtendedProcessor)
// stripped Prebuilt<Thingy> (e.g. PreBuiltTokenFilters)
private static final ImmutableSet<String> EXTENDED_BUILTIN_TOKEN_FILTERS = ImmutableSet.of(
"limit", "delimited_payload_filter", "synonym",
"keep", "pattern_capture", "pattern_replace",
"dictionary_decompounder", "hyphenation_decompounder",
"keyword_marker", "stemmer_override",
"hunspell", "cjk_bigram", "cjk_width");
private static final ImmutableSet<String> EXTENDED_BUILTIN_CHAR_FILTERS = ImmutableSet
.of("mapping", "pattern_replace");
// used for saving the creation statement
public static final String SQL_STATEMENT_KEY = "_sql_stmt";
private ESLogger logger = Loggers.getLogger(FulltextAnalyzerResolver.class);
public enum CustomType {
private String name;
private CustomType(String name) {
this.name = name;
public String getName() {
return this.name;
public FulltextAnalyzerResolver(ClusterService clusterService,
IndicesAnalysisService indicesAnalysisService) {
this.clusterService = clusterService;
this.indicesAnalysisService = indicesAnalysisService;
public boolean hasAnalyzer(String name) {
return hasBuiltInAnalyzer(name) || hasCustomAnalyzer(name);
public boolean hasBuiltInAnalyzer(String name) {
return indicesAnalysisService.hasAnalyzer(name);
public Analyzer getBuiltInAnalyzer(String name) {
return indicesAnalysisService.analyzer(name);
* get all the builtin Analyzers defined in Crate
* @return an Iterable of Strings
public Set<String> getBuiltInAnalyzers() {
return new ImmutableSet.Builder<String>()
* get the custom analyzer created by the CREATE ANALYZER command.
* This does not include definitions for custom tokenizers, token-filters or char-filters
* @param name the name of the analyzer
* @return Settings defining a custom Analyzer
public Settings getCustomAnalyzer(String name) {
return getCustomThingy(name, CustomType.ANALYZER);
* get the source of the custom analyzer with name ``name``.
* This is the statement it was created with.
* @param name the name of the custom analyzer
* @return the source as String or null if no source exists)
public String getCustomAnalyzerSource(String name) {
return clusterService.state().metaData().persistentSettings().get(
String.format("%s.%s.%s.%s", Constants.CUSTOM_ANALYSIS_SETTINGS_PREFIX,
CustomType.ANALYZER.getName(), name, SQL_STATEMENT_KEY)
public Map<String, Settings> getCustomAnalyzers() throws IOException {
Map<String, Settings> result = new HashMap<>();
for (Map.Entry<String, String> entry : getCustomThingies(CustomType.ANALYZER)
.getAsMap().entrySet()) {
if (!entry.getKey().endsWith("." + SQL_STATEMENT_KEY)) {
result.put(entry.getKey(), decodeSettings(entry.getValue()));
return result;
public boolean hasCustomAnalyzer(String name) {
return hasCustomThingy(name, CustomType.ANALYZER);
public boolean hasTokenizer(String name) {
return hasBuiltInTokenizer(name) || hasCustomTokenizer(name);
public boolean hasBuiltInTokenizer(String name) {
return indicesAnalysisService.hasTokenizer(name);
public Set<String> getBuiltInTokenizers() {
return new ImmutableSet.Builder<String>()
public boolean hasCustomTokenizer(String name) {
return hasCustomThingy(name, CustomType.TOKENIZER);
public Map<String, Settings> getCustomTokenizers() throws IOException {
Map<String, Settings> result = new HashMap<>();
for (Map.Entry<String, String> entry : getCustomThingies(CustomType.TOKENIZER).getAsMap
().entrySet()) {
result.put(entry.getKey(), decodeSettings(entry.getValue()));
return result;
public boolean hasCharFilter(String name) {
return hasBuiltInCharFilter(name) || hasCustomCharFilter(name);
public boolean hasBuiltInCharFilter(String name) {
return EXTENDED_BUILTIN_CHAR_FILTERS.contains(name) || indicesAnalysisService.hasCharFilter(name);
public boolean hasCustomCharFilter(String name) {
return hasCustomThingy(name, CustomType.CHAR_FILTER);
public Set<String> getBuiltInCharFilters() {
return new ImmutableSet.Builder<String>().addAll(EXTENDED_BUILTIN_CHAR_FILTERS)
public Map<String, Settings> getCustomCharFilters() throws IOException {
Map<String, Settings> result = new HashMap<>();
for (Map.Entry<String, String> entry : getCustomThingies(CustomType.CHAR_FILTER).getAsMap
().entrySet()) {
result.put(entry.getKey(), decodeSettings(entry.getValue()));
return result;
public boolean hasTokenFilter(String name) {
return hasBuiltInTokenFilter(name) || hasCustomTokenFilter(name);
public boolean hasBuiltInTokenFilter(String name) {
return EXTENDED_BUILTIN_TOKEN_FILTERS.contains(name) || indicesAnalysisService.hasTokenFilter(name);
public Set<String> getBuiltInTokenFilters() {
return new ImmutableSet.Builder<String>()
public boolean hasCustomTokenFilter(String name) {
return hasCustomThingy(name, CustomType.TOKEN_FILTER);
public Map<String, Settings> getCustomTokenFilters() throws IOException {
Map<String, Settings> result = new HashMap<>();
for (Map.Entry<String, String> entry : getCustomThingies(CustomType.TOKEN_FILTER).getAsMap
().entrySet()) {
result.put(entry.getKey(), decodeSettings(entry.getValue()));
return result;
public static BytesReference encodeSettings(Settings settings) throws IOException {
BytesStreamOutput bso = new BytesStreamOutput();
XContentBuilder builder = XContentFactory.jsonBuilder(bso);
for (Map.Entry<String, String> entry : settings.getAsMap().entrySet()) {
builder.field(entry.getKey(), entry.getValue());
return bso.bytes();
public static Settings decodeSettings(String encodedSettings) throws IOException {
Map<String, String> loaded = new JsonSettingsLoader().load(encodedSettings);
return ImmutableSettings.builder().put(loaded).build();
* used to get custom analyzers, tokenizers, token-filters or char-filters with name ``name``
* from crate-cluster-settings
* @param name
* @param type
* @return a full settings instance for the thingy with given name and type or null if it does not exists
private Settings getCustomThingy(String name, CustomType type) {
if (name == null) {
return null;
String encodedSettings = clusterService.state().metaData().persistentSettings().get(
String.format("%s.%s.%s", Constants.CUSTOM_ANALYSIS_SETTINGS_PREFIX, type.getName(), name)
Settings decoded = null;
if (encodedSettings != null) {
try {
decoded = decodeSettings(encodedSettings);
} catch (IOException e) {
logger.warn("Could not decode settings for {} '{}'.", e, type.getName(), name);
return decoded;
private Settings getCustomThingies(CustomType type) {
Map<String, Settings> settingsMap = clusterService.state().metaData().persistentSettings
Settings result = settingsMap.get(type.getName());
return result != null ? result : ImmutableSettings.EMPTY;
* used to check if custom analyzer, tokenizer, token-filter or char-filter with name ``name`` exists
* @param name
* @param type
* @return true if exists, false otherwise
private boolean hasCustomThingy(String name, CustomType type) {
return clusterService.state().metaData().persistentSettings().getAsMap().containsKey(
String.format(Locale.ROOT, "%s.%s.%s", Constants.CUSTOM_ANALYSIS_SETTINGS_PREFIX, type.getName(), name));
* resolve the full settings necessary for the custom analyzer with name ``name``
* to be included in index-settings to get applied on an index.
* Resolves all custom tokenizer, token-filter and char-filter settings and includes them
* @param name the name of the analyzer to resolve
* @return Settings ready for inclusion into a CreateIndexRequest
* @throws AnalyzerInvalidException if no custom analyzer with name ``name`` could be found
public Settings resolveFullCustomAnalyzerSettings(String name) throws AnalyzerInvalidException {
ImmutableSettings.Builder builder = ImmutableSettings.builder();
Settings analyzerSettings = getCustomAnalyzer(name);
if (analyzerSettings != null) {
String tokenizerName = analyzerSettings.get(String.format("index.analysis.analyzer.%s.tokenizer", name));
if (tokenizerName != null) {
Settings customTokenizerSettings = getCustomTokenizer(tokenizerName);
if (customTokenizerSettings != null) {
} else if (!hasBuiltInTokenizer(tokenizerName)) {
throw new AnalyzerInvalidException(String.format("Invalid Analyzer: could not resolve tokenizer '%s'", tokenizerName));
String[] tokenFilterNames = analyzerSettings.getAsArray(String.format("index.analysis.analyzer.%s.filter", name));
for (int i=0; i<tokenFilterNames.length; i++) {
Settings customTokenFilterSettings = getCustomTokenFilter(tokenFilterNames[i]);
if (customTokenFilterSettings != null) {
} else if (!hasBuiltInTokenFilter(tokenFilterNames[i])) {
throw new AnalyzerInvalidException(String.format("Invalid Analyzer: could not resolve token-filter '%s'", tokenFilterNames[i]));
String[] charFilterNames = analyzerSettings.getAsArray(String.format("index.analysis.analyzer.%s.char_filter", name));
for (int i=0; i<charFilterNames.length; i++) {
Settings customCharFilterSettings = getCustomCharFilter(charFilterNames[i]);
if (customCharFilterSettings != null) {
} else if (!hasBuiltInCharFilter(charFilterNames[i])) {
throw new AnalyzerInvalidException(String.format("Invalid Analyzer: could not resolve char-filter '%s'", charFilterNames[i]));
} else {
throw new AnalyzerUnknownException(name);
return builder.build();
public TokenizerFactory getBuiltinTokenizer(String name) {
return indicesAnalysisService.tokenizerFactoryFactory(name).create(null, null); // arguments do not matter here
public Settings getCustomTokenizer(String name) {
return getCustomThingy(name, CustomType.TOKENIZER);
public Settings getCustomTokenFilter(String name) {
return getCustomThingy(name, CustomType.TOKEN_FILTER);
public Settings getCustomCharFilter(String name) {
return getCustomThingy(name, CustomType.CHAR_FILTER);