/*
* Licensed to CRATE Technology GmbH ("Crate") under one or more contributor
* license agreements. See the NOTICE file distributed with this work for
* additional information regarding copyright ownership. Crate licenses
* this file to you under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. You may
* obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
* However, if you have executed another commercial license agreement
* with Crate these terms will supersede the license and you may use the
* software solely pursuant to the terms of the relevant commercial agreement.
*/
package io.crate.integrationtests;
import com.google.common.base.Joiner;
import io.crate.action.sql.SQLActionException;
import io.crate.action.sql.SQLResponse;
import io.crate.metadata.FulltextAnalyzerResolver;
import org.elasticsearch.action.admin.cluster.state.ClusterStateResponse;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.Settings;
import org.junit.AfterClass;
import org.junit.Before;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.ExpectedException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import static org.hamcrest.Matchers.*;
import static org.hamcrest.collection.IsMapContaining.hasEntry;
import static org.hamcrest.collection.IsMapContaining.hasKey;
public class FulltextAnalyzerResolverTest extends SQLTransportIntegrationTest {
@Rule
public ExpectedException expectedException = ExpectedException.none();
private static FulltextAnalyzerResolver fulltextAnalyzerResolver;
@Before
public void AnalyzerServiceSetup() {
fulltextAnalyzerResolver = cluster().getInstance(FulltextAnalyzerResolver.class);
}
@AfterClass
public static void tearDownClass() {
synchronized (FulltextAnalyzerResolverTest.class) {
fulltextAnalyzerResolver = null;
}
}
public Settings getPersistentClusterSettings() {
ClusterStateResponse response = client().admin().cluster().prepareState().execute().actionGet();
return response.getState().metaData().persistentSettings();
}
@Test
public void resolveSimpleAnalyzerSettings() throws Exception {
execute("CREATE ANALYZER a1 (tokenizer lowercase)");
Settings fullAnalyzerSettings = fulltextAnalyzerResolver.resolveFullCustomAnalyzerSettings("a1");
assertThat(fullAnalyzerSettings.getAsMap().size(), is(2));
assertThat(
fullAnalyzerSettings.getAsMap(),
hasEntry("index.analysis.analyzer.a1.type", "custom")
);
assertThat(
fullAnalyzerSettings.getAsMap(),
hasEntry("index.analysis.analyzer.a1.tokenizer", "lowercase")
);
}
@Test
public void resolveAnalyzerWithCustomTokenizer() throws Exception {
execute("CREATE ANALYZER a2" +
"(" +
" tokenizer tok2 with (" +
" type='ngram'," +
" \"min_ngram\"=2," +
" \"token_chars\"=['letter', 'digits']" +
" )" +
")");
Settings fullAnalyzerSettings = fulltextAnalyzerResolver.resolveFullCustomAnalyzerSettings("a2");
assertThat(
fullAnalyzerSettings.getAsMap(),
hasEntry("index.analysis.analyzer.a2.type", "custom")
);
assertThat(
fullAnalyzerSettings.getAsMap(),
hasEntry("index.analysis.analyzer.a2.tokenizer", "a2_tok2")
);
assertThat(
fullAnalyzerSettings.getAsMap(),
allOf(
hasEntry("index.analysis.tokenizer.a2_tok2.type", "ngram"),
hasEntry("index.analysis.tokenizer.a2_tok2.min_ngram", "2"),
hasEntry("index.analysis.tokenizer.a2_tok2.token_chars.0", "letter"),
hasEntry("index.analysis.tokenizer.a2_tok2.token_chars.1", "digits")
)
);
}
@Test
public void resolveAnalyzerWithCharFilters() throws Exception {
execute("CREATE ANALYZER a3" +
"(" +
" tokenizer lowercase," +
" char_filters (" +
" \"html_strip\"," +
" my_mapping WITH (" +
" type='mapping'," +
" mappings=['ph=>f', 'ß=>ss', 'ö=>oe']" +
" )" +
" )" +
")");
Settings fullAnalyzerSettings = fulltextAnalyzerResolver.resolveFullCustomAnalyzerSettings("a3");
assertThat(
fullAnalyzerSettings.getAsMap(),
hasEntry("index.analysis.analyzer.a3.type", "custom")
);
assertThat(
fullAnalyzerSettings.getAsMap(),
hasEntry("index.analysis.analyzer.a3.tokenizer", "lowercase")
);
assertThat(
fullAnalyzerSettings.getAsArray("index.analysis.analyzer.a3.char_filter"),
arrayContainingInAnyOrder("html_strip", "a3_my_mapping")
);
assertThat(
fullAnalyzerSettings.getAsMap(),
hasEntry("index.analysis.char_filter.a3_my_mapping.type", "mapping")
);
assertThat(
fullAnalyzerSettings.getAsArray("index.analysis.char_filter.a3_my_mapping" +
".mappings"),
arrayContainingInAnyOrder("ph=>f", "ß=>ss", "ö=>oe")
);
execute("CREATE TABLE t1(content " +
"string index using fulltext with (analyzer='a3'))");
}
@Test
public void resolveAnalyzerExtendingBuiltin() throws Exception {
execute("CREATE ANALYZER a4 EXTENDS " +
"german WITH (" +
" \"stop_words\"=['der', 'die', 'das']" +
")");
Settings fullAnalyzerSettings = fulltextAnalyzerResolver.resolveFullCustomAnalyzerSettings("a4");
assertThat(
fullAnalyzerSettings.getAsMap(),
hasEntry("index.analysis.analyzer.a4.type", "german")
);
assertThat(
fullAnalyzerSettings.getAsArray("index.analysis.analyzer.a4.stop_words"),
arrayContainingInAnyOrder("der", "die", "das")
);
// extend analyzer who extends builtin analyzer (chain can be longer than 1)
execute("CREATE ANALYZER a4e EXTENDS " +
"a4 WITH (" +
" \"stop_words\"=['der', 'die', 'das', 'wer', 'wie', 'was']" +
")");
fullAnalyzerSettings = fulltextAnalyzerResolver.resolveFullCustomAnalyzerSettings("a4e");
assertThat(
fullAnalyzerSettings.getAsMap(),
hasEntry("index.analysis.analyzer.a4e.type", "german")
);
assertThat(
fullAnalyzerSettings.getAsArray("index.analysis.analyzer.a4e.stop_words"),
arrayContainingInAnyOrder("der", "die", "das", "wer", "wie", "was")
);
}
@Test
public void resolveAnalyzerExtendingCustom() throws Exception {
execute("CREATE ANALYZER a5 (" +
" tokenizer whitespace," +
" token_filters (" +
" lowercase," +
" germanstemmer WITH (" +
" type='stemmer'," +
" language='german'" +
" )" +
" )" +
")");
Settings fullAnalyzerSettings = fulltextAnalyzerResolver.resolveFullCustomAnalyzerSettings("a5");
assertThat(
fullAnalyzerSettings.getAsMap(),
hasEntry("index.analysis.analyzer.a5.type", "custom")
);
assertThat(
fullAnalyzerSettings.getAsMap(),
hasEntry("index.analysis.analyzer.a5.tokenizer", "whitespace")
);
assertThat(
fullAnalyzerSettings.getAsArray("index.analysis.analyzer.a5.filter"),
arrayContainingInAnyOrder("lowercase", "a5_germanstemmer")
);
assertThat(
fullAnalyzerSettings.getAsMap(),
allOf(
hasEntry("index.analysis.filter.a5_germanstemmer.type", "stemmer"),
hasEntry("index.analysis.filter.a5_germanstemmer.language", "german")
)
);
execute("CREATE ANALYZER a5e EXTENDS a5 (" +
" tokenizer letter," +
" char_filters (" +
" \"html_strip\"," +
" mymapping WITH (" +
" type='mapping'," +
" mappings=['ph=>f', 'ß=>ss', 'ö=>oe']" +
" )" +
" )" +
")");
fullAnalyzerSettings = fulltextAnalyzerResolver.resolveFullCustomAnalyzerSettings("a5e");
assertThat(
fullAnalyzerSettings.getAsMap(),
hasEntry("index.analysis.analyzer.a5e.type", "custom")
);
assertThat(
fullAnalyzerSettings.getAsMap(),
hasEntry("index.analysis.analyzer.a5e.tokenizer", "letter")
);
assertThat(
fullAnalyzerSettings.getAsArray("index.analysis.analyzer.a5e.filter"),
arrayContainingInAnyOrder("lowercase", "a5_germanstemmer")
);
assertThat(
fullAnalyzerSettings.getAsMap(),
allOf(
hasEntry("index.analysis.filter.a5_germanstemmer.type", "stemmer"),
hasEntry("index.analysis.filter.a5_germanstemmer.language", "german")
)
);
assertThat(
fullAnalyzerSettings.getAsArray("index.analysis.analyzer.a5e.char_filter"),
arrayContainingInAnyOrder("html_strip", "a5e_mymapping")
);
}
@Test
public void testBuiltInAnalyzers() throws Exception {
List<String> analyzers = new ArrayList<>(fulltextAnalyzerResolver.getBuiltInAnalyzers());
Collections.sort(analyzers);
assertThat(Joiner.on(", ").join(analyzers),
is("arabic, armenian, basque, brazilian, bulgarian, catalan, chinese, cjk, " +
"classic, czech, danish, default, dutch, english, finnish, french, " +
"galician, german, greek, hindi, hungarian, indonesian, irish, " +
"italian, keyword, latvian, norwegian, pattern, persian, portuguese, " +
"romanian, russian, simple, snowball, sorani, spanish, standard, " +
"standard_html_strip, stop, swedish, thai, turkish, whitespace"));
}
@Test
public void testBuiltInTokenizers() throws Exception {
List<String> tokenizers = new ArrayList<>(fulltextAnalyzerResolver.getBuiltInTokenizers());
Collections.sort(tokenizers);
assertThat(Joiner.on(", ").join(tokenizers),
is("classic, edgeNGram, edge_ngram, keyword, letter, lowercase, " +
"nGram, ngram, path_hierarchy, pattern, standard, thai, " +
"uax_url_email, whitespace"));
}
@Test
public void testBuiltInTokenFilters() throws Exception {
List<String> tokenFilters = new ArrayList<>(fulltextAnalyzerResolver.getBuiltInTokenFilters());
Collections.sort(tokenFilters);
assertThat(Joiner.on(", ").join(tokenFilters),
is("apostrophe, arabic_normalization, arabic_stem, asciifolding, brazilian_stem, " +
"cjk_bigram, cjk_width, classic, common_grams, czech_stem, " +
"delimited_payload_filter, dictionary_decompounder, dutch_stem, " +
"edgeNGram, edge_ngram, elision, french_stem, german_normalization, " +
"german_stem, hindi_normalization, hunspell, " +
"hyphenation_decompounder, indic_normalization, keep, " +
"keyword_marker, keyword_repeat, " +
"kstem, length, limit, lowercase, nGram, ngram, pattern_capture, " +
"pattern_replace, persian_normalization, porter_stem, reverse, " +
"russian_stem, scandinavian_folding, scandinavian_normalization, " +
"shingle, snowball, sorani_normalization, standard, stemmer, stemmer_override, " +
"stop, synonym, trim, truncate, type_as_payload, unique, uppercase, word_delimiter"));
}
@Test
public void testBuiltInCharFilters() throws Exception {
List<String> charFilters = new ArrayList<>(fulltextAnalyzerResolver.getBuiltInCharFilters());
Collections.sort(charFilters);
assertThat(Joiner.on(", ").join(charFilters),
is("htmlStrip, html_strip, mapping, pattern_replace"));
}
@Test
public void createAndExtendFullCustomAnalyzer() throws IOException {
execute("CREATE ANALYZER a7 (" +
" char_filters (" +
" mypattern WITH (" +
" type='pattern_replace'," +
" \"pattern\" ='sample(.*)',\n" +
" \"replacement\" = 'replacedSample $1'" +
" )," +
" \"html_strip\"" +
" )," +
" tokenizer mytok WITH (" +
" type='edgeNGram'," +
" \"min_gram\" = 2," +
" \"max_gram\" = 5," +
" \"token_chars\" = [ 'letter', 'digit' ]" +
" )," +
" token_filters (" +
" myshingle WITH (" +
" type='shingle'," +
" \"output_unigrams\"=false," +
" \"max_shingle_size\"=10" +
" )," +
" lowercase," +
" \"my_stemmer\" WITH (" +
" type='stemmer'," +
" language='german'" +
" )" +
" )" +
")");
Settings settings = getPersistentClusterSettings();
assertThat(
settings.getAsMap(),
allOf(
hasKey("crate.analysis.custom.analyzer.a7"),
hasKey("crate.analysis.custom.tokenizer.a7_mytok"),
hasKey("crate.analysis.custom.char_filter.a7_mypattern"),
hasKey("crate.analysis.custom.filter.a7_myshingle"),
hasKey("crate.analysis.custom.filter.a7_my_stemmer")
)
);
Settings analyzerSettings = FulltextAnalyzerResolver.decodeSettings(settings.get("crate.analysis.custom.analyzer.a7"));
assertThat(
analyzerSettings.getAsArray("index.analysis.analyzer.a7.char_filter"),
arrayContainingInAnyOrder("a7_mypattern", "html_strip")
);
assertThat(
analyzerSettings.getAsArray("index.analysis.analyzer.a7.filter"),
arrayContainingInAnyOrder("a7_myshingle", "lowercase", "a7_my_stemmer")
);
assertThat(
analyzerSettings.getAsMap(),
hasEntry("index.analysis.analyzer.a7.tokenizer", "a7_mytok")
);
execute("CREATE ANALYZER a8 EXTENDS a7 (" +
" token_filters (" +
" lowercase," +
" kstem" +
" )" +
")");
Settings extendedSettings = getPersistentClusterSettings();
assertThat(
extendedSettings.getAsMap(),
allOf(
hasKey("crate.analysis.custom.analyzer.a8"),
hasKey("crate.analysis.custom.tokenizer.a7_mytok")
)
);
Settings extendedAnalyzerSettings = FulltextAnalyzerResolver.decodeSettings(extendedSettings.get("crate.analysis.custom.analyzer.a8"));
assertThat(
extendedAnalyzerSettings.getAsMap(),
hasEntry("index.analysis.analyzer.a8.type", "custom")
);
assertThat(
extendedAnalyzerSettings.getAsMap(),
hasEntry("index.analysis.analyzer.a8.tokenizer", "a7_mytok")
);
assertThat(
extendedAnalyzerSettings.getAsArray("index.analysis.analyzer.a8.filter"),
arrayContainingInAnyOrder("lowercase", "kstem")
);
assertThat(
extendedAnalyzerSettings.getAsArray("index.analysis.analyzer.a8.char_filter"),
arrayContainingInAnyOrder("a7_mypattern", "html_strip")
);
}
@Test
public void reuseExistingTokenizer() throws Exception {
execute("CREATE ANALYZER a9 (" +
" TOKENIZER a9tok WITH (" +
" type='nGram'," +
" \"token_chars\"=['letter', 'digit']" +
" )" +
")");
try {
execute("CREATE ANALYZER a10 (" +
" TOKENIZER a9tok" +
")");
fail("Reusing existing tokenizer worked");
} catch (SQLActionException e) {
assertThat(e.getMessage(), is("Non-existing tokenizer 'a9tok'"));
}
/*
* NOT SUPPORTED UNTIL A CONSISTENT SOLUTION IS FOUND
* FOR IMPLICITLY CREATING TOKENIZERS ETC. WITHIN ANALYZER-DEFINITIONS
Settings settings = getPersistentClusterSettings();
Settings a10Settings = AnalyzerService.decodeSettings(settings.get("crate.analysis.custom.analyzer.a10"));
assertThat(
a10Settings.getAsMap(),
hasEntry("index.analysis.analyzer.a10.tokenizer", "a9tok")
);
*/
}
@Test
public void useAnalyzerForIndexSettings() throws Exception {
execute("CREATE ANALYZER a11 (" +
" TOKENIZER standard," +
" TOKEN_FILTERS (" +
" lowercase," +
" mystop WITH (" +
" type='stop'," +
" stopword=['the', 'over']" +
" )" +
" )" +
")");
Settings settings = getPersistentClusterSettings();
assertThat(
settings.getAsMap(),
allOf(
hasKey("crate.analysis.custom.analyzer.a11"),
hasKey("crate.analysis.custom.filter.a11_mystop")
)
);
Settings analyzerSettings = FulltextAnalyzerResolver.decodeSettings(settings.get("crate.analysis.custom.analyzer.a11"));
Settings tokenFilterSettings = FulltextAnalyzerResolver.decodeSettings(settings.get("crate" +
".analysis.custom.filter.a11_mystop"));
ImmutableSettings.Builder builder = ImmutableSettings.builder();
builder.put(analyzerSettings);
builder.put(tokenFilterSettings);
execute("create table test (" +
" id integer primary key," +
" name string," +
" content string index using fulltext with (analyzer='a11')" +
")");
ensureGreen();
execute("insert into test (id, name, content) values (?, ?, ?)", new Object[]{
1, "phrase", "The quick brown fox jumps over the lazy dog."
});
execute("insert into test (id, name, content) values (?, ?, ?)", new Object[]{
2, "another phrase", "Don't panic!"
});
refresh();
SQLResponse response = execute("select id from test where match(content, 'brown jump')");
assertEquals(1L, response.rowCount());
assertEquals(1, response.rows()[0][0]); }
}