Source Code of io.crate.integrationtests.FulltextAnalyzerResolverTest

/*
 * Licensed to CRATE Technology GmbH ("Crate") under one or more contributor
 * license agreements.  See the NOTICE file distributed with this work for
 * additional information regarding copyright ownership.  Crate licenses
 * this file to you under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.  You may
 * obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
 * License for the specific language governing permissions and limitations
 * under the License.
 *
 * However, if you have executed another commercial license agreement
 * with Crate these terms will supersede the license and you may use the
 * software solely pursuant to the terms of the relevant commercial agreement.
 */


package io.crate.integrationtests;


import com.google.common.base.Joiner;
import io.crate.action.sql.SQLActionException;
import io.crate.action.sql.SQLResponse;
import io.crate.metadata.FulltextAnalyzerResolver;
import org.elasticsearch.action.admin.cluster.state.ClusterStateResponse;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.Settings;
import org.junit.AfterClass;
import org.junit.Before;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.ExpectedException;


import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;


import static org.hamcrest.Matchers.*;
import static org.hamcrest.collection.IsMapContaining.hasEntry;
import static org.hamcrest.collection.IsMapContaining.hasKey;


public class FulltextAnalyzerResolverTest extends SQLTransportIntegrationTest {


    @Rule
    public ExpectedException expectedException = ExpectedException.none();


    private static FulltextAnalyzerResolver fulltextAnalyzerResolver;


    @Before
    public void AnalyzerServiceSetup() {
        fulltextAnalyzerResolver = cluster().getInstance(FulltextAnalyzerResolver.class);
    }


    @AfterClass
    public static void tearDownClass() {
        synchronized (FulltextAnalyzerResolverTest.class) {
            fulltextAnalyzerResolver = null;
        }
    }


    public Settings getPersistentClusterSettings() {
        ClusterStateResponse response = client().admin().cluster().prepareState().execute().actionGet();
        return response.getState().metaData().persistentSettings();
    }


    @Test
    public void resolveSimpleAnalyzerSettings() throws Exception {
        execute("CREATE ANALYZER a1 (tokenizer lowercase)");
        Settings fullAnalyzerSettings = fulltextAnalyzerResolver.resolveFullCustomAnalyzerSettings("a1");
        assertThat(fullAnalyzerSettings.getAsMap().size(), is(2));
        assertThat(
                fullAnalyzerSettings.getAsMap(),
                hasEntry("index.analysis.analyzer.a1.type", "custom")
        );
        assertThat(
                fullAnalyzerSettings.getAsMap(),
                hasEntry("index.analysis.analyzer.a1.tokenizer", "lowercase")
        );
    }


    @Test
    public void resolveAnalyzerWithCustomTokenizer() throws Exception {
        execute("CREATE ANALYZER a2" +
                "(" +
                "   tokenizer tok2 with (" +
                "       type='ngram'," +
                "       \"min_ngram\"=2," +
                "       \"token_chars\"=['letter', 'digits']" +
                "   )" +
                ")");
        Settings fullAnalyzerSettings = fulltextAnalyzerResolver.resolveFullCustomAnalyzerSettings("a2");
        assertThat(
                fullAnalyzerSettings.getAsMap(),
                hasEntry("index.analysis.analyzer.a2.type", "custom")
        );
        assertThat(
                fullAnalyzerSettings.getAsMap(),
                hasEntry("index.analysis.analyzer.a2.tokenizer", "a2_tok2")
        );
        assertThat(
                fullAnalyzerSettings.getAsMap(),
                allOf(
                        hasEntry("index.analysis.tokenizer.a2_tok2.type", "ngram"),
                        hasEntry("index.analysis.tokenizer.a2_tok2.min_ngram", "2"),
                        hasEntry("index.analysis.tokenizer.a2_tok2.token_chars.0", "letter"),
                        hasEntry("index.analysis.tokenizer.a2_tok2.token_chars.1", "digits")
                )
        );
    }


    @Test
    public void resolveAnalyzerWithCharFilters() throws Exception {
        execute("CREATE ANALYZER a3" +
                "(" +
                "   tokenizer lowercase," +
                "   char_filters (" +
                "       \"html_strip\"," +
                "       my_mapping WITH (" +
                "           type='mapping'," +
                "           mappings=['ph=>f', 'ß=>ss', 'ö=>oe']" +
                "       )" +
                "   )" +
                ")");
        Settings fullAnalyzerSettings = fulltextAnalyzerResolver.resolveFullCustomAnalyzerSettings("a3");
        assertThat(
                fullAnalyzerSettings.getAsMap(),
                hasEntry("index.analysis.analyzer.a3.type", "custom")
        );
        assertThat(
                fullAnalyzerSettings.getAsMap(),
                hasEntry("index.analysis.analyzer.a3.tokenizer", "lowercase")
        );
        assertThat(
                fullAnalyzerSettings.getAsArray("index.analysis.analyzer.a3.char_filter"),
                arrayContainingInAnyOrder("html_strip", "a3_my_mapping")
        );
        assertThat(
                fullAnalyzerSettings.getAsMap(),
                hasEntry("index.analysis.char_filter.a3_my_mapping.type", "mapping")
        );
        assertThat(
                fullAnalyzerSettings.getAsArray("index.analysis.char_filter.a3_my_mapping" +
                        ".mappings"),
                arrayContainingInAnyOrder("ph=>f", "ß=>ss", "ö=>oe")
        );
        execute("CREATE TABLE t1(content " +
                "string index using fulltext with (analyzer='a3'))");
    }


    @Test
    public void resolveAnalyzerExtendingBuiltin() throws Exception {
        execute("CREATE ANALYZER a4 EXTENDS " +
                "german WITH (" +
                "   \"stop_words\"=['der', 'die', 'das']" +
                ")");
        Settings fullAnalyzerSettings = fulltextAnalyzerResolver.resolveFullCustomAnalyzerSettings("a4");
        assertThat(
                fullAnalyzerSettings.getAsMap(),
                hasEntry("index.analysis.analyzer.a4.type", "german")
        );
        assertThat(
                fullAnalyzerSettings.getAsArray("index.analysis.analyzer.a4.stop_words"),
                arrayContainingInAnyOrder("der", "die", "das")
        );


        // extend analyzer who extends builtin analyzer (chain can be longer than 1)
        execute("CREATE ANALYZER a4e EXTENDS " +
                "a4 WITH (" +
                "   \"stop_words\"=['der', 'die', 'das', 'wer', 'wie', 'was']" +
                ")");
        fullAnalyzerSettings = fulltextAnalyzerResolver.resolveFullCustomAnalyzerSettings("a4e");
        assertThat(
                fullAnalyzerSettings.getAsMap(),
                hasEntry("index.analysis.analyzer.a4e.type", "german")
        );
        assertThat(
                fullAnalyzerSettings.getAsArray("index.analysis.analyzer.a4e.stop_words"),
                arrayContainingInAnyOrder("der", "die", "das", "wer", "wie", "was")
        );
    }


    @Test
    public void resolveAnalyzerExtendingCustom() throws Exception {
        execute("CREATE ANALYZER a5 (" +
                "   tokenizer whitespace," +
                "   token_filters (" +
                "       lowercase," +
                "       germanstemmer WITH (" +
                "           type='stemmer'," +
                "           language='german'" +
                "       )" +
                "   )" +
                ")");
        Settings fullAnalyzerSettings = fulltextAnalyzerResolver.resolveFullCustomAnalyzerSettings("a5");
        assertThat(
                fullAnalyzerSettings.getAsMap(),
                hasEntry("index.analysis.analyzer.a5.type", "custom")
        );
        assertThat(
                fullAnalyzerSettings.getAsMap(),
                hasEntry("index.analysis.analyzer.a5.tokenizer", "whitespace")
        );
        assertThat(
                fullAnalyzerSettings.getAsArray("index.analysis.analyzer.a5.filter"),
                arrayContainingInAnyOrder("lowercase", "a5_germanstemmer")
        );
        assertThat(
                fullAnalyzerSettings.getAsMap(),
                allOf(
                    hasEntry("index.analysis.filter.a5_germanstemmer.type", "stemmer"),
                    hasEntry("index.analysis.filter.a5_germanstemmer.language", "german")
                )
        );


         execute("CREATE ANALYZER a5e EXTENDS a5 (" +
                "   tokenizer letter," +
                "   char_filters (" +
                "       \"html_strip\"," +
                "       mymapping WITH (" +
                "           type='mapping'," +
                "           mappings=['ph=>f', 'ß=>ss', 'ö=>oe']" +
                "       )" +
                "   )" +
                ")");


        fullAnalyzerSettings = fulltextAnalyzerResolver.resolveFullCustomAnalyzerSettings("a5e");
        assertThat(
                fullAnalyzerSettings.getAsMap(),
                hasEntry("index.analysis.analyzer.a5e.type", "custom")
        );
        assertThat(
                fullAnalyzerSettings.getAsMap(),
                hasEntry("index.analysis.analyzer.a5e.tokenizer", "letter")
        );
        assertThat(
                fullAnalyzerSettings.getAsArray("index.analysis.analyzer.a5e.filter"),
                arrayContainingInAnyOrder("lowercase", "a5_germanstemmer")
        );
        assertThat(
                fullAnalyzerSettings.getAsMap(),
                allOf(
                        hasEntry("index.analysis.filter.a5_germanstemmer.type", "stemmer"),
                        hasEntry("index.analysis.filter.a5_germanstemmer.language", "german")
                )
        );
        assertThat(
                fullAnalyzerSettings.getAsArray("index.analysis.analyzer.a5e.char_filter"),
                arrayContainingInAnyOrder("html_strip", "a5e_mymapping")
        );
    }


    @Test
    public void testBuiltInAnalyzers() throws Exception {
        List<String> analyzers = new ArrayList<>(fulltextAnalyzerResolver.getBuiltInAnalyzers());
        Collections.sort(analyzers);
        assertThat(Joiner.on(", ").join(analyzers),
                is("arabic, armenian, basque, brazilian, bulgarian, catalan, chinese, cjk, " +
                        "classic, czech, danish, default, dutch, english, finnish, french, " +
                        "galician, german, greek, hindi, hungarian, indonesian, irish, " +
                        "italian, keyword, latvian, norwegian, pattern, persian, portuguese, " +
                        "romanian, russian, simple, snowball, sorani, spanish, standard, " +
                        "standard_html_strip, stop, swedish, thai, turkish, whitespace"));
    }


    @Test
    public void testBuiltInTokenizers() throws Exception {
        List<String> tokenizers = new ArrayList<>(fulltextAnalyzerResolver.getBuiltInTokenizers());
        Collections.sort(tokenizers);
        assertThat(Joiner.on(", ").join(tokenizers),
                is("classic, edgeNGram, edge_ngram, keyword, letter, lowercase, " +
                        "nGram, ngram, path_hierarchy, pattern, standard, thai, " +
                        "uax_url_email, whitespace"));
    }


    @Test
    public void testBuiltInTokenFilters() throws Exception {
        List<String> tokenFilters = new ArrayList<>(fulltextAnalyzerResolver.getBuiltInTokenFilters());
        Collections.sort(tokenFilters);
        assertThat(Joiner.on(", ").join(tokenFilters),
                is("apostrophe, arabic_normalization, arabic_stem, asciifolding, brazilian_stem, " +
                        "cjk_bigram, cjk_width, classic, common_grams, czech_stem, " +
                        "delimited_payload_filter, dictionary_decompounder, dutch_stem, " +
                        "edgeNGram, edge_ngram, elision, french_stem, german_normalization, " +
                        "german_stem, hindi_normalization, hunspell, " +
                        "hyphenation_decompounder, indic_normalization, keep, " +
                        "keyword_marker, keyword_repeat, " +
                        "kstem, length, limit, lowercase, nGram, ngram, pattern_capture, " +
                        "pattern_replace, persian_normalization, porter_stem, reverse, " +
                        "russian_stem, scandinavian_folding, scandinavian_normalization, " +
                        "shingle, snowball, sorani_normalization, standard, stemmer, stemmer_override, " +
                        "stop, synonym, trim, truncate, type_as_payload, unique, uppercase, word_delimiter"));
    }


    @Test
    public void testBuiltInCharFilters() throws Exception {
        List<String> charFilters = new ArrayList<>(fulltextAnalyzerResolver.getBuiltInCharFilters());
        Collections.sort(charFilters);
        assertThat(Joiner.on(", ").join(charFilters),
                is("htmlStrip, html_strip, mapping, pattern_replace"));
    }


    @Test
    public void createAndExtendFullCustomAnalyzer() throws IOException {
        execute("CREATE ANALYZER a7 (" +
                "  char_filters (" +
                "     mypattern WITH (" +
                "       type='pattern_replace'," +
                "      \"pattern\" ='sample(.*)',\n" +
                "      \"replacement\" = 'replacedSample $1'" +
                "     )," +
                "     \"html_strip\"" +
                "  )," +
                "  tokenizer mytok WITH (" +
                "    type='edgeNGram'," +
                "    \"min_gram\" = 2," +
                "    \"max_gram\" = 5," +
                "    \"token_chars\" = [ 'letter', 'digit' ]" +
                "  )," +
                "  token_filters (" +
                "    myshingle WITH (" +
                "      type='shingle'," +
                "      \"output_unigrams\"=false," +
                "      \"max_shingle_size\"=10" +
                "    )," +
                "    lowercase," +
                "    \"my_stemmer\" WITH (" +
                "      type='stemmer'," +
                "      language='german'" +
                "    )" +
                "  )" +
                ")");
        Settings settings = getPersistentClusterSettings();


        assertThat(
                settings.getAsMap(),
                allOf(
                        hasKey("crate.analysis.custom.analyzer.a7"),
                        hasKey("crate.analysis.custom.tokenizer.a7_mytok"),
                        hasKey("crate.analysis.custom.char_filter.a7_mypattern"),
                        hasKey("crate.analysis.custom.filter.a7_myshingle"),
                        hasKey("crate.analysis.custom.filter.a7_my_stemmer")
                )
        );
        Settings analyzerSettings = FulltextAnalyzerResolver.decodeSettings(settings.get("crate.analysis.custom.analyzer.a7"));
        assertThat(
                analyzerSettings.getAsArray("index.analysis.analyzer.a7.char_filter"),
                arrayContainingInAnyOrder("a7_mypattern", "html_strip")
        );
        assertThat(
                analyzerSettings.getAsArray("index.analysis.analyzer.a7.filter"),
                arrayContainingInAnyOrder("a7_myshingle", "lowercase", "a7_my_stemmer")
        );
        assertThat(
                analyzerSettings.getAsMap(),
                hasEntry("index.analysis.analyzer.a7.tokenizer", "a7_mytok")
        );
        execute("CREATE ANALYZER a8 EXTENDS a7 (" +
                "  token_filters (" +
                "    lowercase," +
                "    kstem" +
                "  )" +
                ")");
        Settings extendedSettings = getPersistentClusterSettings();
        assertThat(
                extendedSettings.getAsMap(),
                allOf(
                        hasKey("crate.analysis.custom.analyzer.a8"),
                        hasKey("crate.analysis.custom.tokenizer.a7_mytok")
                )
        );
        Settings extendedAnalyzerSettings = FulltextAnalyzerResolver.decodeSettings(extendedSettings.get("crate.analysis.custom.analyzer.a8"));
        assertThat(
                extendedAnalyzerSettings.getAsMap(),
                hasEntry("index.analysis.analyzer.a8.type", "custom")
        );
        assertThat(
                extendedAnalyzerSettings.getAsMap(),
                hasEntry("index.analysis.analyzer.a8.tokenizer", "a7_mytok")
        );
        assertThat(
                extendedAnalyzerSettings.getAsArray("index.analysis.analyzer.a8.filter"),
                arrayContainingInAnyOrder("lowercase", "kstem")
        );
        assertThat(
                extendedAnalyzerSettings.getAsArray("index.analysis.analyzer.a8.char_filter"),
                arrayContainingInAnyOrder("a7_mypattern", "html_strip")
        );


    }


    @Test
    public void reuseExistingTokenizer() throws Exception {


        execute("CREATE ANALYZER a9 (" +
                "  TOKENIZER a9tok WITH (" +
                "    type='nGram'," +
                "    \"token_chars\"=['letter', 'digit']" +
                "  )" +
                ")");
        try {
            execute("CREATE ANALYZER a10 (" +
                    "  TOKENIZER a9tok" +
                    ")");
            fail("Reusing existing tokenizer worked");
        } catch (SQLActionException e) {
            assertThat(e.getMessage(), is("Non-existing tokenizer 'a9tok'"));
        }
        /*
         * NOT SUPPORTED UNTIL A CONSISTENT SOLUTION IS FOUND
         * FOR IMPLICITLY CREATING TOKENIZERS ETC. WITHIN ANALYZER-DEFINITIONS


        Settings settings = getPersistentClusterSettings();
        Settings a10Settings = AnalyzerService.decodeSettings(settings.get("crate.analysis.custom.analyzer.a10"));
        assertThat(
                a10Settings.getAsMap(),
                hasEntry("index.analysis.analyzer.a10.tokenizer", "a9tok")
        );
        */
    }


    @Test
    public void useAnalyzerForIndexSettings() throws Exception {
        execute("CREATE ANALYZER a11 (" +
                "  TOKENIZER standard," +
                "  TOKEN_FILTERS (" +
                "    lowercase," +
                "    mystop WITH (" +
                "      type='stop'," +
                "      stopword=['the', 'over']" +
                "    )" +
                "  )" +
                ")");
        Settings settings = getPersistentClusterSettings();
        assertThat(
                settings.getAsMap(),
                allOf(
                        hasKey("crate.analysis.custom.analyzer.a11"),
                        hasKey("crate.analysis.custom.filter.a11_mystop")
                )
        );
        Settings analyzerSettings = FulltextAnalyzerResolver.decodeSettings(settings.get("crate.analysis.custom.analyzer.a11"));
        Settings tokenFilterSettings = FulltextAnalyzerResolver.decodeSettings(settings.get("crate" +
                ".analysis.custom.filter.a11_mystop"));
        ImmutableSettings.Builder builder = ImmutableSettings.builder();
        builder.put(analyzerSettings);
        builder.put(tokenFilterSettings);


        execute("create table test (" +
                " id integer primary key," +
                " name string," +
                " content string index using fulltext with (analyzer='a11')" +
                ")");
        ensureGreen();
        execute("insert into test (id, name, content) values (?, ?, ?)", new Object[]{
                1, "phrase", "The quick brown fox jumps over the lazy dog."
        });
        execute("insert into test (id, name, content) values (?, ?, ?)", new Object[]{
                2, "another phrase", "Don't panic!"
        });
        refresh();
        SQLResponse response = execute("select id from test where match(content, 'brown jump')");
        assertEquals(1L, response.rowCount());
        assertEquals(1, response.rows()[0][0]);    }


}
Source Code of io.crate.integrationtests.FulltextAnalyzerResolverTest

Related Classes of io.crate.integrationtests.FulltextAnalyzerResolverTest