Package de.jetwick.es

Source Code of de.jetwick.es.JetwickAnalyzer

/*
* Copyright 2011 Peter Karich, jetwick_@_pannous_._info.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*      http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.jetwick.es;

import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.ReusableAnalyzerBase;
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.util.Version;

/**
*
* @author Peter Karich, jetwick_@_pannous_._info
*/
public class JetwickAnalyzer extends ReusableAnalyzerBase {

    /**
     * Default maximum allowed token length
     */
    public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
    private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
    /**
     * Specifies whether deprecated acronyms should be replaced with HOST type.
     * See {@linkplain "https://issues.apache.org/jira/browse/LUCENE-1068"}
     */
    private final boolean replaceInvalidAcronym;
    protected final Version matchVersion;
    private CharArraySet protectedWords = null;
    private int generateWordParts = 1;
    private int generateNumberParts = 1;
    private int catenateWords = 0;
    private int catenateNumbers = 0;
    private int catenateAll = 0;
    private int splitOnCaseChange = 0;
    private int splitOnNumerics = 1;
    private int preserveOriginal = 1;
    private int stemEnglishPossessive = 0;
    private String handleAsChar = "";
    private String handleAsDigit = "@#$€₱č₤";

    public JetwickAnalyzer() {
        matchVersion = Version.LUCENE_31;
        replaceInvalidAcronym = true;
    }

    @Override
    protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
        final StandardTokenizer src = new StandardTokenizer(matchVersion, reader);
        src.setMaxTokenLength(maxTokenLength);
        src.setReplaceInvalidAcronym(replaceInvalidAcronym);
        TokenStream tok = JetwickFilterFactory.myCreate(src, handleAsChar, handleAsDigit,
                generateWordParts, generateNumberParts,
                catenateWords, catenateNumbers, catenateAll,
                splitOnCaseChange, preserveOriginal,
                splitOnNumerics, stemEnglishPossessive, protectedWords);
        tok = new LowerCaseFilter(matchVersion, tok);
        return new TokenStreamComponents(src, tok) {

            @Override
            protected boolean reset(final Reader reader) throws IOException {
                src.setMaxTokenLength(JetwickAnalyzer.this.maxTokenLength);
                return super.reset(reader);
            }
        };
    }
}
TOP

Related Classes of de.jetwick.es.JetwickAnalyzer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.