/*
* Copyright 2011 Peter Karich, jetwick_@_pannous_._info.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.jetwick.es;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.ReusableAnalyzerBase;
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.util.Version;
/**
*
* @author Peter Karich, jetwick_@_pannous_._info
*/
public class JetwickAnalyzer extends ReusableAnalyzerBase {
/**
* Default maximum allowed token length
*/
public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
/**
* Specifies whether deprecated acronyms should be replaced with HOST type.
* See {@linkplain "https://issues.apache.org/jira/browse/LUCENE-1068"}
*/
private final boolean replaceInvalidAcronym;
protected final Version matchVersion;
private CharArraySet protectedWords = null;
private int generateWordParts = 1;
private int generateNumberParts = 1;
private int catenateWords = 0;
private int catenateNumbers = 0;
private int catenateAll = 0;
private int splitOnCaseChange = 0;
private int splitOnNumerics = 1;
private int preserveOriginal = 1;
private int stemEnglishPossessive = 0;
private String handleAsChar = "";
private String handleAsDigit = "@#$€₱č₤";
public JetwickAnalyzer() {
matchVersion = Version.LUCENE_31;
replaceInvalidAcronym = true;
}
@Override
protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
final StandardTokenizer src = new StandardTokenizer(matchVersion, reader);
src.setMaxTokenLength(maxTokenLength);
src.setReplaceInvalidAcronym(replaceInvalidAcronym);
TokenStream tok = JetwickFilterFactory.myCreate(src, handleAsChar, handleAsDigit,
generateWordParts, generateNumberParts,
catenateWords, catenateNumbers, catenateAll,
splitOnCaseChange, preserveOriginal,
splitOnNumerics, stemEnglishPossessive, protectedWords);
tok = new LowerCaseFilter(matchVersion, tok);
return new TokenStreamComponents(src, tok) {
@Override
protected boolean reset(final Reader reader) throws IOException {
src.setMaxTokenLength(JetwickAnalyzer.this.maxTokenLength);
return super.reset(reader);
}
};
}
}