/**
* Copyright 2010 Bernhard Haslhofer
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package org.xbib.elasticsearch.index.analysis.skos;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.xbib.elasticsearch.index.analysis.skos.engine.SKOSEngine;
import org.xbib.elasticsearch.index.analysis.skos.engine.SKOSEngineFactory;
import org.xbib.elasticsearch.index.analysis.skos.tokenattributes.SKOSTypeAttribute;
import org.xbib.elasticsearch.plugin.analysis.SKOSAnalysisPlugin;
/**
* An analyzer for expanding fields that contain either (i) URI references to
* SKOS concepts OR (ii) SKOS concept prefLabels as values.
*/
public class SKOSAnalyzer extends StopwordAnalyzerBase {
/**
* The supported expansion types
*/
public enum ExpansionType {
URI, LABEL
}
/**
* Default expansion type
*/
public static final ExpansionType DEFAULT_EXPANSION_TYPE = ExpansionType.LABEL;
private ExpansionType expansionType = DEFAULT_EXPANSION_TYPE;
/**
* Default skos types to expand to
*/
public static final SKOSTypeAttribute.SKOSType[] DEFAULT_SKOS_TYPES = new SKOSTypeAttribute.SKOSType[]{
SKOSTypeAttribute.SKOSType.PREF, SKOSTypeAttribute.SKOSType.ALT, SKOSTypeAttribute.SKOSType.BROADER,
SKOSTypeAttribute.SKOSType.BROADERTRANSITIVE, SKOSTypeAttribute.SKOSType.NARROWER,
SKOSTypeAttribute.SKOSType.NARROWERTRANSITIVE};
private SKOSTypeAttribute.SKOSType[] types = DEFAULT_SKOS_TYPES;
/**
* A SKOS Engine instance
*/
private SKOSEngine skosEngine;
/**
* The size of the buffer used for multi-term prediction
*/
private int bufferSize = SKOSLabelFilter.DEFAULT_BUFFER_SIZE;
/**
* Default maximum allowed token length
*/
public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
/**
* An unmodifiable set containing some common English words that are usually
* not useful for searching.
*/
public static final CharArraySet STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
public SKOSAnalyzer(CharArraySet stopWords,
SKOSEngine skosEngine, ExpansionType expansionType) {
super(SKOSAnalysisPlugin.getLuceneVersion(), stopWords);
this.skosEngine = skosEngine;
this.expansionType = expansionType;
}
public SKOSAnalyzer(SKOSEngine skosEngine,
ExpansionType expansionType) {
this(STOP_WORDS_SET, skosEngine, expansionType);
}
public SKOSAnalyzer(Reader stopwords,
SKOSEngine skosEngine, ExpansionType expansionType) throws IOException {
this(loadStopwordSet(stopwords, SKOSAnalysisPlugin.getLuceneVersion()), skosEngine, expansionType);
}
public SKOSAnalyzer(CharArraySet stopWords,
String indexPath, String skosFile,
ExpansionType expansionType, int bufferSize, String... languages)
throws IOException {
super(SKOSAnalysisPlugin.getLuceneVersion(), stopWords);
this.skosEngine = SKOSEngineFactory.getSKOSEngine(indexPath, skosFile, languages);
this.expansionType = expansionType;
this.bufferSize = bufferSize;
}
public SKOSAnalyzer(String indexPath, String skosFile,
ExpansionType expansionType, int bufferSize, String... languages)
throws IOException {
this(STOP_WORDS_SET, indexPath, skosFile, expansionType, bufferSize, languages);
}
public SKOSAnalyzer(String indexPath, String skosFile,
ExpansionType expansionType, int bufferSize) throws IOException {
this(indexPath, skosFile, expansionType, bufferSize, (String[]) null);
}
public SKOSAnalyzer(String indexPath, String skosFile,
ExpansionType expansionType) throws IOException {
this(indexPath, skosFile, expansionType, SKOSLabelFilter.DEFAULT_BUFFER_SIZE);
}
public SKOSAnalyzer(Reader stopwords,
String indexPath, String skosFile,
ExpansionType expansionType, int bufferSize, String... languages)
throws IOException {
this(loadStopwordSet(stopwords, SKOSAnalysisPlugin.getLuceneVersion()), indexPath, skosFile,
expansionType, bufferSize, languages);
}
public SKOSTypeAttribute.SKOSType[] getTypes() {
return types;
}
public void setTypes(SKOSTypeAttribute.SKOSType... types) {
this.types = types;
}
/**
* Set maximum allowed token length. If a token is seen that exceeds this
* length then it is discarded. This setting only takes effect the next time
* tokenStream or tokenStream is called.
*/
public void setMaxTokenLength(int length) {
maxTokenLength = length;
}
/**
* @see #setMaxTokenLength
*/
public int getMaxTokenLength() {
return maxTokenLength;
}
@Override
protected TokenStreamComponents createComponents(String fileName,
Reader reader) {
if (expansionType.equals(ExpansionType.URI)) {
final KeywordTokenizer src = new KeywordTokenizer(reader);
TokenStream tok = new SKOSURIFilter(src, skosEngine,
new StandardAnalyzer(matchVersion), types);
tok = new LowerCaseFilter(matchVersion, tok);
return new TokenStreamComponents(src, tok);
} else {
final StandardTokenizer src = new StandardTokenizer(matchVersion, reader);
src.setMaxTokenLength(maxTokenLength);
TokenStream tok = new StandardFilter(matchVersion, src);
// prior to this we get the classic behavior, standardfilter does it for
// us.
tok = new SKOSLabelFilter(tok, skosEngine, new StandardAnalyzer(
matchVersion), bufferSize, types);
tok = new LowerCaseFilter(matchVersion, tok);
tok = new StopFilter(matchVersion, tok, stopwords);
tok = new RemoveDuplicatesTokenFilter(tok);
return new TokenStreamComponents(src, tok) {
@Override
protected void setReader(final Reader reader) throws IOException {
src.setMaxTokenLength(maxTokenLength);
super.setReader(reader);
}
};
}
}
}