/*
* Copyright (c) 2009 Andrejs Jermakovics.
*
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Public License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/legal/epl-v10.html
*
* Contributors:
* Andrejs Jermakovics - initial implementation
*/
package it.unibz.instasearch.indexing;
import it.unibz.instasearch.indexing.tokenizers.CamelCaseTokenizer;
import it.unibz.instasearch.indexing.tokenizers.DotSplitTokenizer;
import it.unibz.instasearch.indexing.tokenizers.WordSplitTokenizer;
import it.unibz.instasearch.indexing.tokenizers.standard.StandardTokenizer;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LengthFilter;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
public class FileAnalyzer extends Analyzer {
private int minWordLength;
public FileAnalyzer(int minWordLength){
super();
this.minWordLength = minWordLength;
}
public TokenStream tokenStream(Reader reader) {
TokenStream result = new StandardTokenizer(reader); // splits at ". ", etc.
// result = new SysoFilter(result);
result = new WordSplitTokenizer(result); // non-alphanumerics
result = new DotSplitTokenizer(result); // all.package.names, hyphen-separated-words
result = new CamelCaseTokenizer(result); // CamelCaseIdentifiers
result = new LengthFilter(result, minWordLength, 128);
result = new LowerCaseFilter(result);
return result;
}
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
return tokenStream(reader);
}
// used when debugging
public static class SysoFilter extends TokenFilter
{
private TermAttribute termAtt;
public SysoFilter(TokenStream input)
{
super(input);
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
@Override
public boolean incrementToken() throws IOException
{
if (input.incrementToken()) {
System.out.println("TERM: " + termAtt.term());
return true;
}
return false;
}
}
}