package org.sf.mustru.utils;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import com.aliasi.tokenizer.Tokenizer;
/**
*
*/
public class StandardBgramTokenizer extends Tokenizer
{
StandardBgramAnalyzer analyzer = null; //*-- A Lucene based analyzer
Token[] tokens = null; //*-- list of Lucene tokens
int itoken = -1; //*-- the current token position
/**
* Generate a list of Lucene tokens
* @param ch Characters to tokenize.
*/
public StandardBgramTokenizer(char[] ch, int offset, int length, boolean extractEntities)
{ analyzer = new StandardBgramAnalyzer(); analyzer.setExtractEntities(extractEntities);
String text = new String(ch);
if ( (offset != 0) || (length != ch.length) ) text = text.substring(offset, length);
tokens = tokensFromAnalysis(analyzer, text);
itoken = 0;
}
public StandardBgramTokenizer(String text, int offset, int length)
{ this(text.toCharArray(), offset, length, true); }
/**
* Returns the offset of the first character of the most recently
* returned token, or the first character if no token or space has
* been returned.
*
* @return The character offset of the first character of the most
* recently returned token.
*/
public int lastTokenStartPosition()
{ if (itoken < 0) return(-1);
return(tokens[itoken].endOffset());
}
/**
* Returns the next whitespace. Returns the same result for
* subsequent calls without a call to <code>nextToken</code>.
* @return The next space.
*/
public String nextWhitespace()
{ return (" "); }
/**
* Returns the next token in the stream, or <code>null</code> if
* there are no more tokens. Flushes any whitespace that has
* not been returned.
*
* @return The next token, or <code>null</code> if there are no
* more tokens.
*/
public String nextToken()
{ if (itoken < tokens.length) return (tokens[itoken++].termText());
return(null);
}
/**
* Returns a tokenized version of the specified string.
*
* @param phrase Characters to tokenize.
* @return Array of tokens generated by characters.
*/
public static String[] tokenize(String phrase) throws IOException
{ return new StandardBgramTokenizer(phrase, 0, phrase.length()).tokenize(); }
/**
* Use the passed analyzer to get a list of tokens from the text
*/
private static Token[] tokensFromAnalysis(Analyzer analyzer, String text)
{
TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));
ArrayList<Token> tokenList = new ArrayList<Token>(); Token token = null;
try { while ( (token = stream.next()) != null) tokenList.add(token); }
catch (IOException ie) { System.err.println("Tokenizer problem: " + ie.getMessage()); }
Token[] tokens = new Token[tokenList.size()];
for (int i = 0; i < tokens.length; i++) tokens[i] = tokenList.get(i);
return (tokens);
}
}