Source Code of org.sf.mustru.utils.StandardBgramTokenizer

package org.sf.mustru.utils;


import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;


import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;


import com.aliasi.tokenizer.Tokenizer;


/**
 * 
 */
public class StandardBgramTokenizer extends Tokenizer 
{
   StandardBgramAnalyzer analyzer = null;  //*-- A Lucene based analyzer
   Token[] tokens = null;      //*-- list of Lucene tokens
   int itoken = -1;        //*-- the current token position
   
    /**
     * Generate a list of Lucene tokens
     * @param ch Characters to tokenize.
     */
    public StandardBgramTokenizer(char[] ch, int offset, int length, boolean extractEntities)
    { analyzer = new StandardBgramAnalyzer(); analyzer.setExtractEntities(extractEntities); 
      String text = new String(ch);
      if ( (offset != 0) || (length != ch.length) ) text = text.substring(offset, length);
      tokens = tokensFromAnalysis(analyzer, text); 
      itoken = 0;
    }
    
    public StandardBgramTokenizer(String text, int offset, int length) 
    { this(text.toCharArray(), offset, length, true); }
    
    /**
     * Returns the offset of the first character of the most recently
     * returned token, or the first character if no token or space has
     * been returned.
     *
     * @return The character offset of the first character of the most
     * recently returned token.
     */
    public int lastTokenStartPosition() 
    { if (itoken < 0) return(-1);
      return(tokens[itoken].endOffset());
    }




    /**
     * Returns the next whitespace.  Returns the same result for
     * subsequent calls without a call to <code>nextToken</code>.
     * @return The next space.
     */
    public String nextWhitespace() 
    {  return (" "); }


    /**
     * Returns the next token in the stream, or <code>null</code> if
     * there are no more tokens.  Flushes any whitespace that has
     * not been returned.
     *
     * @return The next token, or <code>null</code> if there are no
     * more tokens.
     */
    public String nextToken()  
    { if (itoken < tokens.length) return (tokens[itoken++].termText());
      return(null);
    }
    
    /**
     * Returns a tokenized version of the specified string.
     *
     * @param phrase Characters to tokenize.
     * @return Array of tokens generated by characters.
     */
    public static String[] tokenize(String phrase) throws IOException
    { return new StandardBgramTokenizer(phrase, 0, phrase.length()).tokenize(); }
    
    /**
     * Use the passed analyzer to get a list of tokens from the text 
     */
    private static Token[] tokensFromAnalysis(Analyzer analyzer, String text)  
    {
      TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));
      ArrayList<Token> tokenList = new ArrayList<Token>(); Token token = null;
      try { while ( (token = stream.next()) != null) tokenList.add(token); }
      catch (IOException ie) { System.err.println("Tokenizer problem: " + ie.getMessage()); }
      Token[] tokens = new Token[tokenList.size()];
      for (int i = 0; i < tokens.length; i++) tokens[i] = tokenList.get(i);
      return (tokens);
    } 
    
}
Source Code of org.sf.mustru.utils.StandardBgramTokenizer

Related Classes of org.sf.mustru.utils.StandardBgramTokenizer