Package org.sf.mustru.utils

Source Code of org.sf.mustru.utils.StandardBgramTokenizer

package org.sf.mustru.utils;

import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;

import com.aliasi.tokenizer.Tokenizer;

/**
*
*/
public class StandardBgramTokenizer extends Tokenizer
{
   StandardBgramAnalyzer analyzer = null//*-- A Lucene based analyzer
   Token[] tokens = null;      //*-- list of Lucene tokens
   int itoken = -1;        //*-- the current token position
  
    /**
     * Generate a list of Lucene tokens
     * @param ch Characters to tokenize.
     */
    public StandardBgramTokenizer(char[] ch, int offset, int length, boolean extractEntities)
    { analyzer = new StandardBgramAnalyzer(); analyzer.setExtractEntities(extractEntities);
      String text = new String(ch);
      if ( (offset != 0) || (length != ch.length) ) text = text.substring(offset, length);
      tokens = tokensFromAnalysis(analyzer, text);
      itoken = 0;
    }
   
    public StandardBgramTokenizer(String text, int offset, int length)
    { this(text.toCharArray(), offset, length, true); }
   
    /**
     * Returns the offset of the first character of the most recently
     * returned token, or the first character if no token or space has
     * been returned.
     *
     * @return The character offset of the first character of the most
     * recently returned token.
     */
    public int lastTokenStartPosition()
    { if (itoken < 0) return(-1);
      return(tokens[itoken].endOffset());
    }


    /**
     * Returns the next whitespace.  Returns the same result for
     * subsequent calls without a call to <code>nextToken</code>.
     * @return The next space.
     */
    public String nextWhitespace()
    {  return (" "); }

    /**
     * Returns the next token in the stream, or <code>null</code> if
     * there are no more tokens.  Flushes any whitespace that has
     * not been returned.
     *
     * @return The next token, or <code>null</code> if there are no
     * more tokens.
     */
    public String nextToken() 
    { if (itoken < tokens.length) return (tokens[itoken++].termText());
      return(null);
    }
   
    /**
     * Returns a tokenized version of the specified string.
     *
     * @param phrase Characters to tokenize.
     * @return Array of tokens generated by characters.
     */
    public static String[] tokenize(String phrase) throws IOException
    { return new StandardBgramTokenizer(phrase, 0, phrase.length()).tokenize(); }
   
    /**
     * Use the passed analyzer to get a list of tokens from the text
     */
    private static Token[] tokensFromAnalysis(Analyzer analyzer, String text
    {
      TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));
      ArrayList<Token> tokenList = new ArrayList<Token>(); Token token = null;
      try { while ( (token = stream.next()) != null) tokenList.add(token); }
      catch (IOException ie) { System.err.println("Tokenizer problem: " + ie.getMessage()); }
      Token[] tokens = new Token[tokenList.size()];
      for (int i = 0; i < tokens.length; i++) tokens[i] = tokenList.get(i);
      return (tokens);
    }
   
}
TOP

Related Classes of org.sf.mustru.utils.StandardBgramTokenizer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.