Examples of org.apache.lucene.analysis.Token

org.apache.lucene.analysis.Token
s.apache.org/jira/browse/LUCENE-969">LUCENE-969 for details.

Typical Token reuse patterns:
- Copying text from a string (type is reset to {@link #DEFAULT_TYPE} if not specified):
```
 return reusableToken.reinit(string, startOffset, endOffset[, type]); 
```
- Copying some text from a string (type is reset to {@link #DEFAULT_TYPE} if not specified):
```
 return reusableToken.reinit(string, 0, string.length(), startOffset, endOffset[, type]); 
```
- Copying text from char[] buffer (type is reset to {@link #DEFAULT_TYPE} if not specified):
```
 return reusableToken.reinit(buffer, 0, buffer.length, startOffset, endOffset[, type]); 
```
- Copying some text from a char[] buffer (type is reset to {@link #DEFAULT_TYPE} if not specified):
```
 return reusableToken.reinit(buffer, start, end - start, startOffset, endOffset[, type]); 
```
- Copying from one one Token to another (type is reset to {@link #DEFAULT_TYPE} if not specified):
```
 return reusableToken.reinit(source.termBuffer(), 0, source.termLength(), source.startOffset(), source.endOffset()[, source.type()]); 
```
A few things to note:
- clear() initializes all of the fields to default values. This was changed in contrast to Lucene 2.4, but should affect no one.
- Because TokenStreams can be chained, one cannot assume that the Token's current type is correct.
- The startOffset and endOffset represent the start and offset in the source text, so be careful in adjusting them.
- When caching a reusable token, clone it. When injecting a cached token into a stream that can be reset, clone it again.
@see org.apache.lucene.index.Payload


  // overridden
  public Token next() throws IOException {
    if (mIter.hasNext()) {
      String text = (String) mIter.next();
      return new Token(text, 0, text.length());
    } else {
      return null;
    }
  }

View Full Code Here

    Token[] tokens = tokensFromAnalysis(analyzer, text);


    int position = 0;
    for (int i = 0; i < tokens.length; i++) 
     {
      Token token = tokens[i];
      int increment = token.getPositionIncrement();
      if (increment > 0) 
      { position = position + increment;
        System.out.println();
        System.out.print(position + ": ");
      }
      System.out.print("\t [" + token.termText() + ": " + token.type() + "] " + token.startOffset() + ":" + token.endOffset());
      
     } //*-- end of for
    System.out.println("");
  }

View Full Code Here

   * Use the passed analyzer to get a list of tokens from the text 
   */
  private static Token[] tokensFromAnalysis(Analyzer analyzer, String text) throws IOException 
  {
    TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));
    ArrayList<Token> tokenList = new ArrayList<Token>(); Token token = null;
    while ( (token = stream.next()) != null) tokenList.add(token);
    Token[] tokens = new Token[tokenList.size()];
    for (int i = 0; i < tokens.length; i++) tokens[i] = tokenList.get(i);
    return (tokens);
    }

View Full Code Here

  String[] synonyms = synBuffer.toString().trim().split(" ");
 */ 
  //*-- tokenize the question
  StandardBgramAnalyzer analyzer = new StandardBgramAnalyzer(); analyzer.setExtractEntities(true);
  TokenStream stream = analyzer.tokenStream("contents", new StringReader(question));
  ArrayList<Token> tokenList = new ArrayList<Token>(); Token token = null;
  entities = new ArrayList<String>();    //*-- list of entities in the question
  while ( (token = stream.next()) != null) 
   { tokenList.add(token); if (token.type().equals("<ENTITY>")) entities.add(token.termText()); }
 
  //*-------------------------------------------------------------------
  //*-- build the query with the five components
  //*--
  //*-- 1. First identify the entity types for the query

View Full Code Here

     int beginIndex = (prevSentence.length() > addChars) ? prevSentence.length() - addChars: 0;
     sentence = prevSentence.substring(beginIndex, prevSentence.length()) + sentence;
   }
   //*-- build a list of tokens from the sentence
   TokenStream stream = analyzer.tokenStream("contents", new StringReader(sentence));
   ArrayList<Token> tokenList = new ArrayList<Token>(); Token token = null;
   while ( (token = stream.next()) != null) tokenList.add(token);
   
   //*-- initialize the boolean arrays and scores
   boolean[] foundWords = new boolean[tokenList.size()];
   for (int i = 0; i < nouns.length; i++) foundNouns[i] = false;

View Full Code Here

  * Use the passed analyzer to get a list of tokens from the text 
  */
 private static Token[] tokensFromAnalysis(Analyzer analyzer, String text) throws IOException 
 {
   TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));
   ArrayList<Token> tokenList = new ArrayList<Token>(); Token token = null;
   while ( (token = stream.next()) != null) tokenList.add(token);
   Token[] tokens = new Token[tokenList.size()];
   for (int i = 0; i < tokens.length; i++) tokens[i] = tokenList.get(i);
   return (tokens);
   }

View Full Code Here

     * Use the passed analyzer to get a list of tokens from the text 
     */
    private static Token[] tokensFromAnalysis(Analyzer analyzer, String text)  
    {
      TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));
      ArrayList<Token> tokenList = new ArrayList<Token>(); Token token = null;
      try { while ( (token = stream.next()) != null) tokenList.add(token); }
      catch (IOException ie) { System.err.println("Tokenizer problem: " + ie.getMessage()); }
      Token[] tokens = new Token[tokenList.size()];
      for (int i = 0; i < tokens.length; i++) tokens[i] = tokenList.get(i);
      return (tokens);

View Full Code Here

  public final Token next() throws IOException 
  {
   
   //*-- check for a saved token
   if (stoken != null)
    { Token tempToken = stoken; stoken = null; 
      tempToken.setPositionIncrement(0); return(tempToken); }
   
   Token ctoken = input.next();  if (ctoken == null) return (null);
   String ctext = ctoken.termText();
   
   //*-- check for an unigram entity
   String etype = ehash.get(ctext);
   if (etype != null) stoken = new Token(etype, ctoken.startOffset(), ctoken.endOffset(), "<ENTITY>"); 
   
   //*-- check for a bigram entity
   if (ptoken != null)    
   { String bigram = ptoken.termText() + " " + ctext;
     etype = ehash.get(bigram);
     if (etype != null)
      stoken = new Token(etype, ptoken.startOffset(), ctoken.endOffset(), "<ENTITY>");
   }
   
   if (number.matcher(ctext).matches())
     stoken = new Token("enumber", ctoken.startOffset(), ctoken.endOffset(), "<ENTITY>");
   
   ptoken = ctoken;
   return (ctoken);
  }

View Full Code Here

     */
    public final Token next() throws IOException 
    { 
     //*-- if a token was saved, return it
     if (stoken != null) //*-- no. 1
     { Token tempToken = stoken; stoken = null; 
       return(tempToken); }
      
     //*-- get the next token and check if it is a stop word
     ctoken = input.next();  
     if (ctoken == null) return (null);    //*-- nos. 7 and 8
     String ctokenText = ctoken.termText();
 
     //*-- set stopword status: True if in stopwords list, or does NOT contain an alphabetic character
     cstop = ( stopWords.contains(ctokenText) ) ? true: (wordPattern.matcher(ctokenText).matches()) ? false: true;
     
     //*-- if there was a previous token 
     if (ptoken == null)
     { //*-- continue if the current token is a stop word 
       if (cstop)   //*-- no. 5
       { ptoken = ctoken; pstop = cstop; 
       ctoken = input.next(); cstop = (ctoken != null) ? stopWords.contains(ctoken.termText()): false;
       } //*-- otherwise, return the current token
       else     //*-- no. 6
       { ptoken = ctoken; pstop = cstop; return(ctoken); } 
     }
     
     //*-- if the current token is a stop word, return a
     //*-- bigram from the previous and current tokens
     if (cstop) 
      { Token tempToken = createBigram(ptoken, ctoken); 
        if (!stopWords.contains(ptoken.termText()) ) tempToken.setPositionIncrement(0); //*-- no. 3
        ptoken = ctoken; pstop = cstop; return(tempToken); }    //*-- nos. 3 and 4
     
     //*-- if the previous token was a stop word, first save
     //*-- the current token and return a bigram from previous
     //*-- and current tokens. In next call, return the saved token
     if (pstop)    //*-- no. 1
     { if (ctoken != null) 
      { stoken = ctoken; stoken.setPositionIncrement(0); 
        Token tempToken = createBigram(ptoken, ctoken); 
        ptoken = ctoken; pstop = cstop; return(tempToken); 
      }
     }
     
     ptoken = ctoken; pstop = cstop;

View Full Code Here

    }
   
    //*-- return a bigram token from the passed tokens
   protected Token createBigram(Token aToken, Token bToken)
   { String bigram = aToken.termText() + "_" + bToken.termText();
     return ( new Token(bigram, aToken.startOffset(), bToken.endOffset(), "<BIGRAM>" ) );
   }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.lucene.analysis.Token

com.chenlb.mmseg4j.analysis.CutLetterDigitFilter

com.flaptor.hounder.classifier.util.TupleTokenizer

com.stimulus.archiva.search.EmailFilter

com.stimulus.archiva.search.FileNameFilter

net.nutch.analysis.CommonGrams

net.nutch.analysis.CommonGrams$Filter

net.nutch.analysis.NutchDocumentTokenizer

org.apache.cocoon.bean.query.SimpleLuceneCriterionBean

org.apache.jackrabbit.core.query.lucene.AbstractExcerpt

org.apache.jackrabbit.core.query.lucene.SearchIndex

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.