Examples of org.apache.lucene.analysis.Token

org.apache.lucene.analysis.Token
s.apache.org/jira/browse/LUCENE-969">LUCENE-969 for details.

Typical Token reuse patterns:
- Copying text from a string (type is reset to {@link #DEFAULT_TYPE} if not specified):
```
 return reusableToken.reinit(string, startOffset, endOffset[, type]); 
```
- Copying some text from a string (type is reset to {@link #DEFAULT_TYPE} if not specified):
```
 return reusableToken.reinit(string, 0, string.length(), startOffset, endOffset[, type]); 
```
- Copying text from char[] buffer (type is reset to {@link #DEFAULT_TYPE} if not specified):
```
 return reusableToken.reinit(buffer, 0, buffer.length, startOffset, endOffset[, type]); 
```
- Copying some text from a char[] buffer (type is reset to {@link #DEFAULT_TYPE} if not specified):
```
 return reusableToken.reinit(buffer, start, end - start, startOffset, endOffset[, type]); 
```
- Copying from one one Token to another (type is reset to {@link #DEFAULT_TYPE} if not specified):
```
 return reusableToken.reinit(source.termBuffer(), 0, source.termLength(), source.startOffset(), source.endOffset()[, source.type()]); 
```
A few things to note:
- clear() initializes all of the fields to default values. This was changed in contrast to Lucene 2.4, but should affect no one.
- Because TokenStreams can be chained, one cannot assume that the Token's current type is correct.
- The startOffset and endOffset represent the start and offset in the source text, so be careful in adjusting them.
- When caching a reusable token, clone it. When injecting a cached token into a stream that can be reset, clone it again.
@see org.apache.lucene.index.Payload

import org.apache.lucene.analysis.tokenattributes.TermAttribute;


public class TestSingleTokenTokenFilter extends LuceneTestCase {


  public void test() throws IOException {
    Token token = new Token();
    SingleTokenTokenStream ts = new SingleTokenTokenStream(token);
    AttributeImpl tokenAtt = (AttributeImpl) ts.addAttribute(TermAttribute.class);
    assertTrue(tokenAtt instanceof Token);
    ts.reset();


    assertTrue(ts.incrementToken());
    assertEquals(token, tokenAtt);
    assertFalse(ts.incrementToken());
    
    token = new Token("hallo", 10, 20, "someType");
    ts.setToken(token);
    ts.reset();


    assertTrue(ts.incrementToken());
    assertEquals(token, tokenAtt);

View Full Code Here

            matchEndOffset = Math.max(matchEndOffset, termEndOffset);
          }
          tot += score;
        }
      }
      Token token = new Token(termStartOffset, termEndOffset);
      token.setTermBuffer(termAtt.term());
      tokens[numTokens] = token;
      scores[numTokens] = score;
      numTokens++;
    }
  }

View Full Code Here

          public boolean incrementToken() throws IOException {
            if (currentToken >= tokens.length) {
              return false;
            }
            clearAttributes();
            Token token = tokens[currentToken++];
            termAtt.setTermBuffer(token.term());
            offsetAtt.setOffset(token.startOffset(), token.endOffset());
            return true;
          }
        }      
        //code to reconstruct the original sequence of Tokens
        String[] terms=tpv.getTerms();          
        int[] freq=tpv.getTermFrequencies();
        int totalTokens=0;


        for (int t = 0; t < freq.length; t++)
        {
            totalTokens+=freq[t];
        }
        Token tokensInOriginalOrder[]=new Token[totalTokens];
        ArrayList<Token> unsortedTokens = null;
        for (int t = 0; t < freq.length; t++)
        {
            TermVectorOffsetInfo[] offsets=tpv.getOffsets(t);
            if(offsets==null)
            {
                return null;
            }
            
            int[] pos=null;
            if(tokenPositionsGuaranteedContiguous)
            {
                //try get the token position info to speed up assembly of tokens into sorted sequence
                pos=tpv.getTermPositions(t);
            }
            if(pos==null)
            {  
                //tokens NOT stored with positions or not guaranteed contiguous - must add to list and sort later
                if(unsortedTokens==null)
                {
                    unsortedTokens=new ArrayList<Token>();
                }
                for (int tp = 0; tp < offsets.length; tp++)
                {
                  Token token = new Token(offsets[tp].getStartOffset(), offsets[tp].getEndOffset());
                  token.setTermBuffer(terms[t]);
                  unsortedTokens.add(token);
                }
            }
            else
            {
                //We have positions stored and a guarantee that the token position information is contiguous
                
                // This may be fast BUT wont work if Tokenizers used which create >1 token in same position or
                // creates jumps in position numbers - this code would fail under those circumstances
                
                //tokens stored with positions - can use this to index straight into sorted array
                for (int tp = 0; tp < pos.length; tp++)
                {
                  Token token = new Token(terms[t], offsets[tp].getStartOffset(), offsets[tp].getEndOffset());
                  tokensInOriginalOrder[pos[tp]] = token;
                }                
            }
        }
        //If the field has been stored without position data we must perform a sort

View Full Code Here

          public boolean incrementToken() throws IOException {
            if (currentToken >= tokens.length) {
              return false;
            }
            clearAttributes();
            Token token = tokens[currentToken++];
            termAtt.setTermBuffer(token.term());
            offsetAtt.setOffset(token.startOffset(), token.endOffset());
            return true;
          }

View Full Code Here

        end = Integer.parseInt(params[3]);
      } else {
        end = start + params[0].length();
      }


      Token t = new Token(params[0],start,end,"TEST");
      t.setPositionIncrement(posInc);
      
      result.add(t);
      for (int j=1; j<toks.length; j++) {
        t = new Token(toks[j],0,0,"TEST");
        t.setPositionIncrement(0);
        result.add(t);
      }
    }
    return result;
  }

View Full Code Here

    public boolean incrementToken() throws IOException {
      if (index >= tokens.length)
        return false;
      else {
        clearAttributes();
        Token token = tokens[index++];
        termAtt.setEmpty().append(token);
        offsetAtt.setOffset(token.startOffset(), token.endOffset());
        posIncAtt.setPositionIncrement(token.getPositionIncrement());
        flagsAtt.setFlags(token.getFlags());
        typeAtt.setType(token.type());
        payloadAtt.setPayload(token.getPayload());
        return true;
      }
    }

View Full Code Here

              superColumnList.clear();
              if(importer_.columnFamily.superColumn.tokenize)
              {
                  Analyzer analyzer = new StandardAnalyzer();
                  TokenStream ts = analyzer.tokenStream("superColumn", new StringReader(superColumnName));
                  Token token = null;
                  token = ts.next();
                  while(token != null)
                  {
                    superColumnList.add(token.termText());
                      token = ts.next();
                  }
              }
              else
              {

View Full Code Here

    public TokenStream tokenStream(String name, Reader reader) {


        if (log.isDebugEnabled()) {
            TokenStream ts = super.tokenStream(name,
                    htmlReaderFromReader(reader));
            Token t;
            if (log.isDebugEnabled()) {
                try {
                    while ((t = ts.next()) != null) {
                        log.debug("token: " + t);
                    }

View Full Code Here

    ArrayList<Token> matchList = new ArrayList<Token>();


    // Add segments before each match found
    while(matcher.find()) {
      String match = input.subSequence(index, matcher.start()).toString();
      matchList.add( new Token( match, index, matcher.start()) );
      index = matcher.end();
      if( match.length() > 0 ) {
        lastNonEmptySize = matchList.size();
      }
    }


    // If no match is found, return the full string
    if (index == 0) {
      matchList.add( new Token( input, 0, input.length()) );
    }
    else { 
      String match = input.subSequence(index, input.length()).toString();
      matchList.add( new Token( match, index, input.length()) );
      if( match.length() > 0 ) {
        lastNonEmptySize = matchList.size();
      }
    }

View Full Code Here

  @Deprecated
  public static List<Token> group( Matcher matcher, String input, int group )
  {
    ArrayList<Token> matchList = new ArrayList<Token>();
    while(matcher.find()) {
      Token t = new Token( 
        matcher.group(group), 
        matcher.start(group), 
        matcher.end(group) );
      matchList.add( t );
    }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.lucene.analysis.Token

com.chenlb.mmseg4j.analysis.CutLetterDigitFilter

com.flaptor.hounder.classifier.util.TupleTokenizer

com.stimulus.archiva.search.EmailFilter

com.stimulus.archiva.search.FileNameFilter

net.nutch.analysis.CommonGrams

net.nutch.analysis.CommonGrams$Filter

net.nutch.analysis.NutchDocumentTokenizer

org.apache.cocoon.bean.query.SimpleLuceneCriterionBean

org.apache.jackrabbit.core.query.lucene.AbstractExcerpt

org.apache.jackrabbit.core.query.lucene.SearchIndex

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.