Examples of org.apache.lucene.analysis.Token

org.apache.lucene.analysis.Token
s.apache.org/jira/browse/LUCENE-969">LUCENE-969 for details.

Typical Token reuse patterns:
- Copying text from a string (type is reset to {@link #DEFAULT_TYPE} if not specified):
```
 return reusableToken.reinit(string, startOffset, endOffset[, type]); 
```
- Copying some text from a string (type is reset to {@link #DEFAULT_TYPE} if not specified):
```
 return reusableToken.reinit(string, 0, string.length(), startOffset, endOffset[, type]); 
```
- Copying text from char[] buffer (type is reset to {@link #DEFAULT_TYPE} if not specified):
```
 return reusableToken.reinit(buffer, 0, buffer.length, startOffset, endOffset[, type]); 
```
- Copying some text from a char[] buffer (type is reset to {@link #DEFAULT_TYPE} if not specified):
```
 return reusableToken.reinit(buffer, start, end - start, startOffset, endOffset[, type]); 
```
- Copying from one one Token to another (type is reset to {@link #DEFAULT_TYPE} if not specified):
```
 return reusableToken.reinit(source.termBuffer(), 0, source.termLength(), source.startOffset(), source.endOffset()[, source.type()]); 
```
A few things to note:
- clear() initializes all of the fields to default values. This was changed in contrast to Lucene 2.4, but should affect no one.
- Because TokenStreams can be chained, one cannot assume that the Token's current type is correct.
- The startOffset and endOffset represent the start and offset in the source text, so be careful in adjusting them.
- When caching a reusable token, clone it. When injecting a cached token into a stream that can be reset, clone it again.
@see org.apache.lucene.index.Payload

    char[] lowerCaseTermBuffer=makeLowerCaseCopy(token.termBuffer());


    for (int i = 0; i < hyp.length; ++i) {
      int remaining = hyp.length - i;
      int start = hyp[i];
      Token longestMatchToken = null;
      for (int j = 1; j < remaining; j++) {
        int partLength = hyp[i + j] - start;


        // if the part is longer than maxSubwordSize we
        // are done with this round
        if (partLength > this.maxSubwordSize) {
          break;
        }


        // we only put subwords to the token stream
        // that are longer than minPartSize
        if (partLength < this.minSubwordSize) {
          continue;
        }


        // check the dictionary
        if (dictionary.contains(lowerCaseTermBuffer, start, partLength)) {
          if (this.onlyLongestMatch) {
            if (longestMatchToken != null) {
              if (longestMatchToken.termLength() < partLength) {
                longestMatchToken = createToken(start, partLength, token);
              }
            } else {
              longestMatchToken = createToken(start, partLength, token);
            }
          } else {
            tokens.add(createToken(start, partLength, token));
          }
        } else if (dictionary.contains(lowerCaseTermBuffer, start,
            partLength - 1)) {
          // check the dictionary again with a word that is one character
          // shorter
          // to avoid problems with genitive 's characters and other binding
          // characters
          if (this.onlyLongestMatch) {
            if (longestMatchToken != null) {
              if (longestMatchToken.termLength() < partLength - 1) {
                longestMatchToken = createToken(start, partLength - 1, token);
              }
            } else {
              longestMatchToken = createToken(start, partLength - 1, token);
            }

View Full Code Here

  }
  
  protected final Token createToken(final int offset, final int length,
      final Token prototype) {
    int newStart = prototype.startOffset() + offset;
    Token t = prototype.clone(prototype.termBuffer(), offset, length, newStart, newStart+length);
    t.setPositionIncrement(0);
    return t;
  }

View Full Code Here

    }
    
    char[] lowerCaseTermBuffer=makeLowerCaseCopy(token.termBuffer());
    
    for (int i=0;i<token.termLength()-this.minSubwordSize;++i) {
        Token longestMatchToken=null;
        for (int j=this.minSubwordSize-1;j<this.maxSubwordSize;++j) {
            if(i+j>token.termLength()) {
                break;
            }
            if(dictionary.contains(lowerCaseTermBuffer, i, j)) {
                if (this.onlyLongestMatch) {
                   if (longestMatchToken!=null) {
                     if (longestMatchToken.termLength()<j) {
                       longestMatchToken=createToken(i,j,token);
                     }
                   } else {
                     longestMatchToken=createToken(i,j,token);
                   }

View Full Code Here

  private boolean prefixExhausted;


  @Override
  public final boolean incrementToken() throws IOException {
    if (!prefixExhausted) {
      Token nextToken = getNextPrefixInputToken(reusableToken);
      if (nextToken == null) {
        prefixExhausted = true;
      } else {
        previousPrefixToken.reinit(nextToken);
        // Make it a deep copy
        Payload p = previousPrefixToken.getPayload();
        if (p != null) {
          previousPrefixToken.setPayload((Payload) p.clone());
        }
        setCurrentToken(nextToken);
        return true;
      }
    }


    Token nextToken = getNextSuffixInputToken(reusableToken);
    if (nextToken == null) {
      return false;
    }


    nextToken = updateSuffixToken(nextToken, previousPrefixToken);

View Full Code Here

          // reset the TokenStream to the first token          
          tokenStream.reset();


          while (tokenStream.incrementToken()) {
            // TODO: this is a simple workaround to still work with tokens, not very effective, but as far as I know, this writer should get removed soon:
            final Token token = new Token();
            for (Iterator<AttributeImpl> atts = tokenStream.getAttributeImplsIterator(); atts.hasNext();) {
              final AttributeImpl att = atts.next();
              try {
                att.copyTo(token);
              } catch (Exception e) {
                // ignore unsupported attributes,
                // this may fail to copy some attributes, if a special combined AttributeImpl is used, that
                // implements basic attributes supported by Token and also other customized ones in one class.
              }
            }
            tokens.add(token); // the vector will be built on commit.
            fieldSetting.fieldLength++;
            if (fieldSetting.fieldLength > maxFieldLength) {
              break;
            }
          }
          tokenStream.end();
          tokenStream.close();
        } else {
          // untokenized
          String fieldVal = field.stringValue();
          Token token = new Token(0, fieldVal.length(), "untokenized");
          token.setTermBuffer(fieldVal);
          tokens.add(token);
          fieldSetting.fieldLength++;
        }
      }


      if (!field.isStored()) {
        it.remove();
      }
    }




    Map<FieldSetting, Map<String /*text*/, TermDocumentInformationFactory>> termDocumentInformationFactoryByTermTextAndFieldSetting = new HashMap<FieldSetting, Map<String /*text*/, TermDocumentInformationFactory>>();
    termDocumentInformationFactoryByDocument.put(document, termDocumentInformationFactoryByTermTextAndFieldSetting);


    // build term vector, term positions and term offsets
    for (Map.Entry<Fieldable, LinkedList<Token>> eField_Tokens : tokensByField.entrySet()) {
      FieldSetting fieldSetting = fieldSettingsByFieldName.get(eField_Tokens.getKey().name());


      Map<String, TermDocumentInformationFactory> termDocumentInformationFactoryByTermText = termDocumentInformationFactoryByTermTextAndFieldSetting.get(fieldSettingsByFieldName.get(eField_Tokens.getKey().name()));
      if (termDocumentInformationFactoryByTermText == null) {
        termDocumentInformationFactoryByTermText = new HashMap<String /*text*/, TermDocumentInformationFactory>();
        termDocumentInformationFactoryByTermTextAndFieldSetting.put(fieldSettingsByFieldName.get(eField_Tokens.getKey().name()), termDocumentInformationFactoryByTermText);
      }


      int lastOffset = 0;


      // for each new field, move positions a bunch.
      if (fieldSetting.position > 0) {
        // todo what if no analyzer set, multiple fields with same name and index without tokenization?
        fieldSetting.position += analyzer.getPositionIncrementGap(fieldSetting.fieldName);
      }


      for (Token token : eField_Tokens.getValue()) {


        TermDocumentInformationFactory termDocumentInformationFactory = termDocumentInformationFactoryByTermText.get(token.term());
        if (termDocumentInformationFactory == null) {
          termDocumentInformationFactory = new TermDocumentInformationFactory();
          termDocumentInformationFactoryByTermText.put(token.term(), termDocumentInformationFactory);
        }
        //termDocumentInformationFactory.termFrequency++;


        fieldSetting.position += (token.getPositionIncrement() - 1);
        termDocumentInformationFactory.termPositions.add(fieldSetting.position++);


        if (token.getPayload() != null && token.getPayload().length() > 0) {
          termDocumentInformationFactory.payloads.add(token.getPayload().toByteArray());
          fieldSetting.storePayloads = true;
        } else {
          termDocumentInformationFactory.payloads.add(null);
        }


        if (eField_Tokens.getKey().isStoreOffsetWithTermVector()) {


          termDocumentInformationFactory.termOffsets.add(new TermVectorOffsetInfo(fieldSetting.offset + token.startOffset(), fieldSetting.offset + token.endOffset()));
          lastOffset = fieldSetting.offset + token.endOffset();
        }




      }

View Full Code Here

    assertTrue("Iterator should have 1 attributes left", it.hasNext());
    assertSame("Second AttributeImpl from iterator should be typeAtt", typeAtt, it.next());
    assertFalse("Iterator should have 0 attributes left", it.hasNext());


    src = new AttributeSource();
    src.addAttributeImpl(new Token());
    // this should not add a new attribute as Token implements TermAttribute, too
    termAtt = src.addAttribute(TermAttribute.class);
    assertTrue("TermAttribute should be implemented by Token", termAtt instanceof Token);
    // get the Token attribute and check, that it is the only one
    it = src.getAttributeImplsIterator();
    Token tok = (Token) it.next();
    assertFalse("There should be only one attribute implementation instance", it.hasNext());
    
    termAtt.setTermBuffer("TestTerm");
    assertEquals("Token should only printed once", "("+tok.toString()+")", src.toString());
  }

View Full Code Here

            Field f = new Field("e", i + " Heres Johnny!", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
            f.setOmitNorms(true);
            document.add(f);
            if (i > 4) {
              final List<Token> tokens = new ArrayList<Token>(2);
              Token t = createToken("the", 0, 2, "text");
              t.setPayload(new Payload(new byte[]{1, 2, 3}));
              tokens.add(t);
              t = createToken("end", 3, 5, "text");
              t.setPayload(new Payload(new byte[]{2}));
              tokens.add(t);
              tokens.add(createToken("fin", 7, 9));
              TokenStream ts = new TokenStream(Token.TOKEN_ATTRIBUTE_FACTORY) {
                final AttributeImpl reusableToken = (AttributeImpl) addAttribute(TermAttribute.class);
                Iterator<Token> it = tokens.iterator();

View Full Code Here

      if (expansions == null) {
        return true;
      }
      st = new StringTokenizer(expansions, ",");
      if (st.hasMoreTokens()) {
        currentRealToken = new Token(realOffsetAtt.startOffset(), realOffsetAtt.endOffset());
        currentRealToken.setTermBuffer(realTermAtt.term());
      }
      
      return true;
    } else {

View Full Code Here

    testReader.close();
  }


  private static Token createToken(String term, int start, int offset)
  {
    Token token = new Token(start, offset);
    token.setTermBuffer(term);
    return token;
  }

View Full Code Here

    return token;
  }


  private static Token createToken(String term, int start, int offset, String type)
  {
    Token token = new Token(start, offset, type);
    token.setTermBuffer(term);
    return token;
  }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.lucene.analysis.Token

com.chenlb.mmseg4j.analysis.CutLetterDigitFilter

com.flaptor.hounder.classifier.util.TupleTokenizer

com.stimulus.archiva.search.EmailFilter

com.stimulus.archiva.search.FileNameFilter

net.nutch.analysis.CommonGrams

net.nutch.analysis.CommonGrams$Filter

net.nutch.analysis.NutchDocumentTokenizer

org.apache.cocoon.bean.query.SimpleLuceneCriterionBean

org.apache.jackrabbit.core.query.lucene.AbstractExcerpt

org.apache.jackrabbit.core.query.lucene.SearchIndex

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.