Package org.apache.lucene.analysis

Examples of org.apache.lucene.analysis.Token


   
          public boolean incrementToken() throws IOException {
            if (currentToken >= tokens.length) {
              return false;
            }
            Token token = tokens[currentToken++];
            termAtt.setTermBuffer(token.term());
            offsetAtt.setOffset(token.startOffset(), token.endOffset());
            return true;
          }
        }     
        //code to reconstruct the original sequence of Tokens
        String[] terms=tpv.getTerms();         
        int[] freq=tpv.getTermFrequencies();
        int totalTokens=0;

        for (int t = 0; t < freq.length; t++)
        {
            totalTokens+=freq[t];
        }
        Token tokensInOriginalOrder[]=new Token[totalTokens];
        ArrayList unsortedTokens = null;
        for (int t = 0; t < freq.length; t++)
        {
            TermVectorOffsetInfo[] offsets=tpv.getOffsets(t);
            if(offsets==null)
            {
                return null;
            }
           
            int[] pos=null;
            if(tokenPositionsGuaranteedContiguous)
            {
                //try get the token position info to speed up assembly of tokens into sorted sequence
                pos=tpv.getTermPositions(t);
            }
            if(pos==null)
            { 
                //tokens NOT stored with positions or not guaranteed contiguous - must add to list and sort later
                if(unsortedTokens==null)
                {
                    unsortedTokens=new ArrayList();
                }
                for (int tp = 0; tp < offsets.length; tp++)
                {
                  Token token = new Token(offsets[tp].getStartOffset(), offsets[tp].getEndOffset());
                  token.setTermBuffer(terms[t]);
                  unsortedTokens.add(token);
                }
            }
            else
            {
                //We have positions stored and a guarantee that the token position information is contiguous
               
                // This may be fast BUT wont work if Tokenizers used which create >1 token in same position or
                // creates jumps in position numbers - this code would fail under those circumstances
               
                //tokens stored with positions - can use this to index straight into sorted array
                for (int tp = 0; tp < pos.length; tp++)
                {
                  Token token = new Token(terms[t], offsets[tp].getStartOffset(), offsets[tp].getEndOffset());
                  tokensInOriginalOrder[pos[tp]] = token;
                }               
            }
        }
        //If the field has been stored without position data we must perform a sort       
        if(unsortedTokens!=null)
        {
            tokensInOriginalOrder=(Token[]) unsortedTokens.toArray(new Token[unsortedTokens.size()]);
            Arrays.sort(tokensInOriginalOrder, new Comparator(){
                public int compare(Object o1, Object o2)
                {
                    Token t1=(Token) o1;
                    Token t2=(Token) o2;
                    if(t1.startOffset()>t2.endOffset())
                        return 1;
                    if(t1.startOffset()<t2.startOffset())
                        return -1;
                    return 0;
                }});
        }
        return new StoredTokenStream(tokensInOriginalOrder);
View Full Code Here


   
          public boolean incrementToken() throws IOException {
            if (currentToken >= tokens.length) {
              return false;
            }
            Token token = tokens[currentToken++];
            termAtt.setTermBuffer(token.term());
            offsetAtt.setOffset(token.startOffset(), token.endOffset());
            return true;
          }
View Full Code Here

    return tokenFactory(text, 1, 1f, startOffset, endOffset);
  }


  private Token tokenFactory(String text, int posIncr, int startOffset, int endOffset) {
    Token token = new Token(startOffset, endOffset);
    token.setTermBuffer(text);
    token.setPositionIncrement(posIncr);
    return token;
  }
View Full Code Here

  private Token tokenFactory(String text, int posIncr, float weight) {
    return tokenFactory(text, posIncr, weight, 0, 0);
  }

  private Token tokenFactory(String text, int posIncr, float weight, int startOffset, int endOffset) {
    Token token = new Token(startOffset, endOffset);
    token.setTermBuffer(text);
    token.setPositionIncrement(posIncr);
    ShingleMatrixFilter.defaultSettingsCodec.setWeight(token, weight);
    return token;
  }
View Full Code Here

    ShingleMatrixFilter.defaultSettingsCodec.setWeight(token, weight);
    return token;
  }

  private Token tokenFactory(String text, int posIncr, float weight, int startOffset, int endOffset, ShingleMatrixFilter.TokenPositioner positioner) {
    Token token = new Token(startOffset, endOffset);
    token.setTermBuffer(text);
    token.setPositionIncrement(posIncr);
    ShingleMatrixFilter.defaultSettingsCodec.setWeight(token, weight);
    ShingleMatrixFilter.defaultSettingsCodec.setTokenPositioner(token, positioner);
    return token;
  }
View Full Code Here

    assertEquals(endOffset, offsetAtt.endOffset());
  }

  private static Token createToken(String term, int start, int offset)
  {
    Token token = new Token(start, offset);
    token.setTermBuffer(term);
    return token;
  }
View Full Code Here

            Field f = new Field("e", i + " Heres Johnny!", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
            f.setOmitNorms(true);
            document.add(f);
            if (i > 4) {
              final List<Token> tokens = new ArrayList<Token>(2);
              Token t = createToken("the", 0, 2, "text");
              t.setPayload(new Payload(new byte[]{1, 2, 3}));
              tokens.add(t);
              t = createToken("end", 3, 5, "text");
              t.setPayload(new Payload(new byte[]{2}));
              tokens.add(t);
              tokens.add(createToken("fin", 7, 9));
              final Token reusableToken = new Token();
              TokenStream ts = new TokenStream() {
                Iterator<Token> it = tokens.iterator();
               
                public final boolean incrementToken() throws IOException {
                  if (!it.hasNext()) {
                    return false;
                  }

                  reusableToken.reinit(it.next());
                  return true;
                }

                public void reset() throws IOException {
                  it = tokens.iterator();
View Full Code Here

    testReader.close();
  }

  private static Token createToken(String term, int start, int offset)
  {
    Token token = new Token(start, offset);
    token.setTermBuffer(term);
    return token;
  }
View Full Code Here

    return token;
  }

  private static Token createToken(String term, int start, int offset, String type)
  {
    Token token = new Token(start, offset, type);
    token.setTermBuffer(term);
    return token;
  }
View Full Code Here

    assertTrue("Iterator should have 1 attributes left", it.hasNext());
    assertSame("Second AttributeImpl from iterator should be typeAtt", typeAtt, it.next());
    assertFalse("Iterator should have 0 attributes left", it.hasNext());

    src = new AttributeSource();
    src.addAttributeImpl(new Token());
    // this should not add a new attribute as Token implements CharTermAttribute, too
    termAtt = src.addAttribute(CharTermAttribute.class);
    assertTrue("CharTermAttribute should be implemented by Token", termAtt instanceof Token);
    // get the Token attribute and check, that it is the only one
    it = src.getAttributeImplsIterator();
    Token tok = (Token) it.next();
    assertFalse("There should be only one attribute implementation instance", it.hasNext());
   
    termAtt.setEmpty().append("TestTerm");
    assertEquals("Token should only printed once", "("+tok.toString()+")", src.toString());
  }
View Full Code Here

TOP

Related Classes of org.apache.lucene.analysis.Token

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.