Package org.apache.lucene.analysis

Examples of org.apache.lucene.analysis.Token


   * @param type token类型,默认应该是word
   * @return
   */
  public Token convertSegToken(SegToken st, String sentence,
      int sentenceStartOffset, String type) {
    Token result;
    switch (st.wordType) {
      case STRING:
      case NUMBER:
      case FULLWIDTH_NUMBER:
      case FULLWIDTH_STRING:
        st.charArray = sentence.substring(st.startOffset, st.endOffset)
            .toCharArray();
        break;
      default:
        break;
    }

    st = tokenFilter.filter(st);

    result = new Token(st.charArray, 0, st.charArray.length, st.startOffset
        + sentenceStartOffset, st.endOffset + sentenceStartOffset);
    return result;
  }
View Full Code Here


    @Override
    public void reset() {
      this.i = -1;
      this.tokens = new Token[] {
          new Token(new char[] {'t', 'h', 'e'}, 0, 3, 0, 3),
          new Token(new char[] {'{', 'f', 'o', 'x', '}'}, 0, 5, 0, 7),
          new Token(new char[] {'f', 'o', 'x'}, 0, 3, 4, 7),
          new Token(new char[] {'d', 'i', 'd'}, 0, 3, 8, 11),
          new Token(new char[] {'n', 'o', 't'}, 0, 3, 12, 15),
          new Token(new char[] {'j', 'u', 'm', 'p'}, 0, 4, 16, 20)};
      this.tokens[1].setPositionIncrement(0);
    }
View Full Code Here

      @Override
      public boolean incrementToken() throws IOException {
        if (currentToken >= tokens.length) {
          return false;
        }
        Token token = tokens[currentToken++];
        clearAttributes();
        termAtt.setEmpty().append(token);
        offsetAtt.setOffset(token.startOffset(), token.endOffset());
        posincAtt
            .setPositionIncrement(currentToken <= 1
                || tokens[currentToken - 1].startOffset() > tokens[currentToken - 2]
                    .startOffset() ? 1 : 0);
        return true;
      }
    }
    // code to reconstruct the original sequence of Tokens
    String[] terms = tpv.getTerms();
    int[] freq = tpv.getTermFrequencies();
    int totalTokens = 0;
    for (int t = 0; t < freq.length; t++) {
      totalTokens += freq[t];
    }
    Token tokensInOriginalOrder[] = new Token[totalTokens];
    ArrayList<Token> unsortedTokens = null;
    for (int t = 0; t < freq.length; t++) {
      TermVectorOffsetInfo[] offsets = tpv.getOffsets(t);
      if (offsets == null) {
        throw new IllegalArgumentException(
            "Required TermVector Offset information was not found");
      }

      int[] pos = null;
      if (tokenPositionsGuaranteedContiguous) {
        // try get the token position info to speed up assembly of tokens into
        // sorted sequence
        pos = tpv.getTermPositions(t);
      }
      if (pos == null) {
        // tokens NOT stored with positions or not guaranteed contiguous - must
        // add to list and sort later
        if (unsortedTokens == null) {
          unsortedTokens = new ArrayList<Token>();
        }
        for (int tp = 0; tp < offsets.length; tp++) {
          Token token = new Token(terms[t], offsets[tp].getStartOffset(), offsets[tp]
              .getEndOffset());
          unsortedTokens.add(token);
        }
      } else {
        // We have positions stored and a guarantee that the token position
        // information is contiguous

        // This may be fast BUT wont work if Tokenizers used which create >1
        // token in same position or
        // creates jumps in position numbers - this code would fail under those
        // circumstances

        // tokens stored with positions - can use this to index straight into
        // sorted array
        for (int tp = 0; tp < pos.length; tp++) {
          Token token = new Token(terms[t], offsets[tp].getStartOffset(),
              offsets[tp].getEndOffset());
          tokensInOriginalOrder[pos[tp]] = token;
        }
      }
    }
View Full Code Here

      @Override
      public boolean incrementToken() throws IOException {
        if (currentToken >= tokens.length) {
          return false;
        }
        Token token = tokens[currentToken++];
        clearAttributes();
        termAtt.setEmpty().append(token);
        offsetAtt.setOffset(token.startOffset(), token.endOffset());
        posincAtt
            .setPositionIncrement(currentToken <= 1
                || tokens[currentToken - 1].startOffset() > tokens[currentToken - 2]
                    .startOffset() ? 1 : 0);
        return true;
View Full Code Here

      private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
      private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
      private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
      {
        lst = new ArrayList<Token>();
        Token t;
        t = createToken("hi", 0, 2);
        t.setPositionIncrement(1);
        lst.add(t);
        t = createToken("hispeed", 0, 8);
        t.setPositionIncrement(1);
        lst.add(t);
        t = createToken("speed", 3, 8);
        t.setPositionIncrement(0);
        lst.add(t);
        t = createToken("10", 8, 10);
        t.setPositionIncrement(1);
        lst.add(t);
        t = createToken("foo", 11, 14);
        t.setPositionIncrement(1);
        lst.add(t);
        iter = lst.iterator();
      }

      @Override
      public boolean incrementToken() throws IOException {
        if(iter.hasNext()) {
          Token token =  iter.next();
          clearAttributes();
          termAtt.setEmpty().append(token);
          posIncrAtt.setPositionIncrement(token.getPositionIncrement());
          offsetAtt.setOffset(token.startOffset(), token.endOffset());
          return true;
        }
        return false;
      }
    
View Full Code Here

      private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
      private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
      private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
      {
        lst = new ArrayList<Token>();
        Token t;
        t = createToken("hispeed", 0, 8);
        t.setPositionIncrement(1);
        lst.add(t);
        t = createToken("hi", 0, 2);
        t.setPositionIncrement(0);
        lst.add(t);
        t = createToken("speed", 3, 8);
        t.setPositionIncrement(1);
        lst.add(t);
        t = createToken("10", 8, 10);
        t.setPositionIncrement(1);
        lst.add(t);
        t = createToken("foo", 11, 14);
        t.setPositionIncrement(1);
        lst.add(t);
        iter = lst.iterator();
      }

      @Override
      public boolean incrementToken() throws IOException {
        if(iter.hasNext()) {
          Token token = iter.next();
          clearAttributes();
          termAtt.setEmpty().append(token);
          posIncrAtt.setPositionIncrement(token.getPositionIncrement());
          offsetAtt.setOffset(token.startOffset(), token.endOffset());
          return true;
        }
        return false;
      }
    };
View Full Code Here

  }

  private static Token createToken(String term, int start, int offset)
  {
    return new Token(term, start, offset);
  }
View Full Code Here

    }

    // this loop exists in order to avoid recursive calls to the next method
    // as the complexity of a large matrix
    // then would require a multi gigabyte sized stack.
    Token token;
    do {
      token = produceNextToken(reusableToken);
    } while (token == request_next_token);
    if (token == null) return false;

    clearAttributes();
    termAtt.setTermBuffer(token.termBuffer(), 0, token.termLength());
    posIncrAtt.setPositionIncrement(token.getPositionIncrement());
    flagsAtt.setFlags(token.getFlags());
    offsetAtt.setOffset(token.startOffset(), token.endOffset());
    typeAtt.setType(token.type());
    payloadAtt.setPayload(token.getPayload());
    return true;
  }
View Full Code Here

        int termLength = 0;

        List<Token> shingle = new ArrayList<Token>(currentShingleLength);

        for (int i = 0; i < currentShingleLength; i++) {
          Token shingleToken = currentPermuationTokens.get(i + currentPermutationTokensStartOffset);
          termLength += shingleToken.termLength();
          shingle.add(shingleToken);
        }
        if (spacerCharacter != null) {
          termLength += currentShingleLength - 1;
        }

        // only produce shingles that not already has been created
        if (!shinglesSeen.add(shingle)) {
          return request_next_token;
        }

        // shingle token factory
        StringBuilder sb = new StringBuilder(termLength + 10); // paranormal ability to foresee the future.
        for (Token shingleToken : shingle) {
          if (spacerCharacter != null && sb.length() > 0) {
            sb.append(spacerCharacter);
          }
          sb.append(shingleToken.termBuffer(), 0, shingleToken.termLength());
        }
        reusableToken.setTermBuffer(sb.toString());
        updateToken(reusableToken, shingle, currentPermutationTokensStartOffset, currentPermutationRows, currentPermuationTokens);

        return reusableToken;
View Full Code Here

   * @return true if it manage to read one more column from the input token stream
   * @throws IOException if the matrix source input stream throws an exception
   */
  private boolean readColumn() throws IOException {

    Token token;
    if (readColumnBuf != null) {
      token = readColumnBuf;
      readColumnBuf = null;
    } else {
      token = getNextInputToken(new Token());
    }

    if (token == null) {
      return false;
    }

    Matrix.Column currentReaderColumn = matrix.new Column();
    Matrix.Column.Row currentReaderRow = currentReaderColumn.new Row();

    currentReaderRow.getTokens().add(token);
    TokenPositioner tokenPositioner;
    while ((readColumnBuf = getNextInputToken(new Token())) != null
        && (tokenPositioner = settingsCodec.getTokenPositioner(readColumnBuf)) != TokenPositioner.newColumn) {

      if (tokenPositioner == TokenPositioner.sameRow) {
        currentReaderRow.getTokens().add(readColumnBuf);
      } else /*if (tokenPositioner == TokenPositioner.newRow)*/ {
        currentReaderRow = currentReaderColumn.new Row();
        currentReaderRow.getTokens().add(readColumnBuf);
      }
      readColumnBuf = null;

    }

    if (readColumnBuf == null) {
      readColumnBuf = getNextInputToken(new Token());
      if (readColumnBuf == null) {
        currentReaderColumn.setLast(true);
      }
    }

View Full Code Here

TOP

Related Classes of org.apache.lucene.analysis.Token

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.