Examples of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute

Package org.apache.lucene.analysis.tokenattributes

Examples of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute

org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute
Determines the position of this token relative to the previous Token in a TokenStream, used in phrase searching.
The default value is one.
Some common uses for this are:
- Set it to zero to put multiple terms in the same position. This is useful if, e.g., a word has multiple stems. Searches for phrases including either stem will match. In this case, all but the first stem's increment should be set to zero: the increment of the first instance should be one. Repeating a token with an increment of zero can also be used to boost the scores of matches on that token.
- Set it to values greater than one to inhibit exact phrase matches. If, for example, one does not want phrases to match across removed stop words, then one could build a stop word filter that removes stop words and also sets the increment to the number of stop words removed before each non-stop word. Then exact phrase queries will only match when the terms occur with no intervening stop words.
@see org.apache.lucene.index.DocsAndPositionsEnum

  
  private void doTestStopPositons(StopFilter stpf, boolean enableIcrements) throws IOException {
    log("---> test with enable-increments-"+(enableIcrements?"enabled":"disabled"));
    stpf.setEnablePositionIncrements(enableIcrements);
    CharTermAttribute termAtt = stpf.getAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncrAtt = stpf.getAttribute(PositionIncrementAttribute.class);
    stpf.reset();
    for (int i=0; i<20; i+=3) {
      assertTrue(stpf.incrementToken());
      log("Token "+i+": "+stpf);
      String w = English.intToEnglish(i).trim();
      assertEquals("expecting token "+i+" to be "+w,w,termAtt.toString());
      assertEquals("all but first token must have position increment of 3",enableIcrements?(i==0?1:3):1,posIncrAtt.getPositionIncrement());
    }
    assertFalse(stpf.incrementToken());
    stpf.end();
    stpf.close();
  }

View Full Code Here

    TokenStream ts = queryAnalyzer.tokenStream("", key.toString());
    try {
      TermToBytesRefAttribute termBytesAtt = ts.addAttribute(TermToBytesRefAttribute.class);
      OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
      PositionLengthAttribute posLenAtt = ts.addAttribute(PositionLengthAttribute.class);
      PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
      ts.reset();
      
      BytesRef[] lastTokens = new BytesRef[grams];
      //System.out.println("lookup: key='" + key + "'");
      
      // Run full analysis, but save only the
      // last 1gram, last 2gram, etc.:
      BytesRef tokenBytes = termBytesAtt.getBytesRef();
      int maxEndOffset = -1;
      boolean sawRealToken = false;
      while(ts.incrementToken()) {
        termBytesAtt.fillBytesRef();
        sawRealToken |= tokenBytes.length > 0;
        // TODO: this is somewhat iffy; today, ShingleFilter
        // sets posLen to the gram count; maybe we should make
        // a separate dedicated att for this?
        int gramCount = posLenAtt.getPositionLength();
        
        assert gramCount <= grams;
        
        // Safety: make sure the recalculated count "agrees":
        if (countGrams(tokenBytes) != gramCount) {
          throw new IllegalArgumentException("tokens must not contain separator byte; got token=" + tokenBytes + " but gramCount=" + gramCount + " does not match recalculated count=" + countGrams(tokenBytes));
        }
        maxEndOffset = Math.max(maxEndOffset, offsetAtt.endOffset());
        lastTokens[gramCount-1] = BytesRef.deepCopyOf(tokenBytes);
      }
      ts.end();
      
      if (!sawRealToken) {
        throw new IllegalArgumentException("no tokens produced by analyzer, or the only tokens were empty strings");
      }
      
      // Carefully fill last tokens with _ tokens;
      // ShingleFilter appraently won't emit "only hole"
      // tokens:
      int endPosInc = posIncAtt.getPositionIncrement();
      
      // Note this will also be true if input is the empty
      // string (in which case we saw no tokens and
      // maxEndOffset is still -1), which in fact works out OK
      // because we fill the unigram with an empty BytesRef

View Full Code Here

    private int upto = 0;
    public boolean incrementToken() {
      if (upto == 4) {
        return false;
      }
      PositionIncrementAttribute posIncr = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
      TermAttribute term = (TermAttribute) addAttribute(TermAttribute.class);
      if (upto == 0) {
        posIncr.setPositionIncrement(1);
        term.setTermBuffer("a");
      } else if (upto == 1) {
        posIncr.setPositionIncrement(1);
        term.setTermBuffer("b");
      } else if (upto == 2) {
        posIncr.setPositionIncrement(0);
        term.setTermBuffer("c");
      } else {
        posIncr.setPositionIncrement(0);
        term.setTermBuffer("d");
      }
      upto++;
      return true;
    }

View Full Code Here

    SnowballFilter filter = new SnowballFilter(new TestTokenStream(), "English");
    TermAttribute termAtt = filter.getAttribute(TermAttribute.class);
    OffsetAttribute offsetAtt = filter.getAttribute(OffsetAttribute.class);
    TypeAttribute typeAtt = filter.getAttribute(TypeAttribute.class);
    PayloadAttribute payloadAtt = filter.getAttribute(PayloadAttribute.class);
    PositionIncrementAttribute posIncAtt = filter.getAttribute(PositionIncrementAttribute.class);
    FlagsAttribute flagsAtt = filter.getAttribute(FlagsAttribute.class);
    
    filter.incrementToken();


    assertEquals("accent", termAtt.term());
    assertEquals(2, offsetAtt.startOffset());
    assertEquals(7, offsetAtt.endOffset());
    assertEquals("wrd", typeAtt.type());
    assertEquals(3, posIncAtt.getPositionIncrement());
    assertEquals(77, flagsAtt.getFlags());
    assertEquals(new Payload(new byte[]{0,1,2,3}), payloadAtt.getPayload());
  }

View Full Code Here


    TokenStream ts = analyzer.tokenStream("content",
                                          new StringReader("this sentence"));
    int j = -1;
    
    PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class);
    TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
    
    while (ts.incrementToken()) {
      j += posIncrAtt.getPositionIncrement();
      String termText = termAtt.term();
      q.add(new Term("content", termText), j);
    }


    ScoreDoc[] hits = searcher.search(q, null, 1000).scoreDocs;

View Full Code Here

      } catch (IOException e1) {
        throw new RuntimeException(e1);
      }
      CachingTokenFilter buffer = new CachingTokenFilter(source);


      PositionIncrementAttribute posIncrAtt = null;
      int numTokens = 0;
      int positionCount = 0;
      boolean severalTokensAtSamePosition = false;


      if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
        posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class);
      }


      try {


        while (buffer.incrementToken()) {
          numTokens++;
          int positionIncrement = (posIncrAtt != null) ? posIncrAtt
              .getPositionIncrement() : 1;
          if (positionIncrement != 0) {
            positionCount += positionIncrement;


          } else {
            severalTokensAtSamePosition = true;
          }


        }


      } catch (IOException e) {
        // ignore
      }


      try {
        // rewind the buffer stream
        buffer.reset();


        // close original stream - all tokens buffered
        source.close();
      } catch (IOException e) {
        // ignore
      }


      if (!buffer.hasAttribute(CharTermAttribute.class)) {
        return new NoTokenFoundQueryNode();
      }


      CharTermAttribute termAtt = buffer.getAttribute(CharTermAttribute.class);


      if (numTokens == 0) {
        return new NoTokenFoundQueryNode();


      } else if (numTokens == 1) {
        String term = null;
        try {
          boolean hasNext;
          hasNext = buffer.incrementToken();
          assert hasNext == true;
          term = termAtt.toString();


        } catch (IOException e) {
          // safe to ignore, because we know the number of tokens
        }


        fieldNode.setText(term);


        return fieldNode;


      } else if (severalTokensAtSamePosition || !(node instanceof QuotedFieldQueryNode)) {
        if (positionCount == 1 || !(node instanceof QuotedFieldQueryNode)) {
          // no phrase query:
          LinkedList<QueryNode> children = new LinkedList<QueryNode>();


          for (int i = 0; i < numTokens; i++) {
            String term = null;
            try {
              boolean hasNext = buffer.incrementToken();
              assert hasNext == true;
              term = termAtt.toString();


            } catch (IOException e) {
              // safe to ignore, because we know the number of tokens
            }


            children.add(new FieldQueryNode(field, term, -1, -1));


          }
          if (positionCount == 1)
            return new GroupQueryNode(
              new StandardBooleanQueryNode(children, true));
          else
            return new StandardBooleanQueryNode(children, false);


        } else {
          // phrase query:
          MultiPhraseQueryNode mpq = new MultiPhraseQueryNode();


          List<FieldQueryNode> multiTerms = new ArrayList<FieldQueryNode>();
          int position = -1;
          int i = 0;
          int termGroupCount = 0;
          for (; i < numTokens; i++) {
            String term = null;
            int positionIncrement = 1;
            try {
              boolean hasNext = buffer.incrementToken();
              assert hasNext == true;
              term = termAtt.toString();
              if (posIncrAtt != null) {
                positionIncrement = posIncrAtt.getPositionIncrement();
              }


            } catch (IOException e) {
              // safe to ignore, because we know the number of tokens
            }


            if (positionIncrement > 0 && multiTerms.size() > 0) {


              for (FieldQueryNode termNode : multiTerms) {


                if (this.positionIncrementsEnabled) {
                  termNode.setPositionIncrement(position);
                } else {
                  termNode.setPositionIncrement(termGroupCount);
                }


                mpq.add(termNode);


              }


              // Only increment once for each "group" of
              // terms that were in the same position:
              termGroupCount++;


              multiTerms.clear();


            }


            position += positionIncrement;
            multiTerms.add(new FieldQueryNode(field, term, -1, -1));


          }


          for (FieldQueryNode termNode : multiTerms) {


            if (this.positionIncrementsEnabled) {
              termNode.setPositionIncrement(position);


            } else {
              termNode.setPositionIncrement(termGroupCount);
            }


            mpq.add(termNode);


          }


          return mpq;


        }


      } else {


        TokenizedPhraseQueryNode pq = new TokenizedPhraseQueryNode();


        int position = -1;


        for (int i = 0; i < numTokens; i++) {
          String term = null;
          int positionIncrement = 1;


          try {
            boolean hasNext = buffer.incrementToken();
            assert hasNext == true;
            term = termAtt.toString();


            if (posIncrAtt != null) {
              positionIncrement = posIncrAtt.getPositionIncrement();
            }


          } catch (IOException e) {
            // safe to ignore, because we know the number of tokens
          }

View Full Code Here

            boolean hasMoreTokens = stream.incrementToken();


            fieldState.attributeSource = stream;


            OffsetAttribute offsetAttribute = fieldState.attributeSource.addAttribute(OffsetAttribute.class);
            PositionIncrementAttribute posIncrAttribute = fieldState.attributeSource.addAttribute(PositionIncrementAttribute.class);
            
            consumer.start(field);
            
            for(;;) {


              // If we hit an exception in stream.next below
              // (which is fairly common, eg if analyzer
              // chokes on a given document), then it's
              // non-aborting and (above) this one document
              // will be marked as deleted, but still
              // consume a docID
              
              if (!hasMoreTokens) break;
              
              final int posIncr = posIncrAttribute.getPositionIncrement();
              fieldState.position += posIncr;
              if (fieldState.position > 0) {
                fieldState.position--;
              }

View Full Code Here

          boolean hasMoreTokens = stream.incrementToken();


          fieldState.attributeSource = stream;


          OffsetAttribute offsetAttribute = fieldState.attributeSource.addAttribute(OffsetAttribute.class);
          PositionIncrementAttribute posIncrAttribute = fieldState.attributeSource.addAttribute(PositionIncrementAttribute.class);


          if (hasMoreTokens) {
            consumer.start(field);


            do {
              // If we hit an exception in stream.next below
              // (which is fairly common, eg if analyzer
              // chokes on a given document), then it's
              // non-aborting and (above) this one document
              // will be marked as deleted, but still
              // consume a docID


              final int posIncr = posIncrAttribute.getPositionIncrement();
              if (posIncr < 0) {
                throw new IllegalArgumentException("position increment must be >=0 (got " + posIncr + ") for field '" + field.name() + "'");
              }
              if (fieldState.position == 0 && posIncr == 0) {
                throw new IllegalArgumentException("first position increment must be > 0 (got 0) for field '" + field.name() + "'");
              }
              int position = fieldState.position + posIncr;
              if (position > 0) {
                // NOTE: confusing: this "mirrors" the
                // position++ we do below
                position--;
              } else if (position < 0) {
                throw new IllegalArgumentException("position overflow for field '" + field.name() + "'");
              }
              
              // position is legal, we can safely place it in fieldState now.
              // not sure if anything will use fieldState after non-aborting exc...
              fieldState.position = position;


              if (posIncr == 0)
                fieldState.numOverlap++;
              
              if (checkOffsets) {
                int startOffset = fieldState.offset + offsetAttribute.startOffset();
                int endOffset = fieldState.offset + offsetAttribute.endOffset();
                if (startOffset < 0 || endOffset < startOffset) {
                  throw new IllegalArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset, "
                      + "startOffset=" + startOffset + ",endOffset=" + endOffset + " for field '" + field.name() + "'");
                }
                if (startOffset < lastStartOffset) {
                  throw new IllegalArgumentException("offsets must not go backwards startOffset=" 
                       + startOffset + " is < lastStartOffset=" + lastStartOffset + " for field '" + field.name() + "'");
                }
                lastStartOffset = startOffset;
              }


              boolean success = false;
              try {
                // If we hit an exception in here, we abort
                // all buffered documents since the last
                // flush, on the likelihood that the
                // internal state of the consumer is now
                // corrupt and should not be flushed to a
                // new segment:
                consumer.add();
                success = true;


              } finally {
                if (!success) {
                  docState.docWriter.setAborting();
                }
              }
              fieldState.length++;
              fieldState.position++;


            } while (stream.incrementToken());
          }
          // trigger streams to perform end-of-stream operations
          stream.end();
          // TODO: maybe add some safety? then again, its already checked 
          // when we come back around to the field...
          fieldState.position += posIncrAttribute.getPositionIncrement();
          fieldState.offset += offsetAttribute.endOffset();




          if (docState.maxTermPrefix != null) {
            final String msg = "Document contains at least one immense term in field=\"" + fieldInfo.name + "\" (whose UTF8 encoding is longer than the max length " + DocumentsWriterPerThread.MAX_TERM_LENGTH_UTF8 + "), all of which were skipped.  Please correct the analyzer to not produce such terms.  The prefix of the first immense term is: '" + docState.maxTermPrefix + "...'";

View Full Code Here

   *
   * @return List of tokens produced from the TokenStream
   */
  private List<AttributeSource> analyzeTokenStream(TokenStream tokenStream) {
    final List<AttributeSource> tokens = new ArrayList<AttributeSource>();
    final PositionIncrementAttribute posIncrAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
    final TokenTrackingAttribute trackerAtt = tokenStream.addAttribute(TokenTrackingAttribute.class);
    // for backwards compatibility, add all "common" attributes
    tokenStream.addAttribute(CharTermAttribute.class);
    tokenStream.addAttribute(OffsetAttribute.class);
    tokenStream.addAttribute(TypeAttribute.class);
    try {
      tokenStream.reset();
      int position = 0;
      while (tokenStream.incrementToken()) {
        position += posIncrAtt.getPositionIncrement();
        trackerAtt.setActPosition(position);
        tokens.add(tokenStream.cloneAttributes());
      }
    } catch (IOException ioe) {
      throw new RuntimeException("Error occured while iterating over tokenstream", ioe);

View Full Code Here

    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
    TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class);
    FlagsAttribute flagsAtt = ts.addAttribute(FlagsAttribute.class);
    PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
    PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
    
    try {
      ts.reset();
      while (ts.incrementToken()){
        Token tok = new Token();
        tok.copyBuffer(termAtt.buffer(), 0, termAtt.length());
        tok.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
        tok.setFlags(flagsAtt.getFlags());
        tok.setPayload(payloadAtt.getPayload());
        tok.setPositionIncrement(posIncAtt.getPositionIncrement());
        tok.setType(typeAtt.type());
        result.add(tok);
      }
    } catch (IOException e) {
      throw new RuntimeException(e);

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute

com.flaptor.indextank.query.IndexEngineParser

com.github.le11.nls.lucene.UIMATypeAwareAnalyzerTest

com.o19s.RegexPathHierarchyTokenizerTest

com.tamingtext.texttamer.solr.NameFilterTest

com.tamingtext.texttamer.solr.SentenceTokenizerTest

lucli.LuceneMethods

net.sf.logsaw.index.internal.LuceneIndexServiceImpl

org.apache.lucene.analysis.core.TestDuelingAnalyzers

org.apache.lucene.analysis.core.TestStopAnalyzer

org.apache.lucene.analysis.core.TestStopFilter

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.