Examples of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute

Package org.apache.lucene.analysis.tokenattributes

Examples of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute

org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute
Determines the position of this token relative to the previous Token in a TokenStream, used in phrase searching.
The default value is one.
Some common uses for this are:
- Set it to zero to put multiple terms in the same position. This is useful if, e.g., a word has multiple stems. Searches for phrases including either stem will match. In this case, all but the first stem's increment should be set to zero: the increment of the first instance should be one. Repeating a token with an increment of zero can also be used to boost the scores of matches on that token.
- Set it to values greater than one to inhibit exact phrase matches. If, for example, one does not want phrases to match across removed stop words, then one could build a stop word filter that removes stop words and also sets the increment to the number of stop words removed before each non-stop word. Then exact phrase queries will only match when the terms occur with no intervening stop words.
@see org.apache.lucene.index.DocsAndPositionsEnum

    public CharsRef analyze(String text, CharsRef reuse) throws IOException {
      IOException priorException = null;
      TokenStream ts = analyzer.tokenStream("", text);
      try {
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
        ts.reset();
        reuse.length = 0;
        while (ts.incrementToken()) {
          int length = termAtt.length();
          if (length == 0) {
            throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token");
          }
          if (posIncAtt.getPositionIncrement() != 1) {
            throw new IllegalArgumentException("term: " + text + " analyzed to a token with posinc != 1");
          }
          reuse.grow(reuse.length + length + 1); /* current + word + separator */
          int end = reuse.offset + reuse.length;
          if (reuse.length > 0) {

View Full Code Here

    TokenStream stream = newStop.tokenStream("test", s);
    try {
      assertNotNull(stream);
      int i = 0;
      CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
      PositionIncrementAttribute posIncrAtt = stream.addAttribute(PositionIncrementAttribute.class);


      stream.reset();
      while (stream.incrementToken()) {
        String text = termAtt.toString();
        assertFalse(stopWordsSet.contains(text));
        assertEquals(expectedIncr[i++],posIncrAtt.getPositionIncrement());
      }
      stream.end();
    } finally {
      IOUtils.closeWhileHandlingException(stream);
    }

View Full Code Here

    assert operator == BooleanClause.Occur.SHOULD || operator == BooleanClause.Occur.MUST;
    // Use the analyzer to get all the tokens, and then build a TermQuery,
    // PhraseQuery, or nothing based on the term count
    CachingTokenFilter buffer = null;
    TermToBytesRefAttribute termAtt = null;
    PositionIncrementAttribute posIncrAtt = null;
    int numTokens = 0;
    int positionCount = 0;
    boolean severalTokensAtSamePosition = false;
    boolean hasMoreTokens = false;    
    
    TokenStream source = null;
    try {
      source = analyzer.tokenStream(field, queryText);
      source.reset();
      buffer = new CachingTokenFilter(source);
      buffer.reset();


      if (buffer.hasAttribute(TermToBytesRefAttribute.class)) {
        termAtt = buffer.getAttribute(TermToBytesRefAttribute.class);
      }
      if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
        posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class);
      }


      if (termAtt != null) {
        try {
          hasMoreTokens = buffer.incrementToken();
          while (hasMoreTokens) {
            numTokens++;
            int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1;
            if (positionIncrement != 0) {
              positionCount += positionIncrement;
            } else {
              severalTokensAtSamePosition = true;
            }
            hasMoreTokens = buffer.incrementToken();
          }
        } catch (IOException e) {
          // ignore
        }
      }
    } catch (IOException e) {
      throw new RuntimeException("Error analyzing query text", e);
    } finally {
      IOUtils.closeWhileHandlingException(source);
    }
    
    // rewind the buffer stream
    buffer.reset();


    BytesRef bytes = termAtt == null ? null : termAtt.getBytesRef();


    if (numTokens == 0)
      return null;
    else if (numTokens == 1) {
      try {
        boolean hasNext = buffer.incrementToken();
        assert hasNext == true;
        termAtt.fillBytesRef();
      } catch (IOException e) {
        // safe to ignore, because we know the number of tokens
      }
      return newTermQuery(new Term(field, BytesRef.deepCopyOf(bytes)));
    } else {
      if (severalTokensAtSamePosition || (!quoted)) {
        if (positionCount == 1 || (!quoted)) {
          // no phrase query:
          
          if (positionCount == 1) {
            // simple case: only one position, with synonyms
            BooleanQuery q = newBooleanQuery(true);
            for (int i = 0; i < numTokens; i++) {
              try {
                boolean hasNext = buffer.incrementToken();
                assert hasNext == true;
                termAtt.fillBytesRef();
              } catch (IOException e) {
                // safe to ignore, because we know the number of tokens
              }
              Query currentQuery = newTermQuery(
                  new Term(field, BytesRef.deepCopyOf(bytes)));
              q.add(currentQuery, BooleanClause.Occur.SHOULD);
            }
            return q;
          } else {
            // multiple positions
            BooleanQuery q = newBooleanQuery(false);
            Query currentQuery = null;
            for (int i = 0; i < numTokens; i++) {
              try {
                boolean hasNext = buffer.incrementToken();
                assert hasNext == true;
                termAtt.fillBytesRef();
              } catch (IOException e) {
                // safe to ignore, because we know the number of tokens
              }
              if (posIncrAtt != null && posIncrAtt.getPositionIncrement() == 0) {
                if (!(currentQuery instanceof BooleanQuery)) {
                  Query t = currentQuery;
                  currentQuery = newBooleanQuery(true);
                  ((BooleanQuery)currentQuery).add(t, BooleanClause.Occur.SHOULD);
                }
                ((BooleanQuery)currentQuery).add(newTermQuery(new Term(field, BytesRef.deepCopyOf(bytes))), BooleanClause.Occur.SHOULD);
              } else {
                if (currentQuery != null) {
                  q.add(currentQuery, operator);
                }
                currentQuery = newTermQuery(new Term(field, BytesRef.deepCopyOf(bytes)));
              }
            }
            q.add(currentQuery, operator);
            return q;
          }
        } else {
          // phrase query:
          MultiPhraseQuery mpq = newMultiPhraseQuery();
          mpq.setSlop(phraseSlop);
          List<Term> multiTerms = new ArrayList<Term>();
          int position = -1;
          for (int i = 0; i < numTokens; i++) {
            int positionIncrement = 1;
            try {
              boolean hasNext = buffer.incrementToken();
              assert hasNext == true;
              termAtt.fillBytesRef();
              if (posIncrAtt != null) {
                positionIncrement = posIncrAtt.getPositionIncrement();
              }
            } catch (IOException e) {
              // safe to ignore, because we know the number of tokens
            }


            if (positionIncrement > 0 && multiTerms.size() > 0) {
              if (enablePositionIncrements) {
                mpq.add(multiTerms.toArray(new Term[0]),position);
              } else {
                mpq.add(multiTerms.toArray(new Term[0]));
              }
              multiTerms.clear();
            }
            position += positionIncrement;
            multiTerms.add(new Term(field, BytesRef.deepCopyOf(bytes)));
          }
          if (enablePositionIncrements) {
            mpq.add(multiTerms.toArray(new Term[0]),position);
          } else {
            mpq.add(multiTerms.toArray(new Term[0]));
          }
          return mpq;
        }
      } else {
        PhraseQuery pq = newPhraseQuery();
        pq.setSlop(phraseSlop);
        int position = -1;


        for (int i = 0; i < numTokens; i++) {
          int positionIncrement = 1;


          try {
            boolean hasNext = buffer.incrementToken();
            assert hasNext == true;
            termAtt.fillBytesRef();
            if (posIncrAtt != null) {
              positionIncrement = posIncrAtt.getPositionIncrement();
            }
          } catch (IOException e) {
            // safe to ignore, because we know the number of tokens
          }

View Full Code Here

      record.put( "isKeyword", narrowedAttr.isKeyword() );
      return record;
    }
    else if (attr instanceof PositionIncrementAttribute ) {
      GenericRecord record = new GenericData.Record( protocol.getType( "PositionIncrementAttribute" ) );
      PositionIncrementAttribute narrowedAttr = (PositionIncrementAttribute) attr;
      record.put( "positionIncrement", narrowedAttr.getPositionIncrement() );
      return record;
    }
    else if (attr instanceof FlagsAttribute ) {
      GenericRecord record = new GenericData.Record( protocol.getType( "FlagsAttribute" ) );
      FlagsAttribute narrowedAttr = (FlagsAttribute) attr;
      record.put( "flags", narrowedAttr.getFlags() );
      return record;
    }
    else if (attr instanceof TypeAttribute ) {
      GenericRecord record = new GenericData.Record( protocol.getType( "TypeAttribute" ) );
      TypeAttribute narrowedAttr = (TypeAttribute) attr;
      record.put( "type", narrowedAttr.type() );
      return record;
    }
    else if (attr instanceof OffsetAttribute ) {
      GenericRecord record = new GenericData.Record( protocol.getType( "OffsetAttribute" ) );
      OffsetAttribute narrowedAttr = (OffsetAttribute) attr;
      record.put( "startOffset", narrowedAttr.startOffset() );
      record.put( "endOffset", narrowedAttr.endOffset() );
      return record;
    }
    else if (attr instanceof Serializable) {
      return ByteBuffer.wrap( toByteArray( attr ) );
    }

View Full Code Here

    try {
      Reader reader = new StringReader( sentence );
      stream = queryContext.getQueryAnalyzer().reusableTokenStream( fieldName, reader);


      CharTermAttribute termAttribute = stream.addAttribute( CharTermAttribute.class );
      PositionIncrementAttribute positionAttribute = stream.addAttribute( PositionIncrementAttribute.class );


      stream.reset();
      int position = -1; //start at -1 since we apply at least one increment
      List<Term> termsAtSamePosition = null;
      while ( stream.incrementToken() ) {
        int positionIncrement = 1;
        if ( positionAttribute != null ) {
          positionIncrement = positionAttribute.getPositionIncrement();
        }


        if ( positionIncrement > 0 ) {
          position += positionIncrement;
          termsAtSamePosition = termsPerPosition.get( position );

View Full Code Here

      record.put("isKeyword", narrowedAttr.isKeyword() );
      return record;
    }
    else if (attr instanceof PositionIncrementAttribute ) {
      GenericRecord record = new GenericData.Record( protocol.getType( "PositionIncrementAttribute" ) );
      PositionIncrementAttribute narrowedAttr = (PositionIncrementAttribute) attr;
      record.put("positionIncrement", narrowedAttr.getPositionIncrement() );
      return record;
    }
    else if (attr instanceof FlagsAttribute ) {
      GenericRecord record = new GenericData.Record( protocol.getType( "FlagsAttribute" ) );
      FlagsAttribute narrowedAttr = (FlagsAttribute) attr;
      record.put("flags", narrowedAttr.getFlags() );
      return record;
    }
    else if (attr instanceof TypeAttribute ) {
      GenericRecord record = new GenericData.Record( protocol.getType( "TypeAttribute" ) );
      TypeAttribute narrowedAttr = (TypeAttribute) attr;
      record.put("type", narrowedAttr.type() );
      return record;
    }
    else if (attr instanceof OffsetAttribute ) {
      GenericRecord record = new GenericData.Record( protocol.getType( "OffsetAttribute" ) );
      OffsetAttribute narrowedAttr = (OffsetAttribute) attr;
      record.put("startOffset", narrowedAttr.startOffset() );
      record.put("endOffset", narrowedAttr.endOffset() );
      return record;
    }
    else if (attr instanceof Serializable) {
      return ByteBuffer.wrap( toByteArray(attr) );
    }

View Full Code Here

    try {
      Reader reader = new StringReader( sentence );
      stream = queryContext.getQueryAnalyzer().reusableTokenStream( fieldName, reader);


      TermAttribute termAttribute = stream.addAttribute( TermAttribute.class );
      PositionIncrementAttribute positionAttribute = stream.addAttribute( PositionIncrementAttribute.class );


      stream.reset();
      int position = -1; //start at -1 since we apply at least one increment
      List<Term> termsAtSamePosition = null;
      while ( stream.incrementToken() ) {
        int positionIncrement = 1;
        if ( positionAttribute != null ) {
          positionIncrement = positionAttribute.getPositionIncrement();
        }


        if ( positionIncrement > 0 ) {
          position+=positionIncrement;
          termsAtSamePosition = termsPerPosition.get(position);

View Full Code Here

    TokenStream stream = newStop.tokenStream("test", s);
    try {
      assertNotNull(stream);
      int i = 0;
      CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
      PositionIncrementAttribute posIncrAtt = stream.addAttribute(PositionIncrementAttribute.class);


      stream.reset();
      while (stream.incrementToken()) {
        String text = termAtt.toString();
        assertFalse(stopWordsSet.contains(text));
        assertEquals(expectedIncr[i++],posIncrAtt.getPositionIncrement());
      }
      stream.end();
    } finally {
      IOUtils.closeWhileHandlingException(stream);
    }

View Full Code Here

      if (!fieldInfos.containsKey(fieldName)) {
        fieldInfos.put(fieldName, 
            new FieldInfo(fieldName, true, fieldInfos.size(), false, false, false, this.storeOffsets ? IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS : IndexOptions.DOCS_AND_FREQS_AND_POSITIONS , null, null, null));
      }
      TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
      PositionIncrementAttribute posIncrAttribute = stream.addAttribute(PositionIncrementAttribute.class);
      OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
      BytesRef ref = termAtt.getBytesRef();
      stream.reset();
      
      while (stream.incrementToken()) {
        termAtt.fillBytesRef();
//        if (DEBUG) System.err.println("token='" + term + "'");
        numTokens++;
        final int posIncr = posIncrAttribute.getPositionIncrement();
        if (posIncr == 0)
          numOverlapTokens++;
        pos += posIncr;
        int ord = terms.add(ref);
        if (ord < 0) {

View Full Code Here

          boolean hasMoreTokens = stream.incrementToken();


          fieldState.attributeSource = stream;


          OffsetAttribute offsetAttribute = fieldState.attributeSource.addAttribute(OffsetAttribute.class);
          PositionIncrementAttribute posIncrAttribute = fieldState.attributeSource.addAttribute(PositionIncrementAttribute.class);


          if (hasMoreTokens) {
            consumer.start(field);


            do {
              // If we hit an exception in stream.next below
              // (which is fairly common, eg if analyzer
              // chokes on a given document), then it's
              // non-aborting and (above) this one document
              // will be marked as deleted, but still
              // consume a docID


              final int posIncr = posIncrAttribute.getPositionIncrement();
              if (posIncr < 0) {
                throw new IllegalArgumentException("position increment must be >=0 (got " + posIncr + ") for field '" + field.name() + "'");
              }
              if (fieldState.position == 0 && posIncr == 0) {
                throw new IllegalArgumentException("first position increment must be > 0 (got 0) for field '" + field.name() + "'");
              }
              int position = fieldState.position + posIncr;
              if (position > 0) {
                // NOTE: confusing: this "mirrors" the
                // position++ we do below
                position--;
              } else if (position < 0) {
                throw new IllegalArgumentException("position overflow for field '" + field.name() + "'");
              }
              
              // position is legal, we can safely place it in fieldState now.
              // not sure if anything will use fieldState after non-aborting exc...
              fieldState.position = position;


              if (posIncr == 0)
                fieldState.numOverlap++;
              
              if (checkOffsets) {
                int startOffset = fieldState.offset + offsetAttribute.startOffset();
                int endOffset = fieldState.offset + offsetAttribute.endOffset();
                if (startOffset < 0 || endOffset < startOffset) {
                  throw new IllegalArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset, "
                      + "startOffset=" + startOffset + ",endOffset=" + endOffset + " for field '" + field.name() + "'");
                }
                if (startOffset < lastStartOffset) {
                  throw new IllegalArgumentException("offsets must not go backwards startOffset=" 
                       + startOffset + " is < lastStartOffset=" + lastStartOffset + " for field '" + field.name() + "'");
                }
                lastStartOffset = startOffset;
              }


              boolean success = false;
              try {
                // If we hit an exception in here, we abort
                // all buffered documents since the last
                // flush, on the likelihood that the
                // internal state of the consumer is now
                // corrupt and should not be flushed to a
                // new segment:
                consumer.add();
                success = true;


              } finally {
                if (!success) {
                  docState.docWriter.setAborting();
                }
              }
              fieldState.length++;
              fieldState.position++;


            } while (stream.incrementToken());
          }
          // trigger streams to perform end-of-stream operations
          stream.end();
          // TODO: maybe add some safety? then again, its already checked 
          // when we come back around to the field...
          fieldState.position += posIncrAttribute.getPositionIncrement();
          fieldState.offset += offsetAttribute.endOffset();
          succeededInProcessingField = true;
          /* if success was false above there is an exception coming through and we won't get here.*/
          succeededInProcessingField = true;
        } finally {

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute

com.flaptor.indextank.query.IndexEngineParser

com.github.le11.nls.lucene.UIMATypeAwareAnalyzerTest

com.o19s.RegexPathHierarchyTokenizerTest

com.tamingtext.texttamer.solr.NameFilterTest

com.tamingtext.texttamer.solr.SentenceTokenizerTest

lucli.LuceneMethods

net.sf.logsaw.index.internal.LuceneIndexServiceImpl

org.apache.lucene.analysis.core.TestDuelingAnalyzers

org.apache.lucene.analysis.core.TestStopAnalyzer

org.apache.lucene.analysis.core.TestStopFilter

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.