Examples of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute

Package org.apache.lucene.analysis.tokenattributes

Examples of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute

org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute
Determines the position of this token relative to the previous Token in a TokenStream, used in phrase searching.
The default value is one.
Some common uses for this are:
- Set it to zero to put multiple terms in the same position. This is useful if, e.g., a word has multiple stems. Searches for phrases including either stem will match. In this case, all but the first stem's increment should be set to zero: the increment of the first instance should be one. Repeating a token with an increment of zero can also be used to boost the scores of matches on that token.
- Set it to values greater than one to inhibit exact phrase matches. If, for example, one does not want phrases to match across removed stop words, then one could build a stop word filter that removes stop words and also sets the increment to the number of stop words removed before each non-stop word. Then exact phrase queries will only match when the terms occur with no intervening stop words.
@see org.apache.lucene.index.DocsAndPositionsEnum

      } catch (IOException e1) {
        throw new RuntimeException(e1);
      }
      CachingTokenFilter buffer = new CachingTokenFilter(source);


      PositionIncrementAttribute posIncrAtt = null;
      int numTokens = 0;
      int positionCount = 0;
      boolean severalTokensAtSamePosition = false;


      if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
        posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class);
      }


      try {


        while (buffer.incrementToken()) {
          numTokens++;
          int positionIncrement = (posIncrAtt != null) ? posIncrAtt
              .getPositionIncrement() : 1;
          if (positionIncrement != 0) {
            positionCount += positionIncrement;


          } else {
            severalTokensAtSamePosition = true;
          }


        }


      } catch (IOException e) {
        // ignore
      }


      try {
        // rewind the buffer stream
        buffer.reset();


        // close original stream - all tokens buffered
        source.close();
      } catch (IOException e) {
        // ignore
      }


      if (!buffer.hasAttribute(CharTermAttribute.class)) {
        return new NoTokenFoundQueryNode();
      }


      CharTermAttribute termAtt = buffer.getAttribute(CharTermAttribute.class);


      if (numTokens == 0) {
        return new NoTokenFoundQueryNode();


      } else if (numTokens == 1) {
        String term = null;
        try {
          boolean hasNext;
          hasNext = buffer.incrementToken();
          assert hasNext == true;
          term = termAtt.toString();


        } catch (IOException e) {
          // safe to ignore, because we know the number of tokens
        }


        fieldNode.setText(term);


        return fieldNode;


      } else if (severalTokensAtSamePosition || !(node instanceof QuotedFieldQueryNode)) {
        if (positionCount == 1 || !(node instanceof QuotedFieldQueryNode)) {
          // no phrase query:
          LinkedList<QueryNode> children = new LinkedList<QueryNode>();


          for (int i = 0; i < numTokens; i++) {
            String term = null;
            try {
              boolean hasNext = buffer.incrementToken();
              assert hasNext == true;
              term = termAtt.toString();


            } catch (IOException e) {
              // safe to ignore, because we know the number of tokens
            }


            children.add(new FieldQueryNode(field, term, -1, -1));


          }
          if (positionCount == 1)
            return new GroupQueryNode(
              new StandardBooleanQueryNode(children, true));
          else
            return new StandardBooleanQueryNode(children, false);


        } else {
          // phrase query:
          MultiPhraseQueryNode mpq = new MultiPhraseQueryNode();


          List<FieldQueryNode> multiTerms = new ArrayList<FieldQueryNode>();
          int position = -1;
          int i = 0;
          int termGroupCount = 0;
          for (; i < numTokens; i++) {
            String term = null;
            int positionIncrement = 1;
            try {
              boolean hasNext = buffer.incrementToken();
              assert hasNext == true;
              term = termAtt.toString();
              if (posIncrAtt != null) {
                positionIncrement = posIncrAtt.getPositionIncrement();
              }


            } catch (IOException e) {
              // safe to ignore, because we know the number of tokens
            }


            if (positionIncrement > 0 && multiTerms.size() > 0) {


              for (FieldQueryNode termNode : multiTerms) {


                if (this.positionIncrementsEnabled) {
                  termNode.setPositionIncrement(position);
                } else {
                  termNode.setPositionIncrement(termGroupCount);
                }


                mpq.add(termNode);


              }


              // Only increment once for each "group" of
              // terms that were in the same position:
              termGroupCount++;


              multiTerms.clear();


            }


            position += positionIncrement;
            multiTerms.add(new FieldQueryNode(field, term, -1, -1));


          }


          for (FieldQueryNode termNode : multiTerms) {


            if (this.positionIncrementsEnabled) {
              termNode.setPositionIncrement(position);


            } else {
              termNode.setPositionIncrement(termGroupCount);
            }


            mpq.add(termNode);


          }


          return mpq;


        }


      } else {


        TokenizedPhraseQueryNode pq = new TokenizedPhraseQueryNode();


        int position = -1;


        for (int i = 0; i < numTokens; i++) {
          String term = null;
          int positionIncrement = 1;


          try {
            boolean hasNext = buffer.incrementToken();
            assert hasNext == true;
            term = termAtt.toString();


            if (posIncrAtt != null) {
              positionIncrement = posIncrAtt.getPositionIncrement();
            }


          } catch (IOException e) {
            // safe to ignore, because we know the number of tokens
          }

View Full Code Here


    TokenStream ts = analyzer.tokenStream("content",
                                          new StringReader("this sentence"));
    int j = -1;
    
    PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class);
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    
    ts.reset();
    while (ts.incrementToken()) {
      j += posIncrAtt.getPositionIncrement();
      String termText = termAtt.toString();
      q.add(new Term("content", termText), j);
    }


    ScoreDoc[] hits = searcher.search(q, null, 1000).scoreDocs;

View Full Code Here

      if (!fieldInfos.containsKey(fieldName)) {
        fieldInfos.put(fieldName, 
            new FieldInfo(fieldName, true, fieldInfos.size(), false, false, false, this.storeOffsets ? IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS : IndexOptions.DOCS_AND_FREQS_AND_POSITIONS , null, null, null));
      }
      TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
      PositionIncrementAttribute posIncrAttribute = stream.addAttribute(PositionIncrementAttribute.class);
      OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
      BytesRef ref = termAtt.getBytesRef();
      stream.reset();
      
      while (stream.incrementToken()) {
        termAtt.fillBytesRef();
        if (ref.length == 0) continue; // nothing to do
//        if (DEBUG) System.err.println("token='" + term + "'");
        numTokens++;
        final int posIncr = posIncrAttribute.getPositionIncrement();
        if (posIncr == 0)
          numOverlapTokens++;
        pos += posIncr;
        int ord = terms.add(ref);
        if (ord < 0) {

View Full Code Here

  public Automaton toAutomaton(TokenStream in) throws IOException {
    final Automaton a = new Automaton();
    boolean deterministic = true;


    final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);
    final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
    final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);


    final BytesRef term = termBytesAtt.getBytesRef();


    in.reset();


    // Only temporarily holds states ahead of our current
    // position:


    final RollingBuffer<Position> positions = new Positions();


    int pos = -1;
    Position posData = null;


    while (in.incrementToken()) {
      int posInc = posIncAtt.getPositionIncrement();
      assert pos > -1 || posInc > 0;


      if (posInc > 0) {


        // New node:

View Full Code Here

    PhraseQuery q = new PhraseQuery();


    TokenStream ts = analyzer.tokenStream("content", new StringReader("this sentence"));
    int j = -1;
    
    PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class);
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    
    ts.reset();
    while (ts.incrementToken()) {
      j += posIncrAtt.getPositionIncrement();
      String termText = termAtt.toString();
      q.add(new Term("content", termText), j);
    }


    ScoreDoc[] hits = searcher.search(q, null, 1000).scoreDocs;

View Full Code Here

     *  separates by {@link SynonymMap#WORD_SEPARATOR}.
     *  reuse and its chars must not be null. */
    public static CharsRef analyze(Analyzer analyzer, String text, CharsRef reuse) throws IOException {
      TokenStream ts = analyzer.tokenStream("", new StringReader(text));
      CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
      PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
      ts.reset();
      reuse.length = 0;
      while (ts.incrementToken()) {
        int length = termAtt.length();
        if (length == 0) {
          throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token");
        }
        if (posIncAtt.getPositionIncrement() != 1) {
          throw new IllegalArgumentException("term: " + text + " analyzed to a token with posinc != 1");
        }
        reuse.grow(reuse.length + length + 1); /* current + word + separator */
        int end = reuse.offset + reuse.length;
        if (reuse.length > 0) {

View Full Code Here

    assert operator == BooleanClause.Occur.SHOULD || operator == BooleanClause.Occur.MUST;
    // Use the analyzer to get all the tokens, and then build a TermQuery,
    // PhraseQuery, or nothing based on the term count
    CachingTokenFilter buffer = null;
    TermToBytesRefAttribute termAtt = null;
    PositionIncrementAttribute posIncrAtt = null;
    int numTokens = 0;
    int positionCount = 0;
    boolean severalTokensAtSamePosition = false;
    boolean hasMoreTokens = false;    
    
    TokenStream source = null;
    try {
      source = analyzer.tokenStream(field, queryText);
      source.reset();
      buffer = new CachingTokenFilter(source);
      buffer.reset();


      termAtt = buffer.getAttribute(TermToBytesRefAttribute.class);
      posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class);


      if (termAtt != null) {
        try {
          hasMoreTokens = buffer.incrementToken();
          while (hasMoreTokens) {
            numTokens++;
            int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1;
            if (positionIncrement != 0) {
              positionCount += positionIncrement;
            } else {
              severalTokensAtSamePosition = true;
            }
            hasMoreTokens = buffer.incrementToken();
          }
        } catch (IOException e) {
          // ignore
        }
      }
    } catch (IOException e) {
      throw new RuntimeException("Error analyzing query text", e);
    } finally {
      IOUtils.closeWhileHandlingException(source);
    }
    
    // rewind the buffer stream
    buffer.reset();


    BytesRef bytes = termAtt == null ? null : termAtt.getBytesRef();


    if (numTokens == 0)
      return null;
    else if (numTokens == 1) {
      try {
        boolean hasNext = buffer.incrementToken();
        assert hasNext == true;
        termAtt.fillBytesRef();
      } catch (IOException e) {
        // safe to ignore, because we know the number of tokens
      }
      return newTermQuery(new Term(field, BytesRef.deepCopyOf(bytes)));
    } else {
      if (severalTokensAtSamePosition || (!quoted)) {
        if (positionCount == 1 || (!quoted)) {
          // no phrase query:
          
          if (positionCount == 1) {
            // simple case: only one position, with synonyms
            BooleanQuery q = newBooleanQuery(true);
            for (int i = 0; i < numTokens; i++) {
              try {
                boolean hasNext = buffer.incrementToken();
                assert hasNext == true;
                termAtt.fillBytesRef();
              } catch (IOException e) {
                // safe to ignore, because we know the number of tokens
              }
              Query currentQuery = newTermQuery(
                  new Term(field, BytesRef.deepCopyOf(bytes)));
              q.add(currentQuery, BooleanClause.Occur.SHOULD);
            }
            return q;
          } else {
            // multiple positions
            BooleanQuery q = newBooleanQuery(false);
            Query currentQuery = null;
            for (int i = 0; i < numTokens; i++) {
              try {
                boolean hasNext = buffer.incrementToken();
                assert hasNext == true;
                termAtt.fillBytesRef();
              } catch (IOException e) {
                // safe to ignore, because we know the number of tokens
              }
              if (posIncrAtt != null && posIncrAtt.getPositionIncrement() == 0) {
                if (!(currentQuery instanceof BooleanQuery)) {
                  Query t = currentQuery;
                  currentQuery = newBooleanQuery(true);
                  ((BooleanQuery)currentQuery).add(t, BooleanClause.Occur.SHOULD);
                }
                ((BooleanQuery)currentQuery).add(newTermQuery(new Term(field, BytesRef.deepCopyOf(bytes))), BooleanClause.Occur.SHOULD);
              } else {
                if (currentQuery != null) {
                  q.add(currentQuery, operator);
                }
                currentQuery = newTermQuery(new Term(field, BytesRef.deepCopyOf(bytes)));
              }
            }
            q.add(currentQuery, operator);
            return q;
          }
        } else {
          // phrase query:
          MultiPhraseQuery mpq = newMultiPhraseQuery();
          mpq.setSlop(phraseSlop);
          List<Term> multiTerms = new ArrayList<>();
          int position = -1;
          for (int i = 0; i < numTokens; i++) {
            int positionIncrement = 1;
            try {
              boolean hasNext = buffer.incrementToken();
              assert hasNext == true;
              termAtt.fillBytesRef();
              if (posIncrAtt != null) {
                positionIncrement = posIncrAtt.getPositionIncrement();
              }
            } catch (IOException e) {
              // safe to ignore, because we know the number of tokens
            }


            if (positionIncrement > 0 && multiTerms.size() > 0) {
              if (enablePositionIncrements) {
                mpq.add(multiTerms.toArray(new Term[0]),position);
              } else {
                mpq.add(multiTerms.toArray(new Term[0]));
              }
              multiTerms.clear();
            }
            position += positionIncrement;
            multiTerms.add(new Term(field, BytesRef.deepCopyOf(bytes)));
          }
          if (enablePositionIncrements) {
            mpq.add(multiTerms.toArray(new Term[0]),position);
          } else {
            mpq.add(multiTerms.toArray(new Term[0]));
          }
          return mpq;
        }
      } else {
        PhraseQuery pq = newPhraseQuery();
        pq.setSlop(phraseSlop);
        int position = -1;


        for (int i = 0; i < numTokens; i++) {
          int positionIncrement = 1;


          try {
            boolean hasNext = buffer.incrementToken();
            assert hasNext == true;
            termAtt.fillBytesRef();
            if (posIncrAtt != null) {
              positionIncrement = posIncrAtt.getPositionIncrement();
            }
          } catch (IOException e) {
            // safe to ignore, because we know the number of tokens
          }

View Full Code Here

      if (!fieldInfos.containsKey(fieldName)) {
        fieldInfos.put(fieldName, 
            new FieldInfo(fieldName, true, fieldInfos.size(), false, false, false, this.storeOffsets ? IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS : IndexOptions.DOCS_AND_FREQS_AND_POSITIONS , null, null, -1, null));
      }
      TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
      PositionIncrementAttribute posIncrAttribute = stream.addAttribute(PositionIncrementAttribute.class);
      OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
      BytesRef ref = termAtt.getBytesRef();
      stream.reset();
      
      while (stream.incrementToken()) {
        termAtt.fillBytesRef();
//        if (DEBUG) System.err.println("token='" + term + "'");
        numTokens++;
        final int posIncr = posIncrAttribute.getPositionIncrement();
        if (posIncr == 0)
          numOverlapTokens++;
        pos += posIncr;
        int ord = terms.add(ref);
        if (ord < 0) {

View Full Code Here

   * 
   * @throws IOException If an I/O error occurs
   */
  public void end() throws IOException {
    clearAttributes(); // LUCENE-3849: don't consume dirty atts
    PositionIncrementAttribute posIncAtt = getAttribute(PositionIncrementAttribute.class);
    if (posIncAtt != null) {
      posIncAtt.setPositionIncrement(0);
    }
  }

View Full Code Here

      // false: they are entirely different code paths
      // under-the-hood:
      TokenStream ts = TokenSources.getTokenStream(reader.getTermVectors(0).terms("field"), i == 0);


      CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
      PositionIncrementAttribute posIncAtt = ts.getAttribute(PositionIncrementAttribute.class);
      OffsetAttribute offsetAtt = ts.getAttribute(OffsetAttribute.class);
      PayloadAttribute payloadAtt = ts.getAttribute(PayloadAttribute.class);


      for(Token token : tokens) {
        assertTrue(ts.incrementToken());
        assertEquals(token.toString(), termAtt.toString());
        assertEquals(token.getPositionIncrement(), posIncAtt.getPositionIncrement());
        assertEquals(token.getPayload(), payloadAtt.getPayload());
        assertEquals(token.startOffset(), offsetAtt.startOffset());
        assertEquals(token.endOffset(), offsetAtt.endOffset());
      }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute

com.flaptor.indextank.query.IndexEngineParser

com.github.le11.nls.lucene.UIMATypeAwareAnalyzerTest

com.o19s.RegexPathHierarchyTokenizerTest

com.tamingtext.texttamer.solr.NameFilterTest

com.tamingtext.texttamer.solr.SentenceTokenizerTest

lucli.LuceneMethods

net.sf.logsaw.index.internal.LuceneIndexServiceImpl

org.apache.lucene.analysis.core.TestDuelingAnalyzers

org.apache.lucene.analysis.core.TestStopAnalyzer

org.apache.lucene.analysis.core.TestStopFilter

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.