Examples of org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute

Package org.apache.lucene.analysis.tokenattributes

Examples of org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute

org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute
must call termAtt.fillBytesRef() before doing something with the bytes. // this encodes the term value (internally it might be a char[], etc) into the bytes. int hashCode = termAtt.fillBytesRef(); if (isInteresting(bytes)) { // because the bytes are reused by the attribute (like CharTermAttribute's char[] buffer), // you should make a copy if you need persistent access to the bytes, otherwise they will // be rewritten across calls to incrementToken() doSomethingWith(new BytesRef(bytes)); } } ... @lucene.experimental This is a very expert API, please use{@link CharTermAttributeImpl} and its implementation of this methodfor UTF-8 terms.

    for (int i = 0; i < numTestPoints; i++) {
      String term = TestUtil.randomSimpleString(random());
      IOException priorException = null;
      TokenStream ts = analyzer.tokenStream("fake", term);
      try {
        TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
        BytesRef bytes = termAtt.getBytesRef();
        ts.reset();
        assertTrue(ts.incrementToken());
        termAtt.fillBytesRef();
        // ensure we make a copy of the actual bytes too
        map.put(term, BytesRef.deepCopyOf(bytes));
        assertFalse(ts.incrementToken());
        ts.end();
      } catch (IOException e) {
        priorException = e;
      } finally {
        IOUtils.closeWhileHandlingException(priorException, ts);
      }
    }
    
    Thread threads[] = new Thread[numThreads];
    for (int i = 0; i < numThreads; i++) {
      threads[i] = new Thread() {
        @Override
        public void run() {
          try {
            for (Map.Entry<String,BytesRef> mapping : map.entrySet()) {
              String term = mapping.getKey();
              BytesRef expected = mapping.getValue();
              IOException priorException = null;
              TokenStream ts = analyzer.tokenStream("fake", term);
              try {
                TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
                BytesRef bytes = termAtt.getBytesRef();
                ts.reset();
                assertTrue(ts.incrementToken());
                termAtt.fillBytesRef();
                assertEquals(expected, bytes);
                assertFalse(ts.incrementToken());
                ts.end();
              } catch (IOException e) {
                priorException = e;

View Full Code Here

  protected final Query createFieldQuery(Analyzer analyzer, BooleanClause.Occur operator, String field, String queryText, boolean quoted, int phraseSlop) {
    assert operator == BooleanClause.Occur.SHOULD || operator == BooleanClause.Occur.MUST;
    // Use the analyzer to get all the tokens, and then build a TermQuery,
    // PhraseQuery, or nothing based on the term count
    CachingTokenFilter buffer = null;
    TermToBytesRefAttribute termAtt = null;
    PositionIncrementAttribute posIncrAtt = null;
    int numTokens = 0;
    int positionCount = 0;
    boolean severalTokensAtSamePosition = false;
    boolean hasMoreTokens = false;    
    
    TokenStream source = null;
    try {
      source = analyzer.tokenStream(field, queryText);
      source.reset();
      buffer = new CachingTokenFilter(source);
      buffer.reset();


      if (buffer.hasAttribute(TermToBytesRefAttribute.class)) {
        termAtt = buffer.getAttribute(TermToBytesRefAttribute.class);
      }
      if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
        posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class);
      }


      if (termAtt != null) {
        try {
          hasMoreTokens = buffer.incrementToken();
          while (hasMoreTokens) {
            numTokens++;
            int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1;
            if (positionIncrement != 0) {
              positionCount += positionIncrement;
            } else {
              severalTokensAtSamePosition = true;
            }
            hasMoreTokens = buffer.incrementToken();
          }
        } catch (IOException e) {
          // ignore
        }
      }
    } catch (IOException e) {
      throw new RuntimeException("Error analyzing query text", e);
    } finally {
      IOUtils.closeWhileHandlingException(source);
    }
    
    // rewind the buffer stream
    buffer.reset();


    BytesRef bytes = termAtt == null ? null : termAtt.getBytesRef();


    if (numTokens == 0)
      return null;
    else if (numTokens == 1) {
      try {
        boolean hasNext = buffer.incrementToken();
        assert hasNext == true;
        termAtt.fillBytesRef();
      } catch (IOException e) {
        // safe to ignore, because we know the number of tokens
      }
      return newTermQuery(new Term(field, BytesRef.deepCopyOf(bytes)));
    } else {
      if (severalTokensAtSamePosition || (!quoted)) {
        if (positionCount == 1 || (!quoted)) {
          // no phrase query:
          
          if (positionCount == 1) {
            // simple case: only one position, with synonyms
            BooleanQuery q = newBooleanQuery(true);
            for (int i = 0; i < numTokens; i++) {
              try {
                boolean hasNext = buffer.incrementToken();
                assert hasNext == true;
                termAtt.fillBytesRef();
              } catch (IOException e) {
                // safe to ignore, because we know the number of tokens
              }
              Query currentQuery = newTermQuery(
                  new Term(field, BytesRef.deepCopyOf(bytes)));
              q.add(currentQuery, BooleanClause.Occur.SHOULD);
            }
            return q;
          } else {
            // multiple positions
            BooleanQuery q = newBooleanQuery(false);
            Query currentQuery = null;
            for (int i = 0; i < numTokens; i++) {
              try {
                boolean hasNext = buffer.incrementToken();
                assert hasNext == true;
                termAtt.fillBytesRef();
              } catch (IOException e) {
                // safe to ignore, because we know the number of tokens
              }
              if (posIncrAtt != null && posIncrAtt.getPositionIncrement() == 0) {
                if (!(currentQuery instanceof BooleanQuery)) {
                  Query t = currentQuery;
                  currentQuery = newBooleanQuery(true);
                  ((BooleanQuery)currentQuery).add(t, BooleanClause.Occur.SHOULD);
                }
                ((BooleanQuery)currentQuery).add(newTermQuery(new Term(field, BytesRef.deepCopyOf(bytes))), BooleanClause.Occur.SHOULD);
              } else {
                if (currentQuery != null) {
                  q.add(currentQuery, operator);
                }
                currentQuery = newTermQuery(new Term(field, BytesRef.deepCopyOf(bytes)));
              }
            }
            q.add(currentQuery, operator);
            return q;
          }
        } else {
          // phrase query:
          MultiPhraseQuery mpq = newMultiPhraseQuery();
          mpq.setSlop(phraseSlop);
          List<Term> multiTerms = new ArrayList<>();
          int position = -1;
          for (int i = 0; i < numTokens; i++) {
            int positionIncrement = 1;
            try {
              boolean hasNext = buffer.incrementToken();
              assert hasNext == true;
              termAtt.fillBytesRef();
              if (posIncrAtt != null) {
                positionIncrement = posIncrAtt.getPositionIncrement();
              }
            } catch (IOException e) {
              // safe to ignore, because we know the number of tokens
            }


            if (positionIncrement > 0 && multiTerms.size() > 0) {
              if (enablePositionIncrements) {
                mpq.add(multiTerms.toArray(new Term[0]),position);
              } else {
                mpq.add(multiTerms.toArray(new Term[0]));
              }
              multiTerms.clear();
            }
            position += positionIncrement;
            multiTerms.add(new Term(field, BytesRef.deepCopyOf(bytes)));
          }
          if (enablePositionIncrements) {
            mpq.add(multiTerms.toArray(new Term[0]),position);
          } else {
            mpq.add(multiTerms.toArray(new Term[0]));
          }
          return mpq;
        }
      } else {
        PhraseQuery pq = newPhraseQuery();
        pq.setSlop(phraseSlop);
        int position = -1;


        for (int i = 0; i < numTokens; i++) {
          int positionIncrement = 1;


          try {
            boolean hasNext = buffer.incrementToken();
            assert hasNext == true;
            termAtt.fillBytesRef();
            if (posIncrAtt != null) {
              positionIncrement = posIncrAtt.getPositionIncrement();
            }
          } catch (IOException e) {
            // safe to ignore, because we know the number of tokens

View Full Code Here

      
      final TokenStream stream = field.tokenStream(analyzer);
      // reset the TokenStream to the first token
      stream.reset();


      TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
      while(stream.incrementToken()) {
        termAtt.fillBytesRef();
        tokenCount++;
      }
      stream.end();
      stream.close();
    }

View Full Code Here

  static final int ivalue = 123456;


  public void testLongStream() throws Exception {
    final NumericTokenStream stream=new NumericTokenStream().setLongValue(lvalue);
    // use getAttribute to test if attributes really exist, if not an IAE will be throwed
    final TermToBytesRefAttribute bytesAtt = stream.getAttribute(TermToBytesRefAttribute.class);
    final TypeAttribute typeAtt = stream.getAttribute(TypeAttribute.class);
    final NumericTokenStream.NumericTermAttribute numericAtt = stream.getAttribute(NumericTokenStream.NumericTermAttribute.class);
    final BytesRef bytes = bytesAtt.getBytesRef();
    stream.reset();
    assertEquals(64, numericAtt.getValueSize());
    for (int shift=0; shift<64; shift+=NumericUtils.PRECISION_STEP_DEFAULT) {
      assertTrue("New token is available", stream.incrementToken());
      assertEquals("Shift value wrong", shift, numericAtt.getShift());
      bytesAtt.fillBytesRef();
      assertEquals("Term is incorrectly encoded", lvalue & ~((1L << shift) - 1L), NumericUtils.prefixCodedToLong(bytes));
      assertEquals("Term raw value is incorrectly encoded", lvalue & ~((1L << shift) - 1L), numericAtt.getRawValue());
      assertEquals("Type incorrect", (shift == 0) ? NumericTokenStream.TOKEN_TYPE_FULL_PREC : NumericTokenStream.TOKEN_TYPE_LOWER_PREC, typeAtt.type());
    }
    assertFalse("More tokens available", stream.incrementToken());

View Full Code Here

  }


  public void testIntStream() throws Exception {
    final NumericTokenStream stream=new NumericTokenStream().setIntValue(ivalue);
    // use getAttribute to test if attributes really exist, if not an IAE will be throwed
    final TermToBytesRefAttribute bytesAtt = stream.getAttribute(TermToBytesRefAttribute.class);
    final TypeAttribute typeAtt = stream.getAttribute(TypeAttribute.class);
    final NumericTokenStream.NumericTermAttribute numericAtt = stream.getAttribute(NumericTokenStream.NumericTermAttribute.class);
    final BytesRef bytes = bytesAtt.getBytesRef();
    stream.reset();
    assertEquals(32, numericAtt.getValueSize());
    for (int shift=0; shift<32; shift+=NumericUtils.PRECISION_STEP_DEFAULT) {
      assertTrue("New token is available", stream.incrementToken());
      assertEquals("Shift value wrong", shift, numericAtt.getShift());
      bytesAtt.fillBytesRef();
      assertEquals("Term is incorrectly encoded", ivalue & ~((1 << shift) - 1), NumericUtils.prefixCodedToInt(bytes));
      assertEquals("Term raw value is incorrectly encoded", ((long) ivalue) & ~((1L << shift) - 1L), numericAtt.getRawValue());
      assertEquals("Type incorrect", (shift == 0) ? NumericTokenStream.TOKEN_TYPE_FULL_PREC : NumericTokenStream.TOKEN_TYPE_LOWER_PREC, typeAtt.type());
    }
    assertFalse("More tokens available", stream.incrementToken());

View Full Code Here

      throw new IllegalArgumentException("this suggester doesn't support contexts");
    }


    TokenStream ts = queryAnalyzer.tokenStream("", key.toString());
    try {
      TermToBytesRefAttribute termBytesAtt = ts.addAttribute(TermToBytesRefAttribute.class);
      OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
      PositionLengthAttribute posLenAtt = ts.addAttribute(PositionLengthAttribute.class);
      PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
      ts.reset();
      
      BytesRef[] lastTokens = new BytesRef[grams];
      //System.out.println("lookup: key='" + key + "'");
      
      // Run full analysis, but save only the
      // last 1gram, last 2gram, etc.:
      BytesRef tokenBytes = termBytesAtt.getBytesRef();
      int maxEndOffset = -1;
      boolean sawRealToken = false;
      while(ts.incrementToken()) {
        termBytesAtt.fillBytesRef();
        sawRealToken |= tokenBytes.length > 0;
        // TODO: this is somewhat iffy; today, ShingleFilter
        // sets posLen to the gram count; maybe we should make
        // a separate dedicated att for this?
        int gramCount = posLenAtt.getPositionLength();

View Full Code Here

      throws Exception {
    TokenStream ts1 = a1.tokenStream("bogus", text);
    TokenStream ts2 = a2.tokenStream("bogus", text);
    ts1.reset();
    ts2.reset();
    TermToBytesRefAttribute termAtt1 = ts1.addAttribute(TermToBytesRefAttribute.class);
    TermToBytesRefAttribute termAtt2 = ts2.addAttribute(TermToBytesRefAttribute.class);
    assertTrue(ts1.incrementToken());
    assertTrue(ts2.incrementToken());
    BytesRef bytes1 = termAtt1.getBytesRef();
    BytesRef bytes2 = termAtt2.getBytesRef();
    termAtt1.fillBytesRef();
    termAtt2.fillBytesRef();
    assertEquals(bytes1, bytes2);
    assertFalse(ts1.incrementToken());
    assertFalse(ts2.incrementToken());
    ts1.close();
    ts2.close();

View Full Code Here

        {
            Analyzer analyzer = columnMapper.analyzer();
            source = analyzer.tokenStream(field, value);
            source.reset();


            TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
            BytesRef bytes = termAtt.getBytesRef();


            if (!source.incrementToken())
            {
                return null;
            }
            termAtt.fillBytesRef();
            if (source.incrementToken())
            {
                throw new IllegalArgumentException("analyzer returned too many terms for multiTerm term: " + value);
            }
            source.end();

View Full Code Here

            // Use the analyzer to get all the tokens, and then build a TermQuery,
            // PhraseQuery, or nothing based on the term count
            CachingTokenFilter buffer = new CachingTokenFilter(source);
            buffer.reset();


            TermToBytesRefAttribute termAtt = null;
            int numTokens = 0;
            boolean hasMoreTokens = false;
            termAtt = buffer.getAttribute(TermToBytesRefAttribute.class);
            if (termAtt != null) {
                try {
                    hasMoreTokens = buffer.incrementToken();
                    while (hasMoreTokens) {
                        numTokens++;
                        hasMoreTokens = buffer.incrementToken();
                    }
                } catch (IOException e) {
                    // ignore
                }
            }


            // rewind buffer
            buffer.reset();


            BytesRef bytes = termAtt == null ? null : termAtt.getBytesRef();
            if (numTokens == 0) {
                return null;
            } else if (numTokens == 1) {
                try {
                    boolean hasNext = buffer.incrementToken();
                    assert hasNext == true;
                    termAtt.fillBytesRef();
                } catch (IOException e) {
                    // safe to ignore, because we know the number of tokens
                }
                return new PrefixQuery(new Term(field, BytesRef.deepCopyOf(bytes)));
            } else {
                BooleanQuery bq = new BooleanQuery();
                for (int i = 0; i < numTokens; i++) {
                    try {
                        boolean hasNext = buffer.incrementToken();
                        assert hasNext == true;
                        termAtt.fillBytesRef();
                    } catch (IOException e) {
                        // safe to ignore, because we know the number of tokens
                    }
                    bq.add(new BooleanClause(new PrefixQuery(new Term(field, BytesRef.deepCopyOf(bytes))), BooleanClause.Occur.SHOULD));
                }

View Full Code Here

0 1 2 3 4 5 6

TOP

Related Classes of org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute

com.stratio.cassandra.index.query.Condition

org.apache.lucene.analysis.CollationTestBase

org.apache.lucene.analysis.TestNumericTokenStream

org.apache.lucene.analysis.TokenStreamToAutomaton

org.apache.lucene.benchmark.byTask.tasks.ReadTokensTask

org.apache.lucene.benchmark.byTask.TestPerfTasksLogic

org.apache.lucene.index.memory.MemoryIndex

org.apache.lucene.index.TestLongPostings

org.apache.lucene.queryparser.classic.QueryParserBase

org.apache.lucene.queryparser.xml.builders.SpanOrTermsBuilder

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.