Examples of org.apache.lucene.analysis.tokenattributes.OffsetAttribute

Package org.apache.lucene.analysis.tokenattributes

Examples of org.apache.lucene.analysis.tokenattributes.OffsetAttribute

org.apache.lucene.analysis.tokenattributes.OffsetAttribute
The start and end character offset of a Token.

      return start + 31 * end;
    }


    @Override
    public void copyTo(AttributeImpl target) {
      OffsetAttribute t = (OffsetAttribute) target;
      t.setOffset(start, end);
    }

View Full Code Here

        fieldInfos.put(fieldName, 
            new FieldInfo(fieldName, true, fieldInfos.size(), false, false, false, this.storeOffsets ? IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS : IndexOptions.DOCS_AND_FREQS_AND_POSITIONS , null, null, -1, null));
      }
      TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
      PositionIncrementAttribute posIncrAttribute = stream.addAttribute(PositionIncrementAttribute.class);
      OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
      BytesRef ref = termAtt.getBytesRef();
      stream.reset();
      
      while (stream.incrementToken()) {
        termAtt.fillBytesRef();
//        if (DEBUG) System.err.println("token='" + term + "'");
        numTokens++;
        final int posIncr = posIncrAttribute.getPositionIncrement();
        if (posIncr == 0)
          numOverlapTokens++;
        pos += posIncr;
        int ord = terms.add(ref);
        if (ord < 0) {
          ord = (-ord) - 1;
          postingsWriter.reset(sliceArray.end[ord]);
        } else {
          sliceArray.start[ord] = postingsWriter.startNewSlice();
        }
        sliceArray.freq[ord]++;
        sumTotalTermFreq++;
        if (!storeOffsets) {
          postingsWriter.writeInt(pos);
        } else {
          postingsWriter.writeInt(pos);
          postingsWriter.writeInt(offsetAtt.startOffset() + offset);
          postingsWriter.writeInt(offsetAtt.endOffset() + offset);
        }
        sliceArray.end[ord] = postingsWriter.getCurrentOffset();
      }
      stream.end();


      // ensure infos.numTokens > 0 invariant; needed for correct operation of terms()
      if (numTokens > 0) {
        fields.put(fieldName, new Info(terms, sliceArray, numTokens, numOverlapTokens, boost, pos, offsetAtt.endOffset() + offset, sumTotalTermFreq));
        sortedFields = null;    // invalidate sorted view, if any
      }
    } catch (Exception e) { // can never happen
      throw new RuntimeException(e);
    } finally {

View Full Code Here

      // under-the-hood:
      TokenStream ts = TokenSources.getTokenStream(reader.getTermVectors(0).terms("field"), i == 0);


      CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
      PositionIncrementAttribute posIncAtt = ts.getAttribute(PositionIncrementAttribute.class);
      OffsetAttribute offsetAtt = ts.getAttribute(OffsetAttribute.class);
      PayloadAttribute payloadAtt = ts.getAttribute(PayloadAttribute.class);


      for(Token token : tokens) {
        assertTrue(ts.incrementToken());
        assertEquals(token.toString(), termAtt.toString());
        assertEquals(token.getPositionIncrement(), posIncAtt.getPositionIncrement());
        assertEquals(token.getPayload(), payloadAtt.getPayload());
        assertEquals(token.startOffset(), offsetAtt.startOffset());
        assertEquals(token.endOffset(), offsetAtt.endOffset());
      }


      assertFalse(ts.incrementToken());
    }

View Full Code Here

   * <p>
   * This is solely used internally by PostingsHighlighter: <b>DO NOT USE THIS METHOD!</b>
   */
  static DocsAndPositionsEnum getDocsEnum(final TokenStream ts, final CharacterRunAutomaton[] matchers) throws IOException {
    final CharTermAttribute charTermAtt = ts.addAttribute(CharTermAttribute.class);
    final OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
    ts.reset();
    
    // TODO: we could use CachingWrapperFilter, (or consume twice) to allow us to have a true freq()
    // but this would have a performance cost for likely little gain in the user experience, it
    // would only serve to make this method less bogus.
    // instead, we always return freq() = Integer.MAX_VALUE and let PH terminate based on offset...
    
    return new DocsAndPositionsEnum() {
      int currentDoc = -1;
      int currentMatch = -1;
      int currentStartOffset = -1;
      int currentEndOffset = -1;
      TokenStream stream = ts;
      
      final BytesRef matchDescriptions[] = new BytesRef[matchers.length];
      
      @Override
      public int nextPosition() throws IOException {
        if (stream != null) {
          while (stream.incrementToken()) {
            for (int i = 0; i < matchers.length; i++) {
              if (matchers[i].run(charTermAtt.buffer(), 0, charTermAtt.length())) {
                currentStartOffset = offsetAtt.startOffset();
                currentEndOffset = offsetAtt.endOffset();
                currentMatch = i;
                return 0;
              }
            }
          }

View Full Code Here

      ts = queryAnalyzer.tokenStream("", new StringReader(key.toString()));


      //long t0 = System.currentTimeMillis();
      ts.reset();
      final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
      final OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
      String lastToken = null;
      query = new BooleanQuery();
      int maxEndOffset = -1;
      matchedTokens = new HashSet<>();
      while (ts.incrementToken()) {
        if (lastToken != null) {  
          matchedTokens.add(lastToken);
          query.add(new TermQuery(new Term(TEXT_FIELD_NAME, lastToken)), occur);
        }
        lastToken = termAtt.toString();
        if (lastToken != null) {
          maxEndOffset = Math.max(maxEndOffset, offsetAtt.endOffset());
        }
      }
      ts.end();


      if (lastToken != null) {
        Query lastQuery;
        if (maxEndOffset == offsetAtt.endOffset()) {
          // Use PrefixQuery (or the ngram equivalent) when
          // there was no trailing discarded chars in the
          // string (e.g. whitespace), so that if query does
          // not end with a space we show prefix matches for
          // that token:

View Full Code Here

   *  LookupResult#highlightKey} member. */
  protected Object highlight(String text, Set<String> matchedTokens, String prefixToken) throws IOException {
    TokenStream ts = queryAnalyzer.tokenStream("text", new StringReader(text));
    try {
      CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
      OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
      ts.reset();
      StringBuilder sb = new StringBuilder();
      int upto = 0;
      while (ts.incrementToken()) {
        String token = termAtt.toString();
        int startOffset = offsetAtt.startOffset();
        int endOffset = offsetAtt.endOffset();
        if (upto < startOffset) {
          addNonMatch(sb, text.substring(upto, startOffset));
          upto = startOffset;
        } else if (upto > startOffset) {
          continue;
        }
        
        if (matchedTokens.contains(token)) {
          // Token matches.
          addWholeMatch(sb, text.substring(startOffset, endOffset), token);
          upto = endOffset;
        } else if (prefixToken != null && token.startsWith(prefixToken)) {
          addPrefixMatch(sb, text.substring(startOffset, endOffset), token, prefixToken);
          upto = endOffset;
        }
      }
      ts.end();
      int endOffset = offsetAtt.endOffset();
      if (upto < endOffset) {
        addNonMatch(sb, text.substring(upto));
      }
      return sb.toString();
    } finally {

View Full Code Here

    }


    TokenStream ts = queryAnalyzer.tokenStream("", key.toString());
    try {
      TermToBytesRefAttribute termBytesAtt = ts.addAttribute(TermToBytesRefAttribute.class);
      OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
      PositionLengthAttribute posLenAtt = ts.addAttribute(PositionLengthAttribute.class);
      PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
      ts.reset();
      
      BytesRef[] lastTokens = new BytesRef[grams];
      //System.out.println("lookup: key='" + key + "'");
      
      // Run full analysis, but save only the
      // last 1gram, last 2gram, etc.:
      BytesRef tokenBytes = termBytesAtt.getBytesRef();
      int maxEndOffset = -1;
      boolean sawRealToken = false;
      while(ts.incrementToken()) {
        termBytesAtt.fillBytesRef();
        sawRealToken |= tokenBytes.length > 0;
        // TODO: this is somewhat iffy; today, ShingleFilter
        // sets posLen to the gram count; maybe we should make
        // a separate dedicated att for this?
        int gramCount = posLenAtt.getPositionLength();
        
        assert gramCount <= grams;
        
        // Safety: make sure the recalculated count "agrees":
        if (countGrams(tokenBytes) != gramCount) {
          throw new IllegalArgumentException("tokens must not contain separator byte; got token=" + tokenBytes + " but gramCount=" + gramCount + " does not match recalculated count=" + countGrams(tokenBytes));
        }
        maxEndOffset = Math.max(maxEndOffset, offsetAtt.endOffset());
        lastTokens[gramCount-1] = BytesRef.deepCopyOf(tokenBytes);
      }
      ts.end();
      
      if (!sawRealToken) {
        throw new IllegalArgumentException("no tokens produced by analyzer, or the only tokens were empty strings");
      }
      
      // Carefully fill last tokens with _ tokens;
      // ShingleFilter appraently won't emit "only hole"
      // tokens:
      int endPosInc = posIncAtt.getPositionIncrement();
      
      // Note this will also be true if input is the empty
      // string (in which case we saw no tokens and
      // maxEndOffset is still -1), which in fact works out OK
      // because we fill the unigram with an empty BytesRef
      // below:
      boolean lastTokenEnded = offsetAtt.endOffset() > maxEndOffset || endPosInc > 0;
      //System.out.println("maxEndOffset=" + maxEndOffset + " vs " + offsetAtt.endOffset());
      
      if (lastTokenEnded) {
        //System.out.println("  lastTokenEnded");
        // If user hit space after the last token, then

View Full Code Here

          assertTrue(docID1 != DocIdSetIterator.NO_MORE_DOCS);
          
          int freq1 = dpEnum1.freq();
          int freq2 = dpEnum2.freq();
          assertEquals(freq1, freq2);
          OffsetAttribute offsetAtt1 = dpEnum1.attributes().getAttribute(OffsetAttribute.class);
          OffsetAttribute offsetAtt2 = dpEnum2.attributes().getAttribute(OffsetAttribute.class);


          if (offsetAtt1 != null) {
            assertNotNull(offsetAtt2);
          } else {
            assertNull(offsetAtt2);
          }


          for(int posUpto=0;posUpto<freq1;posUpto++) {
            int pos1 = dpEnum1.nextPosition();
            int pos2 = dpEnum2.nextPosition();
            assertEquals(pos1, pos2);
            if (offsetAtt1 != null) {
              assertEquals(offsetAtt1.startOffset(),
                           offsetAtt2.startOffset());
              assertEquals(offsetAtt1.endOffset(),
                           offsetAtt2.endOffset());
            }
          }
          assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum1.nextDoc());
          assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum2.nextDoc());
        } else {

View Full Code Here

  public void assertEquals(String s, TokenStream left, TokenStream right) throws Exception {
    left.reset();
    right.reset();
    CharTermAttribute leftTerm = left.addAttribute(CharTermAttribute.class);
    CharTermAttribute rightTerm = right.addAttribute(CharTermAttribute.class);
    OffsetAttribute leftOffset = left.addAttribute(OffsetAttribute.class);
    OffsetAttribute rightOffset = right.addAttribute(OffsetAttribute.class);
    PositionIncrementAttribute leftPos = left.addAttribute(PositionIncrementAttribute.class);
    PositionIncrementAttribute rightPos = right.addAttribute(PositionIncrementAttribute.class);
    
    while (left.incrementToken()) {
      assertTrue("wrong number of tokens for input: " + s, right.incrementToken());
      assertEquals("wrong term text for input: " + s, leftTerm.toString(), rightTerm.toString());
      assertEquals("wrong position for input: " + s, leftPos.getPositionIncrement(), rightPos.getPositionIncrement());
      assertEquals("wrong start offset for input: " + s, leftOffset.startOffset(), rightOffset.startOffset());
      assertEquals("wrong end offset for input: " + s, leftOffset.endOffset(), rightOffset.endOffset());
    };
    assertFalse("wrong number of tokens for input: " + s, right.incrementToken());
    left.end();
    right.end();
    assertEquals("wrong final offset for input: " + s, leftOffset.endOffset(), rightOffset.endOffset());
    left.close();
    right.close();
  }

View Full Code Here

    for (int i = 0; i < num; i++) {
      String s = TestUtil.randomUnicodeString(random());
      TokenStream ts = analyzer.tokenStream("foo", s);
      try {
        ts.reset();
        OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
        while (ts.incrementToken()) {
          String highlightedText = s.substring(offsetAtt.startOffset(), offsetAtt.endOffset());
          for (int j = 0, cp = 0; j < highlightedText.length(); j += Character.charCount(cp)) {
            cp = highlightedText.codePointAt(j);
            assertTrue("non-letter:" + Integer.toHexString(cp), Character.isLetter(cp));
          }
        }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.lucene.analysis.tokenattributes.OffsetAttribute

com.chenlb.mmseg4j.analysis.TokenUtils

com.flaptor.indextank.query.IndexEngineParser

com.github.le11.nls.lucene.UIMABaseAnalyzerTest

com.github.le11.nls.lucene.UIMATypeAwareAnalyzerTest

com.o19s.RegexPathHierarchyTokenizerTest

com.tamingtext.texttamer.solr.NameFilterTest

com.tamingtext.texttamer.solr.SentenceTokenizerTest

com.tistory.devyongsik.crescent.admin.service.MorphServiceImpl

it.unibz.instasearch.indexing.StorageIndexer

me.shenfeng.mmseg.HashSetDictionaryTest

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.