Examples of org.apache.lucene.analysis.tokenattributes.OffsetAttribute

Package org.apache.lucene.analysis.tokenattributes

Examples of org.apache.lucene.analysis.tokenattributes.OffsetAttribute

org.apache.lucene.analysis.tokenattributes.OffsetAttribute
The start and end character offset of a Token.

  {
    ArrayList<TextFragment> docFrags = new ArrayList<TextFragment>();
    StringBuilder newText=new StringBuilder();


    CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
    tokenStream.reset();
    TextFragment currentFrag =  new TextFragment(newText,newText.length(), docFrags.size());


    if (fragmentScorer instanceof QueryScorer) {
      ((QueryScorer) fragmentScorer).setMaxDocCharsToAnalyze(maxDocCharsToAnalyze);
    }
    
    TokenStream newStream = fragmentScorer.init(tokenStream);
    if(newStream != null) {
      tokenStream = newStream;
    }
    fragmentScorer.startFragment(currentFrag);
    docFrags.add(currentFrag);


    FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);


    try
    {


      String tokenText;
      int startOffset;
      int endOffset;
      int lastEndOffset = 0;
      textFragmenter.start(text, tokenStream);


      TokenGroup tokenGroup=new TokenGroup(tokenStream);


      for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset()< maxDocCharsToAnalyze);
            next = tokenStream.incrementToken())
      {
        if(  (offsetAtt.endOffset()>text.length())
          ||
          (offsetAtt.startOffset()>text.length())
          )
        {
          throw new InvalidTokenOffsetsException("Token "+ termAtt.toString()
              +" exceeds length of provided text sized "+text.length());
        }

View Full Code Here


  // LUCENE-1441
  public void testOffsets() throws Exception {
    TokenStream stream = new KeywordAnalyzer().tokenStream("field", new StringReader("abcd"));
    try {
      OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
      stream.reset();
      assertTrue(stream.incrementToken());
      assertEquals(0, offsetAtt.startOffset());
      assertEquals(4, offsetAtt.endOffset());
      assertFalse(stream.incrementToken());
      stream.end();
    } finally {
      IOUtils.closeWhileHandlingException(stream);
    }

View Full Code Here

        fieldInfos.put(fieldName, 
            new FieldInfo(fieldName, true, fieldInfos.size(), false, false, false, this.storeOffsets ? IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS : IndexOptions.DOCS_AND_FREQS_AND_POSITIONS , null, null, null));
      }
      TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
      PositionIncrementAttribute posIncrAttribute = stream.addAttribute(PositionIncrementAttribute.class);
      OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
      BytesRef ref = termAtt.getBytesRef();
      stream.reset();
      
      while (stream.incrementToken()) {
        termAtt.fillBytesRef();
//        if (DEBUG) System.err.println("token='" + term + "'");
        numTokens++;
        final int posIncr = posIncrAttribute.getPositionIncrement();
        if (posIncr == 0)
          numOverlapTokens++;
        pos += posIncr;
        int ord = terms.add(ref);
        if (ord < 0) {
          ord = (-ord) - 1;
          postingsWriter.reset(sliceArray.end[ord]);
        } else {
          sliceArray.start[ord] = postingsWriter.startNewSlice();
        }
        sliceArray.freq[ord]++;
        sumTotalTermFreq++;
        if (!storeOffsets) {
          postingsWriter.writeInt(pos);
        } else {
          postingsWriter.writeInt(pos);
          postingsWriter.writeInt(offsetAtt.startOffset());
          postingsWriter.writeInt(offsetAtt.endOffset());
        }
        sliceArray.end[ord] = postingsWriter.getCurrentOffset();
      }
      stream.end();

View Full Code Here

        @Override
        protected Object highlight(String text, Set<String> matchedTokens, String prefixToken) throws IOException {
          TokenStream ts = queryAnalyzer.tokenStream("text", new StringReader(text));
          try {
            CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
            OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
            ts.reset();
            List<LookupHighlightFragment> fragments = new ArrayList<LookupHighlightFragment>();
            int upto = 0;
            while (ts.incrementToken()) {
              String token = termAtt.toString();
              int startOffset = offsetAtt.startOffset();
              int endOffset = offsetAtt.endOffset();
              if (upto < startOffset) {
                fragments.add(new LookupHighlightFragment(text.substring(upto, startOffset), false));
                upto = startOffset;
              } else if (upto > startOffset) {
                continue;
              }
              
              if (matchedTokens.contains(token)) {
                // Token matches.
                fragments.add(new LookupHighlightFragment(text.substring(startOffset, endOffset), true));
                upto = endOffset;
              } else if (prefixToken != null && token.startsWith(prefixToken)) {
                fragments.add(new LookupHighlightFragment(text.substring(startOffset, startOffset+prefixToken.length()), true));
                if (prefixToken.length() < token.length()) {
                  fragments.add(new LookupHighlightFragment(text.substring(startOffset+prefixToken.length(), startOffset+token.length()), false));
                }
                upto = endOffset;
              }
            }
            ts.end();
            int endOffset = offsetAtt.endOffset();
            if (upto < endOffset) {
              fragments.add(new LookupHighlightFragment(text.substring(upto), false));
            }
            
            return fragments;

View Full Code Here

          assertTrue(docID1 != DocIdSetIterator.NO_MORE_DOCS);
          
          int freq1 = dpEnum1.freq();
          int freq2 = dpEnum2.freq();
          assertEquals(freq1, freq2);
          OffsetAttribute offsetAtt1 = dpEnum1.attributes().hasAttribute(OffsetAttribute.class) ? dpEnum1.attributes().getAttribute(OffsetAttribute.class) : null;
          OffsetAttribute offsetAtt2 = dpEnum2.attributes().hasAttribute(OffsetAttribute.class) ? dpEnum2.attributes().getAttribute(OffsetAttribute.class) : null;


          if (offsetAtt1 != null) {
            assertNotNull(offsetAtt2);
          } else {
            assertNull(offsetAtt2);
          }


          for(int posUpto=0;posUpto<freq1;posUpto++) {
            int pos1 = dpEnum1.nextPosition();
            int pos2 = dpEnum2.nextPosition();
            assertEquals(pos1, pos2);
            if (offsetAtt1 != null) {
              assertEquals(offsetAtt1.startOffset(),
                           offsetAtt2.startOffset());
              assertEquals(offsetAtt1.endOffset(),
                           offsetAtt2.endOffset());
            }
          }
          assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum1.nextDoc());
          assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum2.nextDoc());
        } else {

View Full Code Here

      record.put( "type", narrowedAttr.type() );
      return record;
    }
    else if (attr instanceof OffsetAttribute ) {
      GenericRecord record = new GenericData.Record( protocol.getType( "OffsetAttribute" ) );
      OffsetAttribute narrowedAttr = (OffsetAttribute) attr;
      record.put( "startOffset", narrowedAttr.startOffset() );
      record.put( "endOffset", narrowedAttr.endOffset() );
      return record;
    }
    else if (attr instanceof Serializable) {
      return ByteBuffer.wrap( toByteArray( attr ) );
    }

View Full Code Here

      record.put("type", narrowedAttr.type() );
      return record;
    }
    else if (attr instanceof OffsetAttribute ) {
      GenericRecord record = new GenericData.Record( protocol.getType( "OffsetAttribute" ) );
      OffsetAttribute narrowedAttr = (OffsetAttribute) attr;
      record.put("startOffset", narrowedAttr.startOffset() );
      record.put("endOffset", narrowedAttr.endOffset() );
      return record;
    }
    else if (attr instanceof Serializable) {
      return ByteBuffer.wrap( toByteArray(attr) );
    }

View Full Code Here

        fieldInfos.put(fieldName, 
            new FieldInfo(fieldName, true, fieldInfos.size(), false, false, false, this.storeOffsets ? IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS : IndexOptions.DOCS_AND_FREQS_AND_POSITIONS , null, null, null));
      }
      TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
      PositionIncrementAttribute posIncrAttribute = stream.addAttribute(PositionIncrementAttribute.class);
      OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
      BytesRef ref = termAtt.getBytesRef();
      stream.reset();
      
      while (stream.incrementToken()) {
        termAtt.fillBytesRef();
//        if (DEBUG) System.err.println("token='" + term + "'");
        numTokens++;
        final int posIncr = posIncrAttribute.getPositionIncrement();
        if (posIncr == 0)
          numOverlapTokens++;
        pos += posIncr;
        int ord = terms.add(ref);
        if (ord < 0) {
          ord = (-ord) - 1;
          postingsWriter.reset(sliceArray.end[ord]);
        } else {
          sliceArray.start[ord] = postingsWriter.startNewSlice();
        }
        sliceArray.freq[ord]++;
        sumTotalTermFreq++;
        if (!storeOffsets) {
          postingsWriter.writeInt(pos);
        } else {
          postingsWriter.writeInt(pos);
          postingsWriter.writeInt(offsetAtt.startOffset() + offset);
          postingsWriter.writeInt(offsetAtt.endOffset() + offset);
        }
        sliceArray.end[ord] = postingsWriter.getCurrentOffset();
      }
      stream.end();


      // ensure infos.numTokens > 0 invariant; needed for correct operation of terms()
      if (numTokens > 0) {
        fields.put(fieldName, new Info(terms, sliceArray, numTokens, numOverlapTokens, boost, pos, offsetAtt.endOffset() + offset, sumTotalTermFreq));
        sortedFields = null;    // invalidate sorted view, if any
      }
    } catch (Exception e) { // can never happen
      throw new RuntimeException(e);
    } finally {

View Full Code Here

        try {
          boolean hasMoreTokens = stream.incrementToken();


          fieldState.attributeSource = stream;


          OffsetAttribute offsetAttribute = fieldState.attributeSource.addAttribute(OffsetAttribute.class);
          PositionIncrementAttribute posIncrAttribute = fieldState.attributeSource.addAttribute(PositionIncrementAttribute.class);


          if (hasMoreTokens) {
            consumer.start(field);


            do {
              // If we hit an exception in stream.next below
              // (which is fairly common, eg if analyzer
              // chokes on a given document), then it's
              // non-aborting and (above) this one document
              // will be marked as deleted, but still
              // consume a docID


              final int posIncr = posIncrAttribute.getPositionIncrement();
              if (posIncr < 0) {
                throw new IllegalArgumentException("position increment must be >=0 (got " + posIncr + ") for field '" + field.name() + "'");
              }
              if (fieldState.position == 0 && posIncr == 0) {
                throw new IllegalArgumentException("first position increment must be > 0 (got 0) for field '" + field.name() + "'");
              }
              int position = fieldState.position + posIncr;
              if (position > 0) {
                // NOTE: confusing: this "mirrors" the
                // position++ we do below
                position--;
              } else if (position < 0) {
                throw new IllegalArgumentException("position overflow for field '" + field.name() + "'");
              }
              
              // position is legal, we can safely place it in fieldState now.
              // not sure if anything will use fieldState after non-aborting exc...
              fieldState.position = position;


              if (posIncr == 0)
                fieldState.numOverlap++;
              
              if (checkOffsets) {
                int startOffset = fieldState.offset + offsetAttribute.startOffset();
                int endOffset = fieldState.offset + offsetAttribute.endOffset();
                if (startOffset < 0 || endOffset < startOffset) {
                  throw new IllegalArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset, "
                      + "startOffset=" + startOffset + ",endOffset=" + endOffset + " for field '" + field.name() + "'");
                }
                if (startOffset < lastStartOffset) {
                  throw new IllegalArgumentException("offsets must not go backwards startOffset=" 
                       + startOffset + " is < lastStartOffset=" + lastStartOffset + " for field '" + field.name() + "'");
                }
                lastStartOffset = startOffset;
              }


              boolean success = false;
              try {
                // If we hit an exception in here, we abort
                // all buffered documents since the last
                // flush, on the likelihood that the
                // internal state of the consumer is now
                // corrupt and should not be flushed to a
                // new segment:
                consumer.add();
                success = true;


              } finally {
                if (!success) {
                  docState.docWriter.setAborting();
                }
              }
              fieldState.length++;
              fieldState.position++;


            } while (stream.incrementToken());
          }
          // trigger streams to perform end-of-stream operations
          stream.end();
          // TODO: maybe add some safety? then again, its already checked 
          // when we come back around to the field...
          fieldState.position += posIncrAttribute.getPositionIncrement();
          fieldState.offset += offsetAttribute.endOffset();
          succeededInProcessingField = true;
          /* if success was false above there is an exception coming through and we won't get here.*/
          succeededInProcessingField = true;
        } finally {
          if (!succeededInProcessingField) {

View Full Code Here

  /** Retrieve suggestions. */
  public List<LookupResult> lookup(final CharSequence key, int num) throws IOException {
    TokenStream ts = queryAnalyzer.tokenStream("", key.toString());
    try {
      TermToBytesRefAttribute termBytesAtt = ts.addAttribute(TermToBytesRefAttribute.class);
      OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
      PositionLengthAttribute posLenAtt = ts.addAttribute(PositionLengthAttribute.class);
      PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
      ts.reset();
      
      BytesRef[] lastTokens = new BytesRef[grams];
      //System.out.println("lookup: key='" + key + "'");
      
      // Run full analysis, but save only the
      // last 1gram, last 2gram, etc.:
      BytesRef tokenBytes = termBytesAtt.getBytesRef();
      int maxEndOffset = -1;
      boolean sawRealToken = false;
      while(ts.incrementToken()) {
        termBytesAtt.fillBytesRef();
        sawRealToken |= tokenBytes.length > 0;
        // TODO: this is somewhat iffy; today, ShingleFilter
        // sets posLen to the gram count; maybe we should make
        // a separate dedicated att for this?
        int gramCount = posLenAtt.getPositionLength();
        
        assert gramCount <= grams;
        
        // Safety: make sure the recalculated count "agrees":
        if (countGrams(tokenBytes) != gramCount) {
          throw new IllegalArgumentException("tokens must not contain separator byte; got token=" + tokenBytes + " but gramCount=" + gramCount + " does not match recalculated count=" + countGrams(tokenBytes));
        }
        maxEndOffset = Math.max(maxEndOffset, offsetAtt.endOffset());
        lastTokens[gramCount-1] = BytesRef.deepCopyOf(tokenBytes);
      }
      ts.end();
      
      if (!sawRealToken) {
        throw new IllegalArgumentException("no tokens produced by analyzer, or the only tokens were empty strings");
      }
      
      // Carefully fill last tokens with _ tokens;
      // ShingleFilter appraently won't emit "only hole"
      // tokens:
      int endPosInc = posIncAtt.getPositionIncrement();
      
      // Note this will also be true if input is the empty
      // string (in which case we saw no tokens and
      // maxEndOffset is still -1), which in fact works out OK
      // because we fill the unigram with an empty BytesRef
      // below:
      boolean lastTokenEnded = offsetAtt.endOffset() > maxEndOffset || endPosInc > 0;
      //System.out.println("maxEndOffset=" + maxEndOffset + " vs " + offsetAtt.endOffset());
      
      if (lastTokenEnded) {
        //System.out.println("  lastTokenEnded");
        // If user hit space after the last token, then

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.lucene.analysis.tokenattributes.OffsetAttribute

com.chenlb.mmseg4j.analysis.TokenUtils

com.flaptor.indextank.query.IndexEngineParser

com.github.le11.nls.lucene.UIMABaseAnalyzerTest

com.github.le11.nls.lucene.UIMATypeAwareAnalyzerTest

com.o19s.RegexPathHierarchyTokenizerTest

com.tamingtext.texttamer.solr.NameFilterTest

com.tamingtext.texttamer.solr.SentenceTokenizerTest

com.tistory.devyongsik.crescent.admin.service.MorphServiceImpl

it.unibz.instasearch.indexing.StorageIndexer

me.shenfeng.mmseg.HashSetDictionaryTest

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.