Examples of org.apache.lucene.analysis.tokenattributes.OffsetAttribute

Package org.apache.lucene.analysis.tokenattributes

Examples of org.apache.lucene.analysis.tokenattributes.OffsetAttribute

org.apache.lucene.analysis.tokenattributes.OffsetAttribute
The start and end character offset of a Token.

        boolean foundWildcard = false;
        //Lucene tokenizer are really low level ...
        try {
            while(tokenizer.incrementToken()){
                //only interested in the start/end indexes of tokens
                OffsetAttribute offset = tokenizer.addAttribute(OffsetAttribute.class);
                if(lastAdded < 0){ //rest with this token
                    lastAdded = offset.startOffset();
                }
                if(foundWildcard){ //wildcard present in the current token
                    //two cases: "wildcar? at the end", "wild?ard within the word"
                    // (1) [wildcar,at,the,end] : In this case this is called with
                    //      'at' as active Token and we need write "wildcar?" as
                    //      query term
                    // (2) [wild,ard,within,the,word]: In this case this is called with
                    //      'ard' as active Token and we need write "wild?ard" as
                    //      query term.
                    if(offset.startOffset() > lastOffset+1) {//(1)
                        String queryElement = value.substring(lastAdded,lastOffset+1);
                        if(loewercaseWildcardTokens){
                            queryElement = queryElement.toLowerCase();
                        }
                        
                        queryElements.add(queryElement);
                        lastAdded = offset.startOffset(); //previous token consumed
                        //set to the start of the current token
                        foundWildcard = false;
                    } else if(next != offset.endOffset()){ //(2)
                        String queryElement = value.substring(lastAdded,offset.endOffset());
                        if(loewercaseWildcardTokens){
                            queryElement = queryElement.toLowerCase();
                        }
                        queryElements.add(queryElement);
                        lastAdded = -1; //consume the current token
                        foundWildcard = false;
                    }
                }
                if(next == offset.endOffset()){ //end of current token is '*' or '?'
                    next = m.find()?m.start()+1:-1; //search next '*', '?' in value
                    //we need to write all tokens previous to the current (if any)
                    //NOTE: ignore if foundWildcard is TRUE (multiple wildcards in
                    //      a single word
                    if(!foundWildcard && lastAdded<lastOffset){
                        String queryElement = value.substring(lastAdded,lastOffset);
                        queryElements.add('"'+queryElement+'"');
                        lastAdded = offset.startOffset();
                    }//else multiple wildcards in a single token
                    foundWildcard = true;
                }
                lastOffset = offset.endOffset();
            }
        } catch (IOException e) {
            //StringReader can not throw IOExceptions
            throw new IllegalStateException(e);
        }

View Full Code Here

        try {
            stream = analyzer.tokenStream(FieldNames.FULLTEXT,
                    new StringReader(text));
            CharTermAttribute termAtt = stream
                    .addAttribute(CharTermAttribute.class);
            OffsetAttribute offsetAtt = stream
                    .addAttribute(OffsetAttribute.class);
            // TypeAttribute type = stream.addAttribute(TypeAttribute.class);


            stream.reset();


            int poz = 0;
            boolean hasFulltextToken = false;
            StringBuilder token = new StringBuilder();
            while (stream.incrementToken()) {
                String term = termAtt.toString();
                int start = offsetAtt.startOffset();
                int end = offsetAtt.endOffset();
                if (start > poz) {
                    for (int i = poz; i < start; i++) {
                        for (char c : fulltextTokens) {
                            if (c == text.charAt(i)) {
                                token.append(c);

View Full Code Here

        try {
            stream = analyzer.tokenStream(FieldNames.FULLTEXT,
                    new StringReader(text));
            CharTermAttribute termAtt = stream
                    .addAttribute(CharTermAttribute.class);
            OffsetAttribute offsetAtt = stream
                    .addAttribute(OffsetAttribute.class);
            // TypeAttribute type = stream.addAttribute(TypeAttribute.class);


            stream.reset();


            int poz = 0;
            boolean hasFulltextToken = false;
            StringBuilder token = new StringBuilder();
            while (stream.incrementToken()) {
                String term = termAtt.toString();
                int start = offsetAtt.startOffset();
                int end = offsetAtt.endOffset();
                if (start > poz) {
                    for (int i = poz; i < start; i++) {
                        for (char c : fulltextTokens) {
                            if (c == text.charAt(i)) {
                                token.append(c);

View Full Code Here


            for (int i = 0; i < result.synonyms.length; i++) {
                Token repTok = result.synonyms[i];
                AttributeSource newTok = firstTok.cloneAttributes();
                CharTermAttribute newTermAtt = newTok.addAttribute(CharTermAttribute.class);
                OffsetAttribute newOffsetAtt = newTok.addAttribute(OffsetAttribute.class);
                PositionIncrementAttribute newPosIncAtt = newTok.addAttribute(PositionIncrementAttribute.class);


                OffsetAttribute lastOffsetAtt = lastTok.addAttribute(OffsetAttribute.class);


                newOffsetAtt.setOffset(newOffsetAtt.startOffset(), lastOffsetAtt.endOffset());
                newTermAtt.copyBuffer(repTok.buffer(), 0, repTok.length());
                repPos += repTok.getPositionIncrement();
                if (i == 0) repPos = origPos;  // make position of first token equal to original


                // if necessary, insert original tokens and adjust position increment

View Full Code Here

                terms = new HashMap<String, ArrayIntList>();
            }


            CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
            PositionIncrementAttribute posIncrAttribute = stream.addAttribute(PositionIncrementAttribute.class);
            OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
            stream.reset();
            while (stream.incrementToken()) {
                String term = termAtt.toString();
                if (term.length() == 0) continue; // nothing to do
//        if (DEBUG) System.err.println("token='" + term + "'");
                numTokens++;
                final int posIncr = posIncrAttribute.getPositionIncrement();
                if (posIncr == 0)
                    numOverlapTokens++;
                pos += posIncr;


                ArrayIntList positions = terms.get(term);
                if (positions == null) { // term not seen before
                    positions = new ArrayIntList(stride);
                    terms.put(term, positions);
                }
                if (stride == 1) {
                    positions.add(pos);
                } else {
                    positions.add(pos, offsetAtt.startOffset(), offsetAtt.endOffset());
                }
            }
            stream.end();


            // ensure infos.numTokens > 0 invariant; needed for correct operation of terms()

View Full Code Here

        try {
            stream = analyzer.reusableTokenStream(field, new FastStringReader(request.text()));
            stream.reset();
            CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
            PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
            OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
            TypeAttribute type = stream.addAttribute(TypeAttribute.class);


            int position = 0;
            while (stream.incrementToken()) {
                int increment = posIncr.getPositionIncrement();
                if (increment > 0) {
                    position = position + increment;
                }
                tokens.add(new AnalyzeResponse.AnalyzeToken(term.toString(), position, offset.startOffset(), offset.endOffset(), type.type()));
            }
            stream.end();
        } catch (IOException e) {
            throw new ElasticSearchException("failed to analyze", e);
        } finally {

View Full Code Here

          try {
            boolean hasMoreTokens = stream.incrementToken();


            fieldState.attributeSource = stream;


            OffsetAttribute offsetAttribute = fieldState.attributeSource.addAttribute(OffsetAttribute.class);
            PositionIncrementAttribute posIncrAttribute = fieldState.attributeSource.addAttribute(PositionIncrementAttribute.class);
            
            consumer.start(field);
            
            for(;;) {


              // If we hit an exception in stream.next below
              // (which is fairly common, eg if analyzer
              // chokes on a given document), then it's
              // non-aborting and (above) this one document
              // will be marked as deleted, but still
              // consume a docID
              
              if (!hasMoreTokens) break;
              
              final int posIncr = posIncrAttribute.getPositionIncrement();
              fieldState.position += posIncr;
              if (fieldState.position > 0) {
                fieldState.position--;
              }


              if (posIncr == 0)
                fieldState.numOverlap++;


              boolean success = false;
              try {
                // If we hit an exception in here, we abort
                // all buffered documents since the last
                // flush, on the likelihood that the
                // internal state of the consumer is now
                // corrupt and should not be flushed to a
                // new segment:
                consumer.add();
                success = true;
              } finally {
                if (!success)
                  docState.docWriter.setAborting();
              }
              fieldState.position++;
              if (++fieldState.length >= maxFieldLength) {
                if (docState.infoStream != null)
                  docState.infoStream.println("maxFieldLength " +maxFieldLength+ " reached for field " + fieldInfo.name + ", ignoring following tokens");
                break;
              }


              hasMoreTokens = stream.incrementToken();
            }
            // trigger streams to perform end-of-stream operations
            stream.end();
            
            fieldState.offset += offsetAttribute.endOffset();
          } finally {
            stream.close();
          }
        }

View Full Code Here

    String test = "The quick red fox jumped over the lazy brown dogs";


    TokenOffsetPayloadTokenFilter nptf = new TokenOffsetPayloadTokenFilter(new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false));
    int count = 0;
    PayloadAttribute payloadAtt = nptf.getAttribute(PayloadAttribute.class);
    OffsetAttribute offsetAtt = nptf.getAttribute(OffsetAttribute.class);
    nptf.reset();
    while (nptf.incrementToken()) {
      Payload pay = payloadAtt.getPayload();
      assertTrue("pay is null and it shouldn't be", pay != null);
      byte [] data = pay.getData();
      int start = PayloadHelper.decodeInt(data, 0);
      assertTrue(start + " does not equal: " + offsetAtt.startOffset(), start == offsetAtt.startOffset());
      int end = PayloadHelper.decodeInt(data, 4);
      assertTrue(end + " does not equal: " + offsetAtt.endOffset(), end == offsetAtt.endOffset());
      count++;
    }
    assertTrue(count + " does not equal: " + 10, count == 10);


  }

View Full Code Here

        fieldInfos.put(fieldName, 
            new FieldInfo(fieldName, true, fieldInfos.size(), false, false, false, this.storeOffsets ? IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS : IndexOptions.DOCS_AND_FREQS_AND_POSITIONS , null, null, null));
      }
      TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
      PositionIncrementAttribute posIncrAttribute = stream.addAttribute(PositionIncrementAttribute.class);
      OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
      BytesRef ref = termAtt.getBytesRef();
      stream.reset();
      
      while (stream.incrementToken()) {
        termAtt.fillBytesRef();
        if (ref.length == 0) continue; // nothing to do
//        if (DEBUG) System.err.println("token='" + term + "'");
        numTokens++;
        final int posIncr = posIncrAttribute.getPositionIncrement();
        if (posIncr == 0)
          numOverlapTokens++;
        pos += posIncr;
        int ord = terms.add(ref);
        if (ord < 0) {
          ord = (-ord) - 1;
          postingsWriter.reset(sliceArray.end[ord]);
        } else {
          sliceArray.start[ord] = postingsWriter.startNewSlice();
        }
        sliceArray.freq[ord]++;
        sumTotalTermFreq++;
        if (!storeOffsets) {
          postingsWriter.writeInt(pos);
        } else {
          postingsWriter.writeInt(pos);
          postingsWriter.writeInt(offsetAtt.startOffset());
          postingsWriter.writeInt(offsetAtt.endOffset());
        }
        sliceArray.end[ord] = postingsWriter.getCurrentOffset();
      }
      stream.end();

View Full Code Here

  }


  // LUCENE-1441
  public void testOffsets() throws Exception {
    TokenStream stream = new KeywordAnalyzer().tokenStream("field", new StringReader("abcd"));
    OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
    stream.reset();
    assertTrue(stream.incrementToken());
    assertEquals(0, offsetAtt.startOffset());
    assertEquals(4, offsetAtt.endOffset());
  }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.lucene.analysis.tokenattributes.OffsetAttribute

com.chenlb.mmseg4j.analysis.TokenUtils

com.flaptor.indextank.query.IndexEngineParser

com.github.le11.nls.lucene.UIMABaseAnalyzerTest

com.github.le11.nls.lucene.UIMATypeAwareAnalyzerTest

com.o19s.RegexPathHierarchyTokenizerTest

com.tamingtext.texttamer.solr.NameFilterTest

com.tamingtext.texttamer.solr.SentenceTokenizerTest

com.tistory.devyongsik.crescent.admin.service.MorphServiceImpl

it.unibz.instasearch.indexing.StorageIndexer

me.shenfeng.mmseg.HashSetDictionaryTest

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.