Package org.apache.lucene.analysis.tokenattributes

Examples of org.apache.lucene.analysis.tokenattributes.OffsetAttribute


    try {
      UIMATypeAwareAnalyzer analyzer = new UIMATypeAwareAnalyzer("/HmmTaggerAggregate.xml",
              "org.apache.uima.TokenAnnotation", "posTag");
      TokenStream ts = analyzer.tokenStream("text", new StringReader("the big brown fox jumped on the wood"));
      CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
      OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
      TypeAttribute typeAttr = ts.addAttribute(TypeAttribute.class);
      PositionIncrementAttribute posAtt = ts.addAttribute(PositionIncrementAttribute.class);
      while (ts.incrementToken()) {
        assertNotNull(offsetAtt);
        assertNotNull(termAtt);
View Full Code Here


      if (attClass.equals("CharTermAttribute")) {
        val = ((CharTermAttribute)att).toString();
      } else if (attClass.equals("FlagsAttribute")) {
        val = Integer.toHexString(((FlagsAttribute)att).getFlags());
      } else if (attClass.equals("OffsetAttribute")) {
        OffsetAttribute off = (OffsetAttribute)att;
        val = off.startOffset() + "-" + off.endOffset();
      } else if (attClass.equals("PayloadAttribute")) {
        BytesRef payload = ((PayloadAttribute)att).getPayload();
        if (payload != null) {
          val = Util.bytesToHex(payload.bytes, payload.offset, payload.length, false);
        } else {
          val = "";
        }
      } else if (attClass.equals("PositionIncrementAttribute")) {
        val = ((PositionIncrementAttribute)att).getPositionIncrement() + "";
      } else if (attClass.equals("TypeAttribute")) {
        val = ((TypeAttribute)att).type();
      } else if (attClass.equals("KeywordAttribute")) {
        val = Boolean.toString(((KeywordAttribute)att).isKeyword());
      } else {
        val = att.toString();
      }
      app.setString(cell, "text", val);
    }
    Object inputText = app.find(myUi, "inputText");
    if (as.hasAttribute(OffsetAttribute.class)) {
      OffsetAttribute off = (OffsetAttribute)as.getAttribute(OffsetAttribute.class);
      app.setInteger(inputText, "start", 0);
      app.setInteger(inputText, "end", off.endOffset());
      app.setInteger(inputText, "start", off.startOffset());
      app.requestFocus(inputText);
    }
  }
View Full Code Here

      for (int i=0; i<result.synonyms.length; i++) {
        Token repTok = result.synonyms[i];
        AttributeSource newTok = firstTok.cloneAttributes();
        CharTermAttribute newTermAtt = newTok.addAttribute(CharTermAttribute.class);
        OffsetAttribute newOffsetAtt = newTok.addAttribute(OffsetAttribute.class);
        PositionIncrementAttribute newPosIncAtt = newTok.addAttribute(PositionIncrementAttribute.class);

        OffsetAttribute lastOffsetAtt = lastTok.addAttribute(OffsetAttribute.class);

        newOffsetAtt.setOffset(newOffsetAtt.startOffset(), lastOffsetAtt.endOffset());
        newTermAtt.copyBuffer(repTok.buffer(), 0, repTok.length());
        repPos += repTok.getPositionIncrement();
        if (i==0) repPos=origPos;  // make position of first token equal to original

        // if necessary, insert original tokens and adjust position increment
View Full Code Here

    Tokenizer tok = new SentenceTokenizer(in, detector);
    NameFilter nf = new NameFilter(tok, modelName, finder);

    CharTermAttribute cta;
    PositionIncrementAttribute pta;
    OffsetAttribute oa;
   
    int pass = 0;
   
    while (pass < 2) { // test reuse.
      int pos = 0;
      int lastStart = 0;
      int lastEnd   = 0;
     
      while (nf.incrementToken()) {
        cta = (CharTermAttribute) nf.getAttribute(CharTermAttribute.class);
        pta = (PositionIncrementAttribute) nf.getAttribute(PositionIncrementAttribute.class);
        oa  = (OffsetAttribute) nf.getAttribute(OffsetAttribute.class);
       
        System.err.println("'" + cta.toString() + "'");
        System.err.println(pta.toString());
        System.err.println(oa.toString());
        System.err.println("--- pass: " + pass);
       
        TestCase.assertEquals(tokenStrings[pos], cta.toString());
        TestCase.assertEquals(positionIncrements[pos], pta.getPositionIncrement());
       
        if (pta.getPositionIncrement() == 0) {
          TestCase.assertEquals(lastStart, oa.startOffset());
          TestCase.assertEquals(lastEnd, oa.endOffset());
        }
       
        if (!cta.toString().startsWith("NE_")) {
          TestCase.assertEquals(input.substring(oa.startOffset(), oa.endOffset()), cta.toString());
        }
       
        lastStart = oa.startOffset();
        lastEnd   = oa.endOffset();
       
        pos++;
      }
     
      //if (pass == 1) nf.dumpState();
View Full Code Here

    Collection<Token> result = new ArrayList<Token>();
    TokenStream ts = analyzer.reusableTokenStream("", new StringReader(q));
    ts.reset();
    // TODO: support custom attributes
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
    TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class);
    FlagsAttribute flagsAtt = ts.addAttribute(FlagsAttribute.class);
    PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
    PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
   
    while (ts.incrementToken()){
      Token token = new Token();
      token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
      token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
      token.setType(typeAtt.type());
      token.setFlags(flagsAtt.getFlags());
      token.setPayload(payloadAtt.getPayload());
      token.setPositionIncrement(posIncAtt.getPositionIncrement());
      result.add(token);
View Full Code Here

      int numOverlapTokens = 0;
      int pos = -1;
     
      CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
      PositionIncrementAttribute posIncrAttribute = stream.addAttribute(PositionIncrementAttribute.class);
      OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
      stream.reset();
      while (stream.incrementToken()) {
        String term = termAtt.toString();
        if (term.length() == 0) continue; // nothing to do
//        if (DEBUG) System.err.println("token='" + term + "'");
        numTokens++;
        final int posIncr = posIncrAttribute.getPositionIncrement();
        if (posIncr == 0)
          numOverlapTokens++;
        pos += posIncr;
       
        ArrayIntList positions = terms.get(term);
        if (positions == null) { // term not seen before
          positions = new ArrayIntList(stride);
          terms.put(term, positions);
        }
        if (stride == 1) {
          positions.add(pos);
        } else {
          positions.add(pos, offsetAtt.startOffset(), offsetAtt.endOffset());
        }
      }
      stream.end();

      // ensure infos.numTokens > 0 invariant; needed for correct operation of terms()
View Full Code Here

   
    SentenceTokenizer tok = factory.create(new StringReader(inputString));
   
    CharTermAttribute cta;
    PositionIncrementAttribute pta;
    OffsetAttribute oa;

    int pass = 0;
   
    while (pass < 2) { // test reuse
      int pos = 0;
      int offset = 0;
     
      while (tok.incrementToken()) {
        cta = (CharTermAttribute) tok.getAttribute(CharTermAttribute.class);
        pta = (PositionIncrementAttribute) tok.getAttribute(PositionIncrementAttribute.class);
        oa  = (OffsetAttribute) tok.getAttribute(OffsetAttribute.class);
       
        System.err.println("'" + cta.toString() + "'");
        System.err.println(pta.toString());
        System.err.println(oa.toString());
        System.err.println("--- pass: " + pass);
       
        String expected = expectedStrings[pos];
        TestCase.assertEquals("Strings don't match", expected, cta.toString());
        TestCase.assertEquals("Positing increment is incorrect", 1, pta.getPositionIncrement());
        TestCase.assertEquals("Start offset is incorrect", offset, oa.startOffset());
        TestCase.assertEquals("End offset is incorrect",  offset + expected.length(), oa.endOffset());
       
        offset += expected.length() + 1; // space after end of sentence
        pos++;
      }
     
View Full Code Here

  }
 
  public void checkCJKTokenReusable(final Analyzer a, final String str, final TestToken[] out_tokens) throws IOException {
    TokenStream ts = a.reusableTokenStream("dummy", new StringReader(str));
    TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);
    OffsetAttribute offsetAtt = (OffsetAttribute) ts.getAttribute(OffsetAttribute.class);
    TypeAttribute typeAtt = (TypeAttribute) ts.getAttribute(TypeAttribute.class);
    for (int i = 0; i < out_tokens.length; i++) {
      assertTrue(ts.incrementToken());
      assertEquals(termAtt.term(), out_tokens[i].termText);
      assertEquals(offsetAtt.startOffset(), out_tokens[i].start);
      assertEquals(offsetAtt.endOffset(), out_tokens[i].end);
      assertEquals(typeAtt.type(), out_tokens[i].type);
    }
    assertFalse(ts.incrementToken());
  }
View Full Code Here

  }
 
  public void testFilterTokens() throws Exception {
    SnowballFilter filter = new SnowballFilter(new TestTokenStream(), "English");
    TermAttribute termAtt = (TermAttribute) filter.getAttribute(TermAttribute.class);
    OffsetAttribute offsetAtt = (OffsetAttribute) filter.getAttribute(OffsetAttribute.class);
    TypeAttribute typeAtt = (TypeAttribute) filter.getAttribute(TypeAttribute.class);
    PayloadAttribute payloadAtt = (PayloadAttribute) filter.getAttribute(PayloadAttribute.class);
    PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) filter.getAttribute(PositionIncrementAttribute.class);
    FlagsAttribute flagsAtt = (FlagsAttribute) filter.getAttribute(FlagsAttribute.class);
   
    filter.incrementToken();

    assertEquals("accent", termAtt.term());
    assertEquals(2, offsetAtt.startOffset());
    assertEquals(7, offsetAtt.endOffset());
    assertEquals("wrd", typeAtt.type());
    assertEquals(3, posIncAtt.getPositionIncrement());
    assertEquals(77, flagsAtt.getFlags());
    assertEquals(new Payload(new byte[]{0,1,2,3}), payloadAtt.getPayload());
  }
View Full Code Here

           
            boolean hasMoreTokens = stream.incrementToken();

            fieldState.attributeSource = stream;

            OffsetAttribute offsetAttribute = (OffsetAttribute) fieldState.attributeSource.addAttribute(OffsetAttribute.class);
            PositionIncrementAttribute posIncrAttribute = (PositionIncrementAttribute) fieldState.attributeSource.addAttribute(PositionIncrementAttribute.class);
           
            consumer.start(field);
           
            for(;;) {

              // If we hit an exception in stream.next below
              // (which is fairly common, eg if analyzer
              // chokes on a given document), then it's
              // non-aborting and (above) this one document
              // will be marked as deleted, but still
              // consume a docID
             
              if (!hasMoreTokens) break;
             
              final int posIncr = posIncrAttribute.getPositionIncrement();
              fieldState.position += posIncr;
              if (allowMinus1Position || fieldState.position > 0) {
                fieldState.position--;
              }

              if (posIncr == 0)
                fieldState.numOverlap++;

              boolean success = false;
              try {
                // If we hit an exception in here, we abort
                // all buffered documents since the last
                // flush, on the likelihood that the
                // internal state of the consumer is now
                // corrupt and should not be flushed to a
                // new segment:
                consumer.add();
                success = true;
              } finally {
                if (!success)
                  docState.docWriter.setAborting();
              }
              fieldState.position++;
              offsetEnd = fieldState.offset + offsetAttribute.endOffset();
              if (++fieldState.length >= maxFieldLength) {
                if (docState.infoStream != null)
                  docState.infoStream.println("maxFieldLength " +maxFieldLength+ " reached for field " + fieldInfo.name + ", ignoring following tokens");
                break;
              }

              hasMoreTokens = stream.incrementToken();
            }
            // trigger streams to perform end-of-stream operations
            stream.end();
           
            fieldState.offset += offsetAttribute.endOffset();
            anyToken = fieldState.length > startLength;
          } finally {
            stream.close();
          }
        }
View Full Code Here

TOP

Related Classes of org.apache.lucene.analysis.tokenattributes.OffsetAttribute

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.