Package org.apache.lucene.analysis.tokenattributes

Examples of org.apache.lucene.analysis.tokenattributes.OffsetAttribute


    for (int i = 0; i < num; i++) {
      String s = TestUtil.randomUnicodeString(random());
      TokenStream ts = analyzer.tokenStream("foo", s);
      try {
        ts.reset();
        OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
        while (ts.incrementToken()) {
          String highlightedText = s.substring(offsetAtt.startOffset(), offsetAtt.endOffset());
          for (int j = 0, cp = 0; j < highlightedText.length(); j += Character.charCount(cp)) {
            cp = highlightedText.codePointAt(j);
            assertTrue("non-letter:" + Integer.toHexString(cp), Character.isLetter(cp));
          }
        }
View Full Code Here


    final int minGram = TestUtil.nextInt(random(), 1, 3);
    final int maxGram = TestUtil.nextInt(random(), minGram, 10);
    TokenStream tk = new KeywordTokenizer(new StringReader(s));
    tk = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tk, minGram, maxGram);
    final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
    final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
    tk.reset();
    for (int i = minGram; i <= Math.min(codePointCount, maxGram); ++i) {
      assertTrue(tk.incrementToken());
      assertEquals(0, offsetAtt.startOffset());
      assertEquals(s.length(), offsetAtt.endOffset());
      final int end = Character.offsetByCodePoints(s, 0, i);
      assertEquals(s.substring(0, end), termAtt.toString());
    }
    assertFalse(tk.incrementToken());
  }
View Full Code Here

  }
 
  public void testFilterTokens() throws Exception {
    SnowballFilter filter = new SnowballFilter(new TestTokenStream(), "English");
    CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtt = filter.getAttribute(OffsetAttribute.class);
    TypeAttribute typeAtt = filter.getAttribute(TypeAttribute.class);
    PayloadAttribute payloadAtt = filter.getAttribute(PayloadAttribute.class);
    PositionIncrementAttribute posIncAtt = filter.getAttribute(PositionIncrementAttribute.class);
    FlagsAttribute flagsAtt = filter.getAttribute(FlagsAttribute.class);
   
    filter.incrementToken();

    assertEquals("accent", termAtt.toString());
    assertEquals(2, offsetAtt.startOffset());
    assertEquals(7, offsetAtt.endOffset());
    assertEquals("wrd", typeAtt.type());
    assertEquals(3, posIncAtt.getPositionIncrement());
    assertEquals(77, flagsAtt.getFlags());
    assertEquals(new BytesRef(new byte[]{0,1,2,3}), payloadAtt.getPayload());
  }
View Full Code Here

    final int minGram = TestUtil.nextInt(random(), 1, 3);
    final int maxGram = TestUtil.nextInt(random(), minGram, 10);
    TokenStream tk = new KeywordTokenizer(new StringReader(s));
    tk = new NGramTokenFilter(TEST_VERSION_CURRENT, tk, minGram, maxGram);
    final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
    final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
    tk.reset();
    for (int start = 0; start < codePointCount; ++start) {
      for (int end = start + minGram; end <= Math.min(codePointCount, start + maxGram); ++end) {
        assertTrue(tk.incrementToken());
        assertEquals(0, offsetAtt.startOffset());
        assertEquals(s.length(), offsetAtt.endOffset());
        final int startIndex = Character.offsetByCodePoints(s, 0, start);
        final int endIndex = Character.offsetByCodePoints(s, 0, end);
        assertEquals(s.substring(startIndex, endIndex), termAtt.toString());
      }
    }
View Full Code Here

  }

  private static void displayTokensWithFullDetails(TokenStream stream) throws IOException {
    final CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    final PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
    final OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
    final TypeAttribute type = stream.addAttribute(TypeAttribute.class);
    int position = 0;
    while (stream.incrementToken()) {
      final int increment = posIncr.getPositionIncrement();
      if (increment > 0) {
        position = position + increment;
        System.out.println();
        System.out.print(position + ": ");
      }
      System.out.print("[" + term.toString() + ":" + offset.startOffset() + "->"
          + offset.endOffset() + ":" + type.type() + "] ");
    }
    System.out.println();
  }
View Full Code Here

            new TreeMap<String, TermVectorOffsetInfo[]>();
        Reader r = new StringReader(text);
        TokenStream ts = index.getTextAnalyzer().tokenStream("", r);
        try {
            while (ts.incrementToken()) {
                OffsetAttribute offset = ts.getAttribute(OffsetAttribute.class);
                TermAttribute term = ts.getAttribute(TermAttribute.class);
                String termText = term.term();
                TermVectorOffsetInfo[] info = termMap.get(termText);
                if (info == null) {
                    info = new TermVectorOffsetInfo[1];
                } else {
                    TermVectorOffsetInfo[] tmp = info;
                    info = new TermVectorOffsetInfo[tmp.length + 1];
                    System.arraycopy(tmp, 0, info, 0, tmp.length);
                }
                info[info.length - 1] = new TermVectorOffsetInfo(
                    offset.startOffset(), offset.endOffset());
                termMap.put(termText, info);
            }
            ts.end();
            ts.close();
        } catch (IOException e) {
View Full Code Here

      record.put( "type", narrowedAttr.type() );
      return record;
    }
    else if ( attr instanceof OffsetAttribute ) {
      GenericRecord record = new GenericData.Record( protocol.getType( "OffsetAttribute" ) );
      OffsetAttribute narrowedAttr = (OffsetAttribute) attr;
      record.put( "startOffset", narrowedAttr.startOffset() );
      record.put( "endOffset", narrowedAttr.endOffset() );
      return record;
    }
    else if ( attr instanceof Serializable ) {
      return ByteBuffer.wrap( toByteArray( (Serializable) attr ) );
    }
View Full Code Here

            new TreeMap<String, TermVectorOffsetInfo[]>();
        Reader r = new StringReader(text);
        TokenStream ts = index.getTextAnalyzer().tokenStream("", r);
        try {
            while (ts.incrementToken()) {
                OffsetAttribute offset = ts.getAttribute(OffsetAttribute.class);
                TermAttribute term = ts.getAttribute(TermAttribute.class);
                String termText = term.term();
                TermVectorOffsetInfo[] info = termMap.get(termText);
                if (info == null) {
                    info = new TermVectorOffsetInfo[1];
                } else {
                    TermVectorOffsetInfo[] tmp = info;
                    info = new TermVectorOffsetInfo[tmp.length + 1];
                    System.arraycopy(tmp, 0, info, 0, tmp.length);
                }
                info[info.length - 1] = new TermVectorOffsetInfo(
                    offset.startOffset(), offset.endOffset());
                termMap.put(termText, info);
            }
            ts.end();
            ts.close();
        } catch (IOException e) {
View Full Code Here

        fieldInfos.put(fieldName,
            new FieldInfo(fieldName, true, fieldInfos.size(), false, false, false, this.storeOffsets ? IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS : IndexOptions.DOCS_AND_FREQS_AND_POSITIONS , null, null, null));
      }
      TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
      PositionIncrementAttribute posIncrAttribute = stream.addAttribute(PositionIncrementAttribute.class);
      OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
      BytesRef ref = termAtt.getBytesRef();
      stream.reset();
     
      while (stream.incrementToken()) {
        termAtt.fillBytesRef();
//        if (DEBUG) System.err.println("token='" + term + "'");
        numTokens++;
        final int posIncr = posIncrAttribute.getPositionIncrement();
        if (posIncr == 0)
          numOverlapTokens++;
        pos += posIncr;
        int ord = terms.add(ref);
        if (ord < 0) {
          ord = (-ord) - 1;
          postingsWriter.reset(sliceArray.end[ord]);
        } else {
          sliceArray.start[ord] = postingsWriter.startNewSlice();
        }
        sliceArray.freq[ord]++;
        sumTotalTermFreq++;
        if (!storeOffsets) {
          postingsWriter.writeInt(pos);
        } else {
          postingsWriter.writeInt(pos);
          postingsWriter.writeInt(offsetAtt.startOffset());
          postingsWriter.writeInt(offsetAtt.endOffset());
        }
        sliceArray.end[ord] = postingsWriter.getCurrentOffset();
      }
      stream.end();
View Full Code Here

        String s = "a天b";
        ChineseTokenizer tokenizer = new ChineseTokenizer(new StringReader(s));

        int correctStartOffset = 0;
        int correctEndOffset = 1;
        OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
        while (tokenizer.incrementToken()) {
          assertEquals(correctStartOffset, offsetAtt.startOffset());
          assertEquals(correctEndOffset, offsetAtt.endOffset());
          correctStartOffset++;
          correctEndOffset++;
        }
    }
View Full Code Here

TOP

Related Classes of org.apache.lucene.analysis.tokenattributes.OffsetAttribute

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.