Package org.apache.lucene.analysis.tokenattributes

Examples of org.apache.lucene.analysis.tokenattributes.CharTermAttribute


  }
 
  private void doTestStopPositons(StopFilter stpf, boolean enableIcrements) throws IOException {
    log("---> test with enable-increments-"+(enableIcrements?"enabled":"disabled"));
    stpf.setEnablePositionIncrements(enableIcrements);
    CharTermAttribute termAtt = stpf.getAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncrAtt = stpf.getAttribute(PositionIncrementAttribute.class);
    stpf.reset();
    for (int i=0; i<20; i+=3) {
      assertTrue(stpf.incrementToken());
      log("Token "+i+": "+stpf);
      String w = English.intToEnglish(i).trim();
      assertEquals("expecting token "+i+" to be "+w,w,termAtt.toString());
      assertEquals("all but first token must have position increment of 3",enableIcrements?(i==0?1:3):1,posIncrAtt.getPositionIncrement());
    }
    assertFalse(stpf.incrementToken());
    stpf.end();
    stpf.close();
View Full Code Here


    }
    TokenStream ts = analyzer.tokenStream(fieldName, r);
    try {
      int tokenCount = 0;
      // for every token
      CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
      ts.reset();
      while (ts.incrementToken()) {
        String word = termAtt.toString();
        tokenCount++;
        if (tokenCount > maxNumTokensParsed) {
          break;
        }
        if (isNoiseWord(word)) {
View Full Code Here

     *  reuse and its chars must not be null. */
    public CharsRef analyze(String text, CharsRef reuse) throws IOException {
      IOException priorException = null;
      TokenStream ts = analyzer.tokenStream("", text);
      try {
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
        ts.reset();
        reuse.length = 0;
        while (ts.incrementToken()) {
          int length = termAtt.length();
          if (length == 0) {
            throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token");
          }
          if (posIncAtt.getPositionIncrement() != 1) {
            throw new IllegalArgumentException("term: " + text + " analyzed to a token with posinc != 1");
          }
          reuse.grow(reuse.length + length + 1); /* current + word + separator */
          int end = reuse.offset + reuse.length;
          if (reuse.length > 0) {
            reuse.chars[end++] = SynonymMap.WORD_SEPARATOR;
            reuse.length++;
          }
          System.arraycopy(termAtt.buffer(), 0, reuse.chars, end, length);
          reuse.length += length;
        }
        ts.end();
      } catch (IOException e) {
        priorException = e;
View Full Code Here

  private String[] tokenizeDoc(String doc) throws IOException {
    Collection<String> result = new LinkedList<String>();
    for (String textFieldName : textFieldNames) {
      TokenStream tokenStream = analyzer.tokenStream(textFieldName, doc);
      try {
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
          result.add(charTermAttribute.toString());
        }
        tokenStream.end();
      } finally {
        IOUtils.closeWhileHandlingException(tokenStream);
      }
View Full Code Here

  public void testDefaults() throws IOException {
    assertTrue(stop != null);
    TokenStream stream = stop.tokenStream("test", "This is a test of the english stop analyzer");
    try {
      assertTrue(stream != null);
      CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
      stream.reset();
   
      while (stream.incrementToken()) {
        assertFalse(inValidTokens.contains(termAtt.toString()));
      }
      stream.end();
    } finally {
      IOUtils.closeWhileHandlingException(stream);
    }
View Full Code Here

    CharArraySet stopWordsSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("good", "test", "analyzer"), false);
    StopAnalyzer newStop = new StopAnalyzer(Version.LUCENE_40, stopWordsSet);
    TokenStream stream = newStop.tokenStream("test", "This is a good test of the english stop analyzer");
    try {
      assertNotNull(stream);
      CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
   
      stream.reset();
      while (stream.incrementToken()) {
        String text = termAtt.toString();
        assertFalse(stopWordsSet.contains(text));
      }
      stream.end();
    } finally {
      IOUtils.closeWhileHandlingException(stream);
View Full Code Here

    int expectedIncr[] { 1,   1, 1,          3, 11,      1,            2,   1};
    TokenStream stream = newStop.tokenStream("test", s);
    try {
      assertNotNull(stream);
      int i = 0;
      CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
      PositionIncrementAttribute posIncrAtt = stream.addAttribute(PositionIncrementAttribute.class);

      stream.reset();
      while (stream.incrementToken()) {
        String text = termAtt.toString();
        assertFalse(stopWordsSet.contains(text));
        assertEquals(expectedIncr[i++],posIncrAtt.getPositionIncrement());
      }
      stream.end();
    } finally {
View Full Code Here

        @Override
        protected Object highlight(String text, Set<String> matchedTokens, String prefixToken) throws IOException {
          TokenStream ts = queryAnalyzer.tokenStream("text", new StringReader(text));
          try {
            CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
            OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
            ts.reset();
            List<LookupHighlightFragment> fragments = new ArrayList<LookupHighlightFragment>();
            int upto = 0;
            while (ts.incrementToken()) {
              String token = termAtt.toString();
              int startOffset = offsetAtt.startOffset();
              int endOffset = offsetAtt.endOffset();
              if (upto < startOffset) {
                fragments.add(new LookupHighlightFragment(text.substring(upto, startOffset), false));
                upto = startOffset;
View Full Code Here

  /**
   * TODO: rewrite tests not to use string comparison.
   */
  private static String tsToString(TokenStream in) throws IOException {
    StringBuilder out = new StringBuilder();
    CharTermAttribute termAtt = in.addAttribute(CharTermAttribute.class);
    // extra safety to enforce, that the state is not preserved and also
    // assign bogus values
    in.clearAttributes();
    termAtt.setEmpty().append("bogusTerm");
    in.reset();
    while (in.incrementToken()) {
      if (out.length() > 0)
        out.append(' ');
      out.append(termAtt.toString());
      in.clearAttributes();
      termAtt.setEmpty().append("bogusTerm");
    }

    in.close();
    return out.toString();
  }
View Full Code Here

    PerFieldAnalyzerWrapper analyzer =
              new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT), analyzerPerField);

    TokenStream tokenStream = analyzer.tokenStream("field", text);
    try {
      CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
      tokenStream.reset();

      assertTrue(tokenStream.incrementToken());
      assertEquals("WhitespaceAnalyzer does not lowercase",
                 "Qwerty",
                 termAtt.toString());
      assertFalse(tokenStream.incrementToken());
      tokenStream.end();
    } finally {
      IOUtils.closeWhileHandlingException(tokenStream);
    }

    tokenStream = analyzer.tokenStream("special", text);
    try {
      CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
      tokenStream.reset();

      assertTrue(tokenStream.incrementToken());
      assertEquals("SimpleAnalyzer lowercases",
                 "qwerty",
                 termAtt.toString());
      assertFalse(tokenStream.incrementToken());
      tokenStream.end();
    } finally {
      IOUtils.closeWhileHandlingException(tokenStream);
    }
View Full Code Here

TOP

Related Classes of org.apache.lucene.analysis.tokenattributes.CharTermAttribute

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.