Examples of org.apache.lucene.analysis.tokenattributes.CharTermAttribute

Package org.apache.lucene.analysis.tokenattributes

Examples of org.apache.lucene.analysis.tokenattributes.CharTermAttribute

org.apache.lucene.analysis.tokenattributes.CharTermAttribute
The term text of a Token.

  }
  
  private void doTestStopPositons(StopFilter stpf, boolean enableIcrements) throws IOException {
    log("---> test with enable-increments-"+(enableIcrements?"enabled":"disabled"));
    stpf.setEnablePositionIncrements(enableIcrements);
    CharTermAttribute termAtt = stpf.getAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncrAtt = stpf.getAttribute(PositionIncrementAttribute.class);
    stpf.reset();
    for (int i=0; i<20; i+=3) {
      assertTrue(stpf.incrementToken());
      log("Token "+i+": "+stpf);
      String w = English.intToEnglish(i).trim();
      assertEquals("expecting token "+i+" to be "+w,w,termAtt.toString());
      assertEquals("all but first token must have position increment of 3",enableIcrements?(i==0?1:3):1,posIncrAtt.getPositionIncrement());
    }
    assertFalse(stpf.incrementToken());
    stpf.end();
    stpf.close();

View Full Code Here

    }
    TokenStream ts = analyzer.tokenStream(fieldName, r);
    try {
      int tokenCount = 0;
      // for every token
      CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
      ts.reset();
      while (ts.incrementToken()) {
        String word = termAtt.toString();
        tokenCount++;
        if (tokenCount > maxNumTokensParsed) {
          break;
        }
        if (isNoiseWord(word)) {

View Full Code Here

     *  reuse and its chars must not be null. */
    public CharsRef analyze(String text, CharsRef reuse) throws IOException {
      IOException priorException = null;
      TokenStream ts = analyzer.tokenStream("", text);
      try {
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
        ts.reset();
        reuse.length = 0;
        while (ts.incrementToken()) {
          int length = termAtt.length();
          if (length == 0) {
            throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token");
          }
          if (posIncAtt.getPositionIncrement() != 1) {
            throw new IllegalArgumentException("term: " + text + " analyzed to a token with posinc != 1");
          }
          reuse.grow(reuse.length + length + 1); /* current + word + separator */
          int end = reuse.offset + reuse.length;
          if (reuse.length > 0) {
            reuse.chars[end++] = SynonymMap.WORD_SEPARATOR;
            reuse.length++;
          }
          System.arraycopy(termAtt.buffer(), 0, reuse.chars, end, length);
          reuse.length += length;
        }
        ts.end();
      } catch (IOException e) {
        priorException = e;

View Full Code Here

  private String[] tokenizeDoc(String doc) throws IOException {
    Collection<String> result = new LinkedList<String>();
    for (String textFieldName : textFieldNames) {
      TokenStream tokenStream = analyzer.tokenStream(textFieldName, doc);
      try {
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
          result.add(charTermAttribute.toString());
        }
        tokenStream.end();
      } finally {
        IOUtils.closeWhileHandlingException(tokenStream);
      }

View Full Code Here

  public void testDefaults() throws IOException {
    assertTrue(stop != null);
    TokenStream stream = stop.tokenStream("test", "This is a test of the english stop analyzer");
    try {
      assertTrue(stream != null);
      CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
      stream.reset();
    
      while (stream.incrementToken()) {
        assertFalse(inValidTokens.contains(termAtt.toString()));
      }
      stream.end();
    } finally {
      IOUtils.closeWhileHandlingException(stream);
    }

View Full Code Here

    CharArraySet stopWordsSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("good", "test", "analyzer"), false);
    StopAnalyzer newStop = new StopAnalyzer(Version.LUCENE_40, stopWordsSet);
    TokenStream stream = newStop.tokenStream("test", "This is a good test of the english stop analyzer");
    try {
      assertNotNull(stream);
      CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
    
      stream.reset();
      while (stream.incrementToken()) {
        String text = termAtt.toString();
        assertFalse(stopWordsSet.contains(text));
      }
      stream.end();
    } finally {
      IOUtils.closeWhileHandlingException(stream);

View Full Code Here

    int expectedIncr[] =  { 1,   1, 1,          3, 1,  1,      1,            2,   1};
    TokenStream stream = newStop.tokenStream("test", s);
    try {
      assertNotNull(stream);
      int i = 0;
      CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
      PositionIncrementAttribute posIncrAtt = stream.addAttribute(PositionIncrementAttribute.class);


      stream.reset();
      while (stream.incrementToken()) {
        String text = termAtt.toString();
        assertFalse(stopWordsSet.contains(text));
        assertEquals(expectedIncr[i++],posIncrAtt.getPositionIncrement());
      }
      stream.end();
    } finally {

View Full Code Here


        @Override
        protected Object highlight(String text, Set<String> matchedTokens, String prefixToken) throws IOException {
          TokenStream ts = queryAnalyzer.tokenStream("text", new StringReader(text));
          try {
            CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
            OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
            ts.reset();
            List<LookupHighlightFragment> fragments = new ArrayList<LookupHighlightFragment>();
            int upto = 0;
            while (ts.incrementToken()) {
              String token = termAtt.toString();
              int startOffset = offsetAtt.startOffset();
              int endOffset = offsetAtt.endOffset();
              if (upto < startOffset) {
                fragments.add(new LookupHighlightFragment(text.substring(upto, startOffset), false));
                upto = startOffset;

View Full Code Here

  /** 
   * TODO: rewrite tests not to use string comparison.
   */
  private static String tsToString(TokenStream in) throws IOException {
    StringBuilder out = new StringBuilder();
    CharTermAttribute termAtt = in.addAttribute(CharTermAttribute.class);
    // extra safety to enforce, that the state is not preserved and also
    // assign bogus values
    in.clearAttributes();
    termAtt.setEmpty().append("bogusTerm");
    in.reset();
    while (in.incrementToken()) {
      if (out.length() > 0)
        out.append(' ');
      out.append(termAtt.toString());
      in.clearAttributes();
      termAtt.setEmpty().append("bogusTerm");
    }


    in.close();
    return out.toString();
  }

View Full Code Here

    PerFieldAnalyzerWrapper analyzer =
              new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT), analyzerPerField);


    TokenStream tokenStream = analyzer.tokenStream("field", text);
    try {
      CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
      tokenStream.reset();


      assertTrue(tokenStream.incrementToken());
      assertEquals("WhitespaceAnalyzer does not lowercase",
                 "Qwerty",
                 termAtt.toString());
      assertFalse(tokenStream.incrementToken());
      tokenStream.end();
    } finally {
      IOUtils.closeWhileHandlingException(tokenStream);
    }


    tokenStream = analyzer.tokenStream("special", text);
    try {
      CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
      tokenStream.reset();


      assertTrue(tokenStream.incrementToken());
      assertEquals("SimpleAnalyzer lowercases",
                 "qwerty",
                 termAtt.toString());
      assertFalse(tokenStream.incrementToken());
      tokenStream.end();
    } finally {
      IOUtils.closeWhileHandlingException(tokenStream);
    }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.lucene.analysis.tokenattributes.CharTermAttribute

com.chenlb.mmseg4j.analysis.TokenUtils

com.code972.elasticsearch.rest.action.RestHebrewAnalyzerCheckWordAction

com.digitalpebble.behemoth.mahout.LuceneTokenizerMapper

com.gentics.cr.lucene.analysis.CustomPatternAnalyzerTest

com.ikanow.infinit.e.api.knowledge.SearchHandler

com.livingsocial.hive.udf.Tokenize

com.mozilla.grouperfish.lucene.analysis.en.NGramEnglishAnalyzer

com.mozilla.grouperfish.pig.eval.text.NGramTokenize

com.mozilla.grouperfish.pig.eval.text.Tokenize

ivory.core.tokenize.LuceneArabicAnalyzer

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.