Package org.apache.lucene.analysis

Examples of org.apache.lucene.analysis.TokenStream


    public MultiAnalyzer() {
    }

    @Override
    public TokenStream tokenStream(String fieldName, Reader reader) {
      TokenStream result = new StandardTokenizer(Version.LUCENE_CURRENT, reader);
      result = new TestFilter(result);
      result = new LowerCaseFilter(result);
      return result;
    }
View Full Code Here


    public PosIncrementAnalyzer() {
    }

    @Override
    public TokenStream tokenStream(String fieldName, Reader reader) {
      TokenStream result = new StandardTokenizer(Version.LUCENE_CURRENT, reader);
      result = new TestPosIncrementFilter(result);
      result = new LowerCaseFilter(result);
      return result;
    }
View Full Code Here

     *                  {@link GreekLowerCaseFilter} and {@link StopFilter}
     */
    @Override
    public TokenStream tokenStream(String fieldName, Reader reader)
    {
        TokenStream result = new StandardTokenizer(matchVersion, reader);
        result = new GreekLowerCaseFilter(result);
        result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
                                result, stopSet);
        return result;
    }
View Full Code Here

   */
  protected Query getFieldQuery(String field, String queryTextthrows ParseException {
    // Use the analyzer to get all the tokens, and then build a TermQuery,
    // PhraseQuery, or nothing based on the term count

    TokenStream source;
    try {
      source = analyzer.reusableTokenStream(field, new StringReader(queryText));
      source.reset();
    } catch (IOException e) {
      source = analyzer.tokenStream(field, new StringReader(queryText));
    }
    CachingTokenFilter buffer = new CachingTokenFilter(source);
    TermAttribute termAtt = null;
    PositionIncrementAttribute posIncrAtt = null;
    int numTokens = 0;

    boolean success = false;
    try {
      buffer.reset();
      success = true;
    } catch (IOException e) {
      // success==false if we hit an exception
    }
    if (success) {
      if (buffer.hasAttribute(TermAttribute.class)) {
        termAtt = buffer.getAttribute(TermAttribute.class);
      }
      if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
        posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class);
      }
    }

    int positionCount = 0;
    boolean severalTokensAtSamePosition = false;

    boolean hasMoreTokens = false;
    if (termAtt != null) {
      try {
        hasMoreTokens = buffer.incrementToken();
        while (hasMoreTokens) {
          numTokens++;
          int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1;
          if (positionIncrement != 0) {
            positionCount += positionIncrement;
          } else {
            severalTokensAtSamePosition = true;
          }
          hasMoreTokens = buffer.incrementToken();
        }
      } catch (IOException e) {
        // ignore
      }
    }
    try {
      // rewind the buffer stream
      buffer.reset();

      // close original stream - all tokens buffered
      source.close();
    }
    catch (IOException e) {
      // ignore
    }

View Full Code Here

   * @return  A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
   *       {@link StandardFilter}, {@link LowerCaseFilter}, and {@link StopFilter}
   */
  @Override
  public final TokenStream tokenStream( String fieldName, Reader reader ) {
                TokenStream result = new StandardTokenizer( matchVersion, reader );
    result = new StandardFilter( result );
    result = new LowerCaseFilter( result );
    result = new StopFilter( StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
                                         result, stoptable );
    return result;
View Full Code Here

  public int doLogic() throws Exception {
    List<Fieldable> fields = doc.getFields();
    Analyzer analyzer = getRunData().getAnalyzer();
    int tokenCount = 0;
    for(final Fieldable field : fields) {
      final TokenStream stream;
      final TokenStream streamValue = field.tokenStreamValue();

      if (streamValue != null)
        stream = streamValue;
      else {
        // the field does not have a TokenStream,
View Full Code Here

    highlighter.setMaxDocCharsToAnalyze(maxDocCharsToAnalyze);
    return new BenchmarkHighlighter(){
      @Override
      public int doHighlight(IndexReader reader, int doc, String field,
          Document document, Analyzer analyzer, String text) throws Exception {
        TokenStream ts = TokenSources.getAnyTokenStream(reader, doc, field, document, analyzer);
        TextFragment[] frag = highlighter.getBestTextFragments(ts, text, mergeContiguous, maxFrags);
        return frag != null ? frag.length : 0;
      }
    };
  }
View Full Code Here

        sampleUnicode =
            new InputStreamReader(
                new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/resUTF8.htm")),
                "UTF-8");

        TokenStream in = ra.tokenStream("all", inWords);

        RussianLetterTokenizer sample =
            new RussianLetterTokenizer(
                sampleUnicode);

        TermAttribute text = in.getAttribute(TermAttribute.class);
        TermAttribute sampleText = sample.getAttribute(TermAttribute.class);

        for (;;)
        {
          if (in.incrementToken() == false)
            break;

            boolean nextSampleToken = sample.incrementToken();
            assertEquals(
                "Unicode",
View Full Code Here

   
    public void testDigitsInRussianCharset()
    {
        Reader reader = new StringReader("text 1000");
        RussianAnalyzer ra = new RussianAnalyzer(Version.LUCENE_CURRENT);
        TokenStream stream = ra.tokenStream("", reader);

        TermAttribute termText = stream.getAttribute(TermAttribute.class);
        try {
            assertTrue(stream.incrementToken());
            assertEquals("text", termText.term());
            assertTrue(stream.incrementToken());
            assertEquals("RussianAnalyzer's tokenizer skips numbers from input text", "1000", termText.term());
            assertFalse(stream.incrementToken());
        }
        catch (IOException e)
        {
            fail("unexpected IOException");
        }
View Full Code Here

        LinkedList<Token> tokens = new LinkedList<Token>();
        tokensByField.put(field, tokens);

        if (field.isTokenized()) {
          int termCounter = 0;
          final TokenStream tokenStream;
          // todo readerValue(), binaryValue()
          if (field.tokenStreamValue() != null) {
            tokenStream = field.tokenStreamValue();
          } else {
            tokenStream = analyzer.tokenStream(field.name(), new StringReader(field.stringValue()));
          }

          // reset the TokenStream to the first token         
          tokenStream.reset();

          while (tokenStream.incrementToken()) {
            // TODO: this is a simple workaround to still work with tokens, not very effective, but as far as I know, this writer should get removed soon:
            final Token token = new Token();
            for (Iterator<AttributeImpl> atts = tokenStream.getAttributeImplsIterator(); atts.hasNext();) {
              final AttributeImpl att = atts.next();
              try {
                att.copyTo(token);
              } catch (Exception e) {
                // ignore unsupported attributes,
                // this may fail to copy some attributes, if a special combined AttributeImpl is used, that
                // implements basic attributes supported by Token and also other customized ones in one class.
              }
            }
            tokens.add(token); // the vector will be built on commit.
            fieldSetting.fieldLength++;
            if (fieldSetting.fieldLength > maxFieldLength) {
              break;
            }
          }
          tokenStream.end();
          tokenStream.close();
        } else {
          // untokenized
          String fieldVal = field.stringValue();
          Token token = new Token(0, fieldVal.length(), "untokenized");
          token.setTermBuffer(fieldVal);
View Full Code Here

TOP

Related Classes of org.apache.lucene.analysis.TokenStream

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.