Package org.apache.lucene.analysis.standard

Examples of org.apache.lucene.analysis.standard.StandardTokenizer


  public ThaiAnalyzer() {
    setOverridesTokenStreamMethod(ThaiAnalyzer.class);
  }
 
  public TokenStream tokenStream(String fieldName, Reader reader) {
    TokenStream ts = new StandardTokenizer(reader);
    ts = new StandardFilter(ts);
    ts = new ThaiWordFilter(ts);
    ts = new StopFilter(ts, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
    return ts;
  }
View Full Code Here


    }
   
    SavedStreams streams = (SavedStreams) getPreviousTokenStream();
    if (streams == null) {
      streams = new SavedStreams();
      streams.source = new StandardTokenizer(reader);
      streams.result = new StandardFilter(streams.source);
      streams.result = new ThaiWordFilter(streams.result);
      streams.result = new StopFilter(streams.result, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
      setPreviousTokenStream(streams);
    } else {
View Full Code Here

*/
public class TestElision extends BaseTokenStreamTestCase {

  public void testElision() throws Exception {
    String test = "Plop, juste pour voir l'embrouille avec O'brian. M'enfin.";
    Tokenizer tokenizer = new StandardTokenizer(Version.LUCENE_CURRENT, new StringReader(test));
    Set articles = new HashSet();
    articles.add("l");
    articles.add("M");
    TokenFilter filter = new ElisionFilter(tokenizer, articles);
    List tas = filtre(filter);
View Full Code Here

    RAMDirectory dir = new MockRAMDirectory();
    IndexWriter writer = new IndexWriter(dir, new Analyzer() {

      @Override
      public TokenStream tokenStream(String fieldName, Reader reader) {
        return new TokenFilter(new StandardTokenizer(Version.LUCENE_CURRENT, reader)) {
          private int count = 0;

          @Override
          public boolean incrementToken() throws IOException {
            if (count++ == 5) {
View Full Code Here

            TokenStream tok = new SKOSURIFilter(src, skosEngine,
                    new StandardAnalyzer(matchVersion), types);
            tok = new LowerCaseFilter(matchVersion, tok);
            return new TokenStreamComponents(src, tok);
        } else {
            final StandardTokenizer src = new StandardTokenizer(matchVersion, reader);
            src.setMaxTokenLength(maxTokenLength);
            TokenStream tok = new StandardFilter(matchVersion, src);
            // prior to this we get the classic behavior, standardfilter does it for
            // us.
            tok = new SKOSLabelFilter(tok, skosEngine, new StandardAnalyzer(
                    matchVersion), bufferSize, types);
            tok = new LowerCaseFilter(matchVersion, tok);
            tok = new StopFilter(matchVersion, tok, stopwords);
            tok = new RemoveDuplicatesTokenFilter(tok);
            return new TokenStreamComponents(src, tok) {
                @Override
                protected void setReader(final Reader reader) throws IOException {
                    src.setMaxTokenLength(maxTokenLength);
                    super.setReader(reader);
                }
            };
        }
    }
View Full Code Here

   */
  @Override
  protected TokenStreamComponents createComponents(String fieldName,
      Reader aReader) {
    if (matchVersion.onOrAfter(Version.LUCENE_31)) {
      final Tokenizer source = new StandardTokenizer(matchVersion, aReader);
      TokenStream result = new StandardFilter(matchVersion, source);
      result = new LowerCaseFilter(matchVersion, result);
      result = new StopFilter(matchVersion, result, stoptable);
      if (!excltable.isEmpty())
        result = new SetKeywordMarkerFilter(result, excltable);
      if (stemdict != null)
        result = new StemmerOverrideFilter(result, stemdict);
      result = new SnowballFilter(result, new org.tartarus.snowball.ext.DutchStemmer());
      return new TokenStreamComponents(source, result);
    } else {
      final Tokenizer source = new StandardTokenizer(matchVersion, aReader);
      TokenStream result = new StandardFilter(matchVersion, source);
      result = new StopFilter(matchVersion, result, stoptable);
      if (!excltable.isEmpty())
        result = new SetKeywordMarkerFilter(result, excltable);
      result = new DutchStemFilter(result, origStemdict);
View Full Code Here

   *         provided and {@link SnowballFilter}.
   */
  @Override
  protected TokenStreamComponents createComponents(String fieldName,
      Reader reader) {
    final Tokenizer source = new StandardTokenizer(matchVersion, reader);
    TokenStream result = new StandardFilter(matchVersion, source);
    if (matchVersion.onOrAfter(Version.LUCENE_36)) {
      result = new ElisionFilter(result, DEFAULT_ARTICLES);
    }
    result = new LowerCaseFilter(matchVersion, result);
View Full Code Here

    FieldType type = schema.getFieldType("textDefault");
    TokenizerChain ana = (TokenizerChain) type.getAnalyzer();
    assertEquals(DEFAULT_VERSION, ((BaseTokenizerFactory) ana.getTokenizerFactory()).luceneMatchVersion);
    assertEquals(DEFAULT_VERSION, ((BaseTokenFilterFactory) ana.getTokenFilterFactories()[2]).luceneMatchVersion);
    TokenizerChain.TokenStreamInfo tsi = ana.getStream("textDefault",new StringReader(""));
    StandardTokenizer tok = (StandardTokenizer) tsi.getTokenizer();
    assertTrue(tok.isReplaceInvalidAcronym());
   
    type = schema.getFieldType("text20");
    ana = (TokenizerChain) type.getAnalyzer();
    assertEquals(Version.LUCENE_20, ((BaseTokenizerFactory) ana.getTokenizerFactory()).luceneMatchVersion);
    assertEquals(Version.LUCENE_24, ((BaseTokenFilterFactory) ana.getTokenFilterFactories()[2]).luceneMatchVersion);
    tsi = ana.getStream("text20",new StringReader(""));
    tok = (StandardTokenizer) tsi.getTokenizer();
    assertFalse(tok.isReplaceInvalidAcronym());

    // this is a hack to get the private matchVersion field in StandardAnalyzer's superclass, may break in later lucene versions - we have no getter :(
    final Field matchVersionField = StandardAnalyzer.class.getSuperclass().getDeclaredField("matchVersion");
    matchVersionField.setAccessible(true);
View Full Code Here

     * @return  A TokenStream build from a StandardTokenizer filtered with
     *    StandardFilter, StopFilter, GermanStemFilter
     */
    public TokenStream tokenStream( String fieldName, Reader reader )
    {
  TokenStream result = new StandardTokenizer( reader );
  result = new StandardFilter( result );
  // shouldn't there be a lowercaser before stop word filtering?
  result = new StopFilter( result, stoptable );
  result = new GermanStemFilter( result, excltable );
  return result;
View Full Code Here

   *         , and {@link BrazilianStemFilter}.
   */
  @Override
  protected TokenStreamComponents createComponents(String fieldName,
      Reader reader) {
    Tokenizer source = new StandardTokenizer(matchVersion, reader);
    TokenStream result = new LowerCaseFilter(matchVersion, source);
    result = new StandardFilter(matchVersion, result);
    result = new StopFilter(matchVersion, result, stopwords);
    if(excltable != null && !excltable.isEmpty())
      result = new KeywordMarkerFilter(result, excltable);
View Full Code Here

TOP

Related Classes of org.apache.lucene.analysis.standard.StandardTokenizer

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.