Examples of org.apache.lucene.analysis.core.StopFilter

org.apache.lucene.analysis.core.StopFilter
Removes stop words from a token stream.
You must specify the required {@link Version}compatibility when creating StopFilter:
- As of 3.1, StopFilter correctly handles Unicode 4.0 supplementary characters in stopwords and position increments are preserved

    TokenStream result = new LowerCaseFilter(matchVersion, source);
    if (!stemExclusionSet.isEmpty())
      result = new SetKeywordMarkerFilter(result, stemExclusionSet);
    result = new IndicNormalizationFilter(result);
    result = new HindiNormalizationFilter(result);
    result = new StopFilter(matchVersion, result, stopwords);
    result = new HindiStemFilter(result);
    return new TokenStreamComponents(source, result);
  }

View Full Code Here

  protected TokenStreamComponents createComponents(String fieldName,
      Reader reader) {
    if (matchVersion.onOrAfter(Version.LUCENE_4_8)) {
      final Tokenizer source = new ThaiTokenizer(reader);
      TokenStream result = new LowerCaseFilter(matchVersion, source);
      result = new StopFilter(matchVersion, result, stopwords);
      return new TokenStreamComponents(source, result);
    } else {
      final Tokenizer source = new StandardTokenizer(matchVersion, reader);
      TokenStream result = new StandardFilter(matchVersion, source);
      if (matchVersion.onOrAfter(Version.LUCENE_3_1))
        result = new LowerCaseFilter(matchVersion, result);
      result = new ThaiWordFilter(matchVersion, result);
      return new TokenStreamComponents(source, new StopFilter(matchVersion,
        result, stopwords));
    }
  }

View Full Code Here

  protected TokenStreamComponents createComponents(String fieldName,
      Reader reader) {
    final Tokenizer source = new StandardTokenizer(matchVersion, reader);
    TokenStream result = new StandardFilter(matchVersion, source);
    result = new LowerCaseFilter(matchVersion, result);
    result = new StopFilter(matchVersion, result, stopwords);
    if(!stemExclusionSet.isEmpty())
      result = new SetKeywordMarkerFilter(result, stemExclusionSet);
    if (matchVersion.onOrAfter(Version.LUCENE_3_6)) {
      result = new SpanishLightStemFilter(result);
    } else {

View Full Code Here

    if (matchVersion.onOrAfter(Version.LUCENE_3_1) && name.equals("Turkish"))
      result = new TurkishLowerCaseFilter(result);
    else
      result = new LowerCaseFilter(matchVersion, result);
    if (stopSet != null)
      result = new StopFilter(matchVersion,
                              result, stopSet);
    result = new SnowballFilter(result, name);
    return new TokenStreamComponents(tokenizer, result);
  }

View Full Code Here

    TokenStream result = new StandardFilter(matchVersion, source);
    if (matchVersion.onOrAfter(Version.LUCENE_3_6)) {
      result = new ElisionFilter(result, DEFAULT_ARTICLES);
    }
    result = new LowerCaseFilter(matchVersion, result);
    result = new StopFilter(matchVersion, result, stopwords);
    if(!stemExclusionSet.isEmpty())
      result = new SetKeywordMarkerFilter(result, stemExclusionSet);
    result = new SnowballFilter(result, new CatalanStemmer());
    return new TokenStreamComponents(source, result);
  }

View Full Code Here

      Reader aReader) {
    if (matchVersion.onOrAfter(Version.LUCENE_3_1)) {
      final Tokenizer source = new StandardTokenizer(matchVersion, aReader);
      TokenStream result = new StandardFilter(matchVersion, source);
      result = new LowerCaseFilter(matchVersion, result);
      result = new StopFilter(matchVersion, result, stoptable);
      if (!excltable.isEmpty())
        result = new SetKeywordMarkerFilter(result, excltable);
      if (stemdict != null)
        result = new StemmerOverrideFilter(result, stemdict);
      result = new SnowballFilter(result, new org.tartarus.snowball.ext.DutchStemmer());
      return new TokenStreamComponents(source, result);
    } else {
      final Tokenizer source = new StandardTokenizer(matchVersion, aReader);
      TokenStream result = new StandardFilter(matchVersion, source);
      result = new StopFilter(matchVersion, result, stoptable);
      if (!excltable.isEmpty())
        result = new SetKeywordMarkerFilter(result, excltable);
      result = new DutchStemFilter(result, origStemdict);
      return new TokenStreamComponents(source, result);
    }

View Full Code Here

  protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
    Set<String> stopWords = stopWordsPerField.get(fieldName);
    if (stopWords == null) {
      return components;
    }
    StopFilter stopFilter = new StopFilter(matchVersion, components.getTokenStream(), 
        new CharArraySet(matchVersion, stopWords, false));
    return new TokenStreamComponents(components.getTokenizer(), stopFilter);
  }

View Full Code Here

  @Override
  protected TokenStreamComponents createComponents(String fieldName,
      Reader reader) {
    final Tokenizer source = new StandardTokenizer(matchVersion, reader);
    TokenStream result = new StandardFilter(matchVersion, source);
    StopFilter s = new StopFilter(matchVersion, result, HYPHENATIONS);
    if (!matchVersion.onOrAfter(Version.LUCENE_4_4)) {
      s.setEnablePositionIncrements(false);
    }
    result = s;
    result = new ElisionFilter(result, DEFAULT_ARTICLES);
    result = new IrishLowerCaseFilter(result);
    result = new StopFilter(matchVersion, result, stopwords);
    if(!stemExclusionSet.isEmpty())
      result = new SetKeywordMarkerFilter(result, stemExclusionSet);
    result = new SnowballFilter(result, new IrishStemmer());
    return new TokenStreamComponents(source, result);
  }

View Full Code Here

      Reader reader) {
    final Tokenizer source = matchVersion.onOrAfter(Version.LUCENE_3_1) ? 
        new StandardTokenizer(matchVersion, reader) : new ArabicLetterTokenizer(matchVersion, reader);
    TokenStream result = new LowerCaseFilter(matchVersion, source);
    // the order here is important: the stopword list is not normalized!
    result = new StopFilter( matchVersion, result, stopwords);
    // TODO maybe we should make ArabicNormalization filter also KeywordAttribute aware?!
    result = new ArabicNormalizationFilter(result);
    if(!stemExclusionSet.isEmpty()) {
      result = new SetKeywordMarkerFilter(result, stemExclusionSet);
    }

View Full Code Here

      final Tokenizer source = new StandardTokenizer(matchVersion, reader);
      // run the widthfilter first before bigramming, it sometimes combines characters.
      TokenStream result = new CJKWidthFilter(source);
      result = new LowerCaseFilter(matchVersion, result);
      result = new CJKBigramFilter(result);
      return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords));
    } else {
      final Tokenizer source = new CJKTokenizer(reader);
      return new TokenStreamComponents(source, new StopFilter(matchVersion, source, stopwords));
    }
  }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.lucene.analysis.core.StopFilter

com.foundationdb.server.service.text.SelectiveCaseAnalyzer

com.googlecode.lucene.PorterAnalyzer

com.livingsocial.hive.udf.Tokenize$MyAnalyzer

gov.nysenate.openleg.lucene.OpenLegislationAnalyzer

org.apache.blur.analysis.NoStopWordStandardAnalyzer

org.apache.lucene.analysis.ar.ArabicAnalyzer

org.apache.lucene.analysis.bg.BulgarianAnalyzer

org.apache.lucene.analysis.br.BrazilianAnalyzer

org.apache.lucene.analysis.ca.CatalanAnalyzer

org.apache.lucene.analysis.cjk.CJKAnalyzer

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.