Source Code of org.apache.lucene.analysis.nterm.NTermAnalyzer

package org.apache.lucene.analysis.nterm;


/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


import java.io.Reader;


import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.charfilter.MappingCharFilter;
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.LowerCaseTokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.util.Version;


public final class NTermAnalyzer extends StopwordAnalyzerBase {
  
  private final String ntermStopFilterRules;


  /** Builds an analyzer with the stop words from the given set.
   * @param matchVersion See <a href="#version">above</a>
   * @param stopWords Set of stop words */
  public NTermAnalyzer(Version matchVersion, String ntermStopFilterRules) {
    super(matchVersion);
    this.ntermStopFilterRules = ntermStopFilterRules;
  }


  /**
   * Creates
   * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
   * used to tokenize all the text in the provided {@link Reader}.
   * 
   * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
   *         built from a {@link LowerCaseTokenizer} filtered with
   *         {@link StopFilter}
   */
  @Override
//  protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
//    final Tokenizer source = new StandardTokenizer(matchVersion, reader);
//  //final Tokenizer source = new WhitespaceTokenizer(matchVersion, reader);
//    TokenStream tokenStream = new LowerCaseFilter(matchVersion, source);
//    tokenStream = new ShingleFilter(tokenStream, 2, 3);
//    tokenStream = new NTermStopFilter(matchVersion, tokenStream, ntermStopFilterRules);    
//    return new TokenStreamComponents(source, tokenStream);
//  }
  
  protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
    
      NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
      builder.add("- ", " | ");
      builder.add(" -", " | ");
      builder.add(".", " | ");
      builder.add(",", " | ");
      builder.add(":", " | ");
      builder.add(";", " | ");
      builder.add("...", " | ");
      builder.add("/", " | ");
      builder.add("@", " | ");
      builder.add("(", " | ");
      builder.add(")", " | ");
      builder.add("[", " | ");
      builder.add("]", " | ");
      builder.add("\u00AB", " | ");
      builder.add("\u00BB", " | ");


      NormalizeCharMap normMap = builder.build();


      Reader reader2 = (normMap == null ? reader : new MappingCharFilter(normMap,reader));
          
    final Tokenizer source = new WhitespaceTokenizer(matchVersion, reader2);
      TokenStream tokenStream = new LowerCaseFilter(matchVersion, source);
      tokenStream = new ShingleFilter(tokenStream, 2, 3);
      tokenStream = new NTermStopFilter(matchVersion, tokenStream, ntermStopFilterRules);    
      return new TokenStreamComponents(source, tokenStream);
    }
  
}
Source Code of org.apache.lucene.analysis.nterm.NTermAnalyzer

Related Classes of org.apache.lucene.analysis.nterm.NTermAnalyzer