Package org.apache.lucene.analysis.nterm

Source Code of org.apache.lucene.analysis.nterm.NTermAnalyzer

package org.apache.lucene.analysis.nterm;

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

import java.io.Reader;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.charfilter.MappingCharFilter;
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.LowerCaseTokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.util.Version;

public final class NTermAnalyzer extends StopwordAnalyzerBase {
 
  private final String ntermStopFilterRules;

  /** Builds an analyzer with the stop words from the given set.
   * @param matchVersion See <a href="#version">above</a>
   * @param stopWords Set of stop words */
  public NTermAnalyzer(Version matchVersion, String ntermStopFilterRules) {
    super(matchVersion);
    this.ntermStopFilterRules = ntermStopFilterRules;
  }

  /**
   * Creates
   * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
   * used to tokenize all the text in the provided {@link Reader}.
   *
   * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
   *         built from a {@link LowerCaseTokenizer} filtered with
   *         {@link StopFilter}
   */
  @Override
//  protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
//    final Tokenizer source = new StandardTokenizer(matchVersion, reader);
//  //final Tokenizer source = new WhitespaceTokenizer(matchVersion, reader);
//    TokenStream tokenStream = new LowerCaseFilter(matchVersion, source);
//    tokenStream = new ShingleFilter(tokenStream, 2, 3);
//    tokenStream = new NTermStopFilter(matchVersion, tokenStream, ntermStopFilterRules);   
//    return new TokenStreamComponents(source, tokenStream);
//  }
 
  protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
   
      NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
      builder.add("- ", " | ");
      builder.add(" -", " | ");
      builder.add(".", " | ");
      builder.add(",", " | ");
      builder.add(":", " | ");
      builder.add(";", " | ");
      builder.add("...", " | ");
      builder.add("/", " | ");
      builder.add("@", " | ");
      builder.add("(", " | ");
      builder.add(")", " | ");
      builder.add("[", " | ");
      builder.add("]", " | ");
      builder.add("\u00AB", " | ");
      builder.add("\u00BB", " | ");

      NormalizeCharMap normMap = builder.build();

      Reader reader2 = (normMap == null ? reader : new MappingCharFilter(normMap,reader));
         
    final Tokenizer source = new WhitespaceTokenizer(matchVersion, reader2);
      TokenStream tokenStream = new LowerCaseFilter(matchVersion, source);
      tokenStream = new ShingleFilter(tokenStream, 2, 3);
      tokenStream = new NTermStopFilter(matchVersion, tokenStream, ntermStopFilterRules);   
      return new TokenStreamComponents(source, tokenStream);
    }
 
}
TOP

Related Classes of org.apache.lucene.analysis.nterm.NTermAnalyzer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.