Package org.zanata.hibernate.search

Source Code of org.zanata.hibernate.search.ConfigurableNgramAnalyzer

/*
* Copyright 2012, Red Hat, Inc. and individual contributors as indicated by the
* @author tags. See the copyright.txt file in the distribution for a full
* listing of individual contributors.
*
* This is free software; you can redistribute it and/or modify it under the
* terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 2.1 of the License, or (at your option)
* any later version.
*
* This software is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
* details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this software; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA, or see the FSF
* site: http://www.fsf.org.
*/
package org.zanata.hibernate.search;

import java.io.Reader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ngram.NGramTokenizer;

/**
* Analyzer that tokenizes into ngrams of a specified length, with or without
* case folding.
*
* @author David Mason, damason@redhat.com
*
*/
public class ConfigurableNgramAnalyzer extends Analyzer {

    private int ngramMinLength;
    private int ngramMaxLength;
    private boolean foldCase;

    /**
     * @param ngramLength
     *            the length of each ngram to generate
     * @param foldCase
     *            true to convert all characters to lowercase, allowing
     *            case-insensitive indexing and searching
     */
    public ConfigurableNgramAnalyzer(int ngramLength, boolean foldCase) {
        this(ngramLength, ngramLength, foldCase);
    }

    /**
     * Create analyzer that will tokenize repeatedly to make ngrams of all sizes
     * from ngramMinLength to ngramMaxLength, inclusive.
     *
     * @param ngramMinLength
     *            length of the shortest ngrams to generate
     * @param ngramMaxLength
     *            length of the longest ngrams to generate
     * @param foldCase
     *            true to convert all characters to lowercase, allowing
     *            case-insensitive indexing and searching
     */
    public ConfigurableNgramAnalyzer(int ngramMinLength, int ngramMaxLength,
            boolean foldCase) {
        this.ngramMinLength = ngramMinLength;
        this.ngramMaxLength = ngramMaxLength;
        this.foldCase = foldCase;
    }

    @SuppressWarnings("resource") // caller should close
    @Override
    public TokenStream tokenStream(String fieldName, Reader reader) {
        TokenStream tokenStream;
        NGramTokenizer ngramTokenizer =
                new NGramTokenizer(reader, ngramMinLength, ngramMaxLength);
        if (foldCase) {
            tokenStream = new ULowerCaseFilter(ngramTokenizer);
        } else {
            tokenStream = ngramTokenizer;
        }
        return tokenStream;
    }

}
TOP

Related Classes of org.zanata.hibernate.search.ConfigurableNgramAnalyzer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.