Source Code of org.apache.lucene.analysis.hebrew.MorphAnalyzer

/***************************************************************************
 *   Copyright (C) 2010-2013 by                                            *
 *      Itamar Syn-Hershko <itamar at code972 dot com>                     *
 *    Ofer Fort <oferiko at gmail dot com> (initial Java port)           *
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU Affero General Public License           *
 *   version 3, as published by the Free Software Foundation.              *
 *                                                                         *
 *   This program is distributed in the hope that it will be useful,       *
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
 *   GNU Affero General Public License for more details.                   *
 *                                                                         *
 *   You should have received a copy of the GNU Affero General Public      *
 *   License along with this program; if not, see                          *
 *   <http://www.gnu.org/licenses/>.                                       *
 **************************************************************************/
package org.apache.lucene.analysis.hebrew;


import com.code972.hebmorph.MorphData;
import com.code972.hebmorph.datastructures.DictHebMorph;
import com.code972.hebmorph.datastructures.DictRadix;
import com.code972.hebmorph.hspell.HSpellLoader;
import com.code972.hebmorph.lemmafilters.LemmaFilterBase;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.SuffixKeywordFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.Version;


import java.io.File;
import java.io.IOException;
import java.io.Reader;


public class MorphAnalyzer extends Analyzer {
    /**
     * An unmodifiable set containing some common Hebrew words that are usually not
     * useful for searching.
     */
    private final CharArraySet commonWords;


    private boolean keepOriginalWord = false;


    private LemmaFilterBase lemmaFilter;


    private static final String DEFAULT_HSPELL_DATA_CLASSPATH = "hspell-data-files";
    private static final String DEFAULT_HSPELL_ENV_VARIABLE = "HSPELL_DATA_FILES_PATH";
    protected final Version matchVersion;
    private DictRadix<Byte> specialTokenizationCases;
    private final DictHebMorph dict;
    private Character suffixForExactMatch;


    public MorphAnalyzer(final Version matchVersion, final DictHebMorph dict) throws IOException {
        this(matchVersion, dict, null, null);
    }


    public MorphAnalyzer(final Version matchVersion, final DictHebMorph dict,
                         final CharArraySet commonWords, final DictRadix<Byte> specialTokenizationCases) throws IOException {
        this.matchVersion = matchVersion;
        this.dict = dict;
        this.specialTokenizationCases = specialTokenizationCases;
        this.commonWords = commonWords;
    }


    @Override
    protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
        final StreamLemmasFilter src = new StreamLemmasFilter(reader, dict, specialTokenizationCases, commonWords, lemmaFilter);
        src.setKeepOriginalWord(keepOriginalWord);
        src.setSuffixForExactMatch(suffixForExactMatch);
        TokenStream tok = new SuffixKeywordFilter(src, '$');
        return new TokenStreamComponents(src, tok) {
            @Override
            protected void setReader(final Reader reader) throws IOException {
                super.setReader(reader);
            }
        };
    }


    public void setSuffixForExactMatch(Character suffixForExactMatch) {
        this.suffixForExactMatch = suffixForExactMatch;
    }


    /**
     * A filter object to provide flexibility on deciding which lemmas are valid as index terms
     * and which are not.
     */
    public void setLemmaFilter(LemmaFilterBase lemmaFilter) {
        this.lemmaFilter = lemmaFilter;
    }


    /**
     * Set to true to mark tokens with a $ prefix also when there is only one lemma returned
     * from the lemmatizer. This is mainly here to allow the Hebrew-aware SimpleAnalyzer (in this
     * namespace) to perform searches on the same field used for the Morph analyzer. When used this
     * way, make sure to turn this on only while indexing, so searches don't get slower.
     * Default is false to save some index space.
     */
    public void setKeepOriginalWord(boolean keepOriginalWord) {
        this.keepOriginalWord = keepOriginalWord;
    }


    public static SynonymMap buildAcronymsMergingMap() throws IOException {
        SynonymMap.Builder synonymMap = new SynonymMap.Builder(true);
        synonymMap.add(new CharsRef("אף על פי כן"), new CharsRef("אעפ\"כ"), false);
        synonymMap.add(new CharsRef("אף על פי"), new CharsRef("אע\"פ"), false);
        synonymMap.add(new CharsRef("כמו כן"), new CharsRef("כמו\"כ"), false);
        synonymMap.add(new CharsRef("על ידי"), new CharsRef("ע\"י"), false);
        synonymMap.add(new CharsRef("על פי"), new CharsRef("ע\"פ"), false);
        synonymMap.add(new CharsRef("כל כך"), new CharsRef("כ\"כ"), false);
        synonymMap.add(new CharsRef("בדרך כלל"), new CharsRef("בד\"כ"), false);
        synonymMap.add(new CharsRef("תל אביב"), new CharsRef("ת\"א"), false);
        return synonymMap.build();
    }


    static private DictRadix<MorphData> loadFromClasspath(final String pathInClasspath) {
        try {
            HSpellLoader loader = new HSpellLoader(Thread.currentThread().getContextClassLoader(), pathInClasspath, true);
            return loader.loadDictionaryFromHSpellData();
        } catch (IOException ex) {
            try {
                // Try to use environment variable if failed with classpath
                return loadFromEnvVariable();
            } catch (IOException e) {
                throw new IllegalStateException("Failed to read data", ex);
            }
        }
    }


    static private DictRadix<MorphData> loadFromPath(final File path) {
        try {
            HSpellLoader loader = new HSpellLoader(path, true);
            return loader.loadDictionaryFromHSpellData();
        } catch (IOException ex) {
            throw new IllegalStateException("Failed to read data", ex);
        }
    }


    static private DictRadix<MorphData> loadFromEnvVariable() throws IOException {
        String hspellPath = System.getenv(DEFAULT_HSPELL_ENV_VARIABLE);
        if (hspellPath == null) {
            throw new IllegalStateException("Failed to load hspell dictionary files. They should be configured " +
                    "in classpath or by " + DEFAULT_HSPELL_ENV_VARIABLE + " environment variable");
        }
        HSpellLoader loader = new HSpellLoader(new File(hspellPath), true);
        return loader.loadDictionaryFromHSpellData();
    }
}
Source Code of org.apache.lucene.analysis.hebrew.MorphAnalyzer

Related Classes of org.apache.lucene.analysis.hebrew.MorphAnalyzer