Source Code of org.languagetool.tagging.ro.RomanianTagger

/* LanguageTool, a natural language style checker 
 * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de)
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
package org.languagetool.tagging.ro;


import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;


import morfologik.stemming.DictionaryLookup;
import morfologik.stemming.IStemmer;
import morfologik.stemming.WordData;


import org.languagetool.AnalyzedToken;
import org.languagetool.AnalyzedTokenReadings;
import org.languagetool.JLanguageTool;
import org.languagetool.tagging.BaseTagger;
import org.languagetool.tagging.ManualTagger;


/**
 * Romanian Part-of-speech tagger 
 * 
 * @author Ionuț Păduraru
 */
public class RomanianTagger extends BaseTagger {


  private static final String DEFAULT_BINARY_DICT = "/ro/romanian.dict";
  private static final String DEFAULT_PLAINTEXT_DICT = "/ro/added.txt";
  private static final Locale RO_LOCALE = new Locale("ro");


  private final String binaryDictPath;
  private final String plaintextDictPath;


  private ManualTagger manualTagger;


  public RomanianTagger() {
    this(DEFAULT_BINARY_DICT, DEFAULT_PLAINTEXT_DICT);
  }


  public RomanianTagger(final String dictFileName, final String userDictFileName) {
    binaryDictPath = dictFileName;
    plaintextDictPath = userDictFileName;
    setLocale(RO_LOCALE);
  }


  @Override
  public final String getFileName() {
    return binaryDictPath;
  }


  @Override
  public final List<AnalyzedTokenReadings> tag(
      final List<String> sentenceTokens) throws IOException {
    final List<AnalyzedTokenReadings> tokenReadings = new ArrayList<>();
    int pos = 0;
    final IStemmer morfologik = new DictionaryLookup(getDictionary());
    if (manualTagger == null && plaintextDictPath != null) {
      manualTagger = new ManualTagger(JLanguageTool.getDataBroker().getFromResourceDirAsStream(plaintextDictPath));
    }


    for (final String word : sentenceTokens) {
      final List<AnalyzedToken> l = new ArrayList<>();
      final String lowerCaseWord = word.toLowerCase(RO_LOCALE);
      final List<WordData> taggerTokens = morfologik.lookup(lowerCaseWord);
      if (taggerTokens != null) {
        for (WordData wd : taggerTokens) {
          final String[] tagsArr = wd.getStem().toString().split("\\+");
          for (final String currTag : tagsArr) {
            l.add(new AnalyzedToken(word, 
                wd.getTag().toString(), currTag));
          }
        }
      }
      if (manualTagger != null) { // add user tags, if any
        final String[] manualTags = manualTagger.lookup(lowerCaseWord);
        if (manualTags != null) {
          for (int i = 0; i < manualTags.length / 2; i = i + 2) {
            l.add(new AnalyzedToken(word, manualTags[i+1], manualTags[i]));
          }
        }
      }


      if (l.isEmpty()) {
        l.add(new AnalyzedToken(word, null, null));
      }
      tokenReadings.add(new AnalyzedTokenReadings(l, pos));
      pos += word.length();
    }


    return tokenReadings;
  }


}
Source Code of org.languagetool.tagging.ro.RomanianTagger

Related Classes of org.languagetool.tagging.ro.RomanianTagger