Source Code of org.languagetool.tagging.eo.EsperantoTagger

/* LanguageTool, a natural language style checker 
 * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de)
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
package org.languagetool.tagging.eo;


import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


import org.languagetool.AnalyzedToken;
import org.languagetool.AnalyzedTokenReadings;
import org.languagetool.JLanguageTool;
import org.languagetool.tagging.ManualTagger;
import org.languagetool.tagging.Tagger;


/**
 * A part-of-speech tagger for Esperanto.
 *
 * @author Dominique Pellé
 */
public class EsperantoTagger implements Tagger {
  // manual tagger is used to tag the list of closed Esperanto words
  // (small limited number of words which do not have regular ending).
  private ManualTagger manualTagger = null;


  // Set of transitive verbs and intransitive verbs.
  private Set<String> setTransitiveVerbs = null;
  private Set<String> setIntransitiveVerbs = null;


  // Verbs always end with this pattern.
  private static final Pattern patternVerb = Pattern.compile("(..+)(as|os|is|us|u|i)$");
  private static final Pattern patternPrefix = Pattern.compile("^(?:mal|mis|ek|re|fi|ne)(.*)");
  private static final Pattern patternSuffix = Pattern.compile("(.*)(?:ad|aĉ|eg|et)i$");


  // Participles -ant-, -int, ont-, -it-, -it-, -ot-
  private static final Pattern patternParticiple =
    Pattern.compile("((..+)([aio])(n?)t)([aoe])(j?)(n?)$");
  // Groups           11111111111111111  55555  66  77
  //                   222  33333  44                 
 
  private Set<String> setNonParticiple;


  // Pattern 'tabelvortoj'.
  private static final Pattern patternTabelvorto =
    Pattern.compile("^(i|ti|ki|ĉi|neni)(?:(?:([uoae])(j?)(n?))|(am|al|es|el|om))$");
  // Groups            111111111111111        222222  33  44    55555555555555
  //                                          


  // Pattern of 'tabelvortoj' which are also tagged adverbs.
  private static final Pattern patternTabelvortoAdverb = 
    Pattern.compile("^(?:ti|i|ĉi|neni)(?:am|om|el|e)$");


  /**
   * Load list of words from UTF-8 file (one word per line).
   */
  private Set<String> loadWords(final InputStream stream) throws IOException {
    final Set<String> words = new HashSet<>();
    try (
      InputStreamReader isr = new InputStreamReader(stream, "UTF-8");
      BufferedReader br = new BufferedReader(isr)) 
    {
      String line;
      while ((line = br.readLine()) != null) {
        line = line.trim();
        if (line.length() < 1) {
          continue;
        }
        if (line.charAt(0) == '#') { // ignore comments
          continue;
        }
        words.add(line);
      }
    }
    return words;
  }


  private void lazyInit() throws IOException {
    if (manualTagger != null) {
      return;
    }


    // A manual tagger is used for closed words.  Closed words are the small
    // limited set of words in Esperanto which have no standard ending, as
    // opposed to open words which are unlimited in numbers and which follow
    // strict rules for their suffixes.
    manualTagger = new ManualTagger(JLanguageTool.getDataBroker().getFromResourceDirAsStream("/eo/manual-tagger.txt"));


    // Load set of transitive and intransitive verbs.  Files don't contain
    // verbs with suffix -iĝ or -ig since transitivity is obvious for those verbs.
    // They also don't contain verbs with prefixes mal-, ek-, re-, mis- fi- and
    // suffixes -ad, -aĉ, -et, -eg since these affixes never alter transitivity.
    setTransitiveVerbs   = loadWords(JLanguageTool.getDataBroker().getFromRulesDirAsStream("/eo/verb-tr.txt"));
    setIntransitiveVerbs = loadWords(JLanguageTool.getDataBroker().getFromRulesDirAsStream("/eo/verb-ntr.txt"));
    setNonParticiple     = loadWords(JLanguageTool.getDataBroker().getFromRulesDirAsStream("/eo/root-ant-at.txt"));
  }


  // For a given verb (.*i) find whether it is transitive and/or non transitive.
  // Returns:
  // "tr" for a verb which is transitive
  // "nt" for a verb which is non-transitive
  // "tn" for a verb which is transitive and non-transitive
  // "xx" for an unknown verb.
  private String findTransitivity(String verb) {
    if (verb.endsWith("iĝi")) {
      return "nt";
    } else if (verb.endsWith("igi")) {
      // The verb "memmortigi is strange: even though it ends in -igi, it
      // is intransitive.
      return verb.equals("memmortigi") ? "nt" : "tr";
    }


    // This loop executes only once for most verbs (or very few times).
    for (;;) {
      final boolean isTransitive   = setTransitiveVerbs.contains(verb);
      final boolean isIntransitive = setIntransitiveVerbs.contains(verb);


      if (isTransitive) {
        return isIntransitive ? "tn" : "tr";
      } else if (isIntransitive) {
        return "nt";
      }


      // Verb is not explicitly listed as transitive or intransitive.
      // Try to remove a prefix mal-, ek-, re-, mis- fi- or
      // suffix -ad, -aĉ, -et, -eg since those never alter
      // transitivity.  Then look up verb again in case we find 
      // a verb with a known transitivity.  For example, given a verb
      // "malŝategi", we will probe "malŝategi", "ŝategi" "ŝati"
      // and then finally find out that "ŝati" is transitive.
      final Matcher matcherPrefix = patternPrefix.matcher(verb);
      if (matcherPrefix.find()) {
        // Remove a prefix and try again.
        verb = matcherPrefix.group(1);
        continue;
      }
      final Matcher matcherSuffix = patternSuffix.matcher(verb);
      if (matcherSuffix.find()) {
        // Remove a suffix and try again.
        verb = matcherSuffix.group(1) + "i";
        continue;
      }
      break;
    }
    return "xx"; // Unknown transitivity.
  }


  @Override
  public List<AnalyzedTokenReadings> tag(final List<String> sentenceTokens) throws IOException {


    lazyInit();
    Matcher matcher;


    final List<AnalyzedTokenReadings> tokenReadings = 
      new ArrayList<>();
    for (String word : sentenceTokens) {
      final List<AnalyzedToken> l = new ArrayList<>();


      // No Esperanto word is made of one letter only. This check avoids
      // spurious tagging as single letter words "A", "O", "E", etc.
      if (word.length() > 1) {
        final String lWord = word.toLowerCase();
        final String[] manualTags = manualTagger.lookup(lWord);


        if (manualTags != null) {
          // This is a closed word for which we know its lemmas and tags.
          for (int i = 0; i < manualTags.length; i += 2) {
            final String lemma  = manualTags[2*i];
            final String postag = manualTags[2*i + 1];
            l.add(new AnalyzedToken(word, postag, lemma));
          }
        } else {
          // This is an open word, we need to look at the word ending 
          // to determine its lemma and POS tag.  For verb, we also
          // need to look up the dictionary of known transitive and
          // intransitive verbs.


          // Tiu, kiu (tabelvortoj).
          if ((matcher = patternTabelvorto.matcher(lWord)).find()) {
            final String type1Group = matcher.group(1).substring(0, 1).toLowerCase();
            final String type2Group = matcher.group(2);
            final String plGroup    = matcher.group(3);
            final String accGroup   = matcher.group(4);
            final String type3Group = matcher.group(5);
            final String type;
            final String plural;
            final String accusative;


            if (accGroup == null) {
              accusative = "xxx";
            } else {
              accusative = accGroup.equalsIgnoreCase("n") ? "akz" : "nak";
            }
            if (plGroup == null) {
              plural = " pn ";
            } else {
              plural = plGroup.equalsIgnoreCase("j") ? " pl " : " np ";
            }
            type = ((type2Group == null) ? type3Group : type2Group).toLowerCase();


            l.add(new AnalyzedToken(word, "T " + 
              accusative + plural + type1Group + " " + type, null));


            if ((matcher = patternTabelvortoAdverb.matcher(lWord)).find()) {
              l.add(new AnalyzedToken(word, "E nak", lWord));
            }


          // Words ending in .*oj?n? are nouns.
          } else if (lWord.endsWith("o")) {
            l.add(new AnalyzedToken(word, "O nak np", lWord));
          } else if (lWord.length() >= 2 && lWord.endsWith("'")) {
            l.add(new AnalyzedToken(word, "O nak np", lWord.substring(0, lWord.length() - 1) + "o"));
          } else if (lWord.endsWith("oj")) {
            l.add(new AnalyzedToken(word, "O nak pl", lWord.substring(0, lWord.length() - 1)));
          } else if (lWord.endsWith("on")) {
            l.add(new AnalyzedToken(word, "O akz np", lWord.substring(0, lWord.length() - 1)));
          } else if (lWord.endsWith("ojn")) {
            l.add(new AnalyzedToken(word, "O akz pl", lWord.substring(0, lWord.length() - 2)));


          // Words ending in .*aj?n? are adjectives.
          } else if (lWord.endsWith("a")) {
            l.add(new AnalyzedToken(word, "A nak np", lWord));
          } else if (lWord.endsWith("aj")) {
            l.add(new AnalyzedToken(word, "A nak pl", lWord.substring(0, lWord.length() - 1)));
          } else if (lWord.endsWith("an")) {
            l.add(new AnalyzedToken(word, "A akz np", lWord.substring(0, lWord.length() - 1)));
          } else if (lWord.endsWith("ajn")) {
            l.add(new AnalyzedToken(word, "A akz pl", lWord.substring(0, lWord.length() - 2)));


          // Words ending in .*en? are adverbs.
          } else if (lWord.endsWith("e")) {
            l.add(new AnalyzedToken(word, "E nak", lWord));
          } else if (lWord.endsWith("en")) {
            l.add(new AnalyzedToken(word, "E akz", lWord.substring(0, lWord.length() - 1)));


          // Verbs.
          } else if ((matcher = patternVerb.matcher(lWord)).find()) {
            final String verb = matcher.group(1) + "i";
            final String tense = matcher.group(2);
            final String transitive = findTransitivity(verb);


            l.add(new AnalyzedToken(word, "V " + transitive + " " + tense, verb));


          // Irregular word (no tag).
          } else {
            l.add(new AnalyzedToken(word, null, null));
          }


          // Participle (can be combined with other tags).
          if ((matcher = patternParticiple.matcher(lWord)).find()) {
            if (!setNonParticiple.contains(matcher.group(1))) {
              final String verb = matcher.group(2) + "i";
              final String aio = matcher.group(3);
              final String antAt = matcher.group(4).equals("n") ? "n" : "-";
              final String aoe = matcher.group(5);
              final String plural = matcher.group(6).equals("j") ? "pl" : "np";
              final String accusative = matcher.group(7).equals("n") ? "akz" : "nak";
              final String transitive = findTransitivity(verb);


              l.add(new AnalyzedToken(word, "C " + accusative + " " + plural + " " +
                                      transitive + " " + aio + " " + antAt + " " + aoe,
                                      verb));
            }
          }
        }
      } else {
        // Single letter word (no tag).
        l.add(new AnalyzedToken(word, null, null));
      }
      tokenReadings.add(new AnalyzedTokenReadings(l, 0));
    }
    return tokenReadings;
  }


  @Override
  public AnalyzedTokenReadings createNullToken(String token, int startPos) {
    return new AnalyzedTokenReadings(
      new AnalyzedToken(token, null, null), startPos);
  }


  @Override
  public AnalyzedToken createToken(String token, String posTag) {
    return new AnalyzedToken(token, posTag, null);
  }
}
Source Code of org.languagetool.tagging.eo.EsperantoTagger

Related Classes of org.languagetool.tagging.eo.EsperantoTagger