/* LanguageTool, a natural language style checker
* Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool.tagging.pl;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import morfologik.stemming.DictionaryLookup;
import morfologik.stemming.IStemmer;
import org.languagetool.AnalyzedToken;
import org.languagetool.AnalyzedTokenReadings;
import org.languagetool.tagging.BaseTagger;
import org.languagetool.tools.StringTools;
/**
* Polish POS tagger based on FSA morphological dictionaries.
*
* @author Marcin Milkowski
*/
public class PolishTagger extends BaseTagger {
private static final String RESOURCE_FILENAME = "/pl/polish.dict";
private final Locale plLocale = new Locale("pl");
@Override
public final String getFileName() {
return RESOURCE_FILENAME;
}
@Override
public final List<AnalyzedTokenReadings> tag(final List<String> sentenceTokens)
throws IOException {
List<AnalyzedToken> taggerTokens;
List<AnalyzedToken> lowerTaggerTokens;
List<AnalyzedToken> upperTaggerTokens;
final List<AnalyzedTokenReadings> tokenReadings = new ArrayList<>();
int pos = 0;
final IStemmer morfologik = new DictionaryLookup(getDictionary());
for (String word : sentenceTokens) {
final List<AnalyzedToken> l = new ArrayList<>();
final String lowerWord = word.toLowerCase(plLocale);
taggerTokens = asAnalyzedTokenList(word, morfologik.lookup(word));
lowerTaggerTokens = asAnalyzedTokenList(word, morfologik.lookup(lowerWord));
final boolean isLowercase = word.equals(lowerWord);
//normal case
addTokens(taggerTokens, l);
if (!isLowercase) {
//lowercase
addTokens(lowerTaggerTokens, l);
}
//uppercase
if (lowerTaggerTokens.isEmpty() && taggerTokens.isEmpty()) {
if (isLowercase) {
upperTaggerTokens = asAnalyzedTokenList(word, morfologik.lookup(StringTools
.uppercaseFirstChar(word)));
if (!upperTaggerTokens.isEmpty()) {
addTokens(upperTaggerTokens, l);
} else {
l.add(new AnalyzedToken(word, null, null));
}
} else {
l.add(new AnalyzedToken(word, null, null));
}
}
tokenReadings.add(new AnalyzedTokenReadings(l, pos));
pos += word.length();
}
return tokenReadings;
}
private void addTokens(final List<AnalyzedToken> taggedTokens,
final List<AnalyzedToken> l) {
if (taggedTokens != null) {
for (AnalyzedToken at : taggedTokens) {
final String[] tagsArr = StringTools.asString(at.getPOSTag()).split("\\+");
for (final String currTag : tagsArr) {
l.add(new AnalyzedToken(at.getToken(), currTag,
at.getLemma()));
}
}
}
}
}