return tokenReadings;
}
@Override
public List<AnalyzedToken> additionalTags(String word) {
final IStemmer dictLookup;
try {
dictLookup = new DictionaryLookup(getDictionary());
} catch (IOException e) {
throw new RuntimeException("Could not load Catalan dictionary from " + getFileName(), e);
}
List<AnalyzedToken> additionalTaggedTokens = new ArrayList<>();
//Any well-formed adverb with suffix -ment is tagged as an adverb (RG)
//Adjectiu femení singular o participi femení singular + -ment
if (word.endsWith("ment")){
final String lowerWord = word.toLowerCase(conversionLocale);
final String possibleAdj = lowerWord.replaceAll("^(.+)ment$", "$1");
List<AnalyzedToken> taggerTokens;
taggerTokens = asAnalyzedTokenList(possibleAdj, dictLookup.lookup(possibleAdj));
for (AnalyzedToken taggerToken : taggerTokens ) {
final String posTag = taggerToken.getPOSTag();
if (posTag != null) {
final Matcher m = ADJ_PART_FS.matcher(posTag);
if (m.matches()) {
additionalTaggedTokens.add(new AnalyzedToken(word, "RG", lowerWord));
return additionalTaggedTokens;
}
}
}
}
//Any well-formed verb with prefixes is tagged as a verb copying the original tags
Matcher matcher=PREFIXES_FOR_VERBS.matcher(word);
if (matcher.matches()) {
final String possibleVerb = matcher.group(2).toLowerCase();
List<AnalyzedToken> taggerTokens;
taggerTokens = asAnalyzedTokenList(possibleVerb, dictLookup.lookup(possibleVerb));
for (AnalyzedToken taggerToken : taggerTokens ) {
final String posTag = taggerToken.getPOSTag();
if (posTag != null) {
final Matcher m = VERB.matcher(posTag);
if (m.matches()) {
String lemma=matcher.group(1).toLowerCase().concat(taggerToken.getLemma());
additionalTaggedTokens.add(new AnalyzedToken(word, posTag, lemma));
}
}
}
return additionalTaggedTokens;
}
// Any well-formed noun with prefix ex- is tagged as a noun copying the original tags
/*if (word.startsWith("ex")) {
final String lowerWord = word.toLowerCase(conversionLocale);
final String possibleNoun = lowerWord.replaceAll("^ex(.+)$", "$1");
List<AnalyzedToken> taggerTokens;
taggerTokens = asAnalyzedTokenList(possibleNoun,dictLookup.lookup(possibleNoun));
for (AnalyzedToken taggerToken : taggerTokens) {
final String posTag = taggerToken.getPOSTag();
if (posTag != null) {
final Matcher m = NOUN.matcher(posTag);
if (m.matches()) {
String lemma = "ex".concat(taggerToken.getLemma());
additionalTaggedTokens.add(new AnalyzedToken(word, posTag, lemma));
}
}
}
return additionalTaggedTokens;
}*/
// Interpret deprecated characters of "ela geminada"
// U+013F LATIN CAPITAL LETTER L WITH MIDDLE DOT
// U+0140 LATIN SMALL LETTER L WITH MIDDLE DOT
if (word.contains("\u0140") || word.contains("\u013f")) {
final String lowerWord = word.toLowerCase(conversionLocale);
final String possibleWord = lowerWord.replaceAll("\u0140", "l·");
List<AnalyzedToken> taggerTokens = asAnalyzedTokenList(word,dictLookup.lookup(possibleWord));
return taggerTokens;
}
return null;
}