@Override
public final RuleMatch[] match(final AnalyzedSentence text) {
List<RuleMatch> ruleMatches = new ArrayList<>();
AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace();
AnalyzedTokenReadings reqTokenReadings = null;
int i = -1;
for (AnalyzedTokenReadings tokenReadings: tokens) {
i++;
String posTag = tokenReadings.getAnalyzedToken(0).getPOSTag();
//TODO: skip conj напр. «бодай»
if (posTag == null || posTag.contains(IPOSTag.unknown.getText()) || posTag.equals(JLanguageTool.SENTENCE_START_TAGNAME) ){
reqTokenReadings = null;
continue;
}
String token = tokenReadings.getAnalyzedToken(0).getToken();
if( posTag.contains(REQUIRE_VIDMINOK_SUBSTR) && tokenReadings.getReadingsLength() == 1 ) {
String prep = token;
if( prep.equals("за") && reverseSearch(tokens, i, "що") )
continue;
if( prep.equalsIgnoreCase("понад") )
continue;
if( (prep.equalsIgnoreCase("окрім") || prep.equalsIgnoreCase("крім"))
&& tokens.length > i+1 && tokens[i+1].getAnalyzedToken(0).getToken().equalsIgnoreCase("як") ) {
reqTokenReadings = null;
continue;
}
reqTokenReadings = tokenReadings;
continue;
}
if( reqTokenReadings == null )
continue;
ArrayList<String> posTagsToFind = new ArrayList<String>();
// if( tokens.length > i+1 && Character.isUpperCase(tokenReadings.getAnalyzedToken(0).getToken().charAt(0))
// && hasRequiredPosTag(Arrays.asList("v_naz"), tokenReadings)
// && Character.isUpperCase(tokens[i+1].getAnalyzedToken(0).getToken().charAt(0)) )
// continue; // "у Конан Дойла"
//TODO: for numerics only v_naz
if( reqTokenReadings.getAnalyzedToken(0).getToken().equalsIgnoreCase("понад") ) { //&& tokenReadings.getAnalyzedToken(0).getPOSTag().equals(IPOSTag.numr) ) {
posTagsToFind.add("v_naz");
}
String reqPosTag = reqTokenReadings.getAnalyzedToken(0).getPOSTag();
Matcher matcher = REQUIRE_VIDMINOK_REGEX.matcher(reqPosTag);
while( matcher.find() ) {
posTagsToFind.add(matcher.group(1));
}
for(AnalyzedToken readingToken: tokenReadings) {
if( IPOSTag.numr.match(readingToken.getPOSTag()) ) {
posTagsToFind.add("v_naz"); // TODO: only if noun is following?
break;
}
}
// System.out.println("For " + tokenReadings + " to match " + posTagsToFind + " of " + reqTokenReadings.getToken());
if( ! hasRequiredPosTag(posTagsToFind, tokenReadings) ) {
if( isTokenToSkip(tokenReadings) )
continue;
// if( isTokenToIgnore(tokenReadings) ) {
// reqTokenReadings = null;
// continue;
// }
String prep = reqTokenReadings.getAnalyzedToken(0).getToken();
if( prep.equalsIgnoreCase("до") ) {
if( tokenReadings.getAnalyzedToken(0).getToken().compareToIgnoreCase("Я") == 0 ) { // від А до Я
reqTokenReadings = null;
continue;
}
}
if( prep.equalsIgnoreCase("в") || prep.equalsIgnoreCase("у") ) {
if( hasRequiredPosTag(Arrays.asList("p:v_naz"), tokenReadings) ) { //TODO: only for subset: президенти/депутати/мери/гості... or by verb піти/йти/балотуватися/записатися...
reqTokenReadings = null;
continue;
}
}
// exceptions
if( tokens.length > i+1 ) {
if( isCapitalized( token )
&& STREETS.contains( tokens[i+1].getAnalyzedToken(0).getToken()) ) {
reqTokenReadings = null;
continue;
}
if( IPOSTag.isNum(tokens[i+1].getAnalyzedToken(0).getPOSTag())
&& (token.equals("мінус") || token.equals("плюс")
|| token.equals("мінімум") || token.equals("максимум") ) ) {
reqTokenReadings = null;
continue;
}
if( reqTokenReadings.getAnalyzedToken(0).getToken().equalsIgnoreCase("через")
&& token.equals("років")
&& IPOSTag.isNum(tokens[i+1].getAnalyzedToken(0).getPOSTag()) ) {
reqTokenReadings = null;
continue;
}