@Override
protected Iterator<Event> createEvents(TokenSample tokenSample) {
List<Event> events = new ArrayList<Event>(50);
Span tokens[] = tokenSample.getTokenSpans();
String text = tokenSample.getText();
if (tokens.length > 0) {
int start = tokens[0].getStart();
int end = tokens[tokens.length - 1].getEnd();
String sent = text.substring(start, end);
Span[] candTokens = WhitespaceTokenizer.INSTANCE.tokenizePos(sent);
int firstTrainingToken = -1;
int lastTrainingToken = -1;
for (Span candToken : candTokens) {
Span cSpan = candToken;
String ctok = sent.substring(cSpan.getStart(), cSpan.getEnd());
//adjust cSpan to text offsets
cSpan = new Span(cSpan.getStart() + start, cSpan.getEnd() + start);
//should we skip this token
if (ctok.length() > 1
&& (!skipAlphaNumerics || !alphaNumeric.matcher(ctok).matches())) {
//find offsets of annotated tokens inside of candidate tokens
boolean foundTrainingTokens = false;
for (int ti = lastTrainingToken + 1; ti < tokens.length; ti++) {
if (cSpan.contains(tokens[ti])) {
if (!foundTrainingTokens) {
firstTrainingToken = ti;
foundTrainingTokens = true;
}
lastTrainingToken = ti;
}
else if (cSpan.getEnd() < tokens[ti].getEnd()) {
break;
}
else if (tokens[ti].getEnd() < cSpan.getStart()) {
//keep looking
}
else {
if (logger.isLoggable(Level.WARNING)) {
logger.warning("Bad training token: " + tokens[ti] + " cand: " + cSpan +
" token="+text.substring(tokens[ti].getStart(), tokens[ti].getEnd()));
}
}
}
// create training data
if (foundTrainingTokens) {
for (int ti = firstTrainingToken; ti <= lastTrainingToken; ti++) {
Span tSpan = tokens[ti];
int cStart = cSpan.getStart();
for (int i = tSpan.getStart() + 1; i < tSpan.getEnd(); i++) {
String[] context = cg.getContext(ctok, i - cStart);
events.add(new Event(TokenizerME.NO_SPLIT, context));
}
if (tSpan.getEnd() != cSpan.getEnd()) {
String[] context = cg.getContext(ctok, tSpan.getEnd() - cStart);
events.add(new Event(TokenizerME.SPLIT, context));
}
}
}
}