sentEnd = sentStart;
for (String aline : lines) {
sentEnd += aline.split("\t")[1].length()+1;
}
if ( !inputFormat.contains("tok") ) {
depNodes.add( new ConllDependencyNode(jCas,sentStart,sentEnd));
depNodes.get(depNodes.size()-1).setId(0);
depNodes.get(depNodes.size()-1).addToIndexes(jCas);
}
/** Create tokens */
if ( inputFormat.contains("tok") ) {
for (String aline : lines) {
String[] tokens = aline.split("\t");
wordEnd = wordStart + tokens[1].length();
BaseToken btoken = new BaseToken(jCas,wordStart,wordEnd);
btoken.setTokenNumber(wordNumber++);
btoken.addToIndexes();
documentText.append(tokens[1] + " ");
wordStart = wordEnd+1;
}
} else if ( inputFormat.contains("min") ) {
for (String aline : lines) {
String[] tokens = aline.split("\t");
wordEnd = wordStart + tokens[1].length();
if (trainingMode)
depNodes.add( new ConllDependencyNode(jCas,wordStart,wordEnd) );
BaseToken btoken = new BaseToken(jCas,wordStart,wordEnd);
btoken.setTokenNumber(wordNumber++);
btoken.addToIndexes();
documentText.append(tokens[1] + " ");
wordStart = wordEnd+1;
}
} else if ( inputFormat.contains("mpos")) {
for (String aline : lines) {
String[] tokens = aline.split("\t");
wordEnd = wordStart + tokens[1].length();
if (trainingMode)
depNodes.add( new ConllDependencyNode(jCas,wordStart,wordEnd) );
BaseToken btoken = new BaseToken(jCas,wordStart,wordEnd);
btoken.setTokenNumber(wordNumber++);
btoken.setPartOfSpeech(tokens[2]);
btoken.addToIndexes();
documentText.append(tokens[1] + " ");
wordStart = wordEnd+1;
}
} else if ( inputFormat.contains("mlem")) {
for (String aline : lines) {
String[] tokens = aline.split("\t");
wordEnd = wordStart + tokens[1].length();
if (trainingMode)
depNodes.add( new ConllDependencyNode(jCas,wordStart,wordEnd) );
BaseToken btoken = new BaseToken(jCas,wordStart,wordEnd);
btoken.setTokenNumber(wordNumber++);
btoken.setNormalizedForm(tokens[2]);
btoken.addToIndexes();
documentText.append(tokens[1] + " ");
wordStart = wordEnd+1;
}
} else if ( inputFormat.contains("dep")) {
for (String aline : lines) {
String[] tokens = aline.split("\t");
wordEnd = wordStart + tokens[1].length();
if (trainingMode)
depNodes.add( new ConllDependencyNode(jCas,wordStart,wordEnd) );
BaseToken btoken = new BaseToken(jCas,wordStart,wordEnd);
btoken.setTokenNumber(wordNumber++);
btoken.setNormalizedForm(tokens[2]);
btoken.setPartOfSpeech(tokens[3]);
btoken.addToIndexes();
documentText.append(tokens[1] + " ");
wordStart = wordEnd+1;
}
} else { // CONLL format assumed
if (!inputFormat.contains("conll")) { System.err.println("Warning: Assuming CONLL-x input format"); }
for (String aline : lines) {
String[] tokens = aline.split("\t");
wordEnd = wordStart + tokens[1].length();
if (trainingMode)
depNodes.add( new ConllDependencyNode(jCas,wordStart,wordEnd) );
BaseToken btoken = new BaseToken(jCas,wordStart,wordEnd);
btoken.setTokenNumber(wordNumber++);
btoken.setNormalizedForm(tokens[2]);
btoken.setPartOfSpeech(tokens[4]);
btoken.addToIndexes();