LabelSequence target = new LabelSequence ((LabelAlphabet)getTargetAlphabet(), tokens.length);
boolean [][] ending = new boolean[3][endings.length];
boolean [][] endingp1 = new boolean[3][endings.length];
boolean [][] endingp2 = new boolean[3][endings.length];
StringBuffer source = saveSource ? new StringBuffer() : null;
TokenSequence data = new StringTokenization (source);
String prevLabel = "NOLABEL";
Pattern ipattern = Pattern.compile ("I-.*");
String word, tag = null, phrase = null, label = null;
for (int i = 0; i < tokens.length; i++) {
if (tokens[i].length() != 0) {
try {
String[] features = tokens[i].split (" ");
int fieldIdx = 0;
word = features[fieldIdx++]; // .toLowerCase();
if (doTags) tag = features[fieldIdx++];
if (doPhrases) phrase = features[fieldIdx++];
if (isTargetProcessing ()) label = features[fieldIdx++];
} catch (ArrayIndexOutOfBoundsException e) {
throw new IllegalArgumentException ("Invalid line "+tokens[i]+" : expected word "
+ (doTags ? ", tag" : "")
+ (doPhrases ? ", phrase" : "")
+ (isTargetProcessing () ? ", target" : "")
+ ".");
}
} else {
word = "-<S>-";
tag = "-<S>-";
phrase = "-<S>-";
label = "O";
}
// Transformations
if (doDigitCollapses) {
if (word.matches ("19\\d\\d"))
word = "<YEAR>";
else if (word.matches ("19\\d\\ds"))
word = "<YEARDECADE>";
else if (word.matches ("19\\d\\d-\\d+"))
word = "<YEARSPAN>";
else if (word.matches ("\\d+\\\\/\\d"))
word = "<FRACTION>";
else if (word.matches ("\\d[\\d,\\.]*"))
word = "<DIGITS>";
else if (word.matches ("19\\d\\d-\\d\\d-\\d--d"))
word = "<DATELINEDATE>";
else if (word.matches ("19\\d\\d-\\d\\d-\\d\\d"))
word = "<DATELINEDATE>";
else if (word.matches (".*-led"))
word = "<LED>";
else if (word.matches (".*-sponsored"))
word = "<LED>";
}
if (doDowncasing)
word = word.toLowerCase();
int start = source.length ();
if (saveSource) {
if (word.equals ("-<S>-")) source.append ("\n\n");
source.append (word); source.append (" ");
}
Token token = new StringSpan (source, start, source.length () - 1);
// Word and tag unigram at current time
if (doSpelling) {
for (int j = 0; j < endings.length; j++) {
ending[2][j] = ending[1][j];
ending[1][j] = ending[0][j];
ending[0][j] = endingPatterns[j].matcher(word).matches();
if (ending[0][j]) token.setFeatureValue (endingNames[0][0][j], 1);
}
}
if (doTags) {
token.setFeatureValue ("T="+tag, 1);
}
if (doPhrases) {
token.setFeatureValue ("P="+phrase, 1);
}
data.add (token);
if (isTargetProcessing ()) {
// Change so each segment always begins with a "B-",
// even if previous token did not have this label.
String oldLabel = label;