}
});
sorter.sort(unsorted, sorted);
unsorted.delete();
ByteSequencesReader reader = new ByteSequencesReader(sorted);
BytesRef scratchLine = new BytesRef();
// TODO: the flags themselves can be double-chars (long) or also numeric
// either way the trick is to encode them as char... but they must be parsed differently
String currentEntry = null;
IntsRef currentOrds = new IntsRef();
String line;
while (reader.read(scratchLine)) {
line = scratchLine.utf8ToString();
String entry;
char wordForm[];
int flagSep = line.lastIndexOf(FLAG_SEPARATOR);
if (flagSep == -1) {
wordForm = NOFLAGS;
entry = line;
} else {
// note, there can be comments (morph description) after a flag.
// we should really look for any whitespace: currently just tab and space
int end = line.indexOf('\t', flagSep);
if (end == -1)
end = line.length();
int end2 = line.indexOf(' ', flagSep);
if (end2 == -1)
end2 = line.length();
end = Math.min(end, end2);
String flagPart = line.substring(flagSep + 1, end);
if (aliasCount > 0) {
flagPart = getAliasValue(Integer.parseInt(flagPart));
}
wordForm = flagParsingStrategy.parseFlags(flagPart);
Arrays.sort(wordForm);
entry = line.substring(0, flagSep);
}
int cmp = currentEntry == null ? 1 : entry.compareTo(currentEntry);
if (cmp < 0) {
throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry);
} else {
encodeFlags(flagsScratch, wordForm);
int ord = flagLookup.add(flagsScratch);
if (ord < 0) {
// already exists in our hash
ord = (-ord)-1;
}
// finalize current entry, and switch "current" if necessary
if (cmp > 0 && currentEntry != null) {
Util.toUTF32(currentEntry, scratchInts);
words.add(scratchInts, currentOrds);
}
// swap current
if (cmp > 0 || currentEntry == null) {
currentEntry = entry;
currentOrds = new IntsRef(); // must be this way
}
currentOrds.grow(currentOrds.length+1);
currentOrds.ints[currentOrds.length++] = ord;
}
}
// finalize last entry
Util.toUTF32(currentEntry, scratchInts);
words.add(scratchInts, currentOrds);
reader.close();
sorted.delete();
}