package edu.stanford.nlp.sequences;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.objectbank.ObjectBank;
import edu.stanford.nlp.util.PaddedList;
import edu.stanford.nlp.util.AbstractIterator;
import java.util.*;
import java.util.regex.*;
import java.io.*;
/**
* DocumentReader for the original CoNLL 03 format. In this format, there is
* one word per line, with extra attributes of a word (POS tag, chunk, etc.) in
* other space or tab separated columns, where leading and trailing whitespace
* on the line are ignored. Sentences are supposedly
* separated by a blank line (one with no non-whitespace characters), but
* where blank lines occur is in practice often fairly random. In particular,
* sometimes entities span blank lines. Nevertheless, in this class, like in
* our original CoNLL system, these blank lines are preserved as a special
* BOUNDARY token and detected and exploited by some features. The text is
* divided into documents at each '-DOCSTART-' token, which is seen as a
* special token, which is also preserved. The reader can read data in any
* of the IOB/IOE/etc. formats and output tokens in any other, based on the
* entitySubclassification flag.
* <p>
* This reader is specifically for replicating CoNLL systems. For normal use,
* you should use the saner ColumnDocumentReaderAndWriter.
*
* @author Jenny Finkel
* @author Huy Nguyen
* @author Christopher Manning
*/
public class CoNLLDocumentReaderAndWriter implements DocumentReaderAndWriter<CoreLabel> {
private static final long serialVersionUID = 6281374154299530460L;
public static final String BOUNDARY = "*BOUNDARY*";
public static final String OTHER = "O";
/** Historically, this reader used to treat the whole input as one document, but now it doesn't */
private static final boolean TREAT_FILE_AS_ONE_DOCUMENT = false;
private static final Pattern docPattern = Pattern.compile("^\\s*-DOCSTART-\\s");
private static final Pattern white = Pattern.compile("^\\s*$");
private SeqClassifierFlags flags; // = null;
@Override
public void init(SeqClassifierFlags flags) {
this.flags = flags;
}
@Override
public String toString() {
return "CoNLLDocumentReaderAndWriter[entitySubclassification: " +
flags.entitySubclassification + ", intern: " + flags.intern + ']';
}
@Override
public Iterator<List<CoreLabel>> getIterator(Reader r) {
return new CoNLLIterator(r);
}
private class CoNLLIterator extends AbstractIterator<List<CoreLabel>> {
public CoNLLIterator (Reader r) {
stringIter = splitIntoDocs(r);
}
@Override
public boolean hasNext() { return stringIter.hasNext(); }
@Override
public List<CoreLabel> next() { return processDocument(stringIter.next()); }
private Iterator<String> stringIter; // = null;
} // end class CoNLLIterator
private static Iterator<String> splitIntoDocs(Reader r) {
if (TREAT_FILE_AS_ONE_DOCUMENT) {
return Collections.singleton(IOUtils.slurpReader(r)).iterator();
} else {
Collection<String> docs = new ArrayList<String>();
ObjectBank<String> ob = ObjectBank.getLineIterator(r);
StringBuilder current = new StringBuilder();
for (String line : ob) {
if (docPattern.matcher(line).lookingAt()) {
// Start new doc, store old one if non-empty
if (current.length() > 0) {
docs.add(current.toString());
current = new StringBuilder();
}
}
current.append(line);
current.append('\n');
}
if (current.length() > 0) {
docs.add(current.toString());
}
return docs.iterator();
}
}
private List<CoreLabel> processDocument(String doc) {
List<CoreLabel> lis = new ArrayList<CoreLabel>();
String[] lines = doc.split("\n");
for (String line : lines) {
if ( ! flags.deleteBlankLines || ! white.matcher(line).matches()) {
lis.add(makeCoreLabel(line));
}
}
entitySubclassify(lis, flags.entitySubclassification);
return lis;
}
/**
* This can be used on the CoNLL data to map from a representation where
* normally entities were marked I-PERS, but the beginning of non-first
* items of an entity sequences were marked B-PERS (IOB1 representation).
* It changes this representation to other representations:
* a 4 way representation of all entities, like S-PERS, B-PERS,
* I-PERS, E-PERS for single word, beginning, internal, and end of entity
* (SBIEO); always marking the first word of an entity (IOB2);
* the reverse IOE1 and IOE2 and IO.
* This code is very specific to the particular CoNLL way of labeling
* classes. It will work on any of these styles of input. However, note
* that IO is a lossy mapping, which necessarily loses information if
* two entities of the same class are adjacent.
* If the labels are not of the form "X-Y+" then they will be
* left unaltered, regardless of the value of style.
*
* @param tokens List of read in tokens with AnswerAnnotation
* @param style Output style; one of iob[12], ioe[12], io, sbieo
*/
private void entitySubclassify(List<CoreLabel> tokens,
String style) {
int how;
if ("iob1".equalsIgnoreCase(style)) {
how = 0;
} else if ("iob2".equalsIgnoreCase(style)) {
how = 1;
} else if ("ioe1".equalsIgnoreCase(style)) {
how = 2;
} else if ("ioe2".equalsIgnoreCase(style)) {
how = 3;
} else if ("io".equalsIgnoreCase(style)) {
how = 4;
} else if ("sbieo".equalsIgnoreCase(style)) {
how = 5;
} else {
System.err.println("entitySubclassify: unknown style: " + style);
how = 4;
}
tokens = new PaddedList<CoreLabel>(tokens, new CoreLabel());
int k = tokens.size();
String[] newAnswers = new String[k];
for (int i = 0; i < k; i++) {
final CoreLabel c = tokens.get(i);
final CoreLabel p = tokens.get(i - 1);
final CoreLabel n = tokens.get(i + 1);
final String cAns = c.get(CoreAnnotations.AnswerAnnotation.class);
if (cAns.length() > 1 && cAns.charAt(1) == '-') {
String pAns = p.get(CoreAnnotations.AnswerAnnotation.class);
if (pAns == null) { pAns = OTHER; }
String nAns = n.get(CoreAnnotations.AnswerAnnotation.class);
if (nAns == null) { nAns = OTHER; }
final String base = cAns.substring(2, cAns.length());
String pBase = (pAns.length() > 2 ? pAns.substring(2, pAns.length()) : pAns);
String nBase = (nAns.length() > 2 ? nAns.substring(2, nAns.length()) : nAns);
char prefix = cAns.charAt(0);
char pPrefix = (pAns.length() > 0) ? pAns.charAt(0) : ' ';
char nPrefix = (nAns.length() > 0) ? nAns.charAt(0) : ' ';
boolean isStartAdjacentSame = base.equals(pBase) &&
(prefix == 'B' || prefix == 'S' || pPrefix == 'E' || pPrefix == 'S');
boolean isEndAdjacentSame = base.equals(nBase) &&
(prefix == 'E' || prefix == 'S' || nPrefix == 'B' || pPrefix == 'S');
boolean isFirst = (!base.equals(pBase)) || cAns.charAt(0) == 'B';
boolean isLast = (!base.equals(nBase)) || nAns.charAt(0) == 'B';
switch (how) {
case 0:
if (isStartAdjacentSame) {
newAnswers[i] = intern("B-" + base);
} else {
newAnswers[i] = intern("I-" + base);
}
break;
case 1:
if (isFirst) {
newAnswers[i] = intern("B-" + base);
} else {
newAnswers[i] = intern("I-" + base);
}
break;
case 2:
if (isEndAdjacentSame) {
newAnswers[i] = intern("E-" + base);
} else {
newAnswers[i] = intern("I-" + base);
}
break;
case 3:
if (isLast) {
newAnswers[i] = intern("E-" + base);
} else {
newAnswers[i] = intern("I-" + base);
}
break;
case 4:
newAnswers[i] = intern("I-" + base);
break;
case 5:
if (isFirst && isLast) {
newAnswers[i] = intern("S-" + base);
} else if ((!isFirst) && isLast) {
newAnswers[i] = intern("E-" + base);
} else if (isFirst && (!isLast)) {
newAnswers[i] = intern("B-" + base);
} else {
newAnswers[i] = intern("I-" + base);
}
}
} else {
newAnswers[i] = cAns;
}
}
for (int i = 0; i < k; i++) {
CoreLabel c = tokens.get(i);
c.set(CoreAnnotations.AnswerAnnotation.class, newAnswers[i]);
}
}
/** This deals with the CoNLL files for different languages which have
* between 2 and 5 columns on non-blank lines.
*
* @param line A line of CoNLL input
* @return The constructed token
*/
private CoreLabel makeCoreLabel(String line) {
CoreLabel wi = new CoreLabel();
// wi.line = line;
String[] bits = line.split("\\s+");
switch (bits.length) {
case 0:
case 1:
wi.setWord(BOUNDARY);
wi.set(CoreAnnotations.AnswerAnnotation.class, OTHER);
break;
case 2:
wi.setWord(bits[0]);
wi.set(CoreAnnotations.AnswerAnnotation.class, bits[1]);
break;
case 3:
wi.setWord(bits[0]);
wi.setTag(bits[1]);
wi.set(CoreAnnotations.AnswerAnnotation.class, bits[2]);
break;
case 4:
wi.setWord(bits[0]);
wi.setTag(bits[1]);
wi.set(CoreAnnotations.ChunkAnnotation.class, bits[2]);
wi.set(CoreAnnotations.AnswerAnnotation.class, bits[3]);
break;
case 5:
if (flags.useLemmaAsWord) {
wi.setWord(bits[1]);
} else {
wi.setWord(bits[0]);
}
wi.set(CoreAnnotations.LemmaAnnotation.class, bits[1]);
wi.setTag(bits[2]);
wi.set(CoreAnnotations.ChunkAnnotation.class, bits[3]);
wi.set(CoreAnnotations.AnswerAnnotation.class, bits[4]);
break;
default:
throw new RuntimeIOException("Unexpected input (many fields): " + line);
}
wi.set(CoreAnnotations.OriginalAnswerAnnotation.class, wi.get(CoreAnnotations.AnswerAnnotation.class));
return wi;
}
private String intern(String s) {
if (flags.intern) {
return s.intern();
} else {
return s;
}
}
/** Return the coding scheme to IOB1 coding, regardless of what was used
* internally. This is useful for scoring against CoNLL test output.
*
* @param tokens List of tokens in some NER encoding
*/
private void deEndify(List<CoreLabel> tokens) {
if (flags.retainEntitySubclassification) {
return;
}
tokens = new PaddedList<CoreLabel>(tokens, new CoreLabel());
int k = tokens.size();
String[] newAnswers = new String[k];
for (int i = 0; i < k; i++) {
CoreLabel c = tokens.get(i);
CoreLabel p = tokens.get(i - 1);
if (c.get(CoreAnnotations.AnswerAnnotation.class).length() > 1 && c.get(CoreAnnotations.AnswerAnnotation.class).charAt(1) == '-') {
String base = c.get(CoreAnnotations.AnswerAnnotation.class).substring(2);
String pBase = (p.get(CoreAnnotations.AnswerAnnotation.class).length() <= 2 ? p.get(CoreAnnotations.AnswerAnnotation.class) : p.get(CoreAnnotations.AnswerAnnotation.class).substring(2));
boolean isSecond = (base.equals(pBase));
boolean isStart = (c.get(CoreAnnotations.AnswerAnnotation.class).charAt(0) == 'B' || c.get(CoreAnnotations.AnswerAnnotation.class).charAt(0) == 'S');
if (isSecond && isStart) {
newAnswers[i] = intern("B-" + base);
} else {
newAnswers[i] = intern("I-" + base);
}
} else {
newAnswers[i] = c.get(CoreAnnotations.AnswerAnnotation.class);
}
}
for (int i = 0; i < k; i++) {
CoreLabel c = tokens.get(i);
c.set(CoreAnnotations.AnswerAnnotation.class, newAnswers[i]);
}
}
/** Write a standard CoNLL format output file.
*
* @param doc The document: A List of CoreLabel
* @param out Where to send the answers to
*/
@Override
@SuppressWarnings({"StringEquality"})
public void printAnswers(List<CoreLabel> doc, PrintWriter out) {
// boolean tagsMerged = flags.mergeTags;
// boolean useHead = flags.splitOnHead;
if ( ! "iob1".equalsIgnoreCase(flags.entitySubclassification)) {
deEndify(doc);
}
for (CoreLabel fl : doc) {
String word = fl.word();
if (word == BOUNDARY) { // Using == is okay, because it is set to constant
out.println();
} else {
String gold = fl.getString(CoreAnnotations.OriginalAnswerAnnotation.class);
String guess = fl.get(CoreAnnotations.AnswerAnnotation.class);
// System.err.println(word + "\t" + gold + "\t" + guess));
String pos = fl.getString(CoreAnnotations.PartOfSpeechAnnotation.class);
String chunk = fl.getString(CoreAnnotations.ChunkAnnotation.class);
out.println(fl.word() + '\t' + pos + '\t' + chunk + '\t' +
gold + '\t' + guess);
}
}
}
/** Count some stats on what occurs in a file.
*/
public static void main(String[] args) throws IOException, ClassNotFoundException {
CoNLLDocumentReaderAndWriter f = new CoNLLDocumentReaderAndWriter();
f.init(new SeqClassifierFlags());
int numDocs = 0;
int numTokens = 0;
int numEntities = 0;
String lastAnsBase = "";
for (Iterator<List<CoreLabel>> it = f.getIterator(new FileReader(args[0])); it.hasNext(); ) {
List<CoreLabel> doc = it.next();
numDocs++;
for (CoreLabel fl : doc) {
// System.out.println("FL " + (++i) + " was " + fl);
if (fl.word().equals(BOUNDARY)) {
continue;
}
String ans = fl.get(CoreAnnotations.AnswerAnnotation.class);
String ansBase;
String ansPrefix;
String[] bits = ans.split("-");
if (bits.length == 1) {
ansBase = bits[0];
ansPrefix = "";
} else {
ansBase = bits[1];
ansPrefix = bits[0];
}
numTokens++;
if ( ! ansBase.equals("O")) {
if (ansBase.equals(lastAnsBase)) {
if (ansPrefix.equals("B")) {
numEntities++;
}
} else {
numEntities++;
}
}
}
}
System.out.println("File " + args[0] + " has " + numDocs + " documents, " +
numTokens + " (non-blank line) tokens and " +
numEntities + " entities.");
} // end main
} // end class CoNLLDocumentReaderAndWriter