/* Copyright (C) 2003 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
package cc.mallet.extract;
import org.jdom.Element;
import org.jdom.Document;
import org.jdom.Namespace;
import org.jdom.Text;
import org.jdom.output.XMLOutputter;
import cc.mallet.types.*;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import gnu.trove.THashMap;
* Created: Oct 12, 2004
* @author <A HREF="mailto:casutton@cs.umass.edu>casutton@cs.umass.edu</A>
* @version $Id: DocumentExtraction.java,v 1.1 2007/10/22 21:37:44 mccallum Exp $
//TODO: Add place where user can have general Transducers to change CRF tokenization into LabeledSpans
//TODO: Add field for CRF's labeled tokenization
public class DocumentExtraction implements Serializable {
private Tokenization input;
private Sequence predictedLabels;
private LabelSequence target;
private LabeledSpans extractedSpans;
private LabeledSpans targetSpans;
private Object document;
private Label backgroundTag;
private String name;
public DocumentExtraction (String name, LabelAlphabet dict, Tokenization input, Sequence predicted, String background)
this (name, dict, input, predicted, null, background, new BIOTokenizationFilter ());
public DocumentExtraction (String name, LabelAlphabet dict, Tokenization input, Sequence predicted,
Sequence target, String background)
this (name, dict, input, predicted, target, background, new BIOTokenizationFilter ());
public DocumentExtraction (String name, LabelAlphabet dict, Tokenization input,
Sequence predicted, Sequence target, String background,
TokenizationFilter filter)
this.document = input.getDocument ();
this.name = name;
assert (input.size() == predicted.size());
this.backgroundTag = dict.lookupLabel (background);
this.input = input;
this.predictedLabels = predicted;
this.extractedSpans = filter.constructLabeledSpans (dict, document, backgroundTag, input, predicted);
if (target != null) {
if (target instanceof LabelSequence) this.target = (LabelSequence) target;
this.targetSpans = filter.constructLabeledSpans (dict, document, backgroundTag, input, target);
public DocumentExtraction (String name, LabelAlphabet dict, Tokenization input,
LabeledSpans predictedSpans, LabeledSpans trueSpans, String background)
this.document = input.getDocument ();
this.name = name;
this.backgroundTag = dict.lookupLabel (background);
this.input = input;
this.extractedSpans = predictedSpans;
this.targetSpans = trueSpans;
public Object getDocument ()
return document;
public Tokenization getInput ()
return input;
public Sequence getPredictedLabels ()
return predictedLabels;
public LabeledSpans getExtractedSpans ()
return extractedSpans;
public LabeledSpans getTargetSpans ()
return targetSpans;
public LabelSequence getTarget ()
return target;
public String getName ()
return name;
public Label getBackgroundTag ()
return backgroundTag;
//xxx nyi
public Span subspan (int start, int end)
throw new UnsupportedOperationException ("not yet implemented.");
public Document toXmlDocument ()
return toXmlDocument ("doc", Namespace.NO_NAMESPACE);
public Document toXmlDocument (String rootEltName, Namespace ns)
Element element = new Element (rootEltName, ns);
for (int i = 0; i < extractedSpans.size(); i++) {
LabeledSpan span = (LabeledSpan) extractedSpans.get(i);
Label tag = span.getLabel();
if (tag == backgroundTag) {
org.jdom.Parent p = element.addContent (span.getText ());
} else {
Element field = new Element (tag.toString(), ns);
field.setText (span.getText ());
element.addContent (field);
return new Document (element);
// does not do non-overlap sanity checking
public Document toXmlDocument (String rootEltName, Namespace ns)
ArrayList orderedByStart = new ArrayList (extractedSpans);
Collections.sort (orderedByStart, new Comparator () {
public int compare (Object o, Object o1)
int start1 = ((Span)o).getStartIdx ();
int start2 = ((Span)o1).getStartIdx ();
return Double.compare (start1, start2);
} );
ArrayList roots = new ArrayList (orderedByStart);
THashMap children = new THashMap ();
for (int i = 0; i < orderedByStart.size(); i++) {
LabeledSpan child = (LabeledSpan) orderedByStart.get (i);
for (int j = i-1; j >= 0; j--) {
LabeledSpan parent = (LabeledSpan) orderedByStart.get (j);
if (parent.isSubspan (child)) {
List childList = (List) children.get (parent);
if (childList == null) {
childList = new ArrayList ();
children.put (parent, childList);
roots.remove (child);
childList.add (child);
CharSequence doc = (CharSequence) document;
Span wholeDoc = new StringSpan (doc, 0, doc.length ());
return new Document (generateElement (rootEltName, wholeDoc, roots, children));
private Element generateElement (String parentName, Span span, List childSpans, THashMap tree)
Element parentElt = new Element (parentName);
if (childSpans == null || childSpans.isEmpty ()) {
parentElt.setContent (new Text (span.getText ()));
} else {
List childElts = new ArrayList (childSpans.size());
int start = span.getStartIdx ();
int current = 0;
for (int i = 0; i < childSpans.size(); i++) {
LabeledSpan childSpan = (LabeledSpan) childSpans.get (i);
Label childLabel = childSpan.getLabel();
int childStart = childSpan.getStartIdx () - start;
if (childStart > current) {
childElts.add (new Text (span.getText().substring (current, childStart)));
if (childLabel == backgroundTag) {
childElts.add (new Text (childSpan.getText()));
} else {
String name = childLabel.getEntry ().toString();
List grandchildren = (List) tree.get (childSpan);
childElts.add (generateElement (name, childSpan, grandchildren, tree));
current = childSpan.getEndIdx () - start;
if (current < span.getEndIdx ())
childElts.add (new Text (span.getText().substring (current)));
parentElt.addContent (childElts);
return parentElt;
public String toXmlString ()
Document jdom = toXmlDocument ();
XMLOutputter outputter = new XMLOutputter ();
return outputter.outputString (jdom);
public int size ()
return extractedSpans.size();
// Serialization garbage
private static final long serialVersionUID = 1L;
private static final int CURRENT_SERIAL_VERSION = 1;
private void writeObject(ObjectOutputStream out) throws IOException {
private void readObject(ObjectInputStream in) throws IOException,
ClassNotFoundException {
in.readInt(); // read version