/* Copyright (C) 2003 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
http://www.cs.umass.edu/~mccallum/mallet
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
package cc.mallet.extract;
import org.jdom.Element;
import org.jdom.Document;
import org.jdom.Namespace;
import org.jdom.Text;
import org.jdom.output.XMLOutputter;
import cc.mallet.types.*;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import gnu.trove.THashMap;
/**
* Created: Oct 12, 2004
*
* @author <A HREF="mailto:casutton@cs.umass.edu>casutton@cs.umass.edu</A>
* @version $Id: DocumentExtraction.java,v 1.1 2007/10/22 21:37:44 mccallum Exp $
*/
//TODO: Add place where user can have general Transducers to change CRF tokenization into LabeledSpans
//TODO: Add field for CRF's labeled tokenization
public class DocumentExtraction implements Serializable {
private Tokenization input;
private Sequence predictedLabels;
private LabelSequence target;
private LabeledSpans extractedSpans;
private LabeledSpans targetSpans;
private Object document;
private Label backgroundTag;
private String name;
public DocumentExtraction (String name, LabelAlphabet dict, Tokenization input, Sequence predicted, String background)
{
this (name, dict, input, predicted, null, background, new BIOTokenizationFilter ());
}
public DocumentExtraction (String name, LabelAlphabet dict, Tokenization input, Sequence predicted,
Sequence target, String background)
{
this (name, dict, input, predicted, target, background, new BIOTokenizationFilter ());
}
public DocumentExtraction (String name, LabelAlphabet dict, Tokenization input,
Sequence predicted, Sequence target, String background,
TokenizationFilter filter)
{
this.document = input.getDocument ();
this.name = name;
assert (input.size() == predicted.size());
this.backgroundTag = dict.lookupLabel (background);
this.input = input;
this.predictedLabels = predicted;
this.extractedSpans = filter.constructLabeledSpans (dict, document, backgroundTag, input, predicted);
if (target != null) {
if (target instanceof LabelSequence) this.target = (LabelSequence) target;
this.targetSpans = filter.constructLabeledSpans (dict, document, backgroundTag, input, target);
}
}
public DocumentExtraction (String name, LabelAlphabet dict, Tokenization input,
LabeledSpans predictedSpans, LabeledSpans trueSpans, String background)
{
this.document = input.getDocument ();
this.name = name;
this.backgroundTag = dict.lookupLabel (background);
this.input = input;
this.extractedSpans = predictedSpans;
this.targetSpans = trueSpans;
}
public Object getDocument ()
{
return document;
}
public Tokenization getInput ()
{
return input;
}
public Sequence getPredictedLabels ()
{
return predictedLabels;
}
public LabeledSpans getExtractedSpans ()
{
return extractedSpans;
}
public LabeledSpans getTargetSpans ()
{
return targetSpans;
}
public LabelSequence getTarget ()
{
return target;
}
public String getName ()
{
return name;
}
public Label getBackgroundTag ()
{
return backgroundTag;
}
//xxx nyi
public Span subspan (int start, int end)
{
throw new UnsupportedOperationException ("not yet implemented.");
}
public Document toXmlDocument ()
{
return toXmlDocument ("doc", Namespace.NO_NAMESPACE);
}
/*
public Document toXmlDocument (String rootEltName, Namespace ns)
{
Element element = new Element (rootEltName, ns);
for (int i = 0; i < extractedSpans.size(); i++) {
LabeledSpan span = (LabeledSpan) extractedSpans.get(i);
Label tag = span.getLabel();
if (tag == backgroundTag) {
org.jdom.Parent p = element.addContent (span.getText ());
} else {
Element field = new Element (tag.toString(), ns);
field.setText (span.getText ());
element.addContent (field);
}
}
return new Document (element);
}
*/
// does not do non-overlap sanity checking
public Document toXmlDocument (String rootEltName, Namespace ns)
{
ArrayList orderedByStart = new ArrayList (extractedSpans);
Collections.sort (orderedByStart, new Comparator () {
public int compare (Object o, Object o1)
{
int start1 = ((Span)o).getStartIdx ();
int start2 = ((Span)o1).getStartIdx ();
return Double.compare (start1, start2);
}
} );
ArrayList roots = new ArrayList (orderedByStart);
THashMap children = new THashMap ();
for (int i = 0; i < orderedByStart.size(); i++) {
LabeledSpan child = (LabeledSpan) orderedByStart.get (i);
for (int j = i-1; j >= 0; j--) {
LabeledSpan parent = (LabeledSpan) orderedByStart.get (j);
if (parent.isSubspan (child)) {
List childList = (List) children.get (parent);
if (childList == null) {
childList = new ArrayList ();
children.put (parent, childList);
}
roots.remove (child);
childList.add (child);
break;
}
}
}
CharSequence doc = (CharSequence) document;
Span wholeDoc = new StringSpan (doc, 0, doc.length ());
return new Document (generateElement (rootEltName, wholeDoc, roots, children));
}
private Element generateElement (String parentName, Span span, List childSpans, THashMap tree)
{
Element parentElt = new Element (parentName);
if (childSpans == null || childSpans.isEmpty ()) {
parentElt.setContent (new Text (span.getText ()));
} else {
List childElts = new ArrayList (childSpans.size());
int start = span.getStartIdx ();
int current = 0;
for (int i = 0; i < childSpans.size(); i++) {
LabeledSpan childSpan = (LabeledSpan) childSpans.get (i);
Label childLabel = childSpan.getLabel();
int childStart = childSpan.getStartIdx () - start;
if (childStart > current) {
childElts.add (new Text (span.getText().substring (current, childStart)));
}
if (childLabel == backgroundTag) {
childElts.add (new Text (childSpan.getText()));
} else {
String name = childLabel.getEntry ().toString();
List grandchildren = (List) tree.get (childSpan);
childElts.add (generateElement (name, childSpan, grandchildren, tree));
}
current = childSpan.getEndIdx () - start;
}
if (current < span.getEndIdx ())
childElts.add (new Text (span.getText().substring (current)));
parentElt.addContent (childElts);
}
return parentElt;
}
public String toXmlString ()
{
Document jdom = toXmlDocument ();
XMLOutputter outputter = new XMLOutputter ();
return outputter.outputString (jdom);
}
public int size ()
{
return extractedSpans.size();
}
// Serialization garbage
private static final long serialVersionUID = 1L;
private static final int CURRENT_SERIAL_VERSION = 1;
private void writeObject(ObjectOutputStream out) throws IOException {
out.defaultWriteObject();
out.writeInt(CURRENT_SERIAL_VERSION);
}
private void readObject(ObjectInputStream in) throws IOException,
ClassNotFoundException {
in.defaultReadObject();
in.readInt(); // read version
}
}