Source Code of cc.mallet.extract.DocumentExtraction

/* Copyright (C) 2003 Univ. of Massachusetts Amherst, Computer Science Dept.
   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
   http://www.cs.umass.edu/~mccallum/mallet
   This software is provided under the terms of the Common Public License,
   version 1.0, as published by http://www.opensource.org.  For further
   information, see the file `LICENSE' included with this distribution. */
package cc.mallet.extract;


import org.jdom.Element;
import org.jdom.Document;
import org.jdom.Namespace;
import org.jdom.Text;
import org.jdom.output.XMLOutputter;


import cc.mallet.types.*;


import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;


import gnu.trove.THashMap;


/**
 * Created: Oct 12, 2004
 *
 * @author <A HREF="mailto:casutton@cs.umass.edu>casutton@cs.umass.edu</A>
 * @version $Id: DocumentExtraction.java,v 1.1 2007/10/22 21:37:44 mccallum Exp $
 */
//TODO: Add place where user can have general Transducers to change CRF tokenization into LabeledSpans
//TODO: Add field for CRF's labeled tokenization
public class DocumentExtraction implements Serializable {


  private Tokenization input;
  private Sequence predictedLabels;
  private LabelSequence target;


  private LabeledSpans extractedSpans;
  private LabeledSpans targetSpans;


  private Object document;
  private Label backgroundTag;
  private String name;




  public DocumentExtraction (String name, LabelAlphabet dict, Tokenization input, Sequence predicted, String background)
  {
    this (name, dict, input, predicted, null, background, new BIOTokenizationFilter ());
  }


  public DocumentExtraction (String name, LabelAlphabet dict, Tokenization input, Sequence predicted,
                             Sequence target, String background)
  {
    this (name, dict, input, predicted, target, background, new BIOTokenizationFilter ());
  }


  public DocumentExtraction (String name, LabelAlphabet dict, Tokenization input,
                             Sequence predicted, Sequence target, String background,
                             TokenizationFilter filter)
  {


    this.document = input.getDocument ();
    this.name = name;
    assert (input.size() == predicted.size());


    this.backgroundTag = dict.lookupLabel (background);
    this.input = input;


    this.predictedLabels = predicted;
    this.extractedSpans = filter.constructLabeledSpans (dict, document, backgroundTag, input, predicted);


    if (target != null) {
      if (target instanceof LabelSequence) this.target = (LabelSequence) target;
      this.targetSpans = filter.constructLabeledSpans (dict, document, backgroundTag, input, target);
    }


  }


  public DocumentExtraction (String name, LabelAlphabet dict, Tokenization input,
                             LabeledSpans predictedSpans, LabeledSpans trueSpans, String background)
  {
    this.document = input.getDocument ();
    this.name = name;


    this.backgroundTag = dict.lookupLabel (background);
    this.input = input;


    this.extractedSpans = predictedSpans;
    this.targetSpans = trueSpans;
  }






  public Object getDocument ()
  {
    return document;
  }


  public Tokenization getInput ()
  {
    return input;
  }




  public Sequence getPredictedLabels ()
  {
    return predictedLabels;
  }




  public LabeledSpans getExtractedSpans ()
  {
    return extractedSpans;
  }


  public LabeledSpans getTargetSpans ()
  {
    return targetSpans;
  }


  public LabelSequence getTarget ()
  {
    return target;
  }




  public String getName ()
  {
    return name;
  }


  public Label getBackgroundTag ()
  {
    return backgroundTag;
  }


  //xxx nyi
  public Span subspan (int start, int end)
  {
    throw new UnsupportedOperationException ("not yet implemented.");
  }




  public Document toXmlDocument ()
  {
    return toXmlDocument ("doc", Namespace.NO_NAMESPACE);
  }


 /*
  public Document toXmlDocument (String rootEltName, Namespace ns)
  {
    Element element = new Element (rootEltName, ns);
    for (int i = 0; i < extractedSpans.size(); i++) {
       LabeledSpan span = (LabeledSpan) extractedSpans.get(i);
       Label tag = span.getLabel();
       if (tag == backgroundTag) {
         org.jdom.Parent p = element.addContent (span.getText ());
       } else {
         Element field = new Element (tag.toString(), ns);
         field.setText (span.getText ());
         element.addContent (field);
       }
     }
    return new Document (element);
  }
   */


  // does not do non-overlap sanity checking
  public Document toXmlDocument (String rootEltName, Namespace ns)
   {
     ArrayList orderedByStart = new ArrayList (extractedSpans);
     Collections.sort (orderedByStart, new Comparator () {
       public int compare (Object o, Object o1)
       {
         int start1 = ((Span)o).getStartIdx ();
         int start2 = ((Span)o1).getStartIdx ();
         return Double.compare (start1, start2);
       }
     } );


     ArrayList roots = new ArrayList (orderedByStart);
     THashMap children = new THashMap ();
     for (int i = 0; i < orderedByStart.size(); i++) {
       LabeledSpan child = (LabeledSpan) orderedByStart.get (i);
       for (int j = i-1; j >= 0; j--) {
         LabeledSpan parent = (LabeledSpan) orderedByStart.get (j);
         if (parent.isSubspan (child)) {
           List childList = (List) children.get (parent);
           if (childList == null) {
             childList = new ArrayList ();
             children.put (parent, childList);
           }
           roots.remove (child);
           childList.add (child);
           break;
         }
       }
     }


     CharSequence doc = (CharSequence) document;
     Span wholeDoc = new StringSpan (doc, 0, doc.length ());
     return new Document (generateElement (rootEltName, wholeDoc, roots, children));
   }




  private Element generateElement (String parentName, Span span, List childSpans, THashMap tree)
  {
    Element parentElt = new Element (parentName);
    if (childSpans == null || childSpans.isEmpty ()) {
      parentElt.setContent (new Text (span.getText ()));
    } else {
      List childElts = new ArrayList (childSpans.size());
      int start = span.getStartIdx ();
      int current = 0;
      for (int i = 0; i < childSpans.size(); i++) {
        LabeledSpan childSpan = (LabeledSpan) childSpans.get (i);
        Label childLabel = childSpan.getLabel();


        int childStart = childSpan.getStartIdx () - start;
        if (childStart > current) {
          childElts.add (new Text (span.getText().substring (current, childStart)));
        }


        if (childLabel == backgroundTag) {
          childElts.add (new Text (childSpan.getText()));
        } else {
          String name = childLabel.getEntry ().toString();
          List grandchildren = (List) tree.get (childSpan);
          childElts.add (generateElement (name, childSpan, grandchildren, tree));
        }


        current = childSpan.getEndIdx () - start;
      }


      if (current < span.getEndIdx ())
        childElts.add (new Text (span.getText().substring (current)));


      parentElt.addContent (childElts);
    }


    return parentElt;
  }




  public String toXmlString ()
  {
    Document jdom = toXmlDocument ();
    XMLOutputter outputter = new XMLOutputter ();
    return outputter.outputString (jdom);
  }


  public int size ()
  {
    return extractedSpans.size();
  }
  
  // Serialization garbage


  private static final long serialVersionUID = 1L;


  private static final int CURRENT_SERIAL_VERSION = 1;


  private void writeObject(ObjectOutputStream out) throws IOException {
    out.defaultWriteObject();
    out.writeInt(CURRENT_SERIAL_VERSION);
  }


  private void readObject(ObjectInputStream in) throws IOException,
      ClassNotFoundException {
    in.defaultReadObject();
    in.readInt(); // read version
  }


}
Source Code of cc.mallet.extract.DocumentExtraction

Related Classes of cc.mallet.extract.DocumentExtraction