/* Copyright (C) 2003 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
http://www.cs.umass.edu/~mccallum/mallet
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
package cc.mallet.extract;
import java.io.*;
import java.util.ArrayList;
import java.util.Iterator;
import cc.mallet.fst.CRF;
import cc.mallet.pipe.Noop;
import cc.mallet.pipe.Pipe;
import cc.mallet.pipe.SerialPipes;
import cc.mallet.types.*;
/**
* Created: Oct 12, 2004
*
* @author <A HREF="mailto:casutton@cs.umass.edu>casutton@cs.umass.edu</A>
* @version $Id: CRFExtractor.java,v 1.1 2007/10/22 21:37:44 mccallum Exp $
*/
public class CRFExtractor implements Extractor {
private CRF crf;
private Pipe tokenizationPipe;
private Pipe featurePipe;
private String backgroundTag;
private TokenizationFilter filter;
public CRFExtractor (CRF crf) {
this (crf, new Noop ());
}
public CRFExtractor (File crfFile) throws IOException {
this (loadCrf(crfFile), new Noop ());
}
public CRFExtractor (CRF crf, Pipe tokpipe) {
this (crf, tokpipe, new BIOTokenizationFilter ());
}
public CRFExtractor (CRF crf, Pipe tokpipe, TokenizationFilter filter) {
this (crf, tokpipe, filter, "O");
}
public CRFExtractor (CRF crf, Pipe tokpipe, TokenizationFilter filter, String backgroundTag) {
this.crf = crf;
tokenizationPipe = tokpipe;
featurePipe = (Pipe) crf.getInputPipe ();
this.filter = filter;
this.backgroundTag = backgroundTag;
}
private static CRF loadCrf (File crfFile) throws IOException
{
ObjectInputStream ois = new ObjectInputStream( new FileInputStream( crfFile ) );
CRF crf = null;
// We shouldn't run into a ClassNotFound exception...
try {
crf = (CRF)ois.readObject();
} catch (ClassNotFoundException e) {
System.err.println ("Internal MALLET error: Could not read CRF from file "+crfFile+"\n"+e);
e.printStackTrace ();
throw new RuntimeException (e);
}
ois.close();
return crf;
}
public Extraction extract (Object o)
{
// I don't think there's a polymorphic way to do this. b/c Java sucks. -cas
if (o instanceof Tokenization) {
return extract ((Tokenization) o);
}
else if (o instanceof InstanceList) {
return extract ((InstanceList) o);
}
else {
return extract (doTokenize (o));
}
}
private Tokenization doTokenize (Object obj)
{
Instance toked = new Instance (obj, null, null, null);
tokenizationPipe.pipe (toked);
return (Tokenization) toked.getData ();
}
public Extraction extract (Tokenization spans)
{
// We assume the input is unpiped.
Instance carrier = featurePipe.pipe (new Instance (spans, null, null, null));
Sequence output = crf.transduce ((Sequence) carrier.getData ());
Extraction extraction = new Extraction (this, getTargetAlphabet());
DocumentExtraction docseq = new DocumentExtraction ("Extraction", getTargetAlphabet(),
spans,
output, null, backgroundTag,
filter);
extraction.addDocumentExtraction (docseq);
return extraction;
}
public InstanceList pipeInstances (Iterator<Instance> source)
{
// I think that pipes should be associated neither with InstanceLists, nor
// with Instances. -cas
InstanceList toked = new InstanceList (tokenizationPipe);
toked.addThruPipe (source);
InstanceList piped = new InstanceList (getFeaturePipe ());
piped.addThruPipe (toked.iterator());
return piped;
}
/** Assumes Instance.source contains the Tokenization object. */
public Extraction extract (InstanceList ilist) {
Extraction extraction = new Extraction (this, getTargetAlphabet ());
for (int i = 0; i < ilist.size(); i++) {
Instance inst = ilist.get(i);
Tokenization tok = (Tokenization)inst.getSource();
String name = inst.getName().toString();
Sequence input = (Sequence)inst.getData ();
Sequence target = (Sequence)inst.getTarget ();
Sequence output = crf.transduce(input);
DocumentExtraction docseq =
new DocumentExtraction (name, getTargetAlphabet(), tok,
output, target, backgroundTag,
filter);
extraction.addDocumentExtraction (docseq);
}
return extraction;
}
public Extraction extract (Iterator<Instance> source)
{
Extraction extraction = new Extraction (this, getTargetAlphabet ());
// Put all the instances through both pipes, then get viterbi path
InstanceList tokedList = new InstanceList (tokenizationPipe);
tokedList.addThruPipe (source);
InstanceList pipedList = new InstanceList (getFeaturePipe ());
pipedList.addThruPipe (tokedList.iterator());
Iterator<Instance> it1 = tokedList.iterator ();
Iterator<Instance> it2 = pipedList.iterator ();
while (it1.hasNext()) {
Instance toked = it1.next();
Instance piped = it2.next ();
Tokenization tok = (Tokenization) toked.getData();
String name = piped.getName().toString();
Sequence input = (Sequence) piped.getData ();
Sequence target = (Sequence) piped.getTarget ();
Sequence output = crf.transduce (input);
DocumentExtraction docseq = new DocumentExtraction (name, getTargetAlphabet (), tok,
output, target, backgroundTag,
filter);
extraction.addDocumentExtraction (docseq);
}
return extraction;
}
public TokenizationFilter getTokenizationFilter ()
{
return filter;
}
public String getBackgroundTag ()
{
return backgroundTag;
}
public Pipe getTokenizationPipe ()
{
return tokenizationPipe;
}
public void setTokenizationPipe (Pipe tokenizationPipe)
{
this.tokenizationPipe = tokenizationPipe;
}
public Pipe getFeaturePipe ()
{
return featurePipe;
}
//xxx This method is inherent dangerous!!! Should check that pipe.alphabet equals crf.alphabet
public void setFeaturePipe (Pipe featurePipe)
{
this.featurePipe = featurePipe;
}
public Alphabet getInputAlphabet ()
{
return crf.getInputAlphabet ();
}
public LabelAlphabet getTargetAlphabet ()
{
return (LabelAlphabet) crf.getOutputAlphabet ();
}
public CRF getCrf ()
{
return crf;
}
/**
* Transfer some Pipes from the feature pipe to the tokenization pipe.
* The feature pipe must be a SerialPipes. This will destructively modify the CRF object of the extractor.
* This is useful if you have a CRF hat has been trained from a single pipe, which you need to split up
* int feature and tokenization pipes
*/
public void slicePipes (int num)
{
Pipe fpipe = getFeaturePipe ();
if (!(fpipe instanceof SerialPipes))
throw new IllegalArgumentException ("slicePipes: FeaturePipe must be a SerialPipes.");
SerialPipes sp = (SerialPipes) fpipe;
ArrayList pipes = new ArrayList ();
for (int i = 0; i < num; i++) {
pipes.add (sp.getPipe (0));
//sp.removePipe (0); TODO Fix this
}
//setTokenizationPipe (sp); TODO Fix this
throw new UnsupportedOperationException ("Not yet implemented...");
}
// Java serialization nonsense
// Serial version 0: Initial version
// Serial version 1: Add featurePipe
// Serial version 2: Add filter
private static final int CURRENT_SERIAL_VERSION = 2;
private static final long serialVersionUID = 1;
private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException
{
in.defaultReadObject ();
int version = in.readInt ();
if ((version == 0) || (featurePipe == null)) {
featurePipe = (Pipe) crf.getInputPipe ();
}
if (version < 2) {
filter = new BIOTokenizationFilter ();
}
}
private void writeObject (ObjectOutputStream out) throws IOException
{
out.defaultWriteObject ();
out.writeInt (CURRENT_SERIAL_VERSION);
}
public Sequence pipeInput (Object input)
{
InstanceList all = new InstanceList (getFeaturePipe ());
all.add (input, null, null, null);
return (Sequence) all.get (0).getData();
}
}