package edu.stanford.nlp.ie.crf;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.optimization.CmdEvaluator;
import edu.stanford.nlp.stats.MultiClassChunkEvalStats;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Triple;
import java.io.*;
import java.util.Collection;
import java.util.List;
/**
* Evaluates CRFClassifier on a set of data
* - called by QNMinimizer periodically
* - If evalCmd is set, runs command line specified by evalCmd
* otherwise does evaluation internally
* NOTE: when running conlleval with exec on Linux, linux will first
* fork process by duplicating memory of current process. So if
* JVM has lots of memory, it will all be duplicated when
* child process is initially forked.
* @author Angel Chang
*/
public class CRFClassifierEvaluator<IN extends CoreMap> extends CmdEvaluator {
private CRFClassifier<IN> classifier;
// NOTE: Default uses -r, specify without -r if IOB
private String cmdStr = "/u/nlp/bin/conlleval -r";
private String[] cmd;
// TODO: Use data structure to hold data + features
// Cache already featurized documents
// Original object bank
Collection<List<IN>> data;
// Featurized data
List<Triple<int[][][], int[], double[][][]>> featurizedData;
public CRFClassifierEvaluator(String description,
CRFClassifier<IN> classifier,
Collection<List<IN>> data,
List<Triple<int[][][], int[], double[][][]>> featurizedData)
{
this.description = description;
this.classifier = classifier;
this.data = data;
this.featurizedData = featurizedData;
cmd = getCmd(cmdStr);
saveOutput = true;
}
public CRFClassifierEvaluator(String description,
CRFClassifier<IN> classifier)
{
this.description = description;
this.classifier = classifier;
saveOutput = true;
}
/**
* Set the data to test on
*/
public void setTestData(Collection<List<IN>> data, List<Triple<int[][][], int[], double[][][]>> featurizedData)
{
this.data = data;
this.featurizedData = featurizedData;
}
/**
* Set the evaluation command (set to null to skip evaluation using command line)
* @param evalCmd
*/
public void setEvalCmd(String evalCmd)
{
System.err.println("setEvalCmd to " + evalCmd);
this.cmdStr = evalCmd;
if (cmdStr != null) {
cmdStr = cmdStr.trim();
if (cmdStr.length() == 0) { cmdStr = null; }
}
cmd = getCmd(cmdStr);
}
public void setValues(double[] x)
{
classifier.updateWeightsForTest(x);
}
public String[] getCmd()
{
return cmd;
}
private double interpretCmdOutput() {
String output = getOutput();
String[] parts = output.split("\\s+");
int fScoreIndex = 0;
for (; fScoreIndex < parts.length; fScoreIndex++)
if (parts[fScoreIndex].equals("FB1:"))
break;
fScoreIndex += 1;
if (fScoreIndex < parts.length)
return Double.parseDouble(parts[fScoreIndex]);
else {
System.err.println("ERROR in CRFClassifierEvaluator.interpretCmdOutput(), cannot find FB1 score in output:\n"+output);
return -1;
}
}
@Override
public void outputToCmd(OutputStream outputStream)
{
try {
PrintWriter pw = IOUtils.encodedOutputStreamPrintWriter(outputStream, null, true);
classifier.classifyAndWriteAnswers(data, featurizedData, pw,
classifier.makeReaderAndWriter());
} catch (IOException ex) {
throw new RuntimeIOException(ex);
}
}
public double evaluate(double[] x) {
double score = 0;
setValues(x);
if (getCmd() != null) {
evaluateCmd(getCmd());
score = interpretCmdOutput();
} else {
try {
// TODO: Classify in memory instead of writing to tmp file
File f = File.createTempFile("CRFClassifierEvaluator","txt");
f.deleteOnExit();
OutputStream outputStream = new BufferedOutputStream(new FileOutputStream(f));
PrintWriter pw = IOUtils.encodedOutputStreamPrintWriter(outputStream, null, true);
classifier.classifyAndWriteAnswers(data, featurizedData, pw,
classifier.makeReaderAndWriter());
outputStream.close();
BufferedReader br = new BufferedReader(new FileReader(f));
MultiClassChunkEvalStats stats = new MultiClassChunkEvalStats("O");
score = stats.score(br, "\t");
System.err.println(stats.getConllEvalString());
f.delete();
} catch (Exception ex) {
throw new RuntimeException(ex);
}
}
return score;
}
}