/* Copyright (C) 2003 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
package cc.mallet.extract;
import java.io.File;
import java.io.PrintWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import cc.mallet.types.Label;
import cc.mallet.types.LabelAlphabet;
import cc.mallet.util.ColorUtils;
* Diagnosis class that outputs HTML pages that allows you to view errors on a more
* global per-instance basis.
* Created: Mar 30, 2005
* @author <A HREF="mailto:casutton@cs.umass.edu>casutton@cs.umass.edu</A>
* @version $Id: DocumentViewer.java,v 1.1 2007/10/22 21:37:44 mccallum Exp $
public class DocumentViewer {
private static final String DOC_ERRS_CSS_FNAME = "docerrs.css";
private static final String DOC_ERRS_PRED_CSS_FNAME = "docerrs-by-pred.css";
private static final String DOC_ERRS_TRUE_CSS_FNAME = "docerrs-by-true.css";
private static final double SATURATION = 0.4;
private static class DualLabeledSpans {
DualLabeledSpans (LabeledSpans ls1, LabeledSpans ls2) {
ls = new LabeledSpans[] { ls1, ls2 };
private LabeledSpans[] ls;
int size () { return ls[0].size(); }
LabeledSpan get (int t, int i) { return ls[i].getLabeledSpan (t); }
* Writes several HTML files describing a given extraction. Each HTML file shows an entire
* document, with the extracted fields color-coded.
* @param directory Directory to write files to
* @param extraction Extraction to describe
* @throws IOException
public static void writeExtraction (File directory, Extraction extraction) throws IOException
outputIndex (directory, extraction);
outputStylesheets (directory, extraction);
outputDocuments (directory, extraction);
private static void outputStylesheets (File directory, Extraction extraction) throws IOException
// ERRS css
PrintWriter out = new PrintWriter (new FileWriter (new File (directory, DOC_ERRS_CSS_FNAME)));
out.println (".tf_legend { border-style: dashed; border-width: 2px; padding: 10px; padding-top: 0ex; float: right; margin:2em; }");
out.println (".class_legend { visibility: hidden; }");
out.println (".correct { background-color:#33FF33; }");
out.println (".wrong { background-color:pink }");
out.println (".true { background-color:#99CCFF; }");
out.println (".pred { background-color:#FFFF66 }");
out.close ();
//PRED css
LabelAlphabet dict = extraction.getLabelAlphabet ();
String[] fields = determineFieldNames (dict);
String[] colors = ColorUtils.rainbow (fields.length, (float) SATURATION, 1);
out = new PrintWriter (new FileWriter (new File (directory, DOC_ERRS_PRED_CSS_FNAME)));
out.println (".class_legend { border-style: dashed; border-width: 2px; padding: 10px; padding-top: 0ex; float: right; margin:2em; }");
out.println (".tf_legend { visibility: hidden; }");
for (int i = 0; i < fields.length; i++) {
out.println (".pred_"+fields[i]+" { background-color:"+colors[i]+"; }");
out.close ();
//TRUE css
out = new PrintWriter (new FileWriter (new File (directory, DOC_ERRS_TRUE_CSS_FNAME)));
out.println (".class_legend { border-style: dashed; border-width: 2px; padding: 10px; padding-top: 0ex; float: right; margin:2em; }");
out.println (".tf_legend { visibility: hidden; }");
for (int i = 0; i < fields.length; i++) {
out.println (".true_"+fields[i]+" { background-color:"+colors[i]+"; }");
out.close ();
private static void outputDocuments (File directory, Extraction extraction) throws IOException
for (int i = 0; i < extraction.getNumDocuments (); i++) {
PrintWriter out = new PrintWriter (new FileWriter (new File (directory, "extraction"+i+".html")));
outputOneDocument (out, extraction.getDocumentExtraction (i));
out.close ();
private static void outputOneDocument (PrintWriter out, DocumentExtraction docExtr)
String name = docExtr.getName ();
out.println ("<HTML><HEAD><TITLE>"+name+": Extraction from Document</TITLE>");
out.println ("<LINK REL=\"stylesheet\" TYPE=\"text/css\" HREF=\""+DOC_ERRS_CSS_FNAME+"\" title=\"Agreement\" />");
out.println ("<LINK REL=\"stylesheet\" TYPE=\"text/css\" HREF=\""+DOC_ERRS_PRED_CSS_FNAME+"\" title=\"Pred\" />");
out.println ("<LINK REL=\"stylesheet\" TYPE=\"text/css\" HREF=\""+DOC_ERRS_TRUE_CSS_FNAME+"\" title=\"True\" />");
out.println ("</HEAD><BODY>");
outputClassLegend (out, docExtr.getExtractedSpans ().getLabeledSpan (0).getLabel ().getLabelAlphabet ());
outputRightWrongLegend (out);
DualLabeledSpans spans = intersectSpans (docExtr);
for (int i = 0; i < spans.size(); i++) {
LabeledSpan predSpan = spans.get (i, 0);
LabeledSpan trueSpan = spans.get (i, 1);
Label predLabel = predSpan.getLabel ();
Label trueLabel = trueSpan.getLabel ();
boolean predNonBgrnd = !predSpan.isBackground ();
boolean trueNonBgrnd = !trueSpan.isBackground ();
boolean isBackground = !predNonBgrnd && !trueNonBgrnd;
String spanClass = null;
if (predNonBgrnd && trueNonBgrnd) {
if (predLabel == trueLabel) {
spanClass = "correct";
} else {
spanClass = "wrong";
} else if (predNonBgrnd) {
spanClass = "pred";
} else if (trueNonBgrnd) {
spanClass = "true";
if (!isBackground) out.print ("<SPAN CLASS=\"pred_"+predLabel+"\">");
if (!isBackground) out.print ("<SPAN CLASS=\"true_"+trueLabel+"\">");
if (spanClass != null) { out.print ("<SPAN CLASS=\""+spanClass+"\">"); }
String text = predSpan.getSpan ().getText ();
text = text.replaceAll ("<", "<");
text = text.replaceAll ("\n", "\n<P>");
out.print (text);
if (spanClass != null) { out.print ("</SPAN>"); }
if (!isBackground) out.print ("</SPAN></SPAN>");
out.println ();
out.println ("</BODY></HTML>");
private static void outputRightWrongLegend (PrintWriter out)
out.println ("<DIV CLASS=\"tf_legend\"><B>LEGEND</B><BR>");
out.println ("<SPAN CLASS='correct'>Correct</SPAN><BR />");
out.println ("<SPAN CLASS='wrong'>Wrong</SPAN><BR />");
out.println ("<SPAN CLASS='true'>False Negative</SPAN> (True field but predicted background)<BR />");
out.println ("<SPAN CLASS='pred'>False Positive</SPAN> (True background but predicted field)<BR />");
out.println ("</DIV>");
private static void outputClassLegend (PrintWriter out, LabelAlphabet dict)
out.println ("<DIV CLASS=\"class_legend\">");
out.println ("<H4>LEGEND</H4>");
String[] fields = determineFieldNames (dict);
String[] colors = ColorUtils.rainbow (fields.length, (float) SATURATION, 1);
for (int i = 0; i < fields.length; i++) {
out.println ("<SPAN STYLE=\"background-color:"+colors[i]+"\">"+fields[i]+"</SPAN><BR />");
out.println ("</DIV>");
private static String[] determineFieldNames (LabelAlphabet dict)
List l = new ArrayList ();
for (int i = 0; i < dict.size (); i++) {
String lname = dict.lookupLabel (i).toString ();
if (!lname.startsWith ("B-") && !lname.startsWith ("I-")) {
l.add (lname);
return (String[]) l.toArray (new String [l.size ()]);
private static DualLabeledSpans intersectSpans (DocumentExtraction docExtr)
int predIdx = 0;
int trueIdx = 0;
LabeledSpans trueSpans = docExtr.getTargetSpans ();
LabeledSpans predSpans = docExtr.getExtractedSpans ();
LabeledSpans retPredSpans = new LabeledSpans (predSpans.getDocument ());
LabeledSpans retTrueSpans = new LabeledSpans (predSpans.getDocument ());
while ((predIdx < predSpans.size()) && (trueIdx < trueSpans.size ())) {
LabeledSpan predSpan = predSpans.getLabeledSpan (predIdx);
LabeledSpan trueSpan = trueSpans.getLabeledSpan (trueIdx);
LabeledSpan newPredSpan = (LabeledSpan) predSpan.intersection (trueSpan);
LabeledSpan newTrueSpan = (LabeledSpan) trueSpan.intersection (predSpan);
retPredSpans.add (newPredSpan);
retTrueSpans.add (newTrueSpan);
if (predSpan.getEndIdx () <= trueSpan.getEndIdx ()) {
if (trueSpan.getEndIdx () <= predSpan.getEndIdx ()) {
assert (retPredSpans.size() == retTrueSpans.size());
return new DualLabeledSpans (retPredSpans, retTrueSpans);
private static void outputIndex (File directory, Extraction extraction) throws IOException
PrintWriter out = new PrintWriter (new FileWriter (new File (directory, "index.html")));
out.println ("<HTML><HEAD><TITLE>Extraction Results</TITLE></HEAD><BODY><OL>");
for (int i = 0; i < extraction.getNumDocuments(); i++) {
String name = extraction.getDocumentExtraction (i).getName ();
out.println (" <LI><A HREF=\"extraction"+i+".html\">"+name+"</A></LI>");
out.println ("</OL></BODY></HTML>");
out.close ();