// BuildLexicalizedParserITest
// Copyright (c) 2002-2010 Leland Stanford Junior University
//This program is free software; you can redistribute it and/or
//modify it under the terms of the GNU General Public License
//as published by the Free Software Foundation; either version 2
//of the License, or (at your option) any later version.
//This program is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
//GNU General Public License for more details.
//You should have received a copy of the GNU General Public License
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
//For more information, bug reports, fixes, contact:
//Christopher Manning
//Dept of Computer Science, Gates 1A
//Stanford CA 94305-9010
//USA
//Support/Questions: java-nlp-user@lists.stanford.edu
//Licensing: java-nlp-support@lists.stanford.edu
//http://www-nlp.stanford.edu/software/tagger.shtml
package edu.stanford.nlp.parser.lexparser;
import junit.framework.TestCase;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.PrintStream;
import java.util.Formatter;
import edu.stanford.nlp.io.TeeStream;
import edu.stanford.nlp.util.StringUtils;
/**
* This builds and tests several English parsers on one or a few input
* trees each. The goal is to make sure there are no crashes when
* executing the normally used training paths.
*
* @author John Bauer
*/
public class BuildLexicalizedParserITest extends TestCase {
// This is the example command line run by the
// makeSerializedParser.csh script:
//
//String commandLine = "-evals \"factDA,tsv\" -goodPCFG -saveToSerializedFile wsjPCFG.ser.gz -saveToTextFile wsjPCFG.txt -maxLength 40 -train /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj 200-2199 -testTreebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj 2200-2219";
// We build a similar command line using temporary files for the
// parser and the output files
public static final String[] englishCommandLines = {"-evals factDA,tsv -goodPCFG -saveToSerializedFile %s -saveToTextFile %s -maxLength 40 -train %s -testTreebank %s",
"-evals factDA,tsv -goodPCFG -noTagSplit -saveToSerializedFile %s -saveToTextFile %s -compactGrammar 0 -maxLength 40 -train %s -testTreebank %s",
"-evals factDA,tsv -ijcai03 -v -printStates -compactGrammar 0 -correctTags -saveToSerializedFile %s -saveToTextFile %s -maxLength 40 -train %s -testTreebank %s",
"-evals factDA,tsv -ijcai03 -v -printStates -compactGrammar 0 -saveToSerializedFile %s -saveToTextFile %s -maxLength 40 -train %s -testTreebank %s"};
public static final String[] englishTwoTreebanks = {"-evals factDA,tsv -ijcai03 -saveToSerializedFile %s -saveToTextFile %s -maxLength 40 -train %s -train2 %s 0-9 0.5 -testTreebank %s",
"-evals factDA,tsv -goodPCFG -saveToSerializedFile %s -saveToTextFile %s -maxLength 40 -train %s -train2 %s 0-9 0.5 -testTreebank %s"};
public static final String[] chineseCommandLines = {"-evals factDA,tsv -tLPP edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams -chinesePCFG -encoding utf-8 -saveToSerializedFile %s -saveToTextFile %s -maxLength 40 -train %s -test %s",
"-evals factDA,tsv -tLPP edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams -acl03chinese -encoding utf-8 -scTags -saveToSerializedFile %s -saveToTextFile %s -maxLength 40 -train %s -test %s",
"-evals factDA,tsv -tLPP edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams -encoding utf-8 -chineseFactored -saveToSerializedFile %s -saveToTextFile %s -maxLength 40 -train %s -test %s",
"-evals factDA,tsv -tLPP edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams -chinesePCFG -encoding utf-8 -saveToSerializedFile %s -saveToTextFile %s -maxLength 40 -train %s -test %s",
"-evals factDA,tsv -tLPP edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams -encoding utf-8 -segmentMarkov -saveToSerializedFile %s -saveToTextFile %s -train %s -test %s -sctags -acl03chinese"};
public static final String[] germanCommandLines = {"-evals factDA,tsv -tLPP edu.stanford.nlp.parser.lexparser.NegraPennTreebankParserParams -encoding UTF-8 -hMarkov 1 -vMarkov 2 -vSelSplitCutOff 300 -uwm 1 -unknownSuffixSize 2 -maxLength 40 -nodeCleanup 2 -saveToSerializedFile %s -saveToTextFile %s -train %s -test %s",
"-evals factDA,tsv -tLPP edu.stanford.nlp.parser.lexparser.NegraPennTreebankParserParams -encoding UTF-8 -PCFG -hMarkov 1 -vMarkov 2 -vSelSplitCutOff 300 -uwm 1 -unknownSuffixSize 1 -maxLength 40 -nodeCleanup 2 -saveToSerializedFile %s -saveToTextFile %s -train %s -test %s"};
public static final String[] frenchCommandLines = {"-evals factDA,tsv -maxLength 40 -tLPP edu.stanford.nlp.parser.lexparser.FrenchTreebankParserParams -encoding UTF-8 -frenchFactored -saveToSerializedFile %s -saveToTextFile %s -train %s -test %s"};
public static final String[] arabicCommandLines = {"-evals factDA,tsv -maxLength 40 -tLPP edu.stanford.nlp.parser.lexparser.ArabicTreebankParserParams -encoding UTF-8 -arabicFactored -saveToSerializedFile %s -saveToTextFile %s -train %s -test %s"};
// TODO: weird: this parser is not saved anywhere
//LexicalizedParser invoked with arguments: -evals factDA,tsv -acl03pcfg -maxLength 40 -train /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj 200-2199 -testTreebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj 2200-2219
public static final String baseTestSerCommandLine = "-encoding utf-8 -loadFromSerializedFile %s -testTreebank %s 0-1";
public static final String baseTestTextCommandLine = "-encoding utf-8 -loadFromTextFile %s -testTreebank %s 0-1";
// public static final String PERF_EVAL = "factor LP/LR summary evalb: LP: ";
public static final String englishOneTree = "projects/core/data/edu/stanford/nlp/parser/trees/en-onetree.txt";
public static final String englishSecondTree = "projects/core/data/edu/stanford/nlp/parser/trees/en-secondtree.txt";
public static final String englishThreeTrees = "projects/core/data/edu/stanford/nlp/parser/trees/en-threetrees.txt";
public static final String chineseOneTree = "projects/core/data/edu/stanford/nlp/parser/trees/zh-onetree.txt";
public static final String chineseThreeTrees = "projects/core/data/edu/stanford/nlp/parser/trees/zh-threetrees.txt";
public static final String germanOneTree = "projects/core/data/edu/stanford/nlp/parser/trees/de-onesent.txt";
public static final String germanThreeTrees = "projects/core/data/edu/stanford/nlp/parser/trees/de-threesents.txt";
public static final String frenchOneTree = "projects/core/data/edu/stanford/nlp/parser/trees/fr-onetree.txt";
public static final String frenchThreeTrees = "projects/core/data/edu/stanford/nlp/parser/trees/fr-threetrees.txt";
public static final String arabicOneTree = "projects/core/data/edu/stanford/nlp/parser/trees/ar-onetree.txt";
public static final String arabicThreeTrees = "projects/core/data/edu/stanford/nlp/parser/trees/ar-threetrees.txt";
public static class ParserTestCase {
public final String[] trainCommandLine;
public final String testPath;
public final File parserFile;
public final File textFile;
ParserTestCase(String[] trainCommandLine, String testPath,
File parserFile, File textFile) {
this.trainCommandLine = trainCommandLine;
this.testPath = testPath;
this.parserFile = parserFile;
this.textFile = textFile;
}
public static ParserTestCase
buildOneTreebankTestCase(String baseCommandLine,
String trainPath,
String testPath)
throws IOException
{
File parserFile = File.createTempFile("parser", ".ser.gz");
File textFile = File.createTempFile("parser", ".txt");
Formatter commandLineFormatter = new Formatter();
// Note that we test on the train path. The goal is we should
// get 100% accuracy if everything worked. We will test on
// unknown trees in the next step.
commandLineFormatter.format(baseCommandLine, parserFile.getPath(),
textFile.getPath(), trainPath, trainPath);
String[] trainCommandLine =
commandLineFormatter.toString().split("\\s+");
ParserTestCase test = new ParserTestCase(trainCommandLine, testPath,
parserFile, textFile);
return test;
}
public static ParserTestCase
buildTwoTreebankTestCase(String baseCommandLine,
String trainPath,
String secondaryPath,
String testPath)
throws IOException
{
File parserFile = File.createTempFile("parser", ".ser.gz");
File textFile = File.createTempFile("parser", ".txt");
Formatter commandLineFormatter = new Formatter();
// Note that we test on the train path. The goal is we should
// get 100% accuracy if everything worked. We will test on
// unknown trees in the next step.
commandLineFormatter.format(baseCommandLine, parserFile.getPath(),
textFile.getPath(), trainPath,
secondaryPath, trainPath);
String[] trainCommandLine =
commandLineFormatter.toString().split("\\s+");
ParserTestCase test = new ParserTestCase(trainCommandLine, testPath,
parserFile, textFile);
return test;
}
}
static public void buildAndTest(ParserTestCase test)
throws IOException
{
PrintStream originalOut = System.out;
PrintStream originalErr = System.err;
System.out.println("Training:");
System.out.println(StringUtils.join(test.trainCommandLine));
ByteArrayOutputStream savedOutput = new ByteArrayOutputStream();
TeeStream teeOut = new TeeStream(savedOutput, System.out);
PrintStream teeOutPS = new PrintStream(teeOut);
TeeStream teeErr = new TeeStream(savedOutput, System.err);
PrintStream teeErrPS = new PrintStream(teeErr);
System.setOut(teeOutPS);
System.setErr(teeErrPS);
LexicalizedParser.main(test.trainCommandLine);
teeOutPS.flush();
teeErrPS.flush();
teeOut.flush();
teeErr.flush();
String[] outputLines =
savedOutput.toString().split("(?:\\n|\\r)+");
String perfLine = outputLines[outputLines.length - 5];
System.out.println(perfLine);
assertEquals("factor LP/LR summary evalb: LP: 100.0 LR: 100.0 F1: 100.0 Exact: 100.0 N: 1", perfLine.trim());
Formatter commandLineFormatter = new Formatter();
commandLineFormatter.format(baseTestSerCommandLine,
test.parserFile.getPath(), test.testPath);
String[] testCommandLine =
commandLineFormatter.toString().split("\\s");
System.out.println("Testing:");
System.out.println(StringUtils.join(testCommandLine));
LexicalizedParser.main(testCommandLine);
commandLineFormatter = new Formatter();
commandLineFormatter.format(baseTestTextCommandLine,
test.textFile.getPath(), test.testPath);
testCommandLine = commandLineFormatter.toString().split("\\s");
System.out.println("Testing:");
System.out.println(StringUtils.join(testCommandLine));
LexicalizedParser.main(testCommandLine);
teeOutPS.flush();
teeErrPS.flush();
teeOut.flush();
teeErr.flush();
System.setOut(originalOut);
System.setErr(originalErr);
}
/**
* This tests that building and running a simple English parser
* model works correctly.
*/
public void testBuildEnglishParser()
throws IOException
{
for (String englishCommandLine : englishCommandLines) {
ParserTestCase test = ParserTestCase.buildOneTreebankTestCase(englishCommandLine, englishOneTree, englishThreeTrees);
buildAndTest(test);
}
for (String englishCommandLine : englishTwoTreebanks) {
ParserTestCase test = ParserTestCase.buildTwoTreebankTestCase(englishCommandLine, englishOneTree, englishSecondTree, englishThreeTrees);
buildAndTest(test);
}
}
public void testBuildChineseParser()
throws IOException
{
for (String chineseCommandLine : chineseCommandLines) {
ParserTestCase test = ParserTestCase.buildOneTreebankTestCase(chineseCommandLine, chineseOneTree, chineseThreeTrees);
buildAndTest(test);
}
}
public void testBuildGermanParser()
throws IOException
{
for (String germanCommandLine : germanCommandLines) {
ParserTestCase test = ParserTestCase.buildOneTreebankTestCase(germanCommandLine, germanOneTree, germanThreeTrees);
buildAndTest(test);
}
}
public void testBuildFrenchParser()
throws IOException
{
for (String frenchCommandLine : frenchCommandLines) {
ParserTestCase test = ParserTestCase.buildOneTreebankTestCase(frenchCommandLine, frenchOneTree, frenchThreeTrees);
buildAndTest(test);
}
}
public void testBuildArabicParser()
throws IOException
{
for (String arabicCommandLine : arabicCommandLines) {
ParserTestCase test = ParserTestCase.buildOneTreebankTestCase(arabicCommandLine, arabicOneTree, arabicThreeTrees);
buildAndTest(test);
}
}
}