package edu.stanford.nlp.parser.lexparser;
import edu.stanford.nlp.trees.TreePrint;
import edu.stanford.nlp.trees.TreebankLanguagePack;
import java.io.Serializable;
import java.util.Properties;
/**
* Options to the parser which affect performance only at testing (parsing)
* time.
* <br>
* The Options class that stores the TestOptions stores the
* TestOptions as a transient object. This means that whatever
* options get set at creation time are forgotten when the parser is
* serialized. If you want an option to be remembered when the parser
* is reloaded, put it in either TrainOptions or in Options itself.
*
* @author Dan Klein
*/
public class TestOptions implements Serializable {
static final String DEFAULT_PRE_TAGGER =
"/u/nlp/data/pos-tagger/distrib/wsj-0-18-bidirectional-nodistsim.tagger";
public TestOptions() {
evals = new Properties();
evals.setProperty("pcfgLB", "true");
evals.setProperty("depDA", "true");
evals.setProperty("factLB", "true");
evals.setProperty("factTA", "true");
evals.setProperty("summary", "true");
}
/**
* If false, then failure of the PCFG parser to parse a sentence
* will trigger allowing all tags for words in parse recovery mode,
* with a log probability of -1000.
* If true, these extra taggings are not added.
* It is false by default. Use option -noRecoveryTagging to set
* to true.
*/
public boolean noRecoveryTagging = false;
/** If true, then failure of the PCFG factor to parse a sentence
* will trigger parse recovery mode.
*/
public boolean doRecovery = true;
/**
* If true, the n^4 "speed-up" is not used with the Factored Parser.
*/
public boolean useN5 = false;
/** If true, use approximate factored algorithm, which just rescores
* PCFG k best, rather than exact factored algorithm. This algorithm
* requires the dependency grammar to exist for rescoring, but not for
* the dependency grammar to be run. Hence the correct usage for
* guarding code only required for exact A* factored parsing is now
* if (op.doPCFG && op.doDep && ! Test.useFastFactored).
*/
public boolean useFastFactored = false;
/** If true, use faster iterative deepening CKY algorithm. */
public boolean iterativeCKY = false;
/**
* The maximum sentence length (including punctuation, etc.) to parse.
*/
public int maxLength = -0xDEADBEEF;
// initial value is -0xDEADBEEF (actually positive because of 2s complement)
/**
* The maximum number of edges and hooks combined that the factored parser
* will build before giving up. This number should probably be relative to
* the sentence length parsed. In general, though, if the parser cannot parse
* a sentence after this much work then there is no good parse consistent
* between the PCFG and Dependency parsers. (Normally, depending on other
* flags), the parser will then just return the best PCFG parse.)
*/
public int MAX_ITEMS = 200000;
/**
* The amount of smoothing put in (as an m-estimate) for unknown words.
* If negative, set by the code in the lexicon class.
*/
public double unseenSmooth = -1.0;
/**
* Parse trees in test treebank in order of increasing length.
*/
public boolean increasingLength = false;
/**
* Tag the sentences first, then parse given those (coarse) tags.
*/
public boolean preTag = false;
/**
* Parse using only tags given from correct answer or the POS tagger
*/
public boolean forceTags = preTag;
public boolean forceTagBeginnings = false;
/**
* POS tagger model used when preTag is enabled.
*/
public String taggerSerializedFile = DEFAULT_PRE_TAGGER;
/**
* Only valid with force tags - strips away functionals when forcing
* the tags, meaning tags have to start
* appropriately but the parser will assign the functional part.
*/
public boolean noFunctionalForcing = preTag;
/**
* Write EvalB-readable output files.
*/
public boolean evalb = false;
/**
* Print a lot of extra output as you parse.
*/
public boolean verbose = false; // Don't change this; set with -v
public final boolean exhaustiveTest = false;
/** If this variable is true, and the sum of the inside and outside score
* for a constituent is worse than the best known score for a sentence by
* more than <code>pcfgThresholdValue</code>, then -Inf is returned as the
* outside Score by <code>oScore()</code> (while otherwise the true
* outside score is returned).
*/
public final boolean pcfgThreshold = false;
public final double pcfgThresholdValue = -2.0;
/**
* Print out all best PCFG parses.
*/
public boolean printAllBestParses = false;
/**
* Weighting on dependency log probs. The dependency grammar negative log
* probability scores are simply multiplied by this number.
*/
public double depWeight = 1.0;
public boolean prunePunc = false;
/** If a token list does not have sentence final punctuation near the
* end, then automatically add the default one.
* This might help parsing if the treebank is all punctuated.
* Not done if reading a treebank.
*/
public boolean addMissingFinalPunctuation;
/**
* Determines format of output trees: choose among penn, oneline
*/
public String outputFormat = "penn";
public String outputFormatOptions = "";
/** If true, write files parsed to a new file with the same name except
* for an added ".stp" extension.
*/
public boolean writeOutputFiles;
/** If the writeOutputFiles option is true, then output files appear in
* this directory. An unset value (<code>null</code>) means to use
* the directory of the source files. Use <code>""</code> or <code>.</code>
* for the current directory.
*/
public String outputFilesDirectory;
/** If the writeOutputFiles option is true, then output files appear with
* this extension. Use <code>""</code> for no extension.
*/
public String outputFilesExtension = "stp";
/**
* If the writeOutputFiles option is true, then output files appear with
* this prefix.
*/
public String outputFilesPrefix = "parses";
/**
* If this option is not null, output the k-best equivocation. Must be specified
* with printPCFGkBest.
*/
public String outputkBestEquivocation;
/**
* The largest span to consider for word-hood. Used for parsing unsegmented
* Chinese text and parsing lattices. Keep it at 1 unless you know what
* you're doing.
*/
public int maxSpanForTags = 1;
/**
* Turns on normalizing scores for sentence length. Makes no difference
* (except decreased efficiency) unless maxSpanForTags is greater than one.
* Works only for PCFG (so far).
*/
public boolean lengthNormalization = false;
/**
* Used when you want to generate sample parses instead of finding the best
* parse. (NOT YET USED.)
*/
public boolean sample = false;
/** Printing k-best parses from PCFG, when k > 0. */
public int printPCFGkBest = 0;
/** If using a kBest eval, use this many trees. */
public int evalPCFGkBest = 100;
/** Printing k-best parses from PCFG, when k > 0. */
public int printFactoredKGood = 0;
/** What evaluations to report and how to report them
* (using LexicalizedParser). Known evaluations
* are: pcfgLB, pcfgCB, pcfgDA, pcfgTA, pcfgLL, pcfgRUO, pcfgCUO, pcfgCatE,
* pcfgChildSpecific,
* depDA, depTA, depLL,
* factLB, factCB, factDA, factTA, factLL, factChildSpecific.
* The default is pcfgLB,depDA,factLB,factTA. You need to negate those
* ones out (e.g., <code>-evals "depDA=false"</code>) if you don't want
* them.
* LB = ParseEval labeled bracketing, <br>
* CB = crossing brackets and zero crossing bracket rate, <br>
* DA = dependency accuracy, TA = tagging accuracy, <br>
* LL = log likelihood score, <br>
* RUO/CUO = rules/categories under and over proposed, <br>
* CatE = evaluation by phrasal category. <br>
* ChildSpecific: supply an argument with =. F1 will be returned
* for only the nodes which have at least one child that matches
* this regular expression. <br>
* Known styles are: runningAverages, summary, tsv. <br>
* The default style is summary.
* You need to negate it out if you don't want it.
* Invalid names in the argument to this option are not reported!
*/
public Properties evals;
/** This variable says to find k good fast factored parses, how many times
* k of the best PCFG parses should be examined.
*/
public int fastFactoredCandidateMultiplier = 3;
/** This variable says to find k good factored parses, how many added on
* best PCFG parses should be examined.
*/
public int fastFactoredCandidateAddend = 50;
/** If this is true, the Lexicon is used to score P(w|t) in the backoff inside the
* dependency grammar. (Otherwise, a MLE is used is w is seen, and a constant if
* w is unseen.
*/
public boolean useLexiconToScoreDependencyPwGt = false;
/** If this is true, perform non-projective dependency parsing.
*/
public boolean useNonProjectiveDependencyParser = false;
/**
* Number of threads to use at test time. For example,
* -testTreebank can use this to go X times faster, with the
* negative consequence that output is not quite as nicely ordered.
*/
public int testingThreads = 1;
/**
* When evaluating, don't print out tons of text. Only print out the final scores
*/
public boolean quietEvaluation = false;
/**
* Determines method for print trees on output.
*
* @param tlpParams The treebank parser params
* @return A suitable tree printing object
*/
public TreePrint treePrint(TreebankLangParserParams tlpParams) {
TreebankLanguagePack tlp = tlpParams.treebankLanguagePack();
return new TreePrint(outputFormat, outputFormatOptions, tlp, tlpParams.headFinder(), tlpParams.typedDependencyHeadFinder());
}
public void display() {
String str = toString();
System.err.println(str);
}
@Override
public String toString() {
return ("Test parameters" +
" maxLength=" + maxLength +
" preTag=" + preTag +
" outputFormat=" + outputFormat +
" outputFormatOptions=" + outputFormatOptions +
" printAllBestParses=" + printAllBestParses +
" testingThreads=" + testingThreads +
" quietEvaluation=" + quietEvaluation);
}
private static final long serialVersionUID = 7256526346598L;
}