trainFilter = new NumberRangesFileFilter(args[argIndex++], true);
} else if (numSubArgs >= 3) {
try {
int low = Integer.parseInt(args[argIndex]);
int high = Integer.parseInt(args[argIndex + 1]);
trainFilter = new NumberRangeFileFilter(low, high, true);
argIndex += 2;
} catch (NumberFormatException e) {
// maybe it's a ranges expression?
trainFilter = new NumberRangesFileFilter(args[argIndex], true);
argIndex++;
}
}
} else if (args[argIndex].equalsIgnoreCase("-encoding")) { // sets encoding for TreebankLangParserParams
encoding = args[argIndex + 1];
op.tlpParams.setInputEncoding(encoding);
op.tlpParams.setOutputEncoding(encoding);
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-loadFromSerializedFile")) {
// load the parser from a binary serialized file
// the next argument must be the path to the parser file
serializedInputFileOrUrl = args[argIndex + 1];
argIndex += 2;
// doesn't make sense to load from TextFile -pichuan
// } else if (args[argIndex].equalsIgnoreCase("-loadFromTextFile")) {
// // load the parser from declarative text file
// // the next argument must be the path to the parser file
// textInputFileOrUrl = args[argIndex + 1];
// argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-saveToSerializedFile")) {
saveToSerializedFile = true;
serializedOutputFileOrUrl = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-saveToTextFile")) {
// save the parser to declarative text file
saveToTextFile = true;
textOutputFileOrUrl = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-treebank")) {
// the next argument is the treebank path and range for testing
int numSubArgs = numSubArgs(args, argIndex);
argIndex++;
if (numSubArgs == 1) {
testFilter = new NumberRangesFileFilter(args[argIndex++], true);
} else if (numSubArgs > 1) {
testPath = args[argIndex++];
if (numSubArgs == 2) {
testFilter = new NumberRangesFileFilter(args[argIndex++], true);
} else if (numSubArgs >= 3) {
try {
int low = Integer.parseInt(args[argIndex]);
int high = Integer.parseInt(args[argIndex + 1]);
testFilter = new NumberRangeFileFilter(low, high, true);
argIndex += 2;
} catch (NumberFormatException e) {
// maybe it's a ranges expression?
testFilter = new NumberRangesFileFilter(args[argIndex++], true);
}
}
}
} else {
int j = op.tlpParams.setOptionFlag(args, argIndex);
if (j == argIndex) {
System.err.println("Unknown option ignored: " + args[argIndex]);
j++;
}
argIndex = j;
}
} // end while loop through arguments
TreebankLangParserParams tlpParams = op.tlpParams;
// all other arguments are order dependent and
// are processed in order below
ChineseLexiconAndWordSegmenter cs = null;
if (!train && op.testOptions.verbose) {
System.out.println("Currently " + new Date());
printArgs(args, System.out);
}
if (train) {
printArgs(args, System.out);
// so we train a parser using the treebank
if (treebankPath == null) {
// the next arg must be the treebank path, since it wasn't give earlier
treebankPath = args[argIndex];
argIndex++;
if (args.length > argIndex + 1) {
try {
// the next two args might be the range
int low = Integer.parseInt(args[argIndex]);
int high = Integer.parseInt(args[argIndex + 1]);
trainFilter = new NumberRangeFileFilter(low, high, true);
argIndex += 2;
} catch (NumberFormatException e) {
// maybe it's a ranges expression?
trainFilter = new NumberRangesFileFilter(args[argIndex], true);
argIndex++;
}
}
}
Treebank trainTreebank = makeTreebank(treebankPath, op, trainFilter);
Index<String> wordIndex = new HashIndex<String>();
Index<String> tagIndex = new HashIndex<String>();
cs = new ChineseLexiconAndWordSegmenter(trainTreebank, op, wordIndex, tagIndex);
} else if (textInputFileOrUrl != null) {
// so we load the segmenter from a text grammar file
// XXXXX fix later -pichuan
//cs = new LexicalizedParser(textInputFileOrUrl, true, op);
} else {
// so we load a serialized segmenter
if (serializedInputFileOrUrl == null) {
// the next argument must be the path to the serialized parser
serializedInputFileOrUrl = args[argIndex];
argIndex++;
}
try {
cs = new ChineseLexiconAndWordSegmenter(serializedInputFileOrUrl, op);
} catch (IllegalArgumentException e) {
System.err.println("Error loading segmenter, exiting...");
System.exit(0);
}
}
// the following has to go after reading parser to make sure
// op and tlpParams are the same for train and test
TreePrint treePrint = op.testOptions.treePrint(tlpParams);
if (testFilter != null) {
if (testPath == null) {
if (treebankPath == null) {
throw new RuntimeException("No test treebank path specified...");
} else {
System.err.println("No test treebank path specified. Using train path: \"" + treebankPath + "\"");
testPath = treebankPath;
}
}
testTreebank = tlpParams.testMemoryTreebank();
testTreebank.loadPath(testPath, testFilter);
}
op.trainOptions.sisterSplitters = Generics.newHashSet(Arrays.asList(tlpParams.sisterSplitters()));
// at this point we should be sure that op.tlpParams is
// set appropriately (from command line, or from grammar file),
// and will never change again. We also set the tlpParams of the
// LexicalizedParser instance to be the same object. This is
// redundancy that we probably should take out eventually.
//
// -- Roger
if (op.testOptions.verbose) {
System.err.println("Lexicon is " + cs.getClass().getName());
}
PrintWriter pwOut = tlpParams.pw();
PrintWriter pwErr = tlpParams.pw(System.err);
// Now what do we do with the parser we've made
if (saveToTextFile) {
// save the parser to textGrammar format
if (textOutputFileOrUrl != null) {
saveSegmenterDataToText(cs, textOutputFileOrUrl);
} else {
System.err.println("Usage: must specify a text segmenter data output path");
}
}
if (saveToSerializedFile) {
if (serializedOutputFileOrUrl == null && argIndex < args.length) {
// the next argument must be the path to serialize to
serializedOutputFileOrUrl = args[argIndex];
argIndex++;
}
if (serializedOutputFileOrUrl != null) {
saveSegmenterDataToSerialized(cs, serializedOutputFileOrUrl);
} else if (textOutputFileOrUrl == null && testTreebank == null) {
// no saving/parsing request has been specified
System.err.println("usage: " + "java edu.stanford.nlp.parser.lexparser.ChineseLexiconAndWordSegmenter" + "-train trainFilesPath [start stop] serializedParserFilename");
}
}
/* --------------------- Testing part!!!! ----------------------- */
if (op.testOptions.verbose) {
// printOptions(false, op);
}
if (testTreebank != null || (argIndex < args.length && args[argIndex].equalsIgnoreCase("-treebank"))) {
// test parser on treebank
if (testTreebank == null) {
// the next argument is the treebank path and range for testing
testTreebank = tlpParams.testMemoryTreebank();
if (args.length < argIndex + 4) {
testTreebank.loadPath(args[argIndex + 1]);
} else {
int testlow = Integer.parseInt(args[argIndex + 2]);
int testhigh = Integer.parseInt(args[argIndex + 3]);
testTreebank.loadPath(args[argIndex + 1], new NumberRangeFileFilter(testlow, testhigh, true));
}
}
/* TODO - test segmenting on treebank. -pichuan */
// lp.testOnTreebank(testTreebank);
// } else if (argIndex >= args.length) {