*
* @param args Command line arguments
*/
public static void main(String[] args) {
CommandLineParser commandLine = new CommandLineParser();
Option<String> source_file = commandLine.addStringOption('s',"source-text","SOURCE_FILENAME","name of file containing source language corpus");
//Option<String> source_file_encoding = commandLine.addStringOption("source-encoding","SOURCE_ENCODING","ISO-8859-1","source language file encoding");
Option<String> source_file_encoding = commandLine.addStringOption("source-encoding","SOURCE_ENCODING","UTF-8","source language file encoding");
Option<Boolean> source_file_gz = commandLine.addBooleanOption("source-text-gzipped",false,"is the source text gzipped");
Option<String> target_file = commandLine.addStringOption('t',"target-text","TARGET_FILENAME","name of file containing target language corpus");
//Option<String> target_file_encoding = commandLine.addStringOption("target-encoding","TARGET_ENCODING","ISO-8859-1","target language file encoding");
Option<String> target_file_encoding = commandLine.addStringOption("target-encoding","TARGET_ENCODING","UTF-8","target language file encoding");
Option<Boolean> target_file_gz = commandLine.addBooleanOption("target-text-gzipped",false,"is the target text gzipped");
Option<String> alignment_file = commandLine.addStringOption('a',"alignment","ALIGNMENT_FILENAME","name of file containing word alignments for the sentences in the corpus");
Option<Boolean> alignment_file_gz = commandLine.addBooleanOption("alignment-file-gzipped",false,"is the alignment file gzipped");
Option<Integer> num_lines = commandLine.addIntegerOption('l',"lines","LINE_COUNT","number of aligned sentences in the corpus");
Option<String> output_file = commandLine.addStringOption('o',"output","OUTPUT_FILENAME","file where aligned word pairs will be written");
Option<String> output_file_encoding = commandLine.addStringOption("output-encoding","OUTPUT_ENCODING","UTF-8","output file encoding");
Option<Boolean> output_file_gz = commandLine.addBooleanOption("output-text-gzipped",false,"should the output file be gzipped");
commandLine.parse(args);
try {
// Set System.out and System.err to use the provided character encoding
try {
System.setOut(new PrintStream(System.out, true, commandLine.getValue(source_file_encoding)));
System.setErr(new PrintStream(System.err, true, commandLine.getValue(source_file_encoding)));
} catch (UnsupportedEncodingException e1) {
System.err.println(commandLine.getValue(source_file_encoding) + " is not a valid encoding; using system default encoding for System.out and System.err.");
} catch (SecurityException e2) {
System.err.println("Security manager is configured to disallow changes to System.out or System.err; using system default encoding.");
}
// The number of lines to read
int number_of_lines = commandLine.getValue(num_lines);
// Set up the source text for reading
Scanner source_text;
if (commandLine.getValue(source_file).endsWith(".gz") || commandLine.getValue(source_file_gz)) {
source_text = new Scanner(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(commandLine.getValue(source_file))),commandLine.getValue(source_file_encoding))));
} else {
source_text = new Scanner( new File(commandLine.getValue(source_file)), commandLine.getValue(source_file_encoding));
}
// Set up the target text for reading
Scanner target_text;
if (commandLine.getValue(target_file).endsWith(".gz") || commandLine.getValue(target_file_gz)) {
target_text = new Scanner(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(commandLine.getValue(target_file))),commandLine.getValue(target_file_encoding))));
} else {
target_text = new Scanner( new File(commandLine.getValue(target_file)), commandLine.getValue(target_file_encoding));
}
// Set up the alignment file for reading
Scanner alignments;
if (commandLine.getValue(alignment_file).endsWith(".gz") || commandLine.getValue(alignment_file_gz)) {
alignments = new Scanner(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(commandLine.getValue(alignment_file))))));
} else {
alignments = new Scanner( new File(commandLine.getValue(alignment_file)));
}
// Set up the output file for writing
Writer outputFile;
if (commandLine.getValue(output_file).endsWith(".gz") || commandLine.getValue(output_file_gz)) {
outputFile = new OutputStreamWriter(new GZIPOutputStream(new FileOutputStream(commandLine.getValue(output_file))),commandLine.getValue(output_file_encoding));
} else {
outputFile = new OutputStreamWriter(new FileOutputStream(commandLine.getValue(output_file)),commandLine.getValue(output_file_encoding));
}
try {
extract(number_of_lines, source_text, target_text, alignments, outputFile);
} catch (NoSuchElementException e) {
System.err.println("There are more than " + number_of_lines + " lines of input. Please determine the actual number of lines of input, and re-run with the appropriate command line flag.");
commandLine.printUsage();
System.exit(-1);
}
} catch (FileNotFoundException e) {
e.printStackTrace();