/* This file is part of the Joshua Machine Translation System.
*
* Joshua is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free
* Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*/
package joshua.util;
import joshua.util.io.LineReader;
import joshua.util.io.IndexedReader;
import java.io.BufferedWriter;
import java.io.OutputStreamWriter;
import java.io.FileOutputStream;
import java.io.IOException;
/**
* This program extracts the 1-best output translations from the
* n-best output translations generated by
* {@link joshua.decoder.JoshuaDecoder}.
*
* @author wren ng thornton <wren@users.sourceforge.net>
* @version $LastChangedDate: 2009-03-26 15:06:57 -0400 (Thu, 26 Mar 2009) $
*/
/* TODO: This class should be renamed, something like ExtractBestCandidates
* or ExtractBestTranslations. Saying "top" implies more than one
* (the top how many?) and "cand" is unnecessary abbreviation (also,
* who cares about candidacy?). Once we rename this, the
* ./example2/decode_example2.sh script will need updating (as will
* the end-to-end code)
*/
public class ExtractTopCand {
/**
* Usage: <code>java ExtractTopCand nbestInputFile 1bestOutputFile</code>.
* <p>
* If the input file name is "-" then input is read from
* <code>System.in</code>. If the output file name is "-"
* then output is directed to <code>System.out</code>. If
* a file already exists with the output file name, it is
* truncated before writing. The bulk of this program is
* implemented by
* {@link #extractOneBest(IndexedReader,BufferedWriter)}.
*/
public static void main(String[] args) {
if (2 != args.length) {
System.err.println("Usage: ExtractTopCand nbestInputFile 1bestOutputFile\n (use \"-\" for stdin/stdout)");
System.exit(1);
}
try {
// TODO: see documentation for extractOneBest
// regarding using an n-best SegmentFileParser.
IndexedReader<String> nbestReader =
new IndexedReader<String>("line",
"-".equals(args[0])
? new LineReader(System.in)
: new LineReader(args[0]));
/* TODO: This duplicates FileUtility.getWriteFileStream
* but with the addition of defaulting to System.out;
* should fix that (without breaking other clients
* of that method). We ultimately want something which
* autochecks for errors (like Writer); has a newLine
* method (like BufferedWriter); can wrap System.out;
* can autoflush; and it'd be handy to have the
* print/println methods of PrintStream/PrintWriter
* to boot. PrintWriter *almost* gives us all this,
* but it swallows errors and gives no way to
* retrieve them >:(
*/
BufferedWriter onebestWriter =
new BufferedWriter(
new OutputStreamWriter(
("-".equals(args[1])
? System.out
: new FileOutputStream(args[1], false)
), "UTF-8"));
extractOneBest(nbestReader, onebestWriter);
} catch (IOException ioe) {
// NOTE: if our onebest was System.out, then that
// will already have been closed by the finally
// block. Printing to a closed PrintStream generates
// no exceptions. We should be printing to System.err
// anyways, but this something subtle to be aware of.
System.err.println("There was an error: " + ioe.getMessage());
}
}
/**
* Prints the one-best translation for each segment ID from
* the reader as a line on the writer, and closes both
* before exiting. The translations for a segment are printed
* in the order of the first occurance of the segment ID.
* Any information about the segment other than the translation
* (including segment ID) is not printed to the writer.
*
* <h4>Developer Notes</h4>
* This implementation assumes:
* <ol>
* <li>all translations for a segment are contiguous</li>
* <li>the 1-best translation is the first one encountered.</li>
* </ol>
* We will need to alter the implementation if these
* assumptions no longer hold for the output of JoshuaDecoder
* (or any sensible n-best format passed to this method).
* <p>
* We should switch to using an n-best
* {@link joshua.decoder.segment_file.SegmentFileParser}
* to ensure future compatibility with being able to configure
* the output format of the decoder. The MERT code needs
* such a SegmentFileParser anyways, so that will reduce
* the code duplication between these two classes.
*/
protected static void extractOneBest(
IndexedReader<String> nbestReader, BufferedWriter onebestWriter)
throws IOException {
try {
String prevID = null;
for (String line : nbestReader) {
String[] columns = Regex.threeBarsWithSpace.split(line);
// We allow non-integer segment IDs because the
// Segment interface does, and we have no reason
// to add new restrictions.
String newID = columns[0].trim();
// We want to give the same error message
// regardless of whether there's a leading space
// or not. And, we don't want to accidentally
// accept lines with lots and lots of columns.
if ("".equals(newID) || newID.startsWith("|||")) {
throw nbestReader.wrapIOException(
new IOException("Malformed line, missing segment ID:\n" + line));
}
// Make sure there's a translation there too
// TODO: good error message for when the second
// "|||" doesn't have a following field, m/\|{3}\s*$/
if (3 > columns.length) {
throw nbestReader.wrapIOException(
new IOException("Malformed line, should have at least two \" ||| \":\n" + line));
}
if (null == prevID || ! prevID.equals(newID)) {
onebestWriter.write(columns[1], 0, columns[1].length());
onebestWriter.newLine();
onebestWriter.flush();
prevID = newID;
}
}
} finally {
try {
nbestReader.close();
} finally {
onebestWriter.close();
}
}
}
}