Source Code of joshua.util.ExtractTopCand

/* This file is part of the Joshua Machine Translation System.
 * 
 * Joshua is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 2.1
 * of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free
 * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
 * MA 02111-1307 USA
 */


package joshua.util;


import joshua.util.io.LineReader;
import joshua.util.io.IndexedReader;


import java.io.BufferedWriter;
import java.io.OutputStreamWriter;
import java.io.FileOutputStream;
import java.io.IOException;




/**
 * This program extracts the 1-best output translations from the
 * n-best output translations generated by
 * {@link joshua.decoder.JoshuaDecoder}.
 *
 * @author wren ng thornton <wren@users.sourceforge.net>
 * @version $LastChangedDate: 2009-03-26 15:06:57 -0400 (Thu, 26 Mar 2009) $
 */
/* TODO: This class should be renamed, something like ExtractBestCandidates
 * or ExtractBestTranslations. Saying "top" implies more than one
 * (the top how many?) and "cand" is unnecessary abbreviation (also,
 * who cares about candidacy?). Once we rename this, the
 * ./example2/decode_example2.sh script will need updating (as will
 * the end-to-end code)
 */
public class ExtractTopCand {
  
  /**
   * Usage: <code>java ExtractTopCand nbestInputFile 1bestOutputFile</code>.
   * <p>
   * If the input file name is "-" then input is read from
   * <code>System.in</code>. If the output file name is "-"
   * then output is directed to <code>System.out</code>. If
   * a file already exists with the output file name, it is
   * truncated before writing. The bulk of this program is
   * implemented by
   * {@link #extractOneBest(IndexedReader,BufferedWriter)}.
   */
  public static void main(String[] args) {
    if (2 != args.length) {
      System.err.println("Usage: ExtractTopCand nbestInputFile 1bestOutputFile\n       (use \"-\" for stdin/stdout)");
      System.exit(1);
    }
    
    try {
      // TODO: see documentation for extractOneBest
      // regarding using an n-best SegmentFileParser.
      IndexedReader<String> nbestReader =
        new IndexedReader<String>("line",
          "-".equals(args[0])
            ? new LineReader(System.in)
            : new LineReader(args[0]));
      
      /* TODO: This duplicates FileUtility.getWriteFileStream
       * but with the addition of defaulting to System.out;
       * should fix that (without breaking other clients
       * of that method). We ultimately want something which
       * autochecks for errors (like Writer); has a newLine
       * method (like BufferedWriter); can wrap System.out;
       * can autoflush; and it'd be handy to have the
       * print/println methods of PrintStream/PrintWriter
       * to boot. PrintWriter *almost* gives us all this,
       * but it swallows errors and gives no way to
       * retrieve them >:(
       */
      BufferedWriter onebestWriter =
        new BufferedWriter(
          new OutputStreamWriter(
            ("-".equals(args[1])
              ? System.out
              : new FileOutputStream(args[1], false)
            ), "UTF-8"));
      
      extractOneBest(nbestReader, onebestWriter);
      
    } catch (IOException ioe) {
      // NOTE: if our onebest was System.out, then that
      // will already have been closed by the finally
      // block. Printing to a closed PrintStream generates
      // no exceptions. We should be printing to System.err
      // anyways, but this something subtle to be aware of.
      System.err.println("There was an error: " + ioe.getMessage());
    }
  }
  
  
  /**
   * Prints the one-best translation for each segment ID from
   * the reader as a line on the writer, and closes both
   * before exiting. The translations for a segment are printed
   * in the order of the first occurance of the segment ID.
   * Any information about the segment other than the translation
   * (including segment ID) is not printed to the writer.
   * 
   * <h4>Developer Notes</h4>
   * This implementation assumes:
   * <ol>
   * <li>all translations for a segment are contiguous</li>
   * <li>the 1-best translation is the first one encountered.</li>
   * </ol>
   * We will need to alter the implementation if these
   * assumptions no longer hold for the output of JoshuaDecoder
   * (or any sensible n-best format passed to this method).
   * <p>
   * We should switch to using an n-best
   * {@link joshua.decoder.segment_file.SegmentFileParser}
   * to ensure future compatibility with being able to configure
   * the output format of the decoder. The MERT code needs
   * such a SegmentFileParser anyways, so that will reduce
   * the code duplication between these two classes.
   */
  protected static void extractOneBest(
    IndexedReader<String> nbestReader, BufferedWriter onebestWriter)
  throws IOException {
    
    try {
      String prevID = null;
      for (String line : nbestReader) {
        
        String[] columns = Regex.threeBarsWithSpace.split(line);
        
        // We allow non-integer segment IDs because the
        // Segment interface does, and we have no reason
        // to add new restrictions.
        String newID = columns[0].trim();
        
        // We want to give the same error message
        // regardless of whether there's a leading space
        // or not. And, we don't want to accidentally
        // accept lines with lots and lots of columns.
        if ("".equals(newID) || newID.startsWith("|||")) {
          throw nbestReader.wrapIOException(
            new IOException("Malformed line, missing segment ID:\n" + line));
        }
        
        // Make sure there's a translation there too
        // TODO: good error message for when the second
        // "|||" doesn't have a following field, m/\|{3}\s*$/
        if (3 > columns.length) {
          throw nbestReader.wrapIOException(
            new IOException("Malformed line, should have at least two \" ||| \":\n" + line));
        }
        
        
        if (null == prevID || ! prevID.equals(newID)) {
          onebestWriter.write(columns[1], 0, columns[1].length());
          onebestWriter.newLine();
          onebestWriter.flush();
          
          prevID = newID;
        }
      }
    } finally {
      try {
        nbestReader.close();
      } finally {
        onebestWriter.close();
      }
    }
  }
}
Source Code of joshua.util.ExtractTopCand

Related Classes of joshua.util.ExtractTopCand