Package joshua.corpus.suffix_array

Source Code of joshua.corpus.suffix_array.ConvertCorpus

/* This file is part of the Joshua Machine Translation System.
*
* Joshua is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free
* Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*/
package joshua.corpus.suffix_array;

import java.io.IOException;
import java.io.ObjectInput;
import java.util.logging.Logger;

import joshua.corpus.CorpusArray;
import joshua.corpus.vocab.Vocabulary;
import joshua.util.io.BinaryIn;

/**
* Given a corpus and an existing symbol table, read the corpus,
* and create a binary representation of the corpus using the
* provided symbol table.
*
* @author Lane Schwartz
* @version $LastChangedDate: 2009-05-22 23:31:12 -0500 (Fri, 22 May 2009) $
*/
public class ConvertCorpus {

  /** Logger for this class. */
  private static final Logger logger =
    Logger.getLogger(ConvertCorpus.class.getName());
 
  /**
   * Given a corpus and an existing symbol table, read the
   * corpus, and create a binary representation of the corpus
   * using the provided symbol table.
   *
   * @param args Command line arguments
   * @throws ClassNotFoundException
   * @throws IOException
   */
  public static void main(String[] args) throws IOException, ClassNotFoundException {

    // Read the command line arguments
    if (args.length < 3) {
      System.err.println(
          "Usage: java " + SuffixArray.class.getName() +
          " target_corpus tgt.lm.vocab tgt.corpus");
      System.exit(-1);
    }
    String corpusFileName = args[0];
    String binaryVocabFilename = args[1];
    String binaryCorpusFilename = args[2];
    String charset = (args.length > 3) ? args[3] : "UTF-8";
   
    // Read the provided symbol table
    logger.info("Reading provided symbol table");
    Vocabulary symbolTable = new Vocabulary();
    ObjectInput in = BinaryIn.vocabulary(binaryVocabFilename);
    symbolTable.readExternal(in);
   
    // Read the provided corpus
    logger.info("Reading provided corpus");
    Vocabulary oldSymbolTable = new Vocabulary();
    int[] lengths = Vocabulary.initializeVocabulary(corpusFileName, oldSymbolTable, true);
    CorpusArray corpusArray = SuffixArrayFactory.createCorpusArray(corpusFileName, oldSymbolTable, lengths[0], lengths[1]);
   
    // Change the internal integer-string mappings
    // of the corpus to use those provided by the given symbol table.
    logger.info("Converting corpus to use new symbol mappings");
    corpusArray.setSymbolTable(symbolTable);
   
    // Write the corpus to disk in binary format
    logger.info("Writing corpus to disk in binary format, using new symbol mappings");
    corpusArray.write(binaryCorpusFilename, binaryVocabFilename, charset);
   
  }

}
TOP

Related Classes of joshua.corpus.suffix_array.ConvertCorpus

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.