Source Code of edu.umd.hooka.CorpusVocabNormalizerAndNumberizer

package edu.umd.hooka;


import java.io.BufferedOutputStream;
import java.io.DataOutputStream;
import java.io.IOException;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.lib.IdentityReducer;
import org.apache.hadoop.streaming.StreamXmlRecordReader;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import edu.umd.hooka.alignment.aer.ReferenceAlignment;
import edu.umd.hooka.corpora.Chunk;
import edu.umd.hooka.corpora.Language;
import edu.umd.hooka.corpora.LanguagePair;
import edu.umd.hooka.corpora.ParallelChunk;
import edu.umd.hooka.corpora.ParallelCorpusReader;




public class CorpusVocabNormalizerAndNumberizer {
  static enum BitextCompilerCounters { EN_WORDS, FR_WORDS, CHUNKS, WRONG_LANGUAGE, SRC_TOO_LONG, TGT_TOO_LONG };
  private static final Logger sLogger = Logger.getLogger(CorpusVocabNormalizerAndNumberizer.class);


  static final String SRC_LANG = "ha.sourcelang";
  static final String TGT_LANG = "ha.targetlang";


  public static class BitextCompilerMapper extends MapReduceBase
  implements Mapper<Text, Text, Text, PhrasePair> {
    String outputBase = null;
    Path pf = null;
    Path pe = null;
    Path pa = null;
    static Vocab vocE = null;
    static Vocab vocF = null;
    ParallelCorpusReader pcr = new ParallelCorpusReader();
    Language src = null;
    Language tgt = null;
    AlignmentWordPreprocessor sawp = null;
    AlignmentWordPreprocessor tawp = null;
    LanguagePair lp = null;
    JobConf job_ = null;


    public void configure(JobConf job) {
      sLogger.setLevel(Level.OFF);
      src = Language.languageForISO639_1(job.get(SRC_LANG));
      tgt = Language.languageForISO639_1(job.get(TGT_LANG));
      sLogger.debug("Source language: "+src.code());
      sLogger.debug("Target language: "+tgt.code());
      
      boolean useVocabServer = false;
      if (!useVocabServer) {
        if (vocE == null) vocE = new VocabularyWritable();
        if (vocF == null) vocF = new VocabularyWritable();
      } else {
        try {
          vocE = new VocabServerClient(job.get("ha.vocabserver.host"),
              Integer.parseInt(job.get("ha.vocabserver.port1")));
          vocF = new VocabServerClient(job.get("ha.vocabserver.host"),
              Integer.parseInt(job.get("ha.vocabserver.port2")));
        } catch (IOException e) { e.printStackTrace(); throw new RuntimeException(e); }
      }
      lp = LanguagePair.languageForISO639_1Pair(
          src.code() + "-" + tgt.code());


      if(job.getBoolean("ha.trunc.use", true)){
        sawp = AlignmentWordPreprocessor.CreatePreprocessor(lp, src, job);
        tawp = AlignmentWordPreprocessor.CreatePreprocessor(lp, tgt, job);        
      }else{
        sawp = AlignmentWordPreprocessor.CreatePreprocessor(null, null, job);
        tawp = AlignmentWordPreprocessor.CreatePreprocessor(null, null, job);  
      }
      job_ = job;
    }


    public int[] convertStrings(String[] s, Vocab v) {
      int[] res = new int[s.length];
      for (int i =0; i<s.length; ++i) {
        res[i] = v.addOrGet(s[i]);
        sLogger.info(s[i]+"-->"+res[i]);
      }
      return res;
    }


    Text ok = new Text("");


    @Override
    public void close() {
      System.err.println("Target: " + vocE.size() + " types. Writing to "+job_.get("root",null)+"/vocab.E");
      System.err.println("Source: " + vocF.size() + " types .Writing to "+job_.get("root",null)+"/vocab.F");


      //write out vocabulary to file
      try {
        FileSystem fs = FileSystem.get(job_);


        DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(fs.create(new Path(job_.get("root",null)+"/vocab.E"))));
        ((VocabularyWritable) vocE).write(dos);
        dos.close();
        DataOutputStream dos2 = new DataOutputStream(new BufferedOutputStream(fs.create(new Path(job_.get("root",null)+"/vocab.F"))));
        ((VocabularyWritable) vocF).write(dos2);
        dos2.close();
        
      } catch (IOException e) {
        throw new RuntimeException("Vocab couldn't be written to disk.\n"+e.toString());
      }
    }


    //read in xml-format bitext and output each instance as a PhrasePair object with a unique string id as key.


    public void map(Text key, Text value, 
        OutputCollector<Text, PhrasePair> oc, 
        Reporter reporter) throws IOException {


      //key: a single sentence in both languages and alignment
      //ignore value. each key is parallel sentence and its alignment, in xml format


      ParallelChunk c = pcr.parseString(key.toString());
      ok.set(c.idString());
      
      //Chunk is an array of tokens in the sentence, without any special tokenization (just separated by spaces)
      Chunk fc = c.getChunk(src);
      Chunk ec = c.getChunk(tgt);
      if (fc == null || ec == null) {
        reporter.incrCounter(BitextCompilerCounters.WRONG_LANGUAGE, 1);
        return;
      }
      if (fc.getLength() > 200) {
        reporter.incrCounter(BitextCompilerCounters.SRC_TOO_LONG, 1);
        return;
      }
      if (ec.getLength() > 200) {
        reporter.incrCounter(BitextCompilerCounters.TGT_TOO_LONG, 1);
        return;
      }


      //ec,fc: English/French sentence represented as sequence of words
      //vocE,vocF: vocabularies for english and french, of type VocabularyWritable


      //ee,fe: integer representation of words in sentences ec and fc
      sLogger.debug("Target sentence:");
      int[] ee = convertStrings(tawp.preprocessWordsForAlignment(ec.getWords()), vocE);
      sLogger.debug("Source sentence:");
      int[] fe = convertStrings(sawp.preprocessWordsForAlignment(fc.getWords()), vocF);


      //e,f: phrase from whole sentence
      Phrase e = new Phrase(ee, 0);
      Phrase f = new Phrase(fe, 1);


      edu.umd.hooka.PhrasePair b = new PhrasePair(f,e);
      ReferenceAlignment ra = c.getReferenceAlignment(lp);
      if (ra != null) {
        b.setAlignment(ra);
      }
      reporter.incrCounter(BitextCompilerCounters.EN_WORDS, e.getWords().length);
      reporter.incrCounter(BitextCompilerCounters.FR_WORDS, f.getWords().length);
      reporter.incrCounter(BitextCompilerCounters.CHUNKS, 1);
      oc.collect(ok, b);
    }
  }  


  public static class XMLInput extends FileInputFormat<Text, Text> {
    private CompressionCodecFactory compressionCodecs = null;


    public void configure(JobConf conf) {
      compressionCodecs = new CompressionCodecFactory(conf);
    }


    protected boolean isSplitable(FileSystem fs, Path file) {
      if (compressionCodecs == null) return true;
      return compressionCodecs.getCodec(file) == null;
    }


    public RecordReader<Text, Text> getRecordReader(
        InputSplit genericSplit, JobConf job,
        Reporter reporter)
        throws IOException {


      reporter.setStatus(genericSplit.toString());
      FileSplit split = (FileSplit)genericSplit;
      final Path file = split.getPath();
      FileSystem fs = file.getFileSystem(job);
      FSDataInputStream fileIn = fs.open(split.getPath());
      if (compressionCodecs != null && compressionCodecs.getCodec(file) != null)
        throw new RuntimeException("Not handling compression!");


      return new StreamXmlRecordReader(fileIn, split, reporter, job, FileSystem.get(job));
    }


  }


  @SuppressWarnings({ "deprecation", "null" })
  public static void preprocessAndNumberizeFiles(Configuration c,
      String inputPaths, Path output) throws IOException {
    sLogger.setLevel(Level.INFO);
    
    JobConf conf = new JobConf(c);


    conf.setJobName("bitext.compile");


    boolean useVocabServer = false;


    Thread vst1= null;
    Thread vst2= null;
    VocabServer vocabServer1 = null;
    VocabServer vocabServer2 = null;
    try {
      //inputPaths = bi-text given as input in main method of HadoopAlign
      conf.setOutputKeyClass(Text.class);
      conf.setOutputValueClass(PhrasePair.class);
      conf.setMapperClass(BitextCompilerMapper.class);
      conf.setReducerClass(IdentityReducer.class);
      conf.setNumMapTasks(1);
      conf.setNumReduceTasks(1);
      FileInputFormat.setInputPaths(conf, inputPaths);
      conf.set("stream.recordreader.begin", "<pchunk");
      conf.set("stream.recordreader.end", "</pchunk>");
      conf.set("stream.recordreader.slowmatch", "false");
      conf.set("stream.recordreader.maxrec", "100000");
      conf.setInputFormat(XMLInput.class);
      FileOutputFormat.setOutputPath(conf, output);
      conf.setOutputFormat(SequenceFileOutputFormat.class);
      conf.setJarByClass(CorpusVocabNormalizerAndNumberizer.class);
      System.out.println("Running job "+conf.getJobName());
      System.out.println("Input: " + inputPaths);
      System.out.println("Output: "+output);
      JobClient.runJob(conf);
    } finally {
      try {
        if (vst1!=null) vocabServer1.stopServer();
        if (vst2!=null) vocabServer2.stopServer();
        if (vst1!=null) vst1.join();
        if (vst2!=null) vst2.join();
      } catch (InterruptedException e) {}
    }
  }


  public static void main(String args[]) {
    Path[] files = new Path[2];
    files[0] = new Path("/Users/redpony/bitexts/man-align/deen.ccb_jhu.xml");
    files[1] = new Path("/tmp/bar.xml");
    try {
      Configuration c = new Configuration();
      c.set(SRC_LANG, "de");
      c.set(TGT_LANG, "en");
//      c.set("mapred.job.tracker", "local");
//      c.set("fs.default.name", "file:///");
//      FileSystem.get(c).delete(new Path("/Users/ferhanture/Documents/work/hadoop-0.20.1/dummy.out"), true);
//      preprocessAndNumberizeFiles(c, "/Users/ferhanture/edu/research/programs/hadoop-aligner/training-data.tar/eu-nc-wmt2008.de-en/eu-nc-wmt2008.de-en.xml", new Path("/Users/ferhanture/Documents/work/hadoop-0.20.1/dummy.out"));
      
      preprocessAndNumberizeFiles(c, "/umd-lin/fture/mt/eu-nc-wmt2008.de-en.xml", new Path("/umd-lin/fture/mt/aligner/comp-bitext"));


    } catch (Exception e) { e.printStackTrace(); }
  }
}
Source Code of edu.umd.hooka.CorpusVocabNormalizerAndNumberizer

Related Classes of edu.umd.hooka.CorpusVocabNormalizerAndNumberizer