Source Code of edu.umd.hooka.alignment.M1ViterbiExtract$M1ViterbiMapper

package edu.umd.hooka.alignment;


import edu.umd.hooka.alignment.model1.Model1;
import edu.umd.hooka.ttables.TTable;
import edu.umd.hooka.ttables.TTable_monolithic_IFAs;


import java.io.BufferedInputStream;
import java.io.DataInput;
import java.io.DataInputStream;
import java.io.IOException;


import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.Counters;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.mapred.SequenceFileInputFormat;


import edu.umd.hooka.Alignment;
import edu.umd.hooka.PhrasePair;


/**
 * Reads a bitext and generates a TTable object (serialized) based on the
 * (e,f) cooccurrences in the text.
 * 
 * @author redpony
 *
 */
public class M1ViterbiExtract {
  
  //static final String bitext  ="/shared/bitexts/small.ar-en.ldc/ar-en.bitext";
  //static final String ttable  ="/user/redpony/small.ar-en.ttable";


  static final String bitext  ="/shared/bitexts/hansards.fr-en/hansards.aachen.bitext";
  static final String ttable  ="/user/redpony/hansards.aachen.ttable";


  static protected TTable loadTTable(Path path) throws IOException {
    org.apache.hadoop.conf.Configuration conf = new org.apache.hadoop.conf.Configuration();
    FileSystem fileSys = FileSystem.get(conf);
  
    DataInput in = new DataInputStream(new BufferedInputStream(fileSys.open(path)));
    TTable_monolithic_IFAs tt = new TTable_monolithic_IFAs();
    tt.readFields(in);
    
    return tt;
  }
  
  public static class M1ViterbiMapper extends MapReduceBase
    implements Mapper<IntWritable,PhrasePair,IntWritable,Text> {
    
    Text out = new Text();
    PerplexityReporter cr = new PerplexityReporter();
    Model1 m1 = null;
    public void map(IntWritable key, PhrasePair value, 
                        OutputCollector<IntWritable,Text> output, 
                        Reporter reporter) throws IOException {
      if (m1 == null) {
        Path pathTTable = new Path(ttable);
        TTable tt = loadTTable(pathTTable);
        m1 = new Model1(tt, true);
      }
      cr.reset();
      Alignment a = m1.viterbiAlign(value, cr);
      out.set(a.toString());
      output.collect(key, out);
      reporter.incrCounter(CrossEntropyCounters.LOGPROB, (long)(cr.getTotalLogProb()));
      reporter.incrCounter(CrossEntropyCounters.WORDCOUNT, cr.getTotalWordCount());
    }
    
    public void close() {
    }
  }
    
  @SuppressWarnings("deprecation")
  public static void main(String[] args) throws IOException {
    int mapTasks    = 15;
    
    JobConf conf = new JobConf(M1ViterbiMapper.class);
    conf.setJobName("m1viterbi");
    conf.setOutputKeyClass(LongWritable.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapperClass(M1ViterbiMapper.class);            
    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(0);
    conf.setInputFormat(SequenceFileInputFormat.class);
    FileInputFormat.setInputPaths(conf, new Path(bitext));
    FileOutputFormat.setOutputPath(conf, new Path("somealigns.test"));


    RunningJob rj = JobClient.runJob(conf);
    Counters cs = rj.getCounters();
    double lp = (double)cs.getCounter(CrossEntropyCounters.LOGPROB);
    double wc = (double)cs.getCounter(CrossEntropyCounters.WORDCOUNT);
    double ce = (lp / wc) / Math.log(2.0);
    System.out.println("Viterbi cross-entropy: " + ce + "   perplexity: " + Math.pow(2.0, ce));
  }
  
}
Source Code of edu.umd.hooka.alignment.M1ViterbiExtract$M1ViterbiMapper

Related Classes of edu.umd.hooka.alignment.M1ViterbiExtract$M1ViterbiMapper