Package edu.umd.hooka.alignment

Source Code of edu.umd.hooka.alignment.M1ViterbiExtract$M1ViterbiMapper

package edu.umd.hooka.alignment;

import edu.umd.hooka.alignment.model1.Model1;
import edu.umd.hooka.ttables.TTable;
import edu.umd.hooka.ttables.TTable_monolithic_IFAs;

import java.io.BufferedInputStream;
import java.io.DataInput;
import java.io.DataInputStream;
import java.io.IOException;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.Counters;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.mapred.SequenceFileInputFormat;

import edu.umd.hooka.Alignment;
import edu.umd.hooka.PhrasePair;

/**
* Reads a bitext and generates a TTable object (serialized) based on the
* (e,f) cooccurrences in the text.
*
* @author redpony
*
*/
public class M1ViterbiExtract {
 
  //static final String bitext  ="/shared/bitexts/small.ar-en.ldc/ar-en.bitext";
  //static final String ttable  ="/user/redpony/small.ar-en.ttable";

  static final String bitext  ="/shared/bitexts/hansards.fr-en/hansards.aachen.bitext";
  static final String ttable  ="/user/redpony/hansards.aachen.ttable";

  static protected TTable loadTTable(Path path) throws IOException {
    org.apache.hadoop.conf.Configuration conf = new org.apache.hadoop.conf.Configuration();
    FileSystem fileSys = FileSystem.get(conf);
 
    DataInput in = new DataInputStream(new BufferedInputStream(fileSys.open(path)));
    TTable_monolithic_IFAs tt = new TTable_monolithic_IFAs();
    tt.readFields(in);
   
    return tt;
  }
 
  public static class M1ViterbiMapper extends MapReduceBase
    implements Mapper<IntWritable,PhrasePair,IntWritable,Text> {
   
    Text out = new Text();
    PerplexityReporter cr = new PerplexityReporter();
    Model1 m1 = null;
    public void map(IntWritable key, PhrasePair value,
                        OutputCollector<IntWritable,Text> output,
                        Reporter reporter) throws IOException {
      if (m1 == null) {
        Path pathTTable = new Path(ttable);
        TTable tt = loadTTable(pathTTable);
        m1 = new Model1(tt, true);
      }
      cr.reset();
      Alignment a = m1.viterbiAlign(value, cr);
      out.set(a.toString());
      output.collect(key, out);
      reporter.incrCounter(CrossEntropyCounters.LOGPROB, (long)(cr.getTotalLogProb()));
      reporter.incrCounter(CrossEntropyCounters.WORDCOUNT, cr.getTotalWordCount());
    }
   
    public void close() {
    }
  }
   
  @SuppressWarnings("deprecation")
  public static void main(String[] args) throws IOException {
    int mapTasks    = 15;
   
    JobConf conf = new JobConf(M1ViterbiMapper.class);
    conf.setJobName("m1viterbi");
    conf.setOutputKeyClass(LongWritable.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapperClass(M1ViterbiMapper.class);           
    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(0);
    conf.setInputFormat(SequenceFileInputFormat.class);
    FileInputFormat.setInputPaths(conf, new Path(bitext));
    FileOutputFormat.setOutputPath(conf, new Path("somealigns.test"));

    RunningJob rj = JobClient.runJob(conf);
    Counters cs = rj.getCounters();
    double lp = (double)cs.getCounter(CrossEntropyCounters.LOGPROB);
    double wc = (double)cs.getCounter(CrossEntropyCounters.WORDCOUNT);
    double ce = (lp / wc) / Math.log(2.0);
    System.out.println("Viterbi cross-entropy: " + ce + "   perplexity: " + Math.pow(2.0, ce));
  }
 
}
TOP

Related Classes of edu.umd.hooka.alignment.M1ViterbiExtract$M1ViterbiMapper

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.