Package edu.umd.hooka.alignment

Source Code of edu.umd.hooka.alignment.HSymAlign$Reduce

package edu.umd.hooka.alignment;

import java.io.IOException;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.util.Iterator;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;

import edu.umd.hooka.Alignment;

public class HSymAlign {

  public static class MapClass extends MapReduceBase
    implements Mapper<LongWritable,Text,IntWritable,Text> {
       
    private Text l = new Text();
    private IntWritable linenum = new IntWritable(1);
       
    public void map(LongWritable key, Text value,
        OutputCollector<IntWritable,Text> output,
        Reporter reporter) throws IOException {
      String line = value.toString();
      if (line.length() == 0) { return; }
      String[] toks = line.split("\\s*\\|\\|\\|\\s*");
      if (toks.length != 2)
        throw new IOException("Expected input of form '0 ||| /path/to/input'");
      String pfx = toks[0];
      if (pfx.length() != 1 && (!pfx.equals("0") || !pfx.equals("1")))
        throw new IOException("Excepted transpose field to be 0 or 1");
      Path p = new Path(toks[1]);
      org.apache.hadoop.conf.Configuration conf = new org.apache.hadoop.conf.Configuration();
      FileSystem fileSys = FileSystem.get(conf);
      BufferedReader giza = new BufferedReader(new InputStreamReader(fileSys.open(p), "UTF8"));
      int lc = 0;
      String comment;
      while ((comment = giza.readLine()) != null) {
        String e = giza.readLine();
        String f = giza.readLine();
        lc++;
        linenum.set(lc);
            l.set(pfx + " ||| " + comment + " ||| " + e + " ||| " + f);
            output.collect(linenum, l);
      }
    }
  }
 
  public static class Reduce extends MapReduceBase
    implements Reducer<IntWritable,Text,IntWritable,Text> {
       
    Text alout = new Text();
    Refiner r = null;
   
    public void reduce(IntWritable key, Iterator<Text> values,
        OutputCollector<IntWritable,Text> output,
        Reporter reporter) throws IOException {
      if (r == null) {
        try {
          r = RefinerFactory.getForName("grow-diag-final-and");
        } catch (Exception e) {
          throw new IOException("Caught exception: " + e);
        }
      }
      Text ta = values.next();
      Text tb = values.next();
      if (ta == null || tb == null) { throw new IOException("Layout error!"); }
      String sa = ta.toString();
      String sb = tb.toString();
      String e2f = sa;
      String f2e = sa;
      if (sb.charAt(0) == '0') { f2e = sb; } else { e2f = sb; }
      String[] ae2f = e2f.split("\\s*\\|\\|\\|\\s*");
      String[] af2e = f2e.split("\\s*\\|\\|\\|\\s*");
      Alignment a1 = Alignment.fromGiza(ae2f[1], ae2f[2], true);
      Alignment a2 = Alignment.fromGiza(af2e[1], af2e[2], false);
      Alignment a = r.refine(a1, a2);
      alout.set(a.toString());
      output.collect(key, alout);
    }
  }
 
  public static void main(String[] args) {
    JobConf conf = new JobConf(HSymAlign.class);
    conf.setJobName("alignment-sym");
   
    conf.setOutputKeyClass(IntWritable.class);            // the keys are words (strings)
    conf.setOutputValueClass(Text.class);   // the values are counts (ints)
       
    conf.setMapperClass(MapClass.class);
    conf.setReducerClass(Reduce.class);
           
    conf.setNumMapTasks(1);
    conf.setNumReduceTasks(500);
    String filename="infiles";
    String outputPath="align";
    FileInputFormat.setInputPaths(conf, new Path(filename));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
             
    try{
        JobClient.runJob(conf);
    } catch (Exception e) {
      e.printStackTrace();
    }
  }
}
TOP

Related Classes of edu.umd.hooka.alignment.HSymAlign$Reduce

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.