package edu.umd.hooka.alignment;
import java.io.IOException;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.util.Iterator;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import edu.umd.hooka.Alignment;
public class HSymAlign {
public static class MapClass extends MapReduceBase
implements Mapper<LongWritable,Text,IntWritable,Text> {
private Text l = new Text();
private IntWritable linenum = new IntWritable(1);
public void map(LongWritable key, Text value,
OutputCollector<IntWritable,Text> output,
Reporter reporter) throws IOException {
String line = value.toString();
if (line.length() == 0) { return; }
String[] toks = line.split("\\s*\\|\\|\\|\\s*");
if (toks.length != 2)
throw new IOException("Expected input of form '0 ||| /path/to/input'");
String pfx = toks[0];
if (pfx.length() != 1 && (!pfx.equals("0") || !pfx.equals("1")))
throw new IOException("Excepted transpose field to be 0 or 1");
Path p = new Path(toks[1]);
org.apache.hadoop.conf.Configuration conf = new org.apache.hadoop.conf.Configuration();
FileSystem fileSys = FileSystem.get(conf);
BufferedReader giza = new BufferedReader(new InputStreamReader(fileSys.open(p), "UTF8"));
int lc = 0;
String comment;
while ((comment = giza.readLine()) != null) {
String e = giza.readLine();
String f = giza.readLine();
lc++;
linenum.set(lc);
l.set(pfx + " ||| " + comment + " ||| " + e + " ||| " + f);
output.collect(linenum, l);
}
}
}
public static class Reduce extends MapReduceBase
implements Reducer<IntWritable,Text,IntWritable,Text> {
Text alout = new Text();
Refiner r = null;
public void reduce(IntWritable key, Iterator<Text> values,
OutputCollector<IntWritable,Text> output,
Reporter reporter) throws IOException {
if (r == null) {
try {
r = RefinerFactory.getForName("grow-diag-final-and");
} catch (Exception e) {
throw new IOException("Caught exception: " + e);
}
}
Text ta = values.next();
Text tb = values.next();
if (ta == null || tb == null) { throw new IOException("Layout error!"); }
String sa = ta.toString();
String sb = tb.toString();
String e2f = sa;
String f2e = sa;
if (sb.charAt(0) == '0') { f2e = sb; } else { e2f = sb; }
String[] ae2f = e2f.split("\\s*\\|\\|\\|\\s*");
String[] af2e = f2e.split("\\s*\\|\\|\\|\\s*");
Alignment a1 = Alignment.fromGiza(ae2f[1], ae2f[2], true);
Alignment a2 = Alignment.fromGiza(af2e[1], af2e[2], false);
Alignment a = r.refine(a1, a2);
alout.set(a.toString());
output.collect(key, alout);
}
}
public static void main(String[] args) {
JobConf conf = new JobConf(HSymAlign.class);
conf.setJobName("alignment-sym");
conf.setOutputKeyClass(IntWritable.class); // the keys are words (strings)
conf.setOutputValueClass(Text.class); // the values are counts (ints)
conf.setMapperClass(MapClass.class);
conf.setReducerClass(Reduce.class);
conf.setNumMapTasks(1);
conf.setNumReduceTasks(500);
String filename="infiles";
String outputPath="align";
FileInputFormat.setInputPaths(conf, new Path(filename));
FileOutputFormat.setOutputPath(conf, new Path(outputPath));
try{
JobClient.runJob(conf);
} catch (Exception e) {
e.printStackTrace();
}
}
}