Source Code of ivory.core.preprocess.BuildTranslatedTermDocVectors$DataWriterMapper

package ivory.core.preprocess;


import ivory.core.RetrievalEnvironment;
import ivory.core.data.document.TermDocVector;
import ivory.core.data.stat.DocLengthTable;
import ivory.core.data.stat.DocLengthTable4B;
import ivory.core.data.stat.PrefixEncodedGlobalStats;
import ivory.core.util.CLIRUtils;
import ivory.pwsim.score.ScoringModel;
import java.io.IOException;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;


import edu.umd.hooka.Vocab;
import edu.umd.hooka.alignment.HadoopAlign;
import edu.umd.hooka.ttables.TTable_monolithic_IFAs;
import edu.umd.cloud9.io.map.HMapIFW;
import edu.umd.cloud9.io.map.HMapSFW;
import edu.umd.cloud9.mapred.NullInputFormat;
import edu.umd.cloud9.mapred.NullMapper;
import edu.umd.cloud9.mapred.NullOutputFormat;
import edu.umd.cloud9.util.PowerTool;
import edu.umd.cloud9.util.map.MapIF;


/**
 * Translates term doc vectors in foreign language (e.g. German) into target language (e.g. English) using the CLIR technique discussed in Combining Bidirectional Translation and Synonymy
for Cross-Language Information Retrieval, SIGIR'06, Jianqiang Wang and Douglas W. Oard.


 * @author ferhanture
 *
 */
@SuppressWarnings( "deprecation")
public class BuildTranslatedTermDocVectors extends PowerTool {
  private static final Logger LOG = Logger.getLogger(BuildTranslatedTermDocVectors.class);
  private static int SAMPLING = 1;


  protected static enum Docs {
    ZERO, SHORT, Total
  };
  
  protected static enum DF {
    TransDf, NoDf
  }


  private static class MyMapperTrans extends MapReduceBase implements
  Mapper<IntWritable, TermDocVector, IntWritable, HMapSFW> {


    private ScoringModel model;
    private HMapIFW transDfTable;
    // eVocabSrc is the English vocabulary for probability table e2f_Probs.
    // engVocabTrgis the English vocabulary for probability table f2e_Probs.
    // fVocabSrc is the German vocabulary for probability table f2e_Probs.
    // fVocabTrg is the German vocabulary for probability table e2f_Probs.  
    static Vocab eVocabSrc, fVocabSrc, fVocabTrg, eVocabTrg;
    static TTable_monolithic_IFAs f2e_Probs, e2f_Probs;
    static float avgDocLen;
    static int numDocs;
    static boolean isNormalize;
    private String language;
    int MIN_SIZE = 0;  // minimum document size, to avoid noise in Wikipedia due to stubs/very short articles etc. this is set via Conf object
    
    public void configure(JobConf job) {
      //      LOG.setLevel(Level.DEBUG);
      numDocs = job.getInt("Ivory.CollectionDocumentCount", -1);
      avgDocLen = job.getFloat("Ivory.AvgDocLen", -1);
      isNormalize = job.getBoolean("Ivory.Normalize", false);
      language = job.get("Ivory.Lang");
      LOG.debug(numDocs+" "+avgDocLen);
      MIN_SIZE = job.getInt("Ivory.MinNumTerms", 0);


      FileSystem localFs=null;
      try {
        localFs = FileSystem.getLocal(job);
      } catch (IOException e2) {
      }


      Path[] localFiles = null;
      try {
        localFiles = DistributedCache.getLocalCacheFiles(job);
      } catch (IOException e1) {
        throw new RuntimeException("Error initializing cache file paths!");
      }


      //load translated df values
      try {
        transDfTable = CLIRUtils.readTransDfTable(localFiles[0], localFs);
      } catch (Exception e) {
        LOG.info(e.getMessage());
        throw new RuntimeException("Error initializing DfTable!");
      }


      //load vocabularies and prob table
      try {
        eVocabTrg = HadoopAlign.loadVocab(localFiles[1], localFs);
        fVocabSrc = HadoopAlign.loadVocab(localFiles[2], localFs);
        f2e_Probs = new TTable_monolithic_IFAs(localFs, localFiles[3], true);


        eVocabSrc = HadoopAlign.loadVocab(localFiles[4], localFs);
        fVocabTrg = HadoopAlign.loadVocab(localFiles[5], localFs);
        e2f_Probs = new TTable_monolithic_IFAs(localFs, localFiles[6], true);
      } catch (IOException e) {
        throw new RuntimeException("Error initializing vocabularies/prob table!");
      }  


      try {
        model = (ScoringModel) Class.forName(job.get("Ivory.ScoringModel")).newInstance();
      } catch (Exception e) {
        throw new RuntimeException("Error initializing Ivory.ScoringModel!");
      }


      // this only needs to be set once for the entire collection
      model.setDocCount(numDocs);
      model.setAvgDocLength(avgDocLen);


      if(job.get("debug")!=null){
        LOG.setLevel(Level.DEBUG);
      }
      LOG.debug(numDocs);
      LOG.debug(avgDocLen);
      LOG.debug("---------");
    }


    public void map(IntWritable docno, TermDocVector doc,
        OutputCollector<IntWritable, HMapSFW> output, Reporter reporter)
    throws IOException {
      
      if(docno.get()%SAMPLING!=0)    return;  //for generating sample document vectors. no sampling if SAMPLING=1
      
      if(!language.equals("english")){
        docno.set(docno.get() + 1000000000);  //to distinguish between the two collections in the PWSim sliding window algorithm
      }


      //translate doc vector    
      HMapIFW tfS = new HMapIFW();
      
      int docLen = CLIRUtils.translateTFs(doc, tfS, eVocabSrc, eVocabTrg, fVocabSrc, fVocabTrg, e2f_Probs, f2e_Probs, LOG);
      HMapSFW v = CLIRUtils.createTermDocVector(docLen, tfS, eVocabSrc, model, transDfTable, isNormalize, LOG);
      
      // if no translation of any word is in the target vocab, remove document i.e., our model wasn't capable of translating it.
      if(v.isEmpty() ){
        reporter.incrCounter(Docs.ZERO, 1);
      }else if(v.size()<MIN_SIZE){
        reporter.incrCounter(Docs.SHORT, 1);
      }else{
        reporter.incrCounter(Docs.Total, 1);
        output.collect(docno, v);
      }
    }
  }


  public BuildTranslatedTermDocVectors(Configuration conf) {
    super(conf);
  }


  public static final String[] RequiredParameters = { "Ivory.IndexPath",
  "Ivory.ScoringModel" };


  public String[] getRequiredParameters() {
    return RequiredParameters;
  }


  @Override
  public int runTool() throws Exception {
    String indexPath = getConf().get("Ivory.IndexPath");
    String scoringModel = getConf().get("Ivory.ScoringModel");


    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, FileSystem.get(getConf()));


    String outputPath = env.getWeightedTermDocVectorsDirectory();
    String transDfFile = indexPath+"/transDf.dat";
    String fVocab_f2e= getConf().get("Ivory.F_Vocab_F2E");        //de from P(e|f)
    String eVocab_f2e = getConf().get("Ivory.E_Vocab_F2E");        //en from P(e|f)
    String ttable_f2e = getConf().get("Ivory.TTable_F2E");        //P(e|f)
    String eVocab_e2f  = getConf().get("Ivory.E_Vocab_E2F");    //en from P(f|e)
    String fVocab_e2f  = getConf().get("Ivory.F_Vocab_E2F");    //de from P(f|e)
    String ttable_e2f= getConf().get("Ivory.TTable_E2F");      //P(f|e)
  
    createTranslatedDFFile(transDfFile);


    JobConf conf = new JobConf(getConf(), BuildTranslatedTermDocVectors.class);
    conf.setJobName("BuildTranslatedTermDocVectors");
    FileSystem fs = FileSystem.get(conf);


    if(fs.exists(new Path(outputPath))){
      LOG.info(outputPath+": Translated term doc vectors already exist! Nothing to do for this job...");
      return 0;
    }


    String collectionName = getConf().get("Ivory.CollectionName");
    String inputPath = env.getTermDocVectorsDirectory();


    LOG.info("Preparing to build document vectors using " + scoringModel);
    LOG.info("Document vectors to be stored in " + outputPath);
    LOG.info("CollectionName: " + collectionName);
    LOG.info("Input path: " + inputPath);


    ///////Configuration setup


    conf.set("Ivory.IndexPath", indexPath);
    conf.set("Ivory.ScoringModel", scoringModel);
    DocLengthTable mDLTable;
    try {
      mDLTable = new DocLengthTable4B(env.getDoclengthsData(), fs);
    } catch (IOException e1) {
      throw new RuntimeException("Error initializing Doclengths file");
    }
    LOG.info(mDLTable.getAvgDocLength()+" is average doc len.");
    LOG.info(mDLTable.getDocCount()+" is num docs.");


    conf.setFloat("Ivory.AvgDocLen", mDLTable.getAvgDocLength());
    conf.setInt("Ivory.CollectionDocumentCount", env.readCollectionDocumentCount());
    
    conf.setNumMapTasks(300);      
    conf.setNumReduceTasks(0);
    conf.set("mapred.child.java.opts", "-Xmx2048m");
    conf.setInt("mapred.map.max.attempts", 10);
    conf.setInt("mapred.reduce.max.attempts", 10);
    conf.setInt("mapred.task.timeout", 6000000);


    ////////Cache files


    DistributedCache.addCacheFile(new URI(transDfFile), conf);
    DistributedCache.addCacheFile(new URI(eVocab_f2e), conf);
    DistributedCache.addCacheFile(new URI(fVocab_f2e), conf);
    DistributedCache.addCacheFile(new URI(ttable_f2e), conf);
    DistributedCache.addCacheFile(new URI(eVocab_e2f), conf);
    DistributedCache.addCacheFile(new URI(fVocab_e2f), conf);
    DistributedCache.addCacheFile(new URI(ttable_e2f), conf);


    FileInputFormat.setInputPaths(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));


    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(HMapSFW.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(HMapSFW.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);


    conf.setMapperClass(MyMapperTrans.class);


    long startTime = System.currentTimeMillis();
    JobClient.runJob(conf);
    LOG.info("Job finished in "+(System.currentTimeMillis()-startTime)/1000.0+" seconds");


    return 0;
  }


  private void createTranslatedDFFile(String transDfFile) {
    try {
      JobConf conf2 = new JobConf(getConf(), BuildTranslatedTermDocVectors.class);
      conf2.setJobName("BuildTranslatedDfTable");
      FileSystem fs2 = FileSystem.get(conf2);


      if(fs2.exists(new Path(transDfFile))){
        LOG.info("Translated Df file already exists! Nothing to do for this job...");
      }else{    
        LOG.info("Creating translated Df file ...");
        conf2.set("mapred.child.java.opts", "-Xmx2048m");
        conf2.setInt("mapred.map.max.attempts", 10);
        conf2.setInt("mapred.reduce.max.attempts", 10);
        conf2.setInt("mapred.task.timeout", 6000000);
        conf2.set("TransDfFile", transDfFile);
        conf2.setSpeculativeExecution(false);
        conf2.setNumMapTasks(1);
        conf2.setNumReduceTasks(0);
        conf2.setInputFormat(NullInputFormat.class);
        conf2.setOutputFormat(NullOutputFormat.class);
        conf2.setMapperClass(DataWriterMapper.class);
        JobClient.runJob(conf2);    
        LOG.info("Done");
      }
    } catch (IOException e) {
      e.printStackTrace();
    }
  }


  private static class DataWriterMapper extends NullMapper {


    public void run(JobConf conf, Reporter reporter) throws IOException {
      Logger sLogger = Logger.getLogger(DataWriterMapper.class);
      sLogger.setLevel(Level.DEBUG);


      String indexPath = conf.get("Ivory.IndexPath");
      FileSystem fs2  = FileSystem.get(conf);


      RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs2);


      String transDfFile = conf.get("TransDfFile");
      String eFile = conf.get("Ivory.E_Vocab_E2F");
      String fFile = conf.get("Ivory.F_Vocab_E2F");


      String e2fttableFile = conf.get("Ivory.TTable_E2F");
      String termsFile = env.getIndexTermsData();
      String dfByTermFile = env.getDfByTermData();


      sLogger.debug(e2fttableFile+eFile+termsFile);
      if(!fs2.exists(new Path(fFile)) || !fs2.exists(new Path(eFile)) || !fs2.exists(new Path(e2fttableFile)) || !fs2.exists(new Path(termsFile)) || !fs2.exists(new Path(dfByTermFile))){
        throw new RuntimeException("Error: Translation files do not exist!");
      }


      Vocab eVocab_e2f = null, fVocab_e2f = null;
      TTable_monolithic_IFAs en2DeProbs = null;
      try {
        eVocab_e2f = HadoopAlign.loadVocab(new Path(eFile), conf);
        fVocab_e2f = HadoopAlign.loadVocab(new Path(fFile), conf);


        en2DeProbs = new TTable_monolithic_IFAs(fs2, new Path(e2fttableFile), true);
      } catch (IOException e) {
        e.printStackTrace();
      }  
      PrefixEncodedGlobalStats globalStatsMap;
      globalStatsMap = new PrefixEncodedGlobalStats(new Path(termsFile), fs2);
      globalStatsMap.loadDFStats(new Path(dfByTermFile), fs2);


      HMapIFW transDfTable = CLIRUtils.translateDFTable(eVocab_e2f, fVocab_e2f, en2DeProbs, globalStatsMap);


      SequenceFile.Writer writer = SequenceFile.createWriter(fs2, conf, new Path(transDfFile), IntWritable.class, FloatWritable.class);
      for(MapIF.Entry term : transDfTable.entrySet()){
        reporter.incrCounter(DF.TransDf, 1);
        writer.append(new IntWritable(term.getKey()), new FloatWritable(term.getValue()));
      }
      writer.close();
    }
  }




}
Source Code of ivory.core.preprocess.BuildTranslatedTermDocVectors$DataWriterMapper

Related Classes of ivory.core.preprocess.BuildTranslatedTermDocVectors$DataWriterMapper