Examples of edu.umd.hooka.Vocab

edu.umd.hooka.Vocab

   * @throws IOException
   */
  public static void createTTableFromHooka(String srcVocabFile, String trgVocabFile, String tableFile, String finalSrcVocabFile, 
      String finalTrgVocabFile, String finalTableFile, float probThreshold, int numTrans, FileSystem fs) throws IOException{
    logger.setLevel(Level.DEBUG);
    Vocab srcVocab = HadoopAlign.loadVocab(new Path(srcVocabFile), fs);
    Vocab trgVocab = HadoopAlign.loadVocab(new Path(trgVocabFile), fs);
    TTable_monolithic_IFAs ttable = new TTable_monolithic_IFAs(fs, new Path(tableFile), true);


    logger.debug(ttable.getMaxE() + "," + ttable.getMaxF());


    Vocab finalSrcVocab = new VocabularyWritable();
    Vocab finalTrgVocab = new VocabularyWritable();
    TTable_monolithic_IFAs finalTTable = new TTable_monolithic_IFAs();


    String srcTerm = null, trgTerm = null;
    int curIndex = -1;
    TreeSet<PairOfFloatString> topTrans = new TreeSet<PairOfFloatString>();
    float sumOfProbs = 0.0f, prob;
    //    int cntLongTail = 0, cntShortTail = 0, sumShortTail = 0;    // for statistical purposes only
    HookaStats stats = new HookaStats(numTrans, probThreshold);


    //modify current ttable wrt foll. criteria: top numTrans translations per source term, unless cumulative prob. distr. exceeds probThreshold before that.
    for (int srcIndex = 1; srcIndex < srcVocab.size(); srcIndex++) {
      int[] translations;
      try {
        translations = ttable.get(srcIndex).getTranslations(0f);
      } catch (Exception e) {
        logger.warn("No translations found for "+srcVocab.get(srcIndex)+". Ignoring...");
        continue;
      }


      srcTerm = srcVocab.get(srcIndex);
      curIndex = finalSrcVocab.addOrGet(srcTerm);


      //initialize this term
      topTrans.clear();
      sumOfProbs = 0.0f;
      logger.debug("Processing: " + srcTerm + " with index: " + curIndex + " ("+srcIndex+"); " + translations.length + " translations");
      for (int trgIndex : translations) {
        try {
          trgTerm = trgVocab.get(trgIndex);
        } catch (Exception e) {
          logger.debug("Skipping " + trgIndex);
          continue;
        }
        prob = ttable.get(srcIndex, trgIndex);
        logger.debug("Found: " + trgTerm + " with " + prob);


        topTrans.add(new PairOfFloatString(prob, trgTerm));
        // keep top numTrans translations
        if (topTrans.size() > numTrans) {
          float removedProb = topTrans.pollFirst().getLeftElement();
          sumOfProbs -= removedProb;
        }
        sumOfProbs += prob;


        if (sumOfProbs > probThreshold) {
          logger.debug("Sum of probs > "+probThreshold+", early termination.");
          break;
        }  
      }


      //store previous term's top translations to ttable
      if(topTrans.size() > 0){
        addToTable(curIndex, topTrans, sumOfProbs, finalTTable, finalTrgVocab, probThreshold, stats);
      }
    }
    System.err.println("Vocabulary Target: "+finalTrgVocab.size()+" elements");
    System.err.println("Vocabulary Source: "+finalSrcVocab.size()+" elements");
    System.err.println(stats);


    FSDataOutputStream outputStream1 = fs.create(new Path(finalTrgVocabFile));
    ((VocabularyWritable) finalTrgVocab).write(outputStream1);

View Full Code Here

    TTable_monolithic_IFAs table = new TTable_monolithic_IFAs();
    Configuration conf = new Configuration();
    HookaStats stats = new HookaStats(-1, -1);
    try {
      FileSystem fs = FileSystem.get(conf);
      Vocab eVocabTrg = HadoopAlign.loadVocab(new Path(trgEVocabFile), conf);
      Vocab fVocabSrc = HadoopAlign.loadVocab(new Path(srcFVocabFile), conf);
      TTable_monolithic_IFAs f2e_Probs = new TTable_monolithic_IFAs(fs, new Path(ttableF2EFile), true);
      Vocab eVocabSrc = HadoopAlign.loadVocab(new Path(srcEVocabFile), conf);
      Vocab fVocabTrg = HadoopAlign.loadVocab(new Path(trgFVocabFile), conf);
      TTable_monolithic_IFAs e2f_Probs = new TTable_monolithic_IFAs(fs, new Path(ttableE2FFile), true);


      TreeSet<PairOfFloatString> topTrans = new TreeSet<PairOfFloatString>();
      for (int e1 = 1; e1 < eVocabSrc.size(); e1++) {
        String eTerm = eVocabSrc.get(e1);


        float sumOfProbs = 0;
        int[] fS = e2f_Probs.get(e1).getTranslations(0.0f);
        for (int f1 : fS) {
          float prob1 = e2f_Probs.get(e1, f1);


          String fTerm = fVocabTrg.get(f1);         
          int f2 = fVocabSrc.get(fTerm);
          int e2 = eVocabTrg.get(eTerm);         


          float prob2 = f2e_Probs.get(f2, e2);
          float prob = prob1*prob2;

View Full Code Here

      // simplification mode
      createTTableFromHooka(hookaSrcVocab, hookaTrgVocab, hookaTTable, srcVocabFile, trgVocabFile, ttableFile, cumProbThreshold, numTrans, fs);
    } else if (srcWord != null && trgWord != null) {
      // query mode
      try {
        Vocab srcVocab = HadoopAlign.loadVocab(new Path(srcVocabFile), fs);
        Vocab trgVocab = HadoopAlign.loadVocab(new Path(trgVocabFile), fs);
        TTable_monolithic_IFAs src2trgProbs = new TTable_monolithic_IFAs(fs, new Path(ttableFile), true);
        System.out.println("Source vocab size: " + srcVocab.size());
        System.out.println("Target vocab size: " + trgVocab.size());
        int srcId = -1;
        try {
          srcId = srcVocab.get(srcWord);
        } catch (Exception e) {
          System.err.println(srcWord + " not found in source-side vocabulary " + srcVocabFile);
          System.exit(-1);
        }
        if (trgWord.equals("ALL")) {
          int[] trgs = src2trgProbs.get(srcId).getTranslations(0.0f);
          System.out.println("(" + srcId + "," + srcWord + ") has "+ trgs.length + " translations:");
          for (int i = 0; i < trgs.length; i++) {
            trgWord = trgVocab.get(trgs[i]);
            System.out.println("Prob("+trgWord+"|"+srcWord+")="+src2trgProbs.get(srcId, trgs[i]));
          }
        }else {
          int trgId = -1;
          try {
            trgId = trgVocab.get(trgWord);
          } catch (Exception e) {
            System.err.println(trgWord + " not found in target-side vocabulary " + trgVocabFile);
            System.exit(-1);  
          }
          System.out.println("Prob("+trgWord+"|"+srcWord+")="+src2trgProbs.get(srcId, trgId));

View Full Code Here

      }else {
        LOG.info("No document output! Terminating...");
        return -1;
      }
      // set Property.CollectionTermCount to the size of the target vocab. since all docs are translated into that vocab. This property is read by WriteRandomVectors via RunComputeSignatures.
      Vocab engVocabH = null;
      try {
        engVocabH = HadoopAlign.loadVocab(new Path(conf.get("Ivory.FinalVocab")), conf);
      } catch (IOException e) {
        e.printStackTrace();
      }  
      LOG.info("Changed term count: " + env.readCollectionTermCount() + " => " + engVocabH.size());
      env.writeCollectionTermCount(engVocabH.size());
    }


    LOG.info("Preprocessing job finished in " + (System.currentTimeMillis() - preprocessStartTime) / 1000.0 + " seconds");


    return 0;

View Full Code Here

      if(finalNumDocs > 0){
        LOG.info("Changed doc count from "+env.readCollectionDocumentCount() + " to = "+finalNumDocs);
        env.writeCollectionDocumentCount(finalNumDocs);
      }
      // set Property.CollectionTermCount to the size of the target vocab. since all docs are translated into that vocab. This property is read by WriteRandomVectors via RunComputeSignatures.
      Vocab engVocabH = null;
      try {
        engVocabH = HadoopAlign.loadVocab(new Path(conf.get("Ivory.FinalVocab")), conf);
      } catch (IOException e) {
        e.printStackTrace();
      }  
      LOG.info("Changed term count to : "+env.readCollectionTermCount() + " = " + engVocabH.size());
      env.writeCollectionTermCount(engVocabH.size());
    }
    
    LOG.info("Preprocessing job finished in "+(System.currentTimeMillis()-preprocessStartTime)/1000.0+" seconds");


    return 0;

View Full Code Here

      }else {
        LOG.info("No document output! Terminating...");
        return -1;
      }
      // set Property.CollectionTermCount to the size of the target vocab. since all docs are translated into that vocab. This property is read by WriteRandomVectors via RunComputeSignatures.
      Vocab engVocabH = null;
      try {
        engVocabH = HadoopAlign.loadVocab(new Path(conf.get("Ivory.FinalVocab")), conf);
      } catch (IOException e) {
        e.printStackTrace();
      }  
      LOG.info("Changed term count: " + env.readCollectionTermCount() + " => " + engVocabH.size());
      env.writeCollectionTermCount(engVocabH.size());
    }


    LOG.info("Preprocessing job finished in " + (System.currentTimeMillis() - preprocessStartTime) / 1000.0 + " seconds");


    return 0;

View Full Code Here

   * @throws IOException
   */
  public static void createTTableFromHooka(String srcVocabFile, String trgVocabFile, String tableFile, String finalSrcVocabFile, String finalTrgVocabFile, String finalTableFile, float probThreshold, int numTrans, FileSystem fs) throws IOException{
    logger.setLevel(Level.INFO);


    Vocab srcVocab = HadoopAlign.loadVocab(new Path(srcVocabFile), fs);
    Vocab trgVocab = HadoopAlign.loadVocab(new Path(trgVocabFile), fs);
    TTable_monolithic_IFAs ttable = new TTable_monolithic_IFAs(fs, new Path(tableFile), true);


    Vocab finalSrcVocab = new VocabularyWritable();
    Vocab finalTrgVocab = new VocabularyWritable();
    TTable_monolithic_IFAs finalTTable = new TTable_monolithic_IFAs();


    String srcTerm = null, trgTerm = null;
    int curIndex = -1;
    TreeSet<PairOfFloatString> topTrans = new TreeSet<PairOfFloatString>();
    float sumOfProbs = 0.0f, prob;
    //    int cntLongTail = 0, cntShortTail = 0, sumShortTail = 0;    // for statistical purposes only
    HookaStats stats = new HookaStats(numTrans, probThreshold);


    //modify current ttable wrt foll. criteria: top numTrans translations per source term, unless cumulative prob. distr. exceeds probThreshold before that.
    for(int srcIndex=1; srcIndex<srcVocab.size(); srcIndex++){
      int[] translations;
      try {
        translations = ttable.get(srcIndex).getTranslations(0.0f);
      } catch (Exception e) {
        logger.warn("No translations found for "+srcVocab.get(srcIndex)+". Ignoring...");
        continue;
      }


      srcTerm = srcVocab.get(srcIndex);
      curIndex = finalSrcVocab.addOrGet(srcTerm);


      //initialize this term
      topTrans.clear();
      sumOfProbs = 0.0f;
      logger.debug("Processing: "+srcTerm+" with index: "+curIndex+" ("+srcIndex+")");
      for(int trgIndex : translations){
        trgTerm = trgVocab.get(trgIndex);
        prob = ttable.get(srcIndex, trgIndex);


        topTrans.add(new PairOfFloatString(prob, trgTerm));
        // keep top numTrans translations
        if(topTrans.size() > numTrans){
          float removedProb = topTrans.pollFirst().getLeftElement();
          sumOfProbs -= removedProb;
        }
        sumOfProbs += prob;


        if(sumOfProbs > probThreshold){
          logger.debug("Sum of probs > "+probThreshold+", early termination.");
          break;
        }  
      }


      //store previous term's top translations to ttable
      if(topTrans.size() > 0){
        addToTable(curIndex, topTrans, sumOfProbs, finalTTable, finalTrgVocab, probThreshold, stats);
      }
    }
    logger.info("Vocabulary Target: "+finalTrgVocab.size()+" elements");
    logger.info("Vocabulary Source: "+finalSrcVocab.size()+" elements");
    logger.info(stats);


    DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(fs.create(new Path(finalTrgVocabFile))));
    ((VocabularyWritable) finalTrgVocab).write(dos);

View Full Code Here

    TTable_monolithic_IFAs table = new TTable_monolithic_IFAs();
    Configuration conf = new Configuration();
    HookaStats stats = new HookaStats(-1, -1);
    try {
      FileSystem fs = FileSystem.get(conf);
      Vocab eVocabTrg = HadoopAlign.loadVocab(new Path(trgEVocabFile), conf);
      Vocab fVocabSrc = HadoopAlign.loadVocab(new Path(srcFVocabFile), conf);
      TTable_monolithic_IFAs f2e_Probs = new TTable_monolithic_IFAs(fs, new Path(ttableF2EFile), true);
      Vocab eVocabSrc = HadoopAlign.loadVocab(new Path(srcEVocabFile), conf);
      Vocab fVocabTrg = HadoopAlign.loadVocab(new Path(trgFVocabFile), conf);
      TTable_monolithic_IFAs e2f_Probs = new TTable_monolithic_IFAs(fs, new Path(ttableE2FFile), true);


      TreeSet<PairOfFloatString> topTrans = new TreeSet<PairOfFloatString>();
      for (int e1 = 1; e1 < eVocabSrc.size(); e1++) {
        String eTerm = eVocabSrc.get(e1);


        float sumOfProbs = 0;
        int[] fS = e2f_Probs.get(e1).getTranslations(0.0f);
        for (int f1 : fS) {
          float prob1 = e2f_Probs.get(e1, f1);


          String fTerm = fVocabTrg.get(f1);         
          int f2 = fVocabSrc.get(fTerm);
          int e2 = eVocabTrg.get(eTerm);         


          float prob2 = f2e_Probs.get(f2, e2);
          float prob = prob1*prob2;

View Full Code Here

      FileSystem localFS = FileSystem.getLocal(conf);


      // query mode
      if (args.length == 5) {
        String srcTerm = args[0], trgTerm = args[1];
        Vocab srcVocab = HadoopAlign.loadVocab(new Path(args[2]), localFS);
        Vocab trgVocab = HadoopAlign.loadVocab(new Path(args[3]), localFS);
        TTable_monolithic_IFAs src2trgProbs = new TTable_monolithic_IFAs(localFS, new Path(args[4]), true);


        if (trgTerm.equals("ALL")) {
          int[] trgs = src2trgProbs.get(srcVocab.get(srcTerm)).getTranslations(0.0f);
          System.out.println(srcTerm + " has "+ trgs.length + " translations:");
          for (int i = 0; i < trgs.length; i++) {
            trgTerm = trgVocab.get(trgs[i]);
            System.out.println("Prob("+trgTerm+"|"+srcTerm+")="+src2trgProbs.get(srcVocab.get(srcTerm), trgVocab.get(trgTerm)));
          }
        }else {
          System.out.println("Prob("+trgTerm+"|"+srcTerm+")="+src2trgProbs.get(srcVocab.get(srcTerm), trgVocab.get(trgTerm)));
        }
        return;
      }


      // create mode

View Full Code Here

      }else {
        LOG.info("No document output! Terminating...");
        return -1;
      }
      // set Property.CollectionTermCount to the size of the target vocab. since all docs are translated into that vocab. This property is read by WriteRandomVectors via RunComputeSignatures.
      Vocab engVocabH = null;
      try {
        engVocabH = HadoopAlign.loadVocab(new Path(conf.get("Ivory.FinalVocab")), conf);
      } catch (IOException e) {
        e.printStackTrace();
      }  
      LOG.info("Changed term count: " + env.readCollectionTermCount() + " => " + engVocabH.size());
      env.writeCollectionTermCount(engVocabH.size());
    }


    LOG.info("Preprocessing job finished in " + (System.currentTimeMillis() - preprocessStartTime) / 1000.0 + " seconds");


    return 0;

View Full Code Here

0 1 2 3 4

TOP

Related Classes of edu.umd.hooka.Vocab

edu.umd.hooka.ttables.TTableTest

ivory.app.PreprocessWikipedia

ivory.core.driver.PreprocessWikipedia

ivory.core.preprocess.BuildTranslatedTermDocVectors$DataWriterMapper

ivory.core.util.CLIRUtils

ivory.core.util.CLIRUtilsTest

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.