Examples of edu.umd.hooka.ttables.TTable_monolithic_IFAs

edu.umd.hooka.ttables.TTable_monolithic_IFAs
miacs.umd.edu/~redpony/ttable-structure.png @author redpony

      // query mode
      if (args.length == 5) {
        String srcTerm = args[0], trgTerm = args[1];
        Vocab srcVocab = HadoopAlign.loadVocab(new Path(args[2]), localFS);
        Vocab trgVocab = HadoopAlign.loadVocab(new Path(args[3]), localFS);
        TTable_monolithic_IFAs src2trgProbs = new TTable_monolithic_IFAs(localFS, new Path(args[4]), true);


        if (trgTerm.equals("ALL")) {
          int[] trgs = src2trgProbs.get(srcVocab.get(srcTerm)).getTranslations(0.0f);
          System.out.println(srcTerm + " has "+ trgs.length + " translations:");
          for (int i = 0; i < trgs.length; i++) {
            trgTerm = trgVocab.get(trgs[i]);
            System.out.println("Prob("+trgTerm+"|"+srcTerm+")="+src2trgProbs.get(srcVocab.get(srcTerm), trgVocab.get(trgTerm)));
          }
        }else {
          System.out.println("Prob("+trgTerm+"|"+srcTerm+")="+src2trgProbs.get(srcVocab.get(srcTerm), trgVocab.get(trgTerm)));
        }
        return;
      }


      // create mode

View Full Code Here


    queryLang = conf.get(Constants.QueryLanguage);
    docLang = conf.get(Constants.DocLanguage);
    fVocab_f2e = (VocabularyWritable) HadoopAlign.loadVocab(new Path(conf.get(Constants.QueryVocab)), fs);
    eVocab_f2e = (VocabularyWritable) HadoopAlign.loadVocab(new Path(conf.get(Constants.DocVocab)), fs);
    f2eProbs = new TTable_monolithic_IFAs(fs, new Path(conf.get(Constants.f2eProbsPath)), true);


    LOG.info("Stemmed stopword list file in query-language:" + conf.get(Constants.StemmedStopwordListQ));
    LOG.info("Stemmed stopword list file in doc-language:" + conf.get(Constants.StemmedStopwordListD));


    queryLangTokenizer = TokenizerFactory.createTokenizer(fs, conf, queryLang, conf.get(Constants.QueryTokenizerData), false, null, null, null);

View Full Code Here

   * @throws IOException
   */
  public static void createTTableFromBerkeleyAligner(String inputFile, String srcVocabFile, String trgVocabFile, String probsFile, FileSystem fs) throws IOException{
    logger.setLevel(Level.INFO);


    TTable_monolithic_IFAs table = new TTable_monolithic_IFAs();
    VocabularyWritable trgVocab = new VocabularyWritable(), srcVocab = new VocabularyWritable();
    File file = new File(inputFile);
    FileInputStream fis = null;
    BufferedReader bis = null;
    int cntLongTail = 0, cntShortTail = 0, sumShortTail = 0;    // for statistical purposes only


    //In BerkeleyAligner output, dictionary entries of each source term are already sorted by prob. value. 
    try {
      fis = new FileInputStream(file);


      bis = new BufferedReader(new InputStreamReader(fis,"UTF-8"));
      String cur = null;
      boolean earlyTerminate = false;
      String line = "";


      while (true) {
        if(!earlyTerminate){
          line = bis.readLine();
          if(line ==null)
            break;
        }
        earlyTerminate = false;
        logger.debug("Line:"+line);


        Pattern p = Pattern.compile("(.+)\\tentropy .+nTrans"); 
        Matcher m = p.matcher(line);
        if(m.find()){
          cur = m.group(1);


          int gerIndex = srcVocab.addOrGet(cur);  
          logger.debug("Found: "+cur+" with index: "+gerIndex);




          List<PairOfIntFloat> indexProbPairs = new ArrayList<PairOfIntFloat>();
          float sumprob = 0.0f;
          for(int i=0;i<NUM_TRANS;i++){
            if((line=bis.readLine())!=null){
              Pattern p2 = Pattern.compile("\\s*(\\S+): (.+)");
              Matcher m2 = p2.matcher(line);
              if(!m2.find()){
                m = p.matcher(line);
                if(m.find()){
                  logger.debug("Early terminate");
                  earlyTerminate = true;
                  i = NUM_TRANS;
                  break;
                }
                //                logger.debug("FFFF"+line);
              }else{
                String term = m2.group(1);
                float prob = Float.parseFloat(m2.group(2));
                int engIndex = trgVocab.addOrGet(term);
                logger.debug("Added: "+term+" with index: "+engIndex+" and prob:"+prob);
                indexProbPairs.add(new PairOfIntFloat(engIndex, prob));
                sumprob+=prob;
              }
            }
            if(sumprob > PROB_THRESHOLD){
              cntShortTail++;    // for statistical purposes only
              sumShortTail += (i+1);  // for statistical purposes only
              break;
            }
          }
          if(sumprob <= PROB_THRESHOLD){
            cntLongTail++;    // for statistical purposes only
            if(sumprob < 0.1){
              logger.info(sumprob);
            }
          }


          // to enable faster access with binary search, we sort entries by vocabulary index.
          Collections.sort(indexProbPairs);
          int i=0;
          int numEntries = indexProbPairs.size();
          int[] indices = new int[numEntries];
          float[] probs = new float[numEntries];
          for(PairOfIntFloat pair : indexProbPairs){
            indices[i] = pair.getLeftElement();
            probs[i++] = pair.getRightElement()/sumprob;
          }
          table.set(gerIndex, new IndexedFloatArray(indices, probs, true));
        }
      }


      // dispose all the resources after using them.
      fis.close();
      bis.close();
      //      dis.close();
    }catch (FileNotFoundException e) {
      e.printStackTrace();
    } catch (IOException e) {
      e.printStackTrace();
    }
    logger.info("Vocabulary Target: "+trgVocab.size()+" elements");
    logger.info("Vocabulary Source: "+srcVocab.size()+" elements");
    logger.info("# source terms with > "+PROB_THRESHOLD+" probability covered: "+cntShortTail+" and average translations per term: "+(sumShortTail/(cntShortTail+0.0f)));
    logger.info("# source terms with <= "+PROB_THRESHOLD+" probability covered: "+cntLongTail+" (each has "+ NUM_TRANS +" translations)");




    DataOutputStream dos = new DataOutputStream(new BufferedOutputStream
        (fs.create(new Path(trgVocabFile))));
    ((VocabularyWritable) trgVocab).write(dos);
    dos.close();
    DataOutputStream dos2 = new DataOutputStream(new BufferedOutputStream
        (fs.create(new Path(srcVocabFile))));
    ((VocabularyWritable) srcVocab).write(dos2);
    dos2.close();
    DataOutputStream dos3 = new DataOutputStream(new BufferedOutputStream
        (fs.create(new Path(probsFile))));
    table.write(dos3);
    dos3.close();
  }

View Full Code Here

   * @throws IOException
   */
  public static void createTTableFromGIZA(String filename, String srcVocabFile, String trgVocabFile, String probsFile, FileSystem fs) throws IOException{
    logger.setLevel(Level.INFO);


    TTable_monolithic_IFAs table = new TTable_monolithic_IFAs();
    VocabularyWritable trgVocab = new VocabularyWritable(), srcVocab = new VocabularyWritable();
    File file = new File(filename);
    FileInputStream fis = null;
    BufferedReader bis = null;
    int cnt = 0;


    //In GIZA output, dictionary entries are in random order (w.r.t. prob value), so you need to keep a sorted list of top NUM_TRANS or less entries w/o exceeding MaxProb threshold
    try {
      fis = new FileInputStream(file);
      bis = new BufferedReader(new InputStreamReader(fis,"UTF-8"));


      String srcTerm = null, trgTerm = null, prev = null;
      int curIndex = -1;
      TreeSet<PairOfFloatString> topTrans = new TreeSet<PairOfFloatString>();
      String line = "";
      boolean earlyTerminate = false, skipTerm = false;
      float sumOfProbs = 0.0f, prob;
      int cntLongTail = 0, cntShortTail = 0, sumShortTail = 0;    // for statistical purposes only


      while (true) {
        line = bis.readLine();
        if(line == null)  break;
        String[] parts = line.split(" ");
        if(parts.length != 3){
          throw new RuntimeException("Unknown format: "+line);
        }
        cnt++;
        trgTerm = parts[0];
        srcTerm = parts[1];
        prob = Float.parseFloat(parts[2]);
        if(prev==null || !srcTerm.equals(prev)){
          if(topTrans.size() > 0){
            //store previous term's top translations to ttable
            int finalNumTrans = addToTable(curIndex, topTrans, table, trgVocab);
            if(finalNumTrans < NUM_TRANS){
              cntShortTail++;
              sumShortTail += finalNumTrans;
            }else{
              cntLongTail++;
            }
          }
          logger.debug("Line:"+line);


          //initialize this term
          sumOfProbs = 0.0f;
          topTrans.clear();
          earlyTerminate = false;    //reset status
          skipTerm = false;
          prev = srcTerm;
          int prevIndex = curIndex;
          curIndex = srcVocab.addOrGet(srcTerm);
          if(curIndex <= prevIndex){
            //we've seen this foreign term before. probably due to tokenization or sorting error in aligner. just ignore.
            curIndex = prevIndex;    //revert curIndex value since we're skipping this one
            skipTerm = true;
            continue;
          }
          logger.debug("Processing: "+srcTerm+" with index: "+curIndex);      
          topTrans.add(new PairOfFloatString(prob, trgTerm));
          sumOfProbs += prob;
        }else if(!earlyTerminate && !skipTerm){  //continue adding translation term,prob pairs (except if early termination is ON)
          topTrans.add(new PairOfFloatString(prob, trgTerm));


          // keep top NUM_TRANS translations
          if(topTrans.size()>NUM_TRANS){
            float removedProb = topTrans.pollFirst().getLeftElement();
            sumOfProbs -= removedProb;
          }
          sumOfProbs += prob;
        }else{
          logger.debug("Skipped");
        }
        if(sumOfProbs > PROB_THRESHOLD){
          earlyTerminate = true;
          logger.debug("Sum of probs > "+PROB_THRESHOLD+", early termination.");
        }
      }
      if(topTrans.size()>0){
        //store previous term's top translations to ttable
        int finalNumTrans = addToTable(curIndex, topTrans, table, trgVocab);
        if(finalNumTrans < NUM_TRANS){
          cntShortTail++;
          sumShortTail += finalNumTrans;
        }else{
          cntLongTail++;
        }
      }


      // dispose all the resources after using them.
      fis.close();
      bis.close();


      logger.info("Vocabulary Target: "+trgVocab.size()+" elements");
      logger.info("Vocabulary Source: "+srcVocab.size()+" elements");
      logger.info("# source terms with > "+PROB_THRESHOLD+" probability covered: "+cntShortTail+" and average translations per term: "+(sumShortTail/(cntShortTail+0.0f)));
      logger.info("# source terms with <= "+PROB_THRESHOLD+" probability covered: "+cntLongTail+" (each has "+ NUM_TRANS +" translations)");
    }catch (FileNotFoundException e) {
      e.printStackTrace();
    } catch (IOException e) {
      e.printStackTrace();
    }


    DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(fs.create(new Path(trgVocabFile))));
    ((VocabularyWritable) trgVocab).write(dos);
    dos.close();
    DataOutputStream dos2 = new DataOutputStream(new BufferedOutputStream(fs.create(new Path(srcVocabFile))));
    ((VocabularyWritable) srcVocab).write(dos2);
    dos2.close();
    DataOutputStream dos3 = new DataOutputStream(new BufferedOutputStream(fs.create(new Path(probsFile))));
    table.write(dos3);
    dos3.close();
  }

View Full Code Here

  public static void createTTableFromHooka(String srcVocabFile, String trgVocabFile, String tableFile, String finalSrcVocabFile, String finalTrgVocabFile, String finalTableFile, FileSystem fs) throws IOException{
    logger.setLevel(Level.INFO);


    Vocab srcVocab = HadoopAlign.loadVocab(new Path(srcVocabFile), fs);
    Vocab trgVocab = HadoopAlign.loadVocab(new Path(trgVocabFile), fs);
    TTable_monolithic_IFAs ttable = new TTable_monolithic_IFAs(fs, new Path(tableFile), true);


    Vocab finalSrcVocab = new VocabularyWritable();
    Vocab finalTrgVocab = new VocabularyWritable();
    TTable_monolithic_IFAs finalTTable = new TTable_monolithic_IFAs();


    String srcTerm = null, trgTerm = null;
    int curIndex = -1;
    TreeSet<PairOfFloatString> topTrans = new TreeSet<PairOfFloatString>();
    float sumOfProbs = 0.0f, prob;
    int cntLongTail = 0, cntShortTail = 0, sumShortTail = 0;    // for statistical purposes only


    //modify current ttable wrt foll. criteria: top NUM_TRANS translations per source term, unless cumulative prob. distr. exceeds PROB_THRESHOLD before that.
    for(int srcIndex=1; srcIndex<srcVocab.size(); srcIndex++){
      int[] translations;
      try {
        translations = ttable.get(srcIndex).getTranslations(0.0f);
      } catch (Exception e) {
        logger.warn("No translations found for "+srcVocab.get(srcIndex)+". Ignoring...");
        continue;
      }


      srcTerm = srcVocab.get(srcIndex);
      curIndex = finalSrcVocab.addOrGet(srcTerm);


      //initialize this term
      topTrans.clear();
      sumOfProbs = 0.0f;
      logger.debug("Processing: "+srcTerm+" with index: "+curIndex+" ("+srcIndex+")");
      for(int trgIndex : translations){
        trgTerm = trgVocab.get(trgIndex);
        prob = ttable.get(srcIndex, trgIndex);


        topTrans.add(new PairOfFloatString(prob, trgTerm));
        // keep top NUM_TRANS translations
        if(topTrans.size() > NUM_TRANS){
          float removedProb = topTrans.pollFirst().getLeftElement();
          sumOfProbs -= removedProb;
        }
        sumOfProbs += prob;


        if(sumOfProbs > PROB_THRESHOLD){
          logger.debug("Sum of probs > "+PROB_THRESHOLD+", early termination.");
          break;
        }  
      }


      //store previous term's top translations to ttable
      if(topTrans.size() > 0){
        int finalNumTrans = addToTable(curIndex, topTrans, finalTTable, finalTrgVocab);
        if(finalNumTrans < NUM_TRANS){
          cntShortTail++;
          sumShortTail += finalNumTrans;
        }else{
          cntLongTail++;
        }
      }
    }
    logger.info("Vocabulary Target: "+finalTrgVocab.size()+" elements");
    logger.info("Vocabulary Source: "+finalSrcVocab.size()+" elements");
    logger.info("# source terms with > "+PROB_THRESHOLD+" probability covered: "+cntShortTail+" and average translations per term: "+(sumShortTail/(cntShortTail+0.0f)));
    logger.info("# source terms with <= "+PROB_THRESHOLD+" probability covered: "+cntLongTail+" (each has "+ NUM_TRANS +" translations)");


    DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(fs.create(new Path(finalTrgVocabFile))));
    ((VocabularyWritable) finalTrgVocab).write(dos);
    dos.close();
    DataOutputStream dos2 = new DataOutputStream(new BufferedOutputStream(fs.create(new Path(finalSrcVocabFile))));
    ((VocabularyWritable) finalSrcVocab).write(dos2);
    dos2.close();
    DataOutputStream dos3 = new DataOutputStream(new BufferedOutputStream(fs.create(new Path(finalTableFile))));
    finalTTable.write(dos3);
    dos3.close();
  }

View Full Code Here


      //load vocabularies and prob table
      try {
        eVocabTrg = HadoopAlign.loadVocab(localFiles[1], localFs);
        fVocabSrc = HadoopAlign.loadVocab(localFiles[2], localFs);
        f2e_Probs = new TTable_monolithic_IFAs(localFs, localFiles[3], true);


        eVocabSrc = HadoopAlign.loadVocab(localFiles[4], localFs);
        fVocabTrg = HadoopAlign.loadVocab(localFiles[5], localFs);
        e2f_Probs = new TTable_monolithic_IFAs(localFs, localFiles[6], true);
      } catch (IOException e) {
        throw new RuntimeException("Error initializing vocabularies/prob table!");
      }  


      try {

View Full Code Here

      if(!fs2.exists(new Path(fFile)) || !fs2.exists(new Path(eFile)) || !fs2.exists(new Path(e2fttableFile)) || !fs2.exists(new Path(termsFile)) || !fs2.exists(new Path(dfByTermFile))){
        throw new RuntimeException("Error: Translation files do not exist!");
      }


      Vocab eVocab_e2f = null, fVocab_e2f = null;
      TTable_monolithic_IFAs en2DeProbs = null;
      try {
        eVocab_e2f = HadoopAlign.loadVocab(new Path(eFile), conf);
        fVocab_e2f = HadoopAlign.loadVocab(new Path(fFile), conf);


        en2DeProbs = new TTable_monolithic_IFAs(fs2, new Path(e2fttableFile), true);
      } catch (IOException e) {
        e.printStackTrace();
      }  
      PrefixEncodedGlobalStats globalStatsMap;
      globalStatsMap = new PrefixEncodedGlobalStats(new Path(termsFile), fs2);

View Full Code Here

  
  public void testSetArray() {
    int[] indices = {1,4,6,16};
    float[] probs = {0.1f,0.2f,0.3f,0.4f};
    
    TTable_monolithic_IFAs table = new TTable_monolithic_IFAs();
    table.set(7, new IndexedFloatArray(indices, probs, false));
    
    assertTrue(table.get(7,4)==0.2f);
    assertTrue(table.get(7,6)==0.3f);
    assertTrue(table.get(7,1)==0.1f);
    assertTrue(table.get(7,16)==0.4f);
  }

View Full Code Here

  static protected TTable loadTTable(Path path) throws IOException {
    org.apache.hadoop.conf.Configuration conf = new org.apache.hadoop.conf.Configuration();
    FileSystem fileSys = FileSystem.get(conf);
  
    DataInput in = new DataInputStream(new BufferedInputStream(fileSys.open(path)));
    TTable_monolithic_IFAs tt = new TTable_monolithic_IFAs();
    tt.readFields(in);
    
    return tt;
  }

View Full Code Here

    } catch (IOException e) { e.printStackTrace();}
  }
  
  ServerSocketChannel serverChannel;
  public PServer(int port, FileSystem fs, Path ttablePath) throws IOException {
    ttable = new TTable_monolithic_IFAs(fs, ttablePath, true);
    
    serverChannel = ServerSocketChannel.open();
    selector = Selector.open();
    serverChannel.socket().bind (new InetSocketAddress(port));
    serverChannel.configureBlocking (false);

View Full Code Here

0 1 2 3 4 5 6 7

TOP

Related Classes of edu.umd.hooka.ttables.TTable_monolithic_IFAs

edu.umd.hooka.alignment.HadoopAlign$AlignmentBase

edu.umd.hooka.alignment.HadoopAlign$ModelMergeMapper

edu.umd.hooka.alignment.HadoopAlign$ModelMergeMapper2

edu.umd.hooka.alignment.IndexedFloatArray

edu.umd.hooka.alignment.IndexedFloatArrayTest

edu.umd.hooka.alignment.M1ViterbiExtract

edu.umd.hooka.PServer

ivory.core.preprocess.BuildTranslatedTermDocVectors$DataWriterMapper

ivory.core.preprocess.BuildTranslatedTermDocVectors$MyMapperTrans

ivory.core.util.CLIRUtils

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.