Package edu.umd.cloud9.io.pair

Examples of edu.umd.cloud9.io.pair.PairOfLongInt


    Configuration conf = IntegrationUtils.getBespinConfiguration();
    FileSystem fs = FileSystem.get(conf);

    SequenceFile.Reader reader;
    IntWritable key = new IntWritable();
    HMapIFW map = new HMapIFW();
    WeightedIntDocVector value = new WeightedIntDocVector();

    reader = new SequenceFile.Reader(fs.getConf(),
        SequenceFile.Reader.file(new Path(enwikiEn + "/test_wt-int-doc-vectors/part-00000")));

    reader.next(key, value);
    System.out.println("*** top 10 terms ***");
    map = value.getWeightedTerms();
    for ( MapIF.Entry entry : map.getEntriesSortedByValue(10)) {
      System.out.println(entry.getKey() + ": " + entry.getValue());
    }
    verifyIntDocVector(enIntDocVector1, value);

    reader.next(key, value);
    System.out.println("*** top 10 terms ***");
    map = value.getWeightedTerms();
    for ( MapIF.Entry entry : map.getEntriesSortedByValue(10)) {
      System.out.println(entry.getKey() + ": " + entry.getValue());
    }
    verifyIntDocVector(enIntDocVector2, value);
    reader.close();
  }
View Full Code Here


    Configuration conf = IntegrationUtils.getBespinConfiguration();
    FileSystem fs = FileSystem.get(conf);

    SequenceFile.Reader reader;
    IntWritable key = new IntWritable();
    HMapIFW map = new HMapIFW();
    WeightedIntDocVector value = new WeightedIntDocVector();

    reader = new SequenceFile.Reader(fs.getConf(),
        SequenceFile.Reader.file(new Path(dewikiEn + "/test_wt-int-doc-vectors/part-00000")));
    reader.next(key, value);
    System.out.println("*** top 10 terms ***");
    map = value.getWeightedTerms();
    for ( MapIF.Entry entry : map.getEntriesSortedByValue(10)) {
      System.out.println(entry.getKey() + ": " + entry.getValue());
    }
    verifyIntDocVector(deIntDocVector1, value);

    reader.next(key, value);
    System.out.println("*** top 10 terms ***");
    map = value.getWeightedTerms();
    for ( MapIF.Entry entry : map.getEntriesSortedByValue(10)) {
      System.out.println(entry.getKey() + ": " + entry.getValue());
    }
    verifyIntDocVector(deIntDocVector2, value);
    reader.close();
  }
View Full Code Here

    Configuration conf = IntegrationUtils.getBespinConfiguration();
    FileSystem fs = FileSystem.get(conf);

    SequenceFile.Reader reader;
    IntWritable key = new IntWritable();
    HMapIFW map = new HMapIFW();
    WeightedIntDocVector value = new WeightedIntDocVector();

    reader = new SequenceFile.Reader(fs.getConf(),
        SequenceFile.Reader.file(new Path(galagoIndex + "/test_wt-int-doc-vectors/part-00000")));

    reader.next(key, value);
    System.out.println("*** top 10 terms ***");
    map = value.getWeightedTerms();
    for ( MapIF.Entry entry : map.getEntriesSortedByValue(10)) {
      System.out.println(entry.getKey() + ": " + entry.getValue());
    }
    verifyIntDocVector(galagoIntDocVector1, value);

    reader.next(key, value);
    System.out.println("*** top 10 terms ***");
    map = value.getWeightedTerms();
    for ( MapIF.Entry entry : map.getEntriesSortedByValue(10)) {
      System.out.println(entry.getKey() + ": " + entry.getValue());
    }
    verifyIntDocVector(galagoIntDocVector2, value);
    reader.close();
  }
View Full Code Here

    FileSystem fs = FileSystem.get(conf);

    SequenceFile.Reader reader;
    IntWritable key = new IntWritable();
    WeightedIntDocVector value = new WeightedIntDocVector();
    HMapIFW map = new HMapIFW();

    reader = new SequenceFile.Reader(fs.getConf(),
        SequenceFile.Reader.file(new Path(opennlpIndex + "/test_wt-int-doc-vectors/part-00000")));
    reader.next(key, value);
    map = value.getWeightedTerms();
    System.out.println("*** top 10 terms ***");
    for ( MapIF.Entry entry : map.getEntriesSortedByValue(10)) {
      System.out.println(entry.getKey() + ": " + entry.getValue());
    }
    verifyIntDocVector(opennlpIntDocVector1, value);

    reader.next(key, value);
    map = value.getWeightedTerms();
    System.out.println("*** top 10 terms ***");
    for ( MapIF.Entry entry : map.getEntriesSortedByValue(10)) {
      System.out.println(entry.getKey() + ": " + entry.getValue());
    }
    verifyIntDocVector(opennlpIntDocVector2, value);
    reader.close();
  }
View Full Code Here

   *     FileSystem object
   * @return
   *     mapping from term ids to df values
   */
  public static HMapIFW readTransDfTable(Path path, FileSystem fs) {
    HMapIFW transDfTable = new HMapIFW();
    try {
      SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, fs.getConf());

      IntWritable key = (IntWritable) reader.getKeyClass().newInstance();
      FloatWritable value = (FloatWritable) reader.getValueClass().newInstance();

      while (reader.next(key, value)) {
        transDfTable.put(key.get(), value.get());
        //        logger.info(key.get()+"-->"+value.get());
        key = (IntWritable) reader.getKeyClass().newInstance();
        value = (FloatWritable) reader.getValueClass().newInstance();
      }
      reader.close();
View Full Code Here

   *     contains mapping from F-terms to their df values
   * @return
   *     mapping from E-terms to their computed df values
   */
  public static HMapIFW translateDFTable(Vocab eVocabSrc, Vocab fVocabTrg, TTable_monolithic_IFAs e2f_probs, FrequencySortedDictionary dict, DfTableArray dfTable){
    HMapIFW transDfTable = new HMapIFW();
    for(int e=1;e<eVocabSrc.size();e++){
      int[] fS = e2f_probs.get(e).getTranslations(0.0f);
      float df=0;
      for(int f : fS){
        float probEF = e2f_probs.get(e, f);
        String fTerm = fVocabTrg.get(f);
        int id = dict.getId(fTerm);
        if(id != -1){
          float df_f = dfTable.getDf(id);       
          df += (probEF*df_f);
        }else{
          logger.debug(fTerm+" not in dict");
        }
      }
      transDfTable.put(e, df);
    }
    return transDfTable;
  }
View Full Code Here

   *     mapping from F-terms to their df values
   * @return
   *     mapping from E-terms to their computed df values
   */
  public static HMapIFW translateDFTable(Vocab eVocabSrc, Vocab fVocabTrg, TTable_monolithic_IFAs e2f_probs, HMapSIW dfs){
    HMapIFW transDfTable = new HMapIFW();
    for(int e=1;e<eVocabSrc.size();e++){
      int[] fS = null;
      try {
        fS = e2f_probs.get(e).getTranslations(0.0f);
      } catch (Exception e1) {
        e1.printStackTrace();
      }
      float df=0;
      for(int f : fS){
        float probEF = e2f_probs.get(e, f);
        String fTerm = fVocabTrg.get(f);
        if(!dfs.containsKey(fTerm)){  //only if word is in the collection, can it contribute to the df values.
          continue;
        }     
        float df_f = dfs.get(fTerm);
        df+=(probEF*df_f);
      }
      transDfTable.put(e, df);
    }
    return transDfTable;
  }
View Full Code Here

      // add token translations into a #combine of #weight array structures
      JsonArray tokensArr = new JsonArray();
      if (tokenWeight > 0) {
        for (String srcToken : stemmedSourceTokens) {
          HMapSFW nbestDist = translation.getDistributionOf(srcToken);

          if (defaultTokenizer.isStopWord(srcToken)){
            continue;
          }
          LOG.info("Processing "+srcToken);

          // combine translations from N-best AND bilingual dictionary
          List<PairOfFloatMap> tokenRepresentationList = new ArrayList<PairOfFloatMap>();

          // Pr{bitext}
          if (bitextWeight > 0) {
            HMapSFW bitextDist = clGenerator.getTranslations(origQuery.trim(), srcToken, pairsInGrammar, stemmed2Stemmed);
            if(bitextDist != null && !bitextDist.isEmpty()){
              tokenRepresentationList.add(new PairOfFloatMap(bitextDist, bitextWeight));
            }
          }

          // Pr{scfg}
          if (scfgWeight > 0) {
            HMapSFW scfgDist = scfgGenerator.getTranslations(origQuery.trim(), srcToken, probMap, stemmed2Stemmed);
            if (scfgDist != null && !scfgDist.isEmpty() ){
              tokenRepresentationList.add(new PairOfFloatMap(scfgDist, scfgWeight));
            }
          }

          // Pr{n-best}
          if (mtWeight > 0 && nbestDist != null && !nbestDist.isEmpty()) {
            Utils.normalize(nbestDist);
            tokenRepresentationList.add(new PairOfFloatMap(nbestDist, mtWeight));
          }

          JsonArray combinedArr;
          float scale = 1;
          if (scaling) {
            scale = scale * translation.getSourceTokenCnt().get(srcToken) / ((float)translation.getCount());
          }
          if(tokenRepresentationList.size() == 0) {
            continue;       // if empty distr., do not represent this source token in query
          } else if(tokenRepresentationList.size() == 1) {
            combinedArr = Utils.createJsonArrayFromProbabilities(Utils.scaleProbMap(lexProbThreshold, scale, tokenRepresentationList.get(0).getMap()));
          } else {
            combinedArr = Utils.createJsonArrayFromProbabilities(Utils.combineProbMaps(lexProbThreshold, scale, tokenRepresentationList));
          }

          JsonObject tokenWeightedArr = new JsonObject();         
          tokenWeightedArr.add("#weight", combinedArr);

          // optional: if this source token has occurred more than once per query, reflect this in the representation
          //  for (int i = 0; i < Math.ceil(tokenCount.get(srcToken)/(float)kBest); i++) {
          //    tokensArr.put(tokenWeightedArr);
          //  }
          tokensArr.add(tokenWeightedArr);
        }
        queryTJson.add("#combine", tokensArr);
      }

      // combine the token-based and phrase-based representations into a #combweight structure
      JsonArray queryJsonArr = new JsonArray();

      HMapSFW scaledPhrase2Weight = null;
      if (phraseWeight > 0) {
        scaledPhrase2Weight = Utils.scaleProbMap(lexProbThreshold, phraseWeight, translation.getPhraseDist());     
        for (String phrase : scaledPhrase2Weight.keySet()) {
          queryJsonArr.add(new JsonPrimitive(scaledPhrase2Weight.get(phrase)));
          queryJsonArr.add(new JsonPrimitive(phrase));
        }
      }
      if (tokenWeight > 0) {
        queryJsonArr.add(new JsonPrimitive(tokenWeight));
View Full Code Here

    }  
    return probMap;
  }

  private String getBestTranslation(String query, String token) {
    HMapSFW probDist = query2probMap.get(query).get(token);

    if(probDist == null){
      return token;
    }

    float maxProb = 0f;
    String maxProbTrans = null;
    for (edu.umd.cloud9.util.map.MapKF.Entry<String> entry : probDist.entrySet()) {
      if (entry.getValue() > maxProb) {
        maxProb = entry.getValue();
        maxProbTrans = entry.getKey();
      }
    }
View Full Code Here

    }
    return maxProbTrans;
  }

  protected HMapSFW getTranslations(String query, String token, Map<String, HMapSFW> probMap, Map<String, String> stemmed2Stemmed) {
    HMapSFW probDist = null;
    try {
      probDist = probMap.get(token);
    } catch (NullPointerException e) {
      LOG.info("Prob map not found for " + query);
      e.printStackTrace();
    }
   
    if(probDist == null){
      // borrow OOV word heuristic from MT: if no translation found, include itself as translation
      probDist = new HMapSFW();
      String targetStem = stemmed2Stemmed.get(token);
      String target = (stemmed2Stemmed == null || targetStem == null) ? token : stemmed2Stemmed.get(token);
      probDist.put(target, 1);     
      return probDist;
    }

    return probDist;
  }
View Full Code Here

TOP

Related Classes of edu.umd.cloud9.io.pair.PairOfLongInt

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.