Package edu.umd.cloud9.io.map

Examples of edu.umd.cloud9.io.map.HMapSFW


   * @param probMaps
   *    list of probability distributions
   * @return
   */
  public static HMapSFW combineProbMaps(float threshold, float scale, List<PairOfFloatMap> probMaps) {
    HMapSFW combinedProbMap = new HMapSFW();

    int numDistributions = probMaps.size();

    // get a combined set of all translation alternatives
    // compute normalization factor when sum of weights is not 1.0
    Set<String> translationAlternatives = new HashSet<String>();
    float sumWeights = 0;
    for (int i=0; i < numDistributions; i++) {
      HMapSFW dist = probMaps.get(i).getMap();
      float weight = probMaps.get(i).getWeight();

      // don't add vocabulary from a distribution that has 0 weight
      if (weight > 0) {
        translationAlternatives.addAll(dist.keySet());
        sumWeights += weight;
      }
    }
   
    // normalize by sumWeights
    for (String e : translationAlternatives) {
      float combinedProb = 0f;
      for (int i=0; i < numDistributions; i++) {
        HMapSFW dist = probMaps.get(i).getMap();
        float weight = probMaps.get(i).getWeight();
        combinedProb += (weight/sumWeights) * dist.get(e);    // Prob(e|f) = weighted average of all distributions
      }
      combinedProb *= scale;
      if (combinedProb > threshold) {
        combinedProbMap.put(e, combinedProb);
      }
View Full Code Here


   * @param cumProbThreshold
   * @param maxNumTrans
   */
  public static void normalize(Map<String, HMapSFW> probMap, float lexProbThreshold, float cumProbThreshold, int maxNumTrans) {
    for (String sourceTerm : probMap.keySet()) {
      HMapSFW probDist = probMap.get(sourceTerm);
      TreeSet<PairOfStringFloat> sortedFilteredProbDist = new TreeSet<PairOfStringFloat>();
      HMapSFW normProbDist = new HMapSFW();

      // compute normalization factor
      float sumProb = 0;
      for (Entry<String> entry : probDist.entrySet()) {
        sumProb += entry.getValue();
      }

      // normalize values and remove low-prob entries based on normalized values
      float sumProb2 = 0;
      for (Entry<String> entry : probDist.entrySet()) {
        float pr = entry.getValue() / sumProb;
        if (pr > lexProbThreshold) {
          sumProb2 += pr;
          sortedFilteredProbDist.add(new PairOfStringFloat(entry.getKey(), pr));
        }
      }

      // re-normalize values after removal of low-prob entries
      float cumProb = 0;
      int cnt = 0;
      while (cnt < maxNumTrans && cumProb < cumProbThreshold && !sortedFilteredProbDist.isEmpty()) {
        PairOfStringFloat entry = sortedFilteredProbDist.pollLast();
        float pr = entry.getValue() / sumProb2;
        cumProb += pr;
        normProbDist.put(entry.getKey(), pr);
        cnt++;
      }

      probMap.put(sourceTerm, normProbDist);
    }
View Full Code Here

          if (trans != null) {
            tokenTranslations.put(trans);
          }
        }else {
          JSONObject tokenTrans = new JSONObject();
          HMapSFW distr = getTranslations(token, stemmed2Stemmed);
          if (distr == null) { continue; }
          JSONArray weights = Utils.probMap2JSON(distr);
          if (weights != null) {       
            tokenTrans.put("#weight", weights);
            tokenTranslations.put(tokenTrans);
View Full Code Here

    }
    return token;
  }

  protected HMapSFW getTranslations(String token, Map<String, String> stemmed2Stemmed) {
    HMapSFW probDist = new HMapSFW();
    int f = fVocab_f2e.get(token);
    if (f <= 0) {
      // LOG.info("OOV: "+token);

      // heuristic: if no translation found, include itself as only translation
      String targetStem = stemmed2Stemmed.get(token);
      String target = (stemmed2Stemmed == null || targetStem == null) ? token : stemmed2Stemmed.get(token);
      probDist.put(target, 1);     
      return probDist;
    }
    PriorityQueue<PairOfFloatInt> eS = f2eProbs.get(f).getTranslationsWithProbs(lexProbThreshold);
    //    LOG.info("Adding "+ eS.size() +" translations for "+token+","+f);

    float sumProbEF = 0;
    int numTrans = 0;
    //tf(e) = sum_f{tf(f)*prob(e|f)}
    while (numTrans < numTransPerToken && !eS.isEmpty()) {
      PairOfFloatInt entry = eS.poll();
      float probEF = entry.getLeftElement();
      int e = entry.getRightElement();
      String eTerm = eVocab_f2e.get(e);

      //      LOG.info("Pr("+eTerm+"|"+token+")="+probEF);

      if (probEF > 0 && e > 0 && !docLangTokenizer.isStemmedStopWord(eTerm) && (pairsInSCFG == null || pairsInSCFG.contains(new PairOfStrings(token,eTerm)))) {     
        // assuming our bilingual dictionary is learned from normally segmented text, but we want to use bigram tokenizer for CLIR purposes
        // then we need to convert the translations of each source token into a sequence of bigrams
        // we can distribute the translation probability equally to the each bigram
        if (bigramSegment) {
          String[] eTokens = docLangTokenizer.processContent(eTerm);
          float splitProb = probEF / eTokens.length;
          for (String eToken : eTokens) {
            // filter tokens that are not in the index for efficiency
            if (env.getPostingsList(eToken) != null) {
              probDist.put(eToken, splitProb);
            }
          }
          // here we add probability for tokens that we ignored in above condition,
          // but it works better (empirically) this way
          // AND it is consistent with what we would get if we did not do the index-filtering above
          // only faster
          sumProbEF += probEF;     
        }else {
          if (env.getPostingsList(eTerm) != null) {
            probDist.increment(eTerm, probEF);
            sumProbEF += probEF;
          }
        }
        numTrans++;
        //          LOG.info("adding "+eTerm+","+probEF+","+sumProbEF);
      }else{
        LOG.info("Skipped target stopword/OOV " + eTerm);
      }

      // early terminate if cumulative prob. has reached specified threshold
      if (sumProbEF > cumProbThreshold || numTrans >= numTransPerToken) {
        break;
      }
    }

    // normalize weights
    for(String e : probDist.keySet()){
      probDist.put(e, probDist.get(e) / sumProbEF);
    }

    //    LOG.info("Translations of "+token+"="+probDist);

    return probDist;
View Full Code Here

    }
    return queryJson;
  }

  private String getBestTranslation(String token) {
    HMapSFW probDist = probMap.get(token);

    if(probDist == null){
      //      LOG.info("OOV: "+token);

      // heuristic: if no translation found, include itself as only translation
      return token;
    }

    float maxProb = 0f;
    String maxProbTrans = null;
    for (edu.umd.cloud9.util.map.MapKF.Entry<String> entry : probDist.entrySet()) {
      if (entry.getValue() > maxProb) {
        maxProb = entry.getValue();
        maxProbTrans = entry.getKey();
      }
    }
View Full Code Here

  public int getQueryLength(){
    return length; 
  }

  protected HMapSFW getTranslations(String token, Map<String, String> stemmed2Stemmed) {
    HMapSFW probDist = probMap.get(token);
    //    LOG.info("Translations of "+token+"="+probDist);
    if(probDist == null){
      //      LOG.info("OOV: "+token);

      // heuristic: if no translation found, include itself as only translation
      probDist = new HMapSFW();
      String targetStem = stemmed2Stemmed.get(token);
      String target = (stemmed2Stemmed == null || targetStem == null) ? token : stemmed2Stemmed.get(token);
      probDist.put(target, 1);     
      return probDist;
    }

    // // support for bigram segmentation
    //    if (bigramSegment) {
View Full Code Here

   */
  public static HMapSFW createTermDocVector(int docLen, HMapIFW tfTable, Vocab eVocabSrc, ScoringModel scoringModel, HMapIFW dfTable, boolean isNormalize, Logger sLogger) {
    if(sLogger == null){
      sLogger = logger;
    }
    HMapSFW v = new HMapSFW();
    float normalization=0;
    for(int e : tfTable.keySet()){
      // retrieve term string, tf and df
      String eTerm = eVocabSrc.get(e);
      float tf = tfTable.get(e);
      float df = dfTable.get(e);

      // compute score via scoring model
      float score = ((Bm25) scoringModel).computeDocumentWeight(tf, df, docLen);
      if(score>0){
        v.put(eTerm, score);
        if(isNormalize){
          normalization+=Math.pow(score, 2);
        }   
      }
      sLogger.debug(eTerm+" "+tf+" "+df+" "+score);
    }

    // length-normalize doc vector
    if(isNormalize){
      normalization = (float) Math.sqrt(normalization);
      for(Entry<String> e : v.entrySet()){
        v.put(e.getKey(), e.getValue()/normalization);
      }
    }
    return v;
  }
View Full Code Here

      //translate doc vector   
      HMapIFW tfS = new HMapIFW();
     
      int docLen = CLIRUtils.translateTFs(doc, tfS, eVocabSrc, eVocabTrg, fVocabSrc, fVocabTrg, e2f_Probs, f2e_Probs, LOG);
      HMapSFW v = CLIRUtils.createTermDocVector(docLen, tfS, eVocabSrc, model, transDfTable, isNormalize, LOG);
     
      // if no translation of any word is in the target vocab, remove document i.e., our model wasn't capable of translating it.
      if(v.isEmpty() ){
        reporter.incrCounter(Docs.ZERO, 1);
      }else if(v.size()<MIN_SIZE){
        reporter.incrCounter(Docs.SHORT, 1);
      }else{
        reporter.incrCounter(Docs.Total, 1);
        output.collect(docno, v);
      }
View Full Code Here

public class HMapSFWTest {

  @Test
  public void testBasic() throws IOException {
    HMapSFW m = new HMapSFW();

    m.put("hi", 5.0f);
    m.put("there", 22.0f);

    assertEquals(2, m.size());
    assertEquals(5.0f, m.get("hi"), 10e-6);

    m.remove("hi");
    assertEquals(1, m.size());

    assertEquals(22.0f, m.get("there"), 10e-6);
  }
View Full Code Here

    assertEquals(22.0f, m.get("there"), 10e-6);
  }

  @Test
  public void testAccent() throws IOException {
    HMapSFW map1 = new HMapSFW();

    // '\u00E0': [LATIN SMALL LETTER A WITH GRAVE]
    // '\u00E6': [LATIN SMALL LETTER AE]
    // '\u00E7': [LATIN SMALL LETTER C WITH CEDILLA]
    // '\u00FC': [LATIN SMALL LETTER U WITH DIAERESIS]

    map1.put("\u00E0", 1.0f);
    map1.put("\u00E6", 2.0f);
    map1.put("\u00E7", 3.0f);
    map1.put("\u00FC", 4.0f);

    assertEquals(1.0f, map1.get("\u00E0"), 10e-6);
    assertEquals(2.0f, map1.get("\u00E6"), 10e-6);
    assertEquals(3.0f, map1.get("\u00E7"), 10e-6);
    assertEquals(4.0f, map1.get("\u00FC"), 10e-6);

    map1.put("\u00E0", 10.0f);
    map1.remove("\u00E6");
    map1.remove("\u00E7");
    map1.put("\u00E7", 2.0f);

    assertEquals(10.0f, map1.get("\u00E0"), 10e-6);
    assertEquals(2.0f, map1.get("\u00E7"), 10e-6);
    assertEquals(4.0f, map1.get("\u00FC"), 10e-6);

    assertEquals(3, map1.size());

    // Test serialization
    HMapSFW map2 = HMapSFW.create(map1.serialize());

    assertEquals(10.0f, map2.get("\u00E0"), 10e-6);
    assertEquals(2.0f, map2.get("\u00E7"), 10e-6);
    assertEquals(4.0f, map2.get("\u00FC"), 10e-6);
  }
View Full Code Here

TOP

Related Classes of edu.umd.cloud9.io.map.HMapSFW

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.