Examples of edu.umd.cloud9.io.map.HMapSFW

edu.umd.cloud9.io.map.HMapSFW
Writable representing a map where keys are Strings and values are floats. This class is specialized for String objects to avoid the overhead that comes with wrapping Strings inside Text objects. @author Jimmy Lin

      sLogger = logger;
    }


    //sLogger.setLevel(Level.DEBUG);


    HMapSFW v = new HMapSFW();
    float normalization=0;
    for(int e : tfTable.keySet()){
      // retrieve term string, tf and df
      String eTerm = eVocab.get(e);
      float tf = tfTable.get(e);
      float df = dfTable.get(e);


      // compute score via scoring model
      float score = ((Bm25) scoringModel).computeDocumentWeight(tf, df, docLen);


      sLogger.debug(eTerm+" "+tf+" "+df+" "+score);
      if(score>0){
        v.put(eTerm, score);
        if(isNormalize){
          normalization+=Math.pow(score, 2);
        }    
      }
    }


    // length-normalize doc vector
    if(isNormalize){
      normalization = (float) Math.sqrt(normalization);
      for(Entry<String> e : v.entrySet()){
        v.put(e.getKey(), e.getValue()/normalization);
      }
    }
    return v;
  }

View Full Code Here

      sLogger = logger;
    }


    //sLogger.setLevel(Level.DEBUG);


    HMapSFW v = new HMapSFW();
    float normalization=0;
    for(int e : tfTable.keySet()){
      // retrieve term string, tf and df
      String eTerm = eVocab.get(e);
      float tf = tfTable.get(e);
      float df = dfTable.get(eTerm);


      // compute score via scoring model
      float score = ((Bm25) scoringModel).computeDocumentWeight(tf, df, docLen);


      sLogger.debug(eTerm+" "+tf+" "+df+" "+score);
      if(score>0){
        v.put(eTerm, score);
        if(isNormalize){
          normalization+=Math.pow(score, 2);
        }   
      }
    }


    // length-normalize doc vector
    if(isNormalize){
      normalization = (float) Math.sqrt(normalization);
      for(Entry<String> e : v.entrySet()){
        v.put(e.getKey(), e.getValue()/normalization);
      }
    }
    return v;
  }

View Full Code Here

      sLogger = logger;
    }


    //    sLogger.setLevel(Level.DEBUG);


    HMapSFW v = new HMapSFW();
    float normalization=0;
    for(edu.umd.cloud9.util.map.MapIF.Entry entry : tfTable.entrySet()){
      // retrieve term string, tf and df
      String eTerm = eVocab.get(entry.getKey());
      float tf = entry.getValue();
      int eId = dict.getId(eTerm);
      if(eId < 1){    //OOV
        continue;
      }
      int df = dfTable.getDf(eId);
      // compute score via scoring model
      float score = ((Bm25) scoringModel).computeDocumentWeight(tf, df, docLen);
      if(df<1){
        sLogger.warn("Suspicious DF WARNING = "+eTerm+" "+tf+" "+df+" "+score);
      }


      sLogger.debug(eTerm+" "+tf+" "+df+" "+score);


      if(score>0){
        v.put(eTerm, score);
        if(isNormalize){
          normalization+=Math.pow(score, 2);
        }    
      }
    }


    // length-normalize doc vector
    if(isNormalize){
      normalization = (float) Math.sqrt(normalization);
      for(Entry<String> e : v.entrySet()){
        v.put(e.getKey(), e.getValue()/normalization);
      }
    }
    return v;
  }

View Full Code Here

      sLogger = logger;
    }


    //    sLogger.setLevel(Level.DEBUG);


    HMapSFW v = new HMapSFW();
    float normalization=0;
    for(edu.umd.cloud9.util.map.MapKI.Entry<String> entry : tfTable.entrySet()){
      // retrieve term string, tf and df
      String eTerm = entry.getKey();
      float tf = entry.getValue();
      int eId = dict.getId(eTerm);
      if(eId < 1){    //OOV
        continue;
      }
      int df = dfTable.getDf(eId);
      // compute score via scoring model
      float score = ((Bm25) scoringModel).computeDocumentWeight(tf, df, docLen);
      if(df<1){
        sLogger.warn("Suspicious DF WARNING = "+eTerm+" "+tf+" "+df+" "+score);
      }


      sLogger.debug(eTerm+" "+tf+" "+df+" "+score);


      if(score>0){
        v.put(eTerm, score);
        if(isNormalize){
          normalization+=Math.pow(score, 2);
        }   
      }
    }


    // length-normalize doc vector
    if(isNormalize){
      normalization = (float) Math.sqrt(normalization);
      for(Entry<String> e : v.entrySet()){
        v.put(e.getKey(), e.getValue()/normalization);
      }
    }
    return v;
  }

View Full Code Here

    transPhrase = transPhrase.trim();


    //LOG.info("Found translation phrase " + transPhrase);


    if (!phrase2score.containsKey(fPhrase)) {
      phrase2score.put(fPhrase, new HMapSFW());
    }
    // if same phrase extracted from multiple rules, average prob.s


    HMapKF<String> scoreTable = phrase2score.get(fPhrase);

View Full Code Here

            if (ids == null || ids.size() != 1) continue;
            for (int e : ids) {
              String eTerm = rhs[e];
              if (docLangTokenizer.isStopWord(eTerm))  continue;
              if (scfgDist.containsKey(fTerm)) {
                HMapSFW eToken2Prob = scfgDist.get(fTerm);
                if(eToken2Prob.containsKey(eTerm)) {
                  eToken2Prob.increment(eTerm, prob);
                }else {
                  eToken2Prob.put(eTerm, prob);
                }
              }else {
                HMapSFW eToken2Prob = new HMapSFW();
                eToken2Prob.put(eTerm, prob);
                scfgDist.put(fTerm, eToken2Prob);
              }
            }
            // keep track of alignments to identify source and target phrases
          }else {

View Full Code Here

   * @param threshold
   * @param scale
   * @param probMap
   */
  public static HMapSFW scaleProbMap(float threshold, float scale, HMapSFW probMap) {
    HMapSFW scaledProbMap = new HMapSFW();
    
    for (Entry<String> entry : probMap.entrySet()) {
      float pr = entry.getValue() * scale;
      if (pr > threshold) {
        scaledProbMap.put(entry.getKey(), pr);
      }
    }
    return scaledProbMap;
  }

View Full Code Here

   *    value between 0 and 1 that determines total probability in final distribution (e.g., 0.2 scale will scale [0.8 0.1 0.1] into [0.16 0.02 0.02])
   * @param probMaps
   *    list of probability distributions
   */
  public static HMapSFW combineProbMaps(float threshold, float scale, List<PairOfFloatMap> probMaps) {
    HMapSFW combinedProbMap = new HMapSFW();


    int numDistributions = probMaps.size();


    // get a combined set of all translation alternatives
    // compute normalization factor when sum of weights is not 1.0
    Set<String> translationAlternatives = new HashSet<String>();
    float sumWeights = 0;
    for (int i=0; i < numDistributions; i++) {
      HMapSFW dist = probMaps.get(i).getMap();
      float weight = probMaps.get(i).getWeight();


      // don't add vocabulary from a distribution that has 0 weight
      if (weight > 0) {
        translationAlternatives.addAll(dist.keySet());
        sumWeights += weight;
      }
    }
    
    // normalize by sumWeights
    for (String e : translationAlternatives) {
      float combinedProb = 0f;
      for (int i=0; i < numDistributions; i++) {
        HMapSFW dist = probMaps.get(i).getMap();
        float weight = probMaps.get(i).getWeight();
        combinedProb += (weight/sumWeights) * dist.get(e);    // Prob(e|f) = weighted average of all distributions
      }
      combinedProb *= scale;
      if (combinedProb > threshold) {
        combinedProbMap.put(e, combinedProb);
      }

View Full Code Here

   * @param cumProbThreshold
   * @param maxNumTrans
   */
  public static void normalize(Map<String, HMapSFW> probMap, float lexProbThreshold, float cumProbThreshold, int maxNumTrans) {
    for (String sourceTerm : probMap.keySet()) {
      HMapSFW probDist = probMap.get(sourceTerm);
      TreeSet<PairOfStringFloat> sortedFilteredProbDist = new TreeSet<PairOfStringFloat>();
      HMapSFW normProbDist = new HMapSFW();


      // compute normalization factor
      float sumProb = 0;
      for (Entry<String> entry : probDist.entrySet()) {
        sumProb += entry.getValue(); 
      }


      // normalize values and remove low-prob entries based on normalized values
      float sumProb2 = 0;
      for (Entry<String> entry : probDist.entrySet()) {
        float pr = entry.getValue() / sumProb;
        if (pr > lexProbThreshold) {
          sumProb2 += pr;
          sortedFilteredProbDist.add(new PairOfStringFloat(entry.getKey(), pr));
        }
      }


      // re-normalize values after removal of low-prob entries
      float cumProb = 0;
      int cnt = 0;
      while (cnt < maxNumTrans && cumProb < cumProbThreshold && !sortedFilteredProbDist.isEmpty()) {
        PairOfStringFloat entry = sortedFilteredProbDist.pollLast();
        float pr = entry.getValue() / sumProb2;
        cumProb += pr;
        normProbDist.put(entry.getKey(), pr);
        cnt++;
      }


      probMap.put(sourceTerm, normProbDist);
    }

View Full Code Here

    public void map(IntWritable docno, HMapSFW docvector,
        OutputCollector<IntWritable, PairOfFloatInt> output, Reporter reporter) throws IOException {
      for (int i = 0; i < vectors.size(); i++) {
        reporter.incrCounter(Pairs.Total, 1);
        IntWritable sampleDocno = (IntWritable) vectors.get(i).getLeftElement();
        HMapSFW fromSample = (HMapSFW) vectors.get(i).getRightElement();


        float cs = CLIRUtils.cosine(docvector, fromSample);       
        if (cs >= threshold) {
          sLogger.debug(sampleDocno + "," + fromSample + "\n" + fromSample.length());
          sLogger.debug(docno + "," + docvector + "\n" + docvector.length());
          sLogger.debug(cs);
          reporter.incrCounter(Pairs.Emitted, 1);
          output.collect(new IntWritable(sampleDocno.get()), new PairOfFloatInt(cs, docno.get()));
        }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of edu.umd.cloud9.io.map.HMapSFW

bak.pcj.IntIterator

edu.umd.cloud9.example.bfs.BfsNodeTest

edu.umd.cloud9.example.pagerank.PageRankNodeTest

edu.umd.cloud9.io.array.ArrayListOfIntsWritable

edu.umd.cloud9.io.fastutil.Int2FloatOpenHashMapWritableTest

edu.umd.cloud9.io.fastutil.Int2IntOpenHashMapWritableTest

edu.umd.cloud9.io.fastutil.String2FloatOpenHashMapWritableTest

edu.umd.cloud9.io.fastutil.String2IntOpenHashMapWritableTest

edu.umd.cloud9.io.map.HMapIDW

edu.umd.cloud9.io.map.HMapIDWTest

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.