Examples of edu.umd.cloud9.collection.DocnoMapping

edu.umd.cloud9.io.pair.PairOfFloatString

Interface for an object that maintains a bidirectional mapping between docids and docnos. A docid is a globally-unique String identifier for a document in the collection. For many types of information retrieval algorithms, documents in the collection must be sequentially numbered; thus, each document in the collection must be assigned a unique integer identifier, which is its docno. Typically, the docid/docno mappings are stored in a mappings file, which is loaded into memory by concrete objects implementing this interface.

Unless there are compelling reasons otherwise, it is preferable to start numbering docnos from one instead of zero. This is because zero cannot be represented in many common compression schemes that are used in information retrieval (e.g., Golomb codes).
@author Jimmy Lin

    Configuration conf = IntegrationUtils.getBespinConfiguration();
    FileSystem fs = FileSystem.get(conf);


    SequenceFile.Reader reader;
    IntWritable key = new IntWritable();
    HMapSFW value = new HMapSFW();


    reader = new SequenceFile.Reader(fs.getConf(),
        SequenceFile.Reader.file(new Path(opennlpIndex + "/test_wt-term-doc-vectors/part-00000")));
    reader.next(key, value);
    System.out.println("opennlpterm1\n"+key+";"+value);

View Full Code Here

    Configuration conf = IntegrationUtils.getBespinConfiguration();
    FileSystem fs = FileSystem.get(conf);


    SequenceFile.Reader reader;
    IntWritable key = new IntWritable();
    HMapSFW value = new HMapSFW();


    reader = new SequenceFile.Reader(fs.getConf(),
        SequenceFile.Reader.file(new Path(enwikiEn + "/test_wt-term-doc-vectors/part-00000")));
    reader.next(key, value);
    verifyTermDocVector(enTermDocVector1, value);

View Full Code Here

    Configuration conf = IntegrationUtils.getBespinConfiguration();
    FileSystem fs = FileSystem.get(conf);


    SequenceFile.Reader reader;
    IntWritable key = new IntWritable();
    HMapSFW value = new HMapSFW();


    reader = new SequenceFile.Reader(fs.getConf(),
        SequenceFile.Reader.file(new Path(dewikiEn + "/test_wt-term-doc-vectors/part-00000")));
    reader.next(key, value);
    verifyTermDocVector(deTermDocVector1, value);

View Full Code Here

    Map<String,HMapSFW> scfgDist = new HashMap<String,HMapSFW>();


    // phrase2count table is a set of (source_phrase --> X) maps, where X is a set of (phrase_trans --> count) maps
    HMapSFW phraseDist = new HMapSFW();


    HMapSIW srcTokenCnt = new HMapSIW();


    Set<String> bagOfTargetTokens = new HashSet<String>();


    try {
      FSDataInputStream fis = fs.open(new Path(grammarFile));

View Full Code Here

      return null;
    }
    PriorityQueue<PairOfFloatInt> eS = f2eProbs.get(f).getTranslationsWithProbs(lexProbThreshold);


    if (!eS.isEmpty()) {
      PairOfFloatInt entry = eS.poll();
      int e = entry.getRightElement();
      String eTerm = eVocab_f2e.get(e);
      return eTerm;
    }
    return token;
  }

View Full Code Here


    float sumProbEF = 0;
    int numTrans = 0;
    //tf(e) = sum_f{tf(f)*prob(e|f)}
    while (numTrans < numTransPerToken && !eS.isEmpty()) {
      PairOfFloatInt entry = eS.poll();
      float probEF = entry.getLeftElement();
      int e = entry.getRightElement();
      String eTerm = eVocab_f2e.get(e);


      //      LOG.info("Pr("+eTerm+"|"+token+")="+probEF);


      if (probEF > 0 && e > 0 && !docLangTokenizer.isStopWord(eTerm) && (translateOnly == null || !translateOnly.equals("indri") || indriPuncPattern.matcher(eTerm).matches()) && (pairsInSCFG == null || pairsInSCFG.contains(new PairOfStrings(token,eTerm)))) {

View Full Code Here

            curIndex = prevIndex;    // revert curIndex value since we're skipping this one
            skipTerm = true;
            continue;
          }
          logger.debug("Processing: "+srcTerm+" with index: "+curIndex);      
          topTrans.add(new PairOfFloatString(prob, trgTerm));
          sumOfProbs += prob;
          logger.debug("Added to queue: "+trgTerm+" with prob: "+prob+" (sum: "+sumOfProbs+")");      
        }else if(!earlyTerminate && !skipTerm && !delims.contains(srcTerm)){  //continue adding translation term,prob pairs (except if early termination is ON)
          topTrans.add(new PairOfFloatString(prob, trgTerm));
          sumOfProbs += prob;
          logger.debug("Added to queue: "+trgTerm+" with prob: "+prob+" (sum: "+sumOfProbs+")");      


          // keep top numTrans translations
          if(topTrans.size() > numTrans){
            PairOfFloatString pair = topTrans.pollFirst();
            float removedProb = pair.getLeftElement();
            sumOfProbs -= removedProb;
            logger.debug("Removed from queue: "+pair.getRightElement()+" (sum: "+sumOfProbs+")");      
          }
        }else{
          logger.debug("Skipped line: "+line);
        }
      }

View Full Code Here

          continue;
        }
        prob = ttable.get(srcIndex, trgIndex);
        logger.debug("Found: " + trgTerm + " with " + prob);


        topTrans.add(new PairOfFloatString(prob, trgTerm));
        // keep top numTrans translations
        if (topTrans.size() > numTrans) {
          float removedProb = topTrans.pollFirst().getLeftElement();
          sumOfProbs -= removedProb;
        }

View Full Code Here

    List<Integer> sortedIndices = new ArrayList<Integer>();
    HMapIF index2ProbMap = new HMapIF();


    float sumOfProbs = 0.0f;    //only extract the top K<15 if the mass prob. exceeds MAX_probThreshold
    while(!topTrans.isEmpty() && sumOfProbs < cumProbThreshold){
      PairOfFloatString e = topTrans.pollLast();
      String term = e.getRightElement();
      float pr = e.getLeftElement()/cumProb;    // normalize
      logger.debug(term+"-->"+pr);
      int trgIndex = trgVocab.addOrGet(term);
      sumOfProbs += e.getLeftElement();         // keep track of unnormalized cumulative prob for determining cutoff
      sortedIndices.add(trgIndex);
      index2ProbMap.put(trgIndex, pr);
    }


    // to enable faster access with binary search, we sort entries by vocabulary index.

View Full Code Here

          int e2 = eVocabTrg.get(eTerm);         


          float prob2 = f2e_Probs.get(f2, e2);
          float prob = prob1*prob2;
          sumOfProbs += prob;
          topTrans.add(new PairOfFloatString(prob, fTerm));
        }
        logger.info("Adding "+eTerm);
        addToTable(e1, topTrans, sumOfProbs, table, fVocabTrg, 1.0f, stats);      
      }
      logger.info(stats);

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of edu.umd.cloud9.collection.DocnoMapping

bak.pcj.IntIterator

edu.umd.cloud9.example.bfs.BfsNodeTest

edu.umd.cloud9.example.pagerank.PageRankNodeTest

edu.umd.cloud9.io.array.ArrayListOfIntsWritable

edu.umd.cloud9.io.fastutil.Int2FloatOpenHashMapWritableTest

edu.umd.cloud9.io.fastutil.Int2IntOpenHashMapWritableTest

edu.umd.cloud9.io.fastutil.String2FloatOpenHashMapWritableTest

edu.umd.cloud9.io.fastutil.String2IntOpenHashMapWritableTest

edu.umd.cloud9.io.map.HMapIDW

edu.umd.cloud9.io.map.HMapIDWTest

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.