Examples of Bm25

ivory.pwsim.score.Bm25

Examples of ivory.pwsim.score.Bm25

    String tokenizerFile = conf.get("fTokenizer");
    fTok = TokenizerFactory.createTokenizer(fs, fLang, tokenizerFile, true, conf.get("fStopword"), conf.get("fStemmedStopword"), null);
    sLogger.info("Tokenizer and vocabs created successfully from " + fLang + " " + tokenizerFile + "," + conf.get("fStopword") + "," + conf.get("fStemmedStopword"));


    // average sentence length = just a heuristic derived from sample text
    fScoreFn = (ScoringModel) new Bm25();
    fScoreFn.setAvgDocLength(lang2AvgSentLen.get(fLang));         


    // we use df table of English side, so we should read collection doc count from English dir
    RetrievalEnvironment eEnv = new RetrievalEnvironment(eDir, fs);
    fScoreFn.setDocCount(eEnv.readCollectionDocumentCount());

View Full Code Here

Examples of ivory.pwsim.score.Bm25


    String tokenizerFile = conf.get("eTokenizer");
    eTok = TokenizerFactory.createTokenizer(fs, eLang, tokenizerFile, true, conf.get("eStopword"), conf.get("eStemmedStopword"), null);
    sLogger.info("Tokenizer and vocabs created successfully from " + eLang + " " + tokenizerFile + "," + conf.get("eStopword") + "," + conf.get("eStemmedStopword"));


    eScoreFn = (ScoringModel) new Bm25();
    eScoreFn.setAvgDocLength(lang2AvgSentLen.get(eLang));        //average sentence length = heuristic based on De-En data
    eScoreFn.setDocCount(env.readCollectionDocumentCount());


    dict = new DefaultFrequencySortedDictionary(new Path(env.getIndexTermsData()), new Path(env.getIndexTermIdsData()), new Path(env.getIndexTermIdMappingData()), fs);
    dfTable = new DfTableArray(new Path(env.getDfByTermData()), fs);

View Full Code Here

Examples of ivory.pwsim.score.Bm25

    String tokenizerFile = conf.get("fTokenizer");
    fTok = TokenizerFactory.createTokenizer(localFs, fLang, tokenizerFile, true, conf.get("fStopword"), null, null);
    sLogger.info("Tokenizer and vocabs created successfully.");


    // average sentence length = just a heuristic derived from sample text
    fScoreFn = (ScoringModel) new Bm25();
    fScoreFn.setAvgDocLength(lang2AvgSentLen.get(fLang));         


    // we use df table of English side, so we should read collection doc count from English dir
    RetrievalEnvironment eEnv = new RetrievalEnvironment(eDir, localFs);
    fScoreFn.setDocCount(eEnv.readCollectionDocumentCount());

View Full Code Here

Examples of ivory.pwsim.score.Bm25


    String tokenizerFile = conf.get("eTokenizer");
    eTok = TokenizerFactory.createTokenizer(localFs, eLang, tokenizerFile, true, conf.get("eStopword"), null, null);
    sLogger.info("Tokenizer and vocabs created successfully.");


    eScoreFn = (ScoringModel) new Bm25();
    eScoreFn.setAvgDocLength(lang2AvgSentLen.get(eLang));        //average sentence length = heuristic based on De-En data
    eScoreFn.setDocCount(env.readCollectionDocumentCount());


    dict = new DefaultFrequencySortedDictionary(new Path(env.getIndexTermsData()), new Path(env.getIndexTermIdsData()), new Path(env.getIndexTermIdMappingData()), localFs);
    dfTable = new DfTableArray(new Path(env.getDfByTermData()), localFs);

View Full Code Here

Examples of ivory.pwsim.score.Bm25

  static TTable_monolithic_IFAs f2e_Probs, e2f_Probs;
  private static Options options;


  private List<HMapSFW> translateDocVectors(String eLang, 
      String eTokenizerModelFile, String eStopwordsFile, List<HMapSIW> docs, float avgLen, HMapSIW transDfTable) {
    Bm25 mModel = new Bm25();
    // set number of docs
    mModel.setDocCount(docs.size());


    // set average doc length
    mModel.setAvgDocLength(avgLen);


    List<HMapSFW> transDocs = new ArrayList<HMapSFW>();
    Tokenizer tokenizer = TokenizerFactory.createTokenizer(eLang, 
        eTokenizerModelFile, true, eStopwordsFile, eStopwordsFile + ".stemmed", null);

View Full Code Here

Examples of ivory.pwsim.score.Bm25

    }
  }


  private List<HMapSFW> buildDocVectors(List<HMapSIW> term2tfVectors, float avgLen,
      HMapSIW dfTable) {
    Bm25 mModel = new Bm25();
    // set number of docs
    mModel.setDocCount(term2tfVectors.size());


    // set average doc length
    mModel.setAvgDocLength(avgLen);


    // tf-idf computation
    List<HMapSFW> docVectors = new ArrayList<HMapSFW>();
    for (HMapSIW enDoc : term2tfVectors) {
      HMapSFW v = new HMapSFW();
      int docLen = 0;
      for (Entry<String> item : enDoc.entrySet()) {
        int tf = item.getValue();
        docLen += tf;
      }
      float sum2 = 0;
      for (Entry<String> item : enDoc.entrySet()) {
        String term = item.getKey();
        int tf = item.getValue();
        int df = dfTable.get(term);
        mModel.setDF(df);
        float score = mModel.computeDocumentWeight(tf, docLen);
        if (score > 0) {
          v.put(term, score);
          sum2 += score * score;
        }
      }

View Full Code Here

Examples of ivory.pwsim.score.Bm25

    String tokenizerFile = conf.get("fTokenizer");
    fTok = TokenizerFactory.createTokenizer(fs, fLang, tokenizerFile, true, conf.get("fStopword"), conf.get("fStemmedStopword"), null);
    sLogger.info("Tokenizer and vocabs created successfully.");


    // average sentence length = just a heuristic derived from sample text
    fScoreFn = (ScoringModel) new Bm25();
    fScoreFn.setAvgDocLength(lang2AvgSentLen.get(fLang));         


    // we use df table of English side, so we should read collection doc count from English dir
    RetrievalEnvironment eEnv = new RetrievalEnvironment(eDir, fs);
    fScoreFn.setDocCount(eEnv.readCollectionDocumentCount());

View Full Code Here

Examples of ivory.pwsim.score.Bm25


    String tokenizerFile = conf.get("eTokenizer");
    eTok = TokenizerFactory.createTokenizer(fs, eLang, tokenizerFile, true, conf.get("eStopword"), conf.get("eStemmedStopword"), null);
    sLogger.info("Tokenizer and vocabs created successfully.");


    eScoreFn = (ScoringModel) new Bm25();
    eScoreFn.setAvgDocLength(lang2AvgSentLen.get(eLang));        //average sentence length = heuristic based on De-En data
    eScoreFn.setDocCount(env.readCollectionDocumentCount());


    dict = new DefaultFrequencySortedDictionary(new Path(env.getIndexTermsData()), new Path(env.getIndexTermIdsData()), new Path(env.getIndexTermIdMappingData()), fs);
    dfTable = new DfTableArray(new Path(env.getDfByTermData()), fs);

View Full Code Here

Examples of ivory.pwsim.score.Bm25

    String tokenizerFile = conf.get("fTokenizer");
    fTok = TokenizerFactory.createTokenizer(localFs, fLang, tokenizerFile, true, conf.get("fStopword"), null, null);
    sLogger.info("Tokenizer and vocabs created successfully.");


    // average sentence length = just a heuristic derived from sample text
    fScoreFn = (ScoringModel) new Bm25();
    fScoreFn.setAvgDocLength(lang2AvgSentLen.get(fLang));         


    // we use df table of English side, so we should read collection doc count from English dir
    RetrievalEnvironment eEnv = new RetrievalEnvironment(eDir, localFs);
    fScoreFn.setDocCount(eEnv.readCollectionDocumentCount());

View Full Code Here

Examples of ivory.pwsim.score.Bm25


    String tokenizerFile = conf.get("eTokenizer");
    eTok = TokenizerFactory.createTokenizer(localFs, eLang, tokenizerFile, true, conf.get("eStopword"), null, null);
    sLogger.info("Tokenizer and vocabs created successfully.");


    eScoreFn = (ScoringModel) new Bm25();
    eScoreFn.setAvgDocLength(lang2AvgSentLen.get(eLang));        //average sentence length = heuristic based on De-En data
    eScoreFn.setDocCount(env.readCollectionDocumentCount());


    dict = new DefaultFrequencySortedDictionary(new Path(env.getIndexTermsData()), new Path(env.getIndexTermIdsData()), new Path(env.getIndexTermIdMappingData()), localFs);
    dfTable = new DfTableArray(new Path(env.getDfByTermData()), localFs);

View Full Code Here

0 1

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.