Package ivory.core.data.dictionary

Examples of ivory.core.data.dictionary.DefaultFrequencySortedDictionary


    eScoreFn = (ScoringModel) new Bm25();
    eScoreFn.setAvgDocLength(lang2AvgSentLen.get(eLang));        //average sentence length = heuristic based on De-En data
    eScoreFn.setDocCount(env.readCollectionDocumentCount());

    dict = new DefaultFrequencySortedDictionary(new Path(env.getIndexTermsData()), new Path(env.getIndexTermIdsData()), new Path(env.getIndexTermIdMappingData()), localFs);
    dfTable = new DfTableArray(new Path(env.getDfByTermData()), localFs);
  }
View Full Code Here


    Path termsFilePath = new Path(env.getIndexTermsData());
    Path termIDsFilePath = new Path(env.getIndexTermIdsData());
    Path idToTermFilePath = new Path(env.getIndexTermIdMappingData());

    DefaultFrequencySortedDictionary termIDMap = new DefaultFrequencySortedDictionary(termsFilePath, termIDsFilePath, idToTermFilePath, fs);

    for (int i=1; i<=200; i++) {
      System.out.println(String.format("%d\t%s\t%d", i, termIDMap.getTerm(i), dfs.getDf(i)));
    }
  }
View Full Code Here

    FileSystem fs = FileSystem.getLocal(new Configuration());
    Path termsFilePath = new Path("etc/trec-index-terms.dat");
    Path termIDsFilePath = new Path("etc/trec-index-termids.dat");
    Path idToTermFilePath = new Path("etc/trec-index-termid-mapping.dat");

    DefaultFrequencySortedDictionary dictionary =
        new DefaultFrequencySortedDictionary(termsFilePath, termIDsFilePath, idToTermFilePath, fs);

    assertEquals(312232, dictionary.size());
    assertEquals("page", dictionary.getTerm(1));
    assertEquals("time", dictionary.getTerm(2));
    assertEquals("will", dictionary.getTerm(3));
    assertEquals("year", dictionary.getTerm(4));
    assertEquals("nikaan", dictionary.getTerm(100000));

    assertEquals(1, dictionary.getId("page"));
    assertEquals(2, dictionary.getId("time"));
    assertEquals(3, dictionary.getId("will"));
    assertEquals(4, dictionary.getId("year"));
    assertEquals(100000, dictionary.getId("nikaan"));
   
    assertEquals(null, dictionary.getTerm(312233));

    Iterator<String> iter = dictionary.iterator();
    assertTrue(iter.hasNext());
    assertEquals("page", iter.next());
    assertTrue(iter.hasNext());
    assertEquals("time", iter.next());
    assertTrue(iter.hasNext());
    assertEquals("will", iter.next());
    assertTrue(iter.hasNext());
    assertEquals("year", iter.next());
    assertTrue(iter.hasNext());

    int cnt = 0;
    for (@SuppressWarnings("unused") String s : dictionary) {
      cnt++;
    }
    assertEquals(dictionary.size(), cnt);

    cnt = 0;
    iter = dictionary.iterator();
    while(iter.hasNext()) {
      cnt++;
      iter.next();
    }
    assertEquals(dictionary.size(), cnt);
  }
View Full Code Here

        idToTermFile = idToTermFile.substring(idToTermFile.lastIndexOf("/") + 1);

        LOG.info("Looking for the following files in dcache: " + termsFile + ", " + termidsFile + ", " + idToTermFile);
        // Take a different code path if we're in standalone mode.
        if (conf.get("mapred.job.tracker").equals("local")) {
          dictionary = new DefaultFrequencySortedDictionary(new Path(termsFile),
              new Path(termidsFile), new Path(idToTermFile), FileSystem.getLocal(conf));
        } else {
          // We need to figure out which file in the DistributeCache is which...
          Map<String, Path> pathMapping = Maps.newHashMap();
          Path[] localFiles = DistributedCache.getLocalCacheFiles(context.getConfiguration());
          for (Path p : localFiles) {
            LOG.info("In DistributedCache: " + p);
            if (p.toString().contains(termsFile)) {
              pathMapping.put(termsFile, p);
            } else if (p.toString().contains(termidsFile)) {
              pathMapping.put(termidsFile, p);
            } else if (p.toString().contains(idToTermFile)) {
              pathMapping.put(idToTermFile, p);
            }
          }

          LOG.info(" - terms: " + pathMapping.get(termsFile));
          LOG.info(" - id: " + pathMapping.get(termidsFile));
          LOG.info(" - idToTerms: " + pathMapping.get(idToTermFile));

    String s = localFiles.length + " " + localFiles[0].toString() + " " + localFiles[1].toString() + " " + localFiles[2].toString();
    if (pathMapping.get(termsFile) == null ) {

        throw new RuntimeException(s);
    }

          dictionary = new DefaultFrequencySortedDictionary(pathMapping.get(termsFile),
              pathMapping.get(termidsFile), pathMapping.get(idToTermFile), FileSystem.getLocal(conf));
        }
      } catch (Exception e) {
        e.printStackTrace();
        throw new RuntimeException("Error initializing data!", e);
View Full Code Here

      LOG.info(" - idToTerms: " + pathMapping.get(idToTermFile));
      LOG.info(" - df data: " + pathMapping.get(dfFile));
      LOG.info(" - dl data: " + pathMapping.get(dlFile));

      try{
        dict = new DefaultFrequencySortedDictionary(pathMapping.get(termsFile),
            pathMapping.get(termidsFile), pathMapping.get(idToTermFile), FileSystem.getLocal(conf));
        dfTable = new DfTableArray(pathMapping.get(dfFile), FileSystem.getLocal(conf));
      } catch (Exception e) {
        e.printStackTrace();
        throw new RuntimeException("Error loading Terms File for dictionary from "+localFiles[0]);
View Full Code Here

        gs.loadDFStats(localFiles[1], fs);
        gs.loadCFStats(localFiles[2], fs);

        String indexPath = job.get("Ivory.IndexPath");
        sLogger.info("loading TermIdMap from " + indexPath);
        mTermIdMap = new DefaultFrequencySortedDictionary(localFiles[3], localFiles[4],
            localFiles[5], fs);
      } catch (IOException e) {
        e.printStackTrace();
        throw new RuntimeException("Error loading global term stats!");
      }
View Full Code Here

    postingsIndex = new IntPostingsForwardIndex(indexPath, fs);
    LOG.info(" - Number of terms: " + readCollectionTermCount());
    LOG.info("Done!");

    try {
      termidMap = new DefaultFrequencySortedDictionary(new Path(getIndexTermsData()),
          new Path(getIndexTermIdsData()), new Path(getIndexTermIdMappingData()), fs);
    } catch (Exception e) {
      throw new ConfigurationException("Error initializing dictionary!");
    }
View Full Code Here

    eScoreFn = (ScoringModel) new Bm25();
    eScoreFn.setAvgDocLength(lang2AvgSentLen.get(eLang));        //average sentence length = heuristic based on De-En data
    eScoreFn.setDocCount(env.readCollectionDocumentCount());

    dict = new DefaultFrequencySortedDictionary(new Path(env.getIndexTermsData()), new Path(env.getIndexTermIdsData()), new Path(env.getIndexTermIdMappingData()), fs);
    dfTable = new DfTableArray(new Path(env.getDfByTermData()), fs);

    //for backward compatibility
    //    String indexTermsFile = localFiles[12].toString();
    //    String dfTableFile = localFiles[0].toString();
View Full Code Here

    eScoreFn = (ScoringModel) new Bm25();
    eScoreFn.setAvgDocLength(lang2AvgSentLen.get(eLang));        //average sentence length = heuristic based on De-En data
    eScoreFn.setDocCount(env.readCollectionDocumentCount());

    dict = new DefaultFrequencySortedDictionary(new Path(env.getIndexTermsData()), new Path(env.getIndexTermIdsData()), new Path(env.getIndexTermIdMappingData()), localFs);
    dfTable = new DfTableArray(new Path(env.getDfByTermData()), localFs);
  }
View Full Code Here

        LOG.info(" - id: " + pathMapping.get(termidsFile));
        LOG.info(" - idToTerms: " + pathMapping.get(idToTermFile));
        LOG.info(" - df data: " + pathMapping.get(dfFile));

        try{
          dict = new DefaultFrequencySortedDictionary(pathMapping.get(termsFile),
              pathMapping.get(termidsFile), pathMapping.get(idToTermFile), FileSystem.getLocal(conf));
          dfTable = new DfTableArray(pathMapping.get(dfFile), FileSystem.getLocal(conf));
        } catch (Exception e) {
          e.printStackTrace();
          throw new RuntimeException("Error loading Terms File for dictionary from "+localFiles[0]);
View Full Code Here

TOP

Related Classes of ivory.core.data.dictionary.DefaultFrequencySortedDictionary

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.