Examples of ivory.core.RetrievalEnvironment

ivory.core.RetrievalEnvironment
@author Don Metzler @author Jimmy Lin

    // PowerTool
    JobConf conf = new JobConf(getConf(), BuildWeightedTermDocVectors.class);
    FileSystem fs = FileSystem.get(conf);


    String indexPath = conf.get("Ivory.IndexPath");
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    String outputPath = env.getWeightedTermDocVectorsDirectory();
    int mapTasks = conf.getInt("Ivory.NumMapTasks", 0);
    int minSplitSize = conf.getInt("Ivory.MinSplitSize", 0);
    String collectionName = conf.get("Ivory.CollectionName");


    String termsFilePath = env.getIndexTermsData();
    String dfByTermFilePath = env.getDfByTermData();


    Path inputPath = new Path(env.getTermDocVectorsDirectory());
    Path weightedVectorsPath = new Path(outputPath);


    if (fs.exists(weightedVectorsPath)) {
      //fs.delete(weightedVectorsPath, true);
      sLogger.info("Output path already exists!");
      return 0;
    }


    /* add terms file to cache */
    if (!fs.exists(new Path(termsFilePath))) {
      throw new RuntimeException("Error, terms file " + termsFilePath + "doesn't exist!");
    }
    DistributedCache.addCacheFile(new URI(termsFilePath), conf);


    /* add df table to cache */
    if (!fs.exists(new Path(dfByTermFilePath))) {
      throw new RuntimeException("Error, df data file " + dfByTermFilePath + "doesn't exist!");
    }
    DistributedCache.addCacheFile(new URI(dfByTermFilePath), conf);


    /* add dl table to cache */
    Path docLengthFile = env.getDoclengthsData();
    if (!fs.exists(docLengthFile)) {
      throw new RuntimeException("Error, doc-length data file " + docLengthFile + "doesn't exist!");
    }
    DistributedCache.addCacheFile(docLengthFile.toUri(), conf);

View Full Code Here


    JobConf job = new JobConf(getConf(), WriteRandomVectors.class);
//    job.set("mapred.job.tracker", "local");
//    job.set("fs.default.name", "file:///");
    FileSystem fs = FileSystem.get(job);
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    K = (int) env.readCollectionTermCount();
    job.setJobName("WriteRandomVectors");
    
    if(D<=0 || K<=0){
      throw new RuntimeException("parameters not read properly");
    }

View Full Code Here

      throw new RuntimeException("Parameters not read properly! Quitting...");
    }
    JobConf job = new JobConf(conf, ComputeSignaturesRandom.class);
    FileSystem fs = FileSystem.get(job);


    RetrievalEnvironment re = new RetrievalEnvironment(dir, fs);
    job.setJobName("ComputeSignatures_random_D="+D+":"+re.readCollectionName());


    String inputPath = PwsimEnvironment.getFileNameWithPars(dir, "IntDocs");
    String outputPath = PwsimEnvironment.getFileNameWithPars(dir, "SignaturesRandom");
    String randomVectorFile = PwsimEnvironment.getFileNameWithPars(dir, "RandomVectors")+"/part-00000";
    DistributedCache.addCacheFile(new URI(randomVectorFile), job);

View Full Code Here

          sLogger.info(" - index: " + child.getTextContent().trim());


          indexPath = child.getTextContent().trim();
          if(mEnv == null){
            try {
              mEnv = new RetrievalEnvironment(indexPath, fs);
              mEnv.initialize(true);
            } catch (Exception e) {
              e.printStackTrace();
              throw new RuntimeException();
            }
          }
        }


        if ("findex".equals(child.getNodeName())) {
          sLogger.info(" - findex: " + child.getTextContent().trim());


          // initialize forward index
          findexPath = child.getTextContent().trim();
        }


        if ("docscore".equals(child.getNodeName())) {
          sLogger.info(" - docscore: " + child.getTextContent().trim());


          String type = XMLTools.getAttributeValue(child, "type", "");
          String provider = XMLTools.getAttributeValue(child, "provider", "");
          String path = child.getTextContent();


          if (type.equals("") || provider.equals("") || path.equals("")) {
            throw new RuntimeException("Invalid docscore!");
          }
          System.out.println("$$ Loading docscore: type=" + type + ", provider=" +
              provider + ", path="
              + path);
          sLogger.info("Loading docscore: type=" + type + ", provider=" +
              provider + ", path="
              + path);


          if(mEnv == null){
            try {
              mEnv = new RetrievalEnvironment(indexPath, fs);
              mEnv.initialize(true);
            } catch (Exception e) {
              e.printStackTrace();
              throw new RuntimeException();
            }
          }
          mEnv.loadDocScore(type, provider, path);
        }
        
        if("importancemodel".equals(child.getNodeName())) {
          sLogger.info(" - importancemodel: " + child.getTextContent().trim());
          
          String importanceModelId = XMLTools.getAttributeValue(child, "id", null);
          if(importanceModelId == null) {
            throw new RuntimeException("Invalid importance model!");
          }
          
          ConceptImportanceModel importanceModel = null;
          try {
            importanceModel = ConceptImportanceModel.get(child);
          }
          catch(Exception e) {
            throw new RuntimeException(e);
          }
          
          if(mEnv == null){
            try {
              mEnv = new RetrievalEnvironment(indexPath, fs);
              mEnv.initialize(true);
            } catch (Exception e) {
              e.printStackTrace();
              throw new RuntimeException();
            }
          }
          mEnv.addImportanceModel(importanceModelId, importanceModel);
        }


      }


      if (indexPath == null) {
        throw new RuntimeException("Error: must specify an index location!");
      }


      if (findexPath == null)
        sLogger.warn("forward index not specified: will not be able to access documents.");
    }
    if(mEnv == null){
      try {
        mEnv = new RetrievalEnvironment(indexPath, fs);
        mEnv.initialize(true);
      } catch (Exception e) {
        e.printStackTrace();
        throw new RuntimeException();
      }

View Full Code Here

  public int runTool() throws Exception {
    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);


    String indexPath = conf.get(Constants.IndexPath);
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);


    String collectionName = env.readCollectionName();


    int reduceTasks = conf.getInt(Constants.NumReduceTasks, 0);
    int minSplitSize = conf.getInt(Constants.MinSplitSize, 0);
    int collectionDocCnt = env.readCollectionDocumentCount();


    String postingsType = conf.get(Constants.PostingsListsType,
        ivory.core.data.index.PostingsListDocSortedPositional.class.getCanonicalName());
    @SuppressWarnings("unchecked")
    Class<? extends PostingsList> postingsClass =
        (Class<? extends PostingsList>) Class.forName(postingsType);


    LOG.info("PowerTool: " + BuildIPInvertedIndexDocSorted.class.getCanonicalName());
    LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath));
    LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName));
    LOG.info(String.format(" - %s: %s", Constants.CollectionDocumentCount, collectionDocCnt));
    LOG.info(String.format(" - %s: %s", Constants.PostingsListsType, postingsClass.getCanonicalName()));
    LOG.info(String.format(" - %s: %s", Constants.NumReduceTasks, reduceTasks));
    LOG.info(String.format(" - %s: %s", Constants.MinSplitSize, minSplitSize));


    if (!fs.exists(new Path(indexPath))) {
      fs.mkdirs(new Path(indexPath));
    }


    Path inputPath = new Path(env.getIntDocVectorsDirectory());
    Path postingsPath = new Path(env.getPostingsDirectory());


    if (fs.exists(postingsPath)) {
      LOG.info("Postings already exist: no indexing will be performed.");
      return 0;
    }


    conf.setInt(Constants.CollectionDocumentCount, collectionDocCnt);


    conf.setInt("mapred.min.split.size", minSplitSize);
    conf.set("mapred.child.java.opts", "-Xmx2048m");


    Job job = new Job(conf,
        BuildIPInvertedIndexDocSorted.class.getSimpleName() + ":" + collectionName);
    job.setJarByClass(BuildIPInvertedIndexDocSorted.class);


    job.setNumReduceTasks(reduceTasks);


    FileInputFormat.setInputPaths(job, inputPath);
    FileOutputFormat.setOutputPath(job, postingsPath);


    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);


    job.setMapOutputKeyClass(PairOfInts.class);
    job.setMapOutputValueClass(TermPositions.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(postingsClass);


    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);
    job.setPartitionerClass(MyPartitioner.class);


    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");


    env.writePostingsType(postingsClass.getCanonicalName());


    return 0;
  }

View Full Code Here


    LOG.info("PowerTool: " + BuildDictionary.class.getCanonicalName());
    LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName));
    LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath));


    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    if (!fs.exists(new Path(indexPath))) {
      LOG.error("index path doesn't existing: skipping!");
      return 0;
    }


    if (fs.exists(new Path(env.getIndexTermsData())) &&
        fs.exists(new Path(env.getIndexTermIdsData())) &&
        fs.exists(new Path(env.getIndexTermIdMappingData())) &&
        fs.exists(new Path(env.getDfByTermData())) &&
        fs.exists(new Path(env.getCfByTermData())) &&
        fs.exists(new Path(env.getDfByIntData())) &&
        fs.exists(new Path(env.getCfByIntData()))) {
      LOG.info("term and term id data exist: skipping!");
      return 0;
    }


    conf.setInt(Constants.CollectionTermCount, (int) env.readCollectionTermCount());
    conf.set("mapred.child.java.opts", "-Xmx2048m");


    Path tmpPath = new Path(env.getTempDirectory());
    fs.delete(tmpPath, true);


    Job job = new Job(conf,
        BuildDictionary.class.getSimpleName() + ":" + collectionName);


    job.setJarByClass(BuildDictionary.class);
    job.setNumReduceTasks(1);


    FileInputFormat.setInputPaths(job, new Path(env.getTermDfCfDirectory()));
    FileOutputFormat.setOutputPath(job, tmpPath);


    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

View Full Code Here

    long collectionLength = 0;
    int docCount = 0;
    for (String index : indexPaths.split(",")) {
      sLogger.info("reading stats for " + index);


      RetrievalEnvironment env = new RetrievalEnvironment(index, fs);


      long l = env.readCollectionLength();
      int n = env.readCollectionDocumentCount();


      sLogger.info(" - CollectionLength: " + l);
      sLogger.info(" - CollectionDocumentCount: " + n);


      collectionLength += l;
      docCount += n;
    }


    float avgdl = (float) collectionLength / docCount;


    sLogger.info("all index segments: ");
    sLogger.info(" - CollectionLength: " + collectionLength);
    sLogger.info(" - CollectionDocumentCount: " + docCount);
    sLogger.info(" - AverageDocumentLenght: " + avgdl);


    RetrievalEnvironment env = new RetrievalEnvironment(dataOutputPath, fs);


    env.writeCollectionAverageDocumentLength(avgdl);
    env.writeCollectionLength(collectionLength);
    env.writeCollectionDocumentCount(docCount);


    return 0;
  }

View Full Code Here

    public void setup(Reducer<Text, PairOfIntLong, NullWritable, NullWritable>.Context context)
        throws IOException {
      LOG.info("Starting setup.");
      Configuration conf = context.getConfiguration();
      FileSystem fs = FileSystem.get(conf);
      RetrievalEnvironment env = new RetrievalEnvironment(conf.get(Constants.IndexPath), fs);


      numTerms = conf.getInt(Constants.CollectionTermCount, 0);


      terms = new String[numTerms];
      seqNums = new int[numTerms];
      dfs = new int[numTerms];
      cfs = new long[numTerms];


      termsOut = fs.create(new Path(env.getIndexTermsData()), true);
      //termsOut.writeInt(numTerms);


      idsOut = fs.create(new Path(env.getIndexTermIdsData()), true);
      idsOut.writeInt(numTerms);


      idsToTermOut = fs.create(new Path(env.getIndexTermIdMappingData()), true);
      idsToTermOut.writeInt(numTerms);


      dfByTermOut = fs.create(new Path(env.getDfByTermData()), true);
      dfByTermOut.writeInt(numTerms);


      cfByTermOut = fs.create(new Path(env.getCfByTermData()), true);
      cfByTermOut.writeInt(numTerms);


      dfByIntOut = fs.create(new Path(env.getDfByIntData()), true);
      dfByIntOut.writeInt(numTerms);


      cfByIntOut = fs.create(new Path(env.getCfByIntData()), true);
      cfByIntOut.writeInt(numTerms);
      LOG.info("Finished setup.");
    }

View Full Code Here

  @Override
  public int runTool() throws Exception {
    String indexPath = getConf().get("Ivory.IndexPath");
    String scoringModel = getConf().get("Ivory.ScoringModel");


    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, FileSystem.get(getConf()));


    String outputPath = env.getWeightedTermDocVectorsDirectory();
    String transDfFile = indexPath+"/transDf.dat";
    String fVocab_f2e= getConf().get("Ivory.F_Vocab_F2E");        //de from P(e|f)
    String eVocab_f2e = getConf().get("Ivory.E_Vocab_F2E");        //en from P(e|f)
    String ttable_f2e = getConf().get("Ivory.TTable_F2E");        //P(e|f)
    String eVocab_e2f  = getConf().get("Ivory.E_Vocab_E2F");    //en from P(f|e)
    String fVocab_e2f  = getConf().get("Ivory.F_Vocab_E2F");    //de from P(f|e)
    String ttable_e2f= getConf().get("Ivory.TTable_E2F");      //P(f|e)
  
    createTranslatedDFFile(transDfFile);


    JobConf conf = new JobConf(getConf(), BuildTranslatedTermDocVectors.class);
    conf.setJobName("BuildTranslatedTermDocVectors");
    FileSystem fs = FileSystem.get(conf);


    if(fs.exists(new Path(outputPath))){
      LOG.info(outputPath+": Translated term doc vectors already exist! Nothing to do for this job...");
      return 0;
    }


    String collectionName = getConf().get("Ivory.CollectionName");
    String inputPath = env.getTermDocVectorsDirectory();


    LOG.info("Preparing to build document vectors using " + scoringModel);
    LOG.info("Document vectors to be stored in " + outputPath);
    LOG.info("CollectionName: " + collectionName);
    LOG.info("Input path: " + inputPath);


    ///////Configuration setup


    conf.set("Ivory.IndexPath", indexPath);
    conf.set("Ivory.ScoringModel", scoringModel);
    DocLengthTable mDLTable;
    try {
      mDLTable = new DocLengthTable4B(env.getDoclengthsData(), fs);
    } catch (IOException e1) {
      throw new RuntimeException("Error initializing Doclengths file");
    }
    LOG.info(mDLTable.getAvgDocLength()+" is average doc len.");
    LOG.info(mDLTable.getDocCount()+" is num docs.");


    conf.setFloat("Ivory.AvgDocLen", mDLTable.getAvgDocLength());
    conf.setInt("Ivory.CollectionDocumentCount", env.readCollectionDocumentCount());
    
    conf.setNumMapTasks(300);      
    conf.setNumReduceTasks(0);
    conf.set("mapred.child.java.opts", "-Xmx2048m");
    conf.setInt("mapred.map.max.attempts", 10);

View Full Code Here

    job2.setJobName("ConvertMap2Pairs");
    
    FileSystem fs = FileSystem.get(job2);
    
    String indexPath = getConf().get("Ivory.IndexPath");
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    int blockSize = getConf().getInt("Ivory.BlockSize", 0);    
    int numDocs = env.readCollectionDocumentCount();
    int numBlocks = numDocs / blockSize + 1;


    String inputPath = null;
    for (int i = 0; i < numBlocks; i++) {
      inputPath = conf.get("Ivory.PCPOutputPath")+"/block"+i;      //one block of output of PCP algorithm

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of ivory.core.RetrievalEnvironment

edu.umd.cloud9.collection.DocnoMapping

ivory.app.PreprocessClueWebEnglish

ivory.app.PreprocessClueWebEnglishMultipleSegments

ivory.app.PreprocessCollection

ivory.app.PreprocessMedline

ivory.app.PreprocessTextCollection

ivory.app.PreprocessTrecForeign

ivory.app.PreprocessWikipedia

ivory.bloomir.preprocessing.GenerateBloomFilters

ivory.bloomir.preprocessing.GenerateCompressedPostings

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.