Package ivory.core

Examples of ivory.core.RetrievalEnvironment


    // PowerTool
    JobConf conf = new JobConf(getConf(), BuildWeightedTermDocVectors.class);
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get("Ivory.IndexPath");
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    String outputPath = env.getWeightedTermDocVectorsDirectory();
    int mapTasks = conf.getInt("Ivory.NumMapTasks", 0);
    int minSplitSize = conf.getInt("Ivory.MinSplitSize", 0);
    String collectionName = conf.get("Ivory.CollectionName");

    String termsFilePath = env.getIndexTermsData();
    String dfByTermFilePath = env.getDfByTermData();

    Path inputPath = new Path(env.getTermDocVectorsDirectory());
    Path weightedVectorsPath = new Path(outputPath);

    if (fs.exists(weightedVectorsPath)) {
      //fs.delete(weightedVectorsPath, true);
      sLogger.info("Output path already exists!");
      return 0;
    }

    /* add terms file to cache */
    if (!fs.exists(new Path(termsFilePath))) {
      throw new RuntimeException("Error, terms file " + termsFilePath + "doesn't exist!");
    }
    DistributedCache.addCacheFile(new URI(termsFilePath), conf);

    /* add df table to cache */
    if (!fs.exists(new Path(dfByTermFilePath))) {
      throw new RuntimeException("Error, df data file " + dfByTermFilePath + "doesn't exist!");
    }
    DistributedCache.addCacheFile(new URI(dfByTermFilePath), conf);

    /* add dl table to cache */
    Path docLengthFile = env.getDoclengthsData();
    if (!fs.exists(docLengthFile)) {
      throw new RuntimeException("Error, doc-length data file " + docLengthFile + "doesn't exist!");
    }
    DistributedCache.addCacheFile(docLengthFile.toUri(), conf);

View Full Code Here


    JobConf job = new JobConf(getConf(), WriteRandomVectors.class);
//    job.set("mapred.job.tracker", "local");
//    job.set("fs.default.name", "file:///");
    FileSystem fs = FileSystem.get(job);
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    K = (int) env.readCollectionTermCount();
    job.setJobName("WriteRandomVectors");
   
    if(D<=0 || K<=0){
      throw new RuntimeException("parameters not read properly");
    }
View Full Code Here

      throw new RuntimeException("Parameters not read properly! Quitting...");
    }
    JobConf job = new JobConf(conf, ComputeSignaturesRandom.class);
    FileSystem fs = FileSystem.get(job);

    RetrievalEnvironment re = new RetrievalEnvironment(dir, fs);
    job.setJobName("ComputeSignatures_random_D="+D+":"+re.readCollectionName());

    String inputPath = PwsimEnvironment.getFileNameWithPars(dir, "IntDocs");
    String outputPath = PwsimEnvironment.getFileNameWithPars(dir, "SignaturesRandom");
    String randomVectorFile = PwsimEnvironment.getFileNameWithPars(dir, "RandomVectors")+"/part-00000";
    DistributedCache.addCacheFile(new URI(randomVectorFile), job);
View Full Code Here

          sLogger.info(" - index: " + child.getTextContent().trim());

          indexPath = child.getTextContent().trim();
          if(mEnv == null){
            try {
              mEnv = new RetrievalEnvironment(indexPath, fs);
              mEnv.initialize(true);
            } catch (Exception e) {
              e.printStackTrace();
              throw new RuntimeException();
            }
          }
        }

        if ("findex".equals(child.getNodeName())) {
          sLogger.info(" - findex: " + child.getTextContent().trim());

          // initialize forward index
          findexPath = child.getTextContent().trim();
        }

        if ("docscore".equals(child.getNodeName())) {
          sLogger.info(" - docscore: " + child.getTextContent().trim());

          String type = XMLTools.getAttributeValue(child, "type", "");
          String provider = XMLTools.getAttributeValue(child, "provider", "");
          String path = child.getTextContent();

          if (type.equals("") || provider.equals("") || path.equals("")) {
            throw new RuntimeException("Invalid docscore!");
          }
          System.out.println("$$ Loading docscore: type=" + type + ", provider=" +
              provider + ", path="
              + path);
          sLogger.info("Loading docscore: type=" + type + ", provider=" +
              provider + ", path="
              + path);

          if(mEnv == null){
            try {
              mEnv = new RetrievalEnvironment(indexPath, fs);
              mEnv.initialize(true);
            } catch (Exception e) {
              e.printStackTrace();
              throw new RuntimeException();
            }
          }
          mEnv.loadDocScore(type, provider, path);
        }
       
        if("importancemodel".equals(child.getNodeName())) {
          sLogger.info(" - importancemodel: " + child.getTextContent().trim());
         
          String importanceModelId = XMLTools.getAttributeValue(child, "id", null);
          if(importanceModelId == null) {
            throw new RuntimeException("Invalid importance model!");
          }
         
          ConceptImportanceModel importanceModel = null;
          try {
            importanceModel = ConceptImportanceModel.get(child);
          }
          catch(Exception e) {
            throw new RuntimeException(e);
          }
         
          if(mEnv == null){
            try {
              mEnv = new RetrievalEnvironment(indexPath, fs);
              mEnv.initialize(true);
            } catch (Exception e) {
              e.printStackTrace();
              throw new RuntimeException();
            }
          }
          mEnv.addImportanceModel(importanceModelId, importanceModel);
        }

      }

      if (indexPath == null) {
        throw new RuntimeException("Error: must specify an index location!");
      }

      if (findexPath == null)
        sLogger.warn("forward index not specified: will not be able to access documents.");
    }
    if(mEnv == null){
      try {
        mEnv = new RetrievalEnvironment(indexPath, fs);
        mEnv.initialize(true);
      } catch (Exception e) {
        e.printStackTrace();
        throw new RuntimeException();
      }
View Full Code Here

  public int runTool() throws Exception {
    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get(Constants.IndexPath);
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    String collectionName = env.readCollectionName();

    int reduceTasks = conf.getInt(Constants.NumReduceTasks, 0);
    int minSplitSize = conf.getInt(Constants.MinSplitSize, 0);
    int collectionDocCnt = env.readCollectionDocumentCount();

    String postingsType = conf.get(Constants.PostingsListsType,
        ivory.core.data.index.PostingsListDocSortedPositional.class.getCanonicalName());
    @SuppressWarnings("unchecked")
    Class<? extends PostingsList> postingsClass =
        (Class<? extends PostingsList>) Class.forName(postingsType);

    LOG.info("PowerTool: " + BuildIPInvertedIndexDocSorted.class.getCanonicalName());
    LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath));
    LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName));
    LOG.info(String.format(" - %s: %s", Constants.CollectionDocumentCount, collectionDocCnt));
    LOG.info(String.format(" - %s: %s", Constants.PostingsListsType, postingsClass.getCanonicalName()));
    LOG.info(String.format(" - %s: %s", Constants.NumReduceTasks, reduceTasks));
    LOG.info(String.format(" - %s: %s", Constants.MinSplitSize, minSplitSize));

    if (!fs.exists(new Path(indexPath))) {
      fs.mkdirs(new Path(indexPath));
    }

    Path inputPath = new Path(env.getIntDocVectorsDirectory());
    Path postingsPath = new Path(env.getPostingsDirectory());

    if (fs.exists(postingsPath)) {
      LOG.info("Postings already exist: no indexing will be performed.");
      return 0;
    }

    conf.setInt(Constants.CollectionDocumentCount, collectionDocCnt);

    conf.setInt("mapred.min.split.size", minSplitSize);
    conf.set("mapred.child.java.opts", "-Xmx2048m");

    Job job = new Job(conf,
        BuildIPInvertedIndexDocSorted.class.getSimpleName() + ":" + collectionName);
    job.setJarByClass(BuildIPInvertedIndexDocSorted.class);

    job.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(job, inputPath);
    FileOutputFormat.setOutputPath(job, postingsPath);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(PairOfInts.class);
    job.setMapOutputValueClass(TermPositions.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(postingsClass);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);
    job.setPartitionerClass(MyPartitioner.class);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    env.writePostingsType(postingsClass.getCanonicalName());

    return 0;
  }
View Full Code Here

    LOG.info("PowerTool: " + BuildDictionary.class.getCanonicalName());
    LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName));
    LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath));

    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    if (!fs.exists(new Path(indexPath))) {
      LOG.error("index path doesn't existing: skipping!");
      return 0;
    }

    if (fs.exists(new Path(env.getIndexTermsData())) &&
        fs.exists(new Path(env.getIndexTermIdsData())) &&
        fs.exists(new Path(env.getIndexTermIdMappingData())) &&
        fs.exists(new Path(env.getDfByTermData())) &&
        fs.exists(new Path(env.getCfByTermData())) &&
        fs.exists(new Path(env.getDfByIntData())) &&
        fs.exists(new Path(env.getCfByIntData()))) {
      LOG.info("term and term id data exist: skipping!");
      return 0;
    }

    conf.setInt(Constants.CollectionTermCount, (int) env.readCollectionTermCount());
    conf.set("mapred.child.java.opts", "-Xmx2048m");

    Path tmpPath = new Path(env.getTempDirectory());
    fs.delete(tmpPath, true);

    Job job = new Job(conf,
        BuildDictionary.class.getSimpleName() + ":" + collectionName);

    job.setJarByClass(BuildDictionary.class);
    job.setNumReduceTasks(1);

    FileInputFormat.setInputPaths(job, new Path(env.getTermDfCfDirectory()));
    FileOutputFormat.setOutputPath(job, tmpPath);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
View Full Code Here

    long collectionLength = 0;
    int docCount = 0;
    for (String index : indexPaths.split(",")) {
      sLogger.info("reading stats for " + index);

      RetrievalEnvironment env = new RetrievalEnvironment(index, fs);

      long l = env.readCollectionLength();
      int n = env.readCollectionDocumentCount();

      sLogger.info(" - CollectionLength: " + l);
      sLogger.info(" - CollectionDocumentCount: " + n);

      collectionLength += l;
      docCount += n;
    }

    float avgdl = (float) collectionLength / docCount;

    sLogger.info("all index segments: ");
    sLogger.info(" - CollectionLength: " + collectionLength);
    sLogger.info(" - CollectionDocumentCount: " + docCount);
    sLogger.info(" - AverageDocumentLenght: " + avgdl);

    RetrievalEnvironment env = new RetrievalEnvironment(dataOutputPath, fs);

    env.writeCollectionAverageDocumentLength(avgdl);
    env.writeCollectionLength(collectionLength);
    env.writeCollectionDocumentCount(docCount);

    return 0;
  }
View Full Code Here

    public void setup(Reducer<Text, PairOfIntLong, NullWritable, NullWritable>.Context context)
        throws IOException {
      LOG.info("Starting setup.");
      Configuration conf = context.getConfiguration();
      FileSystem fs = FileSystem.get(conf);
      RetrievalEnvironment env = new RetrievalEnvironment(conf.get(Constants.IndexPath), fs);

      numTerms = conf.getInt(Constants.CollectionTermCount, 0);

      terms = new String[numTerms];
      seqNums = new int[numTerms];
      dfs = new int[numTerms];
      cfs = new long[numTerms];

      termsOut = fs.create(new Path(env.getIndexTermsData()), true);
      //termsOut.writeInt(numTerms);

      idsOut = fs.create(new Path(env.getIndexTermIdsData()), true);
      idsOut.writeInt(numTerms);

      idsToTermOut = fs.create(new Path(env.getIndexTermIdMappingData()), true);
      idsToTermOut.writeInt(numTerms);

      dfByTermOut = fs.create(new Path(env.getDfByTermData()), true);
      dfByTermOut.writeInt(numTerms);

      cfByTermOut = fs.create(new Path(env.getCfByTermData()), true);
      cfByTermOut.writeInt(numTerms);

      dfByIntOut = fs.create(new Path(env.getDfByIntData()), true);
      dfByIntOut.writeInt(numTerms);

      cfByIntOut = fs.create(new Path(env.getCfByIntData()), true);
      cfByIntOut.writeInt(numTerms);
      LOG.info("Finished setup.");
    }
View Full Code Here

  @Override
  public int runTool() throws Exception {
    String indexPath = getConf().get("Ivory.IndexPath");
    String scoringModel = getConf().get("Ivory.ScoringModel");

    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, FileSystem.get(getConf()));

    String outputPath = env.getWeightedTermDocVectorsDirectory();
    String transDfFile = indexPath+"/transDf.dat";
    String fVocab_f2e= getConf().get("Ivory.F_Vocab_F2E");        //de from P(e|f)
    String eVocab_f2e = getConf().get("Ivory.E_Vocab_F2E");        //en from P(e|f)
    String ttable_f2e = getConf().get("Ivory.TTable_F2E");        //P(e|f)
    String eVocab_e2f  = getConf().get("Ivory.E_Vocab_E2F");    //en from P(f|e)
    String fVocab_e2f  = getConf().get("Ivory.F_Vocab_E2F");    //de from P(f|e)
    String ttable_e2f= getConf().get("Ivory.TTable_E2F");      //P(f|e)
 
    createTranslatedDFFile(transDfFile);

    JobConf conf = new JobConf(getConf(), BuildTranslatedTermDocVectors.class);
    conf.setJobName("BuildTranslatedTermDocVectors");
    FileSystem fs = FileSystem.get(conf);

    if(fs.exists(new Path(outputPath))){
      LOG.info(outputPath+": Translated term doc vectors already exist! Nothing to do for this job...");
      return 0;
    }

    String collectionName = getConf().get("Ivory.CollectionName");
    String inputPath = env.getTermDocVectorsDirectory();

    LOG.info("Preparing to build document vectors using " + scoringModel);
    LOG.info("Document vectors to be stored in " + outputPath);
    LOG.info("CollectionName: " + collectionName);
    LOG.info("Input path: " + inputPath);

    ///////Configuration setup

    conf.set("Ivory.IndexPath", indexPath);
    conf.set("Ivory.ScoringModel", scoringModel);
    DocLengthTable mDLTable;
    try {
      mDLTable = new DocLengthTable4B(env.getDoclengthsData(), fs);
    } catch (IOException e1) {
      throw new RuntimeException("Error initializing Doclengths file");
    }
    LOG.info(mDLTable.getAvgDocLength()+" is average doc len.");
    LOG.info(mDLTable.getDocCount()+" is num docs.");

    conf.setFloat("Ivory.AvgDocLen", mDLTable.getAvgDocLength());
    conf.setInt("Ivory.CollectionDocumentCount", env.readCollectionDocumentCount());
   
    conf.setNumMapTasks(300);     
    conf.setNumReduceTasks(0);
    conf.set("mapred.child.java.opts", "-Xmx2048m");
    conf.setInt("mapred.map.max.attempts", 10);
View Full Code Here

    job2.setJobName("ConvertMap2Pairs");
   
    FileSystem fs = FileSystem.get(job2);
   
    String indexPath = getConf().get("Ivory.IndexPath");
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    int blockSize = getConf().getInt("Ivory.BlockSize", 0);   
    int numDocs = env.readCollectionDocumentCount();
    int numBlocks = numDocs / blockSize + 1;

    String inputPath = null;
    for (int i = 0; i < numBlocks; i++) {
      inputPath = conf.get("Ivory.PCPOutputPath")+"/block"+i;      //one block of output of PCP algorithm
View Full Code Here

TOP

Related Classes of ivory.core.RetrievalEnvironment

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.