Package edu.umd.cloud9.io

Examples of edu.umd.cloud9.io.FSLineReader


          "-output_path=" + indexRootPath + "/wiki-docid-tmp",
          "-output_file=" + mappingFile.toString(),
          "-wiki_language=" + collectionLang };
      LOG.info("Running BuildWikipediaDocnoMapping with args " + Arrays.toString(arr));

      BuildWikipediaDocnoMapping tool = new BuildWikipediaDocnoMapping();
      tool.setConf(conf);
      tool.run(arr);

      fs.delete(new Path(indexRootPath + "/wiki-docid-tmp"), true);
    } else {
      LOG.info("Docno mapping already exists at: " + mappingFile);
    }

    // Repack Wikipedia into sequential compressed block
    if (!fs.exists(new Path(seqCollection + "/part-00000"))) {
      LOG.info(seqCollection + " doesn't exist, creating...");
      String[] arr = new String[] { "-input=" + rawCollection,
          "-output=" + seqCollection,
          "-mapping_file=" + mappingFile.toString(),
          "-compression_type=block",
          "-wiki_language=" + collectionLang };
      LOG.info("Running RepackWikipedia with args " + Arrays.toString(arr));

      RepackWikipedia tool = new RepackWikipedia();
      tool.setConf(conf);
      tool.run(arr);
    } else {
      LOG.info("Repacked collection already exists at: " + seqCollection);     
    }

    conf.set(Constants.CollectionName, "Wikipedia-"+collectionLang);
View Full Code Here


        "-mapping_file=" + mappingFile.toString(),
        "-compression_type=block",
        "-wiki_language=" + collectionLang };
    LOG.info("Running RepackWikipedia with args " + Arrays.toString(arr));

    RepackWikipedia tool = new RepackWikipedia();
    tool.setConf(conf);
    tool.run(arr);

    conf.set(Constants.CollectionName, "Wikipedia-"+collectionLang);
    conf.setInt(Constants.NumMapTasks, numMappers);
    conf.setInt(Constants.NumReduceTasks, numReducers);
    conf.set(Constants.CollectionPath, seqCollection);
View Full Code Here

    // Repack Wikipedia into sequential compressed block
    p = new Path(seqCollection);
    if (!fs.exists(p)) {
      LOG.info(seqCollection + " doesn't exist, creating...");
      String[] arr = new String[] { rawCollection, seqCollection, mappingFile.toString(), "block"};
      RepackWikipedia tool = new RepackWikipedia();
      tool.setConf(conf);
      tool.run(arr);
    }

    conf.set("Ivory.CollectionName", "Wikipedia-"+collectionLang);
    conf.setInt("Ivory.NumMapTasks", numMappers);
    conf.setInt("Ivory.NumReduceTasks", numReducers);
View Full Code Here

          "-mapping_file=" + mappingFile.toString(),
          "-compression_type=block",
          "-wiki_language=" + collectionLang };
      LOG.info("Running RepackWikipedia with args " + Arrays.toString(arr));

      RepackWikipedia tool = new RepackWikipedia();
      tool.setConf(conf);
      tool.run(arr);
    } else {
      LOG.info("Repacked collection already exists at: " + seqCollection);     
    }

    conf.set(Constants.CollectionName, "Wikipedia-"+collectionLang);
View Full Code Here

        if (fileStats[i].getPath().getName().startsWith("_")) {
          continue;
        }

        LOG.info("processing " + fileStats[i].getPath());
        FSLineReader reader = new FSLineReader(fileStats[i].getPath(), fs);

        Text line = new Text();
        while (reader.readLine(line) > 0) {
          String[] arr = line.toString().split("\\t+", 2);

          int docno = Integer.parseInt(arr[0]);
          int len = Integer.parseInt(arr[1]);

          // Note that because of speculative execution there may be
          // multiple copies of doclength data. Therefore, we can't
          // just count number of doclengths read. Instead, keep track
          // of largest docno encountered.
          if (docno < docnoOffset) {
            throw new RuntimeException(
                "Error: docno " + docno + " < docnoOffset " + docnoOffset + "!");
          }

          doclengths[docno - docnoOffset] = len;

          if (docno > maxDocno) {
            maxDocno = docno;
          }
          if (docno < minDocno) {
            minDocno = docno;
          }
        }
        reader.close();
        context.getCounter(DocLengths.Files).increment(1);
      }

      LOG.info("min docno: " + minDocno);
      LOG.info("max docno: " + maxDocno);
View Full Code Here

      }

      if(localFiles!=null && localFiles.length > 0){
        samplesMap = new HMapIIW();
        try {
          FSLineReader reader = new FSLineReader(localFiles[0], FileSystem.getLocal(job));
          Text t = new Text();
          while(reader.readLine(t)!=0){
            int docno = Integer.parseInt(t.toString());
            samplesMap.put(docno, 1);
          }
          reader.close();
        } catch (IOException e1) {
        }
        sLogger.info(samplesMap);
      }
    }
View Full Code Here

      // if cache is non-empty, a docnos file has been entered
      if(localFiles != null){
        sLogger.setLevel(Level.INFO);
        samplesMap = new HMapII();
        try {
          FSLineReader reader = new FSLineReader(localFiles[0], FileSystem.getLocal(conf));
          Text t = new Text();
          while(reader.readLine(t)!=0){
            int docno = Integer.parseInt(t.toString());
            samplesMap.put(docno, 1);
          }
          reader.close();
        } catch (IOException e1) {
        }
        sLogger.info(samplesMap);
      }
    }
View Full Code Here

      FileSystem fs = FileSystem.get(conf);

      sLogger.info("reading " + inputFile);

      FSLineReader reader = new FSLineReader(new Path(inputFile), fs);
      FSDataOutputStream writer = fs.create(new Path(outputFile), true);

      Text line = new Text();
      while (reader.readLine(line) > 0) {
        String[] arr = line.toString().split("\\s+");

        String docid = arr[2];
        int rank = Integer.parseInt(arr[3]);

        long start = System.currentTimeMillis();
        ClueWarcRecord doc = null;

        for (int i = 0; i < 10; i++) {
          doc = indexes[i].getDocument(docid);
          if (doc != null)
            break;
        }
        String url = doc.getHeaderMetadataItem("WARC-Target-URI");
        long duration = System.currentTimeMillis() - start;

        reporter.incrCounter(MyCounter.Count, 1);
        reporter.incrCounter(MyCounter.Time, duration);

        if (rank == 1 || rank % 100 == 0)
          sLogger.info(line + " " + url + " (" + duration + "ms)");
        writer.write(new String(line + " " + url + "\n").getBytes());
      }

      reader.close();
      writer.close();

    }
View Full Code Here

      FileSystem fs = FileSystem.get(conf);

      sLogger.info("reading " + inputFile);

      FSLineReader reader = new FSLineReader(new Path(inputFile), fs);
      FSDataOutputStream writer = fs.create(new Path(outputFile), true);

      Text line = new Text();
      while (reader.readLine(line) > 0) {
        String[] arr = line.toString().split("\\s+");

        String docid = arr[2];
        int rank = Integer.parseInt(arr[3]);

        long start = System.currentTimeMillis();
        String url = findex.getDocument(docid).getHeaderMetadataItem("WARC-Target-URI");
        long duration = System.currentTimeMillis() - start;

        reporter.incrCounter(MyCounter.Count, 1);
        reporter.incrCounter(MyCounter.Time, duration);

        if (rank == 1 || rank % 100 == 0)
          sLogger.info(line + " " + url + " (" + duration + "ms)");
        writer.write(new String(line + " " + url + "\n").getBytes());
      }

      reader.close();
      writer.close();

    }
View Full Code Here

     
      if(localFiles.length > 1){
        dotProductThresholds = new float[D];
        int i = 0;
        try {
          FSLineReader reader = new FSLineReader(localFiles[1], FileSystem.getLocal(job));
          Text t = new Text();
          while(reader.readLine(t)!=0){
            float val = Float.parseFloat(t.toString());
            sLogger.debug(i + " --> "+val);
            dotProductThresholds[i] = val;
          }
          reader.close();
        } catch (IOException e1) {
        }
        sLogger.info("Dot product thresholds read");
      }else{
        sLogger.info("Dot product thresholds file not specified in option Ivory.DotProdThreshFile");
View Full Code Here

TOP

Related Classes of edu.umd.cloud9.io.FSLineReader

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.