Package org.apache.nutch.io

Examples of org.apache.nutch.io.LongWritable


    // XXX One should check for this later, when actually
    // XXX reading the entries.
    FetcherOutput fo = new FetcherOutput();
    fetcherReader.next(fo);
    started = fo.getFetchDate();
    LongWritable w = new LongWritable(-1);
    try {
      fetcherReader.finalKey(w);
    } catch (Throwable eof) {
      // the file is truncated - probably due to a crashed fetcher.
      // Use just the part that we can...
      LOG.warning(" - data in segment " + dir + " is corrupt, using only " + w.get() + " entries.");
    }
    // go back until you get a good entry
    size = w.get()+1;
    boolean ok = false;
    int back = 0;
    do {
      try {
        fetcherReader.seek(size - 2 - back);
View Full Code Here


      nfs.delete(unsortedFile);
      nfs.delete(sortedFile);
      SequenceFile.Writer seqWriter = new SequenceFile.Writer(nfs,
              unsortedFile.toString(), UTF8.class, LongWritable.class);
      FetchListEntry fle;
      LongWritable rec = new LongWritable();
      UTF8 url = new UTF8();
      String urlString;
      while (fetcherReader.next(fo) != null) {
        fle = fo.getFetchListEntry();
        urlString = fle.getPage().getURL().toString();
        rec.set(recNo);
        url.set(urlString);
        seqWriter.append(url, rec);
        recNo++;
      }
      seqWriter.close();
      // sort the SequenceFile
      long start = System.currentTimeMillis();

      SequenceFile.Sorter sorter = new SequenceFile.Sorter(nfs,
              new UTF8.Comparator(), LongWritable.class);

      sorter.sort(unsortedFile.toString(), sortedFile.toString());

      float localSecs = (System.currentTimeMillis() - start) / 1000.0f;
      LOG.info(" - sorted: " + recNo + " entries in " + localSecs + "s, "
        + (recNo/localSecs) + " entries/s");

      nfs.delete(unsortedFile);
      SequenceFile.Reader seqReader = new SequenceFile.Reader(nfs, sortedFile.toString());
      while (seqReader.next(url, rec)) {
        recNo = rec.get();
        get(recNo, fo, co, pt, pd);
        output.println("Recno:: " + recNo++);
        output.println("FetcherOutput::\n" + fo.toString());
        if (contentReader != null)
          output.println("Content::\n" + co.toString());
View Full Code Here

    }

    public void run() {
      for (int i = 0; i < count; i++) {
        try {
          LongWritable param = new LongWritable(RANDOM.nextLong());
          LongWritable value =
            (LongWritable)client.call(param, new InetSocketAddress(PORT));
          if (!param.equals(value)) {
            LOG.severe("Call failed!");
            failed = true;
            break;
View Full Code Here

    public void run() {
      for (int i = 0; i < count; i++) {
        try {
          Writable[] params = new Writable[addresses.length];
          for (int j = 0; j < addresses.length; j++)
            params[j] = new LongWritable(RANDOM.nextLong());
          Writable[] values = client.call(params, addresses);
          for (int j = 0; j < addresses.length; j++) {
            if (!params[j].equals(values[j])) {
              LOG.severe("Call failed!");
              failed = true;
View Full Code Here

    // tokenize the value
    StringTokenizer st = new StringTokenizer(text);
    while (st.hasMoreTokens()) {
      // output <token,1> pairs
      output.collect(new UTF8(st.nextToken()), new LongWritable(1));
    }
  }
View Full Code Here

  public void map(WritableComparable key, Writable value,
                  OutputCollector output) throws IOException {
    String text = ((UTF8)value).toString();
    Matcher matcher = pattern.matcher(text);
    while (matcher.find()) {
      output.collect(new UTF8(matcher.group(group)), new LongWritable(1));
    }
  }
View Full Code Here

    while (values.hasNext()) {
      sum += ((LongWritable)values.next()).get();
    }

    // output sum
    output.collect(key, new LongWritable(sum));
  }
View Full Code Here

    // XXX One should check for this later, when actually
    // XXX reading the entries.
    FetcherOutput fo = new FetcherOutput();
    fetcherReader.next(fo);
    started = fo.getFetchDate();
    LongWritable w = new LongWritable(-1);
    try {
      fetcherReader.finalKey(w);
    } catch (Throwable eof) {
      // the file is truncated - probably due to a crashed fetcher.
      // Use just the part that we can...
      LOG.warning(" - data in segment " + dir + " is corrupt, using only " + w.get() + " entries.");
    }
    // go back until you get a good entry
    size = w.get()+1;
    boolean ok = false;
    int back = 0;
    do {
      try {
        fetcherReader.seek(size - 2 - back);
View Full Code Here

      nfs.delete(unsortedFile);
      nfs.delete(sortedFile);
      SequenceFile.Writer seqWriter = new SequenceFile.Writer(nfs,
              unsortedFile.toString(), UTF8.class, LongWritable.class);
      FetchListEntry fle;
      LongWritable rec = new LongWritable();
      UTF8 url = new UTF8();
      String urlString;
      while (fetcherReader.next(fo) != null) {
        fle = fo.getFetchListEntry();
        urlString = fle.getPage().getURL().toString();
        rec.set(recNo);
        url.set(urlString);
        seqWriter.append(url, rec);
        recNo++;
      }
      seqWriter.close();
      // sort the SequenceFile
      long start = System.currentTimeMillis();

      SequenceFile.Sorter sorter = new SequenceFile.Sorter(nfs,
              new UTF8.Comparator(), LongWritable.class);

      sorter.sort(unsortedFile.toString(), sortedFile.toString());

      float localSecs = (System.currentTimeMillis() - start) / 1000.0f;
      LOG.info(" - sorted: " + recNo + " entries in " + localSecs + "s, "
        + (recNo/localSecs) + " entries/s");

      nfs.delete(unsortedFile);
      SequenceFile.Reader seqReader = new SequenceFile.Reader(nfs, sortedFile.toString());
      while (seqReader.next(url, rec)) {
        recNo = rec.get();
        get(recNo, fo, co, pt, pd);
        output.println("Recno:: " + recNo++);
        output.println("FetcherOutput::\n" + fo.toString());
        if (contentReader != null)
          output.println("Content::\n" + co.toString());
View Full Code Here

TOP

Related Classes of org.apache.nutch.io.LongWritable

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.