Examples of FetcherOutput

net.nutch.fetcher.FetcherOutput
An entry in the fetcher's output. This includes all of the fetcher output except the raw and stripped versions of the content, which are placed in separate files.
Note by John Xing: As of 20041022, option -noParsing is introduced in Fetcher.java. This changes fetcher behavior. Accordingly there are necessary modifications in this class. Check Fetcher.java and ParseSegment.java for details. @author Doug Cutting
org.apache.nutch.fetcher.FetcherOutput

Examples of net.nutch.fetcher.FetcherOutput

      (new File(directory, ParseText.DIR_NAME)).getPath(), ParseText.class);


    try {
      LongWritable key = new LongWritable();
      ParserOutput val = new ParserOutput();
      FetcherOutput fo = new FetcherOutput();
      int count = 0;
      int status;
      while (parserOutputReader.next(key,val)) {
        fetcherNPReader.next(fo);
        // safe guarding
        if (fetcherNPReader.key() != key.get())
          throw new IOException("Mismatch between entries under "
            + FetcherOutput.DIR_NAME_NP + " and in " + sortedFile.getName());
        // reset status in fo (FetcherOutput), using status in ParserOutput
        switch (val.getStatus()) {
        case ParserOutput.SUCCESS:
          fo.setStatus(FetcherOutput.SUCCESS);
          break;
        case ParserOutput.UNKNOWN:
        case ParserOutput.FAILURE:
          fo.setStatus(FetcherOutput.CANT_PARSE);
          break;
        case ParserOutput.NOFETCH:
        default:
          // do not reset
        }

View Full Code Here

Examples of net.nutch.fetcher.FetcherOutput

     * This thread participates in looping through
     * entries of FetcherOutput and Content
     */
    public void run() {


      FetcherOutput fetcherOutput = new FetcherOutput();
      Content content = new Content();


      FetchListEntry fle = null;
      String url = null;


      while (true) {
        if (LogFormatter.hasLoggedSevere())       // something bad happened
          break;                                  // exit


        t0 = System.currentTimeMillis();


        try {


          // must be read in order! thus synchronize threads.
          synchronized (ParseSegment.this) {
            t1 = System.currentTimeMillis();


            try {
              if (fetcherNPReader.next(fetcherOutput) == null ||
                contentReader.next(content) == null)
              return;
            } catch (EOFException eof) {
              // only partial data available, stop this thread,
              // other threads will be stopped also.
              return;
            }


            entry++;
            myEntry = entry;
            if (LOG.isLoggable(Level.FINE))
              LOG.fine("Read in entry "+entry);


            // safe guard against mismatched files
            //if (entry != fetcherNPReader.key() ||
            //    entry != contentReader.key()) {
            //  LOG.severe("Mismatched entries under "
            //    + FetcherOutput.DIR_NAME_NP + " and " + Content.DIR_NAME);
            //  continue;
            //}
          }


          t2 = System.currentTimeMillis();


          fle = fetcherOutput.getFetchListEntry();
          url = fle.getPage().getURL().toString();


          LOG.fine("parsing " + url);            // parse the page


          // safe guard against mismatched files
          if (!url.equals(content.getUrl())) {
            LOG.severe("Mismatched entries under "
              + FetcherOutput.DIR_NAME_NP + " and " + Content.DIR_NAME);
            continue;
          }


          // if fetch was successful or
          // previously unable to parse (so try again)
          if (fetcherOutput.getStatus() == FetcherOutput.SUCCESS ||
              fetcherOutput.getStatus() == FetcherOutput.CANT_PARSE) {
            handleContent(url, content);
            synchronized (ParseSegment.this) {
              pages++;                    // record successful parse
              bytes += content.getContent().length;
              if ((pages % 100) == 0)

View Full Code Here

Examples of net.nutch.fetcher.FetcherOutput

      processedRecords = 0L;
      delta = System.currentTimeMillis();
      while (it.hasNext()) {
        SegmentReader sr = (SegmentReader) it.next();
        String name = sr.segmentDir.getName();
        FetcherOutput fo = new FetcherOutput();
        for (long i = 0; i < sr.size; i++) {
          try {
            if (!sr.get(i, fo, null, null, null)) break;


            Document doc = new Document();
            doc.add(new Field("sd", name + "|" + i, true, false, false));
            doc.add(new Field("uh", MD5Hash.digest(fo.getUrl().toString()).toString(), true, true, false));
            doc.add(new Field("ch", fo.getMD5Hash().toString(), true, true, false));
            doc.add(new Field("time", DateField.timeToString(fo.getFetchDate()), true, false, false));
            iw.addDocument(doc);
            processedRecords++;
            if (processedRecords > 0 && (processedRecords % LOG_STEP == 0)) {
              LOG.info(" Processed " + processedRecords + " records (" +
                      (float)(LOG_STEP * 1000)/(float)(System.currentTimeMillis() - delta) + " rec/s)");
              delta = System.currentTimeMillis();
            }
            if (processedRecords > 0 && (processedRecords % INDEX_SIZE == 0)) {
              iw.optimize();
              iw.close();
              LOG.info(" - creating next subindex...");
              masterDir = new File(fsmtIndexDir, "" + masters.size());
              if (!masterDir.mkdirs()) {
                LOG.severe("Could not create a master index dir: " + masterDir);
                return;
              }
              masters.add(masterDir);
              iw = new IndexWriter(masterDir, new WhitespaceAnalyzer(), true);
              iw.setUseCompoundFile(false);
              iw.mergeFactor = INDEX_MERGE_FACTOR;
              iw.minMergeDocs = INDEX_MIN_MERGE_DOCS;
            }
          } catch (Throwable t) {
            // we can assume the data is invalid from now on - break here
            LOG.info(" - segment " + name + " truncated to " + (i + 1) + " records");
            break;
          }
        }
      }
      iw.optimize();
      LOG.info("* Creating index took " + (System.currentTimeMillis() - s1) + " ms");
      s1 = System.currentTimeMillis();
      // merge all other indexes using the latest IndexWriter (still open):
      if (masters.size() > 1) {
        LOG.info(" - merging subindexes...");
        stage = SegmentMergeStatus.STAGE_MERGEIDX;
        IndexReader[] ireaders = new IndexReader[masters.size() - 1];
        for (int i = 0; i < masters.size() - 1; i++) ireaders[i] = IndexReader.open((File)masters.get(i));
        iw.addIndexes(ireaders);
        for (int i = 0; i < masters.size() - 1; i++) {
          ireaders[i].close();
          FileUtil.fullyDelete((File)masters.get(i));
        }
      }
      iw.close();
      LOG.info("* Optimizing index took " + (System.currentTimeMillis() - s1) + " ms");
      LOG.info("* Removing duplicate entries...");
      stage = SegmentMergeStatus.STAGE_DEDUP;
      IndexReader ir = IndexReader.open(masterDir);
      int i = 0;
      long cnt = 0L;
      processedRecords = 0L;
      s1 = System.currentTimeMillis();
      delta = s1;
      TermEnum te = ir.terms();
      while(te.next()) {
        Term t = te.term();
        if (t == null) continue;
        if (!(t.field().equals("ch") || t.field().equals("uh"))) continue;
        cnt++;
        processedRecords = cnt / 2;
        if (cnt > 0 && (cnt % (LOG_STEP  * 2) == 0)) {
          LOG.info(" Processed " + processedRecords + " records (" +
                  (float)(LOG_STEP * 1000)/(float)(System.currentTimeMillis() - delta) + " rec/s)");
          delta = System.currentTimeMillis();
        }
        // Enumerate all docs with the same URL hash or content hash
        TermDocs td = ir.termDocs(t);
        if (td == null) continue;
        int id = -1;
        String time = null;
        Document doc = null;
        // Keep only the latest version of the document with
        // the same hash (url or content). Note: even if the content
        // hash is identical, other metadata may be different, so even
        // in this case it makes sense to keep the latest version.
        while (td.next()) {
          int docid = td.doc();
          if (!ir.isDeleted(docid)) {
            doc = ir.document(docid);
            if (time == null) {
              time = doc.get("time");
              id = docid;
              continue;
            }
            String dtime = doc.get("time");
            // "time" is a DateField, and can be compared lexicographically
            if (dtime.compareTo(time) > 0) {
              if (id != -1) {
                ir.delete(id);
              }
              time = dtime;
              id = docid;
            } else {
              ir.delete(docid);
            }
          }
        }
      }
      //
      // keep the IndexReader open...
      //
      
      LOG.info("* Deduplicating took " + (System.currentTimeMillis() - s1) + " ms");
      stage = SegmentMergeStatus.STAGE_WRITING;
      processedRecords = 0L;
      Vector outDirs = new Vector();
      File outDir = new File(output, SegmentWriter.getNewSegmentName());
      outDirs.add(outDir);
      LOG.info("* Merging all segments into " + output.getName());
      s1 = System.currentTimeMillis();
      delta = s1;
      nfs.mkdirs(outDir);
      SegmentWriter sw = new SegmentWriter(nfs, outDir, true);
      LOG.fine(" - opening first output segment in " + outDir.getName());
      FetcherOutput fo = new FetcherOutput();
      Content co = new Content();
      ParseText pt = new ParseText();
      ParseData pd = new ParseData();
      int outputCnt = 0;
      for (int n = 0; n < ir.maxDoc(); n++) {

View Full Code Here

Examples of net.nutch.fetcher.FetcherOutput

      readers.add(sr);
    }
    LOG.info("Input: " + total + " entries in " + readers.size() + " segments.");
    if (!parsed)
      LOG.warning(" - some input segments are non-parsed, forcing non-parsed output!");
    FetcherOutput fo = new FetcherOutput();
    Content co = new Content();
    ParseData pd = new ParseData();
    ParseText pt = new ParseText();
    long outputCnt = 0L;
    int segCnt = 1;

View Full Code Here

Examples of net.nutch.fetcher.FetcherOutput

    // XXX We assume that all other data files contain the
    // XXX same number of valid entries - which is not always
    // XXX true if Fetcher crashed in the middle of update.
    // XXX One should check for this later, when actually
    // XXX reading the entries.
    FetcherOutput fo = new FetcherOutput();
    fetcherReader.next(fo);
    started = fo.getFetchDate();
    LongWritable w = new LongWritable();
    w.set(++size);
    try {
      while (fetcherReader.seek(w)) {
        w.set(++size);
      }
    } catch (Throwable eof) {
      // the file is truncated - probably due to a crashed fetcher.
      // Use just the part that we can...
      LOG.warning(" - data in segment " + dir + " is corrupt, using only " + size + " entries.");
    }
    // go back until you get a good entry
    boolean ok = false;
    int back = 0;
    do {
      try {
        fetcherReader.seek(size - 2 - back);
        fetcherReader.next(fo);
        ok = true;
      } catch (Throwable t) {
        back++;
      }
    } while (!ok && back < 10);
    if (back >= 10)
      throw new Exception(" - fetcher output is unreadable");
    if (back > 0) LOG.warning(" - fetcher output truncated by " + back + " to " + size);
    size = size - back;
    finished = fo.getFetchDate();
    // reposition to the start
    fetcherReader.reset();
  }

View Full Code Here

Examples of net.nutch.fetcher.FetcherOutput

   * @param output where to dump to
   * @throws Exception
   */
  public synchronized void dump(boolean sorted, PrintStream output) throws Exception {
    reset();
    FetcherOutput fo = new FetcherOutput();
    Content co = new Content();
    ParseData pd = new ParseData();
    ParseText pt = new ParseText();
    long recNo = 0L;
    if (!sorted) {
      while(next(fo, co, pt, pd)) {
        output.println("Recno:: " + recNo++);
        output.println("FetcherOutput::\n" + fo.toString());
        if (contentReader != null)
          output.println("Content::\n" + co.toString());
        if (parseDataReader != null)
          output.println("ParseData::\n" + pd.toString());
        if (parseTextReader != null)
          output.println("ParseText::\n" + pt.toString());
        output.println("");
      }
    } else {
      File unsortedFile = new File(segmentDir, ".unsorted");
      File sortedFile = new File(segmentDir, ".sorted");
      nfs.delete(unsortedFile);
      nfs.delete(sortedFile);
      SequenceFile.Writer seqWriter = new SequenceFile.Writer(nfs,
              unsortedFile.toString(), UTF8.class, LongWritable.class);
      FetchListEntry fle;
      LongWritable rec = new LongWritable();
      UTF8 url = new UTF8();
      String urlString;
      while (fetcherReader.next(fo) != null) {
        fle = fo.getFetchListEntry();
        urlString = fle.getPage().getURL().toString();
        rec.set(recNo);
        url.set(urlString);
        seqWriter.append(url, rec);
        recNo++;
      }
      seqWriter.close();
      // sort the SequenceFile
      long start = System.currentTimeMillis();


      SequenceFile.Sorter sorter = new SequenceFile.Sorter(nfs,
              new UTF8.Comparator(), LongWritable.class);


      sorter.sort(unsortedFile.toString(), sortedFile.toString());


      float localSecs = (System.currentTimeMillis() - start) / 1000.0f;
      LOG.info(" - sorted: " + recNo + " entries in " + localSecs + "s, "
        + (recNo/localSecs) + " entries/s");


      nfs.delete(unsortedFile);
      SequenceFile.Reader seqReader = new SequenceFile.Reader(nfs, sortedFile.toString());
      while (seqReader.next(url, rec)) {
        recNo = rec.get();
        get(recNo, fo, co, pt, pd);
        output.println("Recno:: " + recNo++);
        output.println("FetcherOutput::\n" + fo.toString());
        if (contentReader != null)
          output.println("Content::\n" + co.toString());
        if (parseDataReader != null)
          output.println("ParseData::\n" + pd.toString());
        if (parseTextReader != null)

View Full Code Here

Examples of net.nutch.fetcher.FetcherOutput

        rnd = "/" + System.currentTimeMillis();
        url += rnd;
      }
      url += "/example.html";
      FetchListEntry fle = new FetchListEntry(true, new Page(url, 1.0f), new String[] { "test" + rnd });
      FetcherOutput fo = new FetcherOutput(fle, MD5Hash.digest(url), FetcherOutput.SUCCESS);
      StringBuffer content = new StringBuffer("<html><body><h1>Hello from Page " + i + "</h1>");
      if (unique) {
        content.append("<p>Created at epoch time: " + System.currentTimeMillis() + ", " + r.nextLong() + "</p>");
      }
      for (int k = 0; k < 10; k++) {

View Full Code Here

Examples of org.apache.nutch.fetcher.FetcherOutput

              // Score at this stage is 1.0f.
              metaData.set(Nutch.SCORE_KEY, Float.toString(datum.getScore())); // TODO MC
                                
              // WritableComparable outkey = new UTF8(d.urlString);
              WritableComparable outkey = new Text(url); 
              Writable outvalue = new FetcherOutput(datum, null, new ParseImpl(parse));                 
                    
              // output.collect(outkey, outvalue); 
              Text key=Nutchwax.generateWaxKey(outkey, collectionName);
              output.collect(key, outvalue);                        
            }

View Full Code Here

Examples of org.apache.nutch.fetcher.FetcherOutput

        LOG.warn(getParseRateLogMessage(url, noSpacesMimetype,
          kbPerSecond));
      }
    }


    Writable v = new FetcherOutput(datum, null,
      parse != null ? new ParseImpl(parse) : null);       
    if (collectionType.equals(Global.COLLECTION_TYPE_MULTIPLE)) {
      LOG.info("multiple: "+SqlSearcher.getCollectionNameWithTimestamp(this.collectionName,arcData.getDate())+" "+url); 
      output.collect(Nutchwax.generateWaxKey(url,SqlSearcher.getCollectionNameWithTimestamp(this.collectionName,arcData.getDate())), v); 
    }

View Full Code Here

Examples of org.apache.nutch.fetcher.FetcherOutput

        }


        public void write(WritableComparable key, Writable value)
          throws IOException
        {                 
          FetcherOutput fo = (FetcherOutput)value;
          MapWritable mw = fo.getCrawlDatum().getMetaData();
          Text cdxLine = (Text)mw.get(ImportArcs.CDXKEY);
          
          if (cdxLine != null)
          {
            cdxOut.append(key, cdxLine);
          }
          
          mw.remove(ImportArcs.CDXKEY);
          fetchOut.append(key, fo.getCrawlDatum());
          
          if (fo.getParse() != null)
          {
            parseOut.write(key, fo.getParse());         
          }
        }


        public void close(Reporter reporter) throws IOException
        {

View Full Code Here

0 1

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.