Package org.apache.nutch.fetcher

Examples of org.apache.nutch.fetcher.FetcherOutput


              // Score at this stage is 1.0f.
              metaData.set(Nutch.SCORE_KEY, Float.toString(datum.getScore())); // TODO MC
                               
              // WritableComparable outkey = new UTF8(d.urlString);
              WritableComparable outkey = new Text(url);
              Writable outvalue = new FetcherOutput(datum, null, new ParseImpl(parse));                
                   
              // output.collect(outkey, outvalue);
              Text key=Nutchwax.generateWaxKey(outkey, collectionName);
              output.collect(key, outvalue);                       
            }
View Full Code Here


        LOG.warn(getParseRateLogMessage(url, noSpacesMimetype,
          kbPerSecond));
      }
    }

    Writable v = new FetcherOutput(datum, null,
      parse != null ? new ParseImpl(parse) : null);      
    if (collectionType.equals(Global.COLLECTION_TYPE_MULTIPLE)) {
      LOG.info("multiple: "+SqlSearcher.getCollectionNameWithTimestamp(this.collectionName,arcData.getDate())+" "+url);
      output.collect(Nutchwax.generateWaxKey(url,SqlSearcher.getCollectionNameWithTimestamp(this.collectionName,arcData.getDate())), v);
    }
View Full Code Here

        }

        public void write(WritableComparable key, Writable value)
          throws IOException
        {                
          FetcherOutput fo = (FetcherOutput)value;
          MapWritable mw = fo.getCrawlDatum().getMetaData();
          Text cdxLine = (Text)mw.get(ImportArcs.CDXKEY);
         
          if (cdxLine != null)
          {
            cdxOut.append(key, cdxLine);
          }
         
          mw.remove(ImportArcs.CDXKEY);
          fetchOut.append(key, fo.getCrawlDatum());
         
          if (fo.getParse() != null)
          {
            parseOut.write(key, fo.getParse());        
          }
        }

        public void close(Reporter reporter) throws IOException
        {
View Full Code Here

      readers.add(sr);
    }
    LOG.info("Input: " + total + " entries in " + readers.size() + " segments.");
    if (!parsed)
      LOG.warning(" - some input segments are non-parsed, forcing non-parsed output!");
    FetcherOutput fo = new FetcherOutput();
    Content co = new Content();
    ParseData pd = new ParseData();
    ParseText pt = new ParseText();
    long outputCnt = 0L;
    int segCnt = 1;
    File outDir = new File(output, SegmentWriter.getNewSegmentName());
    LOG.info("Writing output in " + output);
    try {
      LOG.info(" - starting first output segment in " + outDir.getName());
      SegmentWriter sw = new SegmentWriter(nfs,
            outDir, true, parsed, withContent, withParseText, withParseData);
      long delta = System.currentTimeMillis();
      for (int i = 0; i < readers.size(); i++) {
        SegmentReader sr = (SegmentReader)readers.get(i);
        for (long k = 0L; k < sr.size; k++) {
          try {
            if (!sr.next(fo, co, pt, pd)) break;
          } catch (Throwable t) {
            LOG.warning(" - error reading entry #" + k + " from " + sr.segmentDir.getName());
            break;
          }
          // try to filter url
          String url = fo.getUrl().toString();
          boolean toSave = true; // default to save if no pattern given
          if (this.pattern != null) {
            if (matcher.contains(url,this.pattern)) {
              toSave = plusSign ? true: false;
            } else {
View Full Code Here

    // XXX We assume that all other data files contain the
    // XXX same number of valid entries - which is not always
    // XXX true if Fetcher crashed in the middle of update.
    // XXX One should check for this later, when actually
    // XXX reading the entries.
    FetcherOutput fo = new FetcherOutput();
    fetcherReader.next(fo);
    started = fo.getFetchDate();
    LongWritable w = new LongWritable(-1);
    try {
      fetcherReader.finalKey(w);
    } catch (Throwable eof) {
      // the file is truncated - probably due to a crashed fetcher.
      // Use just the part that we can...
      LOG.warning(" - data in segment " + dir + " is corrupt, using only " + w.get() + " entries.");
    }
    // go back until you get a good entry
    size = w.get()+1;
    boolean ok = false;
    int back = 0;
    do {
      try {
        fetcherReader.seek(size - 2 - back);
        fetcherReader.next(fo);
        ok = true;
      } catch (Throwable t) {
        back++;
      }
    } while (!ok && back < 10);
    if (back >= 10)
      throw new Exception(" - fetcher output is unreadable");
    if (back > 0) LOG.warning(" - fetcher output truncated by " + back + " to " + size);
    size = size - back;
    finished = fo.getFetchDate();
    // reposition to the start
    fetcherReader.reset();
  }
View Full Code Here

   * @param output where to dump to
   * @throws Exception
   */
  public synchronized void dump(boolean sorted, PrintStream output) throws Exception {
    reset();
    FetcherOutput fo = new FetcherOutput();
    Content co = new Content();
    ParseData pd = new ParseData();
    ParseText pt = new ParseText();
    long recNo = 0L;
    if (!sorted) {
      while(next(fo, co, pt, pd)) {
        output.println("Recno:: " + recNo++);
        output.println("FetcherOutput::\n" + fo.toString());
        if (contentReader != null)
          output.println("Content::\n" + co.toString());
        if (parseDataReader != null)
          output.println("ParseData::\n" + pd.toString());
        if (parseTextReader != null)
          output.println("ParseText::\n" + pt.toString());
        output.println("");
      }
    } else {
      File unsortedFile = new File(segmentDir, ".unsorted");
      File sortedFile = new File(segmentDir, ".sorted");
      nfs.delete(unsortedFile);
      nfs.delete(sortedFile);
      SequenceFile.Writer seqWriter = new SequenceFile.Writer(nfs,
              unsortedFile.toString(), UTF8.class, LongWritable.class);
      FetchListEntry fle;
      LongWritable rec = new LongWritable();
      UTF8 url = new UTF8();
      String urlString;
      while (fetcherReader.next(fo) != null) {
        fle = fo.getFetchListEntry();
        urlString = fle.getPage().getURL().toString();
        rec.set(recNo);
        url.set(urlString);
        seqWriter.append(url, rec);
        recNo++;
      }
      seqWriter.close();
      // sort the SequenceFile
      long start = System.currentTimeMillis();

      SequenceFile.Sorter sorter = new SequenceFile.Sorter(nfs,
              new UTF8.Comparator(), LongWritable.class);

      sorter.sort(unsortedFile.toString(), sortedFile.toString());

      float localSecs = (System.currentTimeMillis() - start) / 1000.0f;
      LOG.info(" - sorted: " + recNo + " entries in " + localSecs + "s, "
        + (recNo/localSecs) + " entries/s");

      nfs.delete(unsortedFile);
      SequenceFile.Reader seqReader = new SequenceFile.Reader(nfs, sortedFile.toString());
      while (seqReader.next(url, rec)) {
        recNo = rec.get();
        get(recNo, fo, co, pt, pd);
        output.println("Recno:: " + recNo++);
        output.println("FetcherOutput::\n" + fo.toString());
        if (contentReader != null)
          output.println("Content::\n" + co.toString());
        if (parseDataReader != null)
          output.println("ParseData::\n" + pd.toString());
        if (parseTextReader != null)
View Full Code Here

      processedRecords = 0L;
      delta = System.currentTimeMillis();
      while (it.hasNext()) {
        SegmentReader sr = (SegmentReader) it.next();
        String name = sr.segmentDir.getName();
        FetcherOutput fo = new FetcherOutput();
        for (long i = 0; i < sr.size; i++) {
          try {
            if (!sr.get(i, fo, null, null, null)) break;

            Document doc = new Document();
           
            // compute boost
            float boost = IndexSegment.calculateBoost(fo.getFetchListEntry().getPage().getScore(),
                    scorePower, boostByLinkCount, fo.getAnchors().length);
            doc.add(new Field("sd", name + "|" + i, true, false, false));
            doc.add(new Field("uh", MD5Hash.digest(fo.getUrl().toString()).toString(), true, true, false));
            doc.add(new Field("ch", fo.getMD5Hash().toString(), true, true, false));
            doc.add(new Field("time", DateField.timeToString(fo.getFetchDate()), true, false, false));
            doc.add(new Field("score", boost + "", true, false, false));
            doc.add(new Field("ul", fo.getUrl().toString().length() + "", true, false, false));
            iw.addDocument(doc);
            processedRecords++;
            if (processedRecords > 0 && (processedRecords % LOG_STEP == 0)) {
              LOG.info(" Processed " + processedRecords + " records (" +
                      (float)(LOG_STEP * 1000)/(float)(System.currentTimeMillis() - delta) + " rec/s)");
              delta = System.currentTimeMillis();
            }
            if (processedRecords > 0 && (processedRecords % INDEX_SIZE == 0)) {
              iw.optimize();
              iw.close();
              LOG.info(" - creating next subindex...");
              masterDir = new File(fsmtIndexDir, "" + masters.size());
              if (!masterDir.mkdirs()) {
                LOG.severe("Could not create a master index dir: " + masterDir);
                return;
              }
              masters.add(masterDir);
              iw = new IndexWriter(masterDir, new WhitespaceAnalyzer(), true);
              iw.setUseCompoundFile(false);
              iw.mergeFactor = INDEX_MERGE_FACTOR;
              iw.minMergeDocs = INDEX_MIN_MERGE_DOCS;
            }
          } catch (Throwable t) {
            // we can assume the data is invalid from now on - break here
            LOG.info(" - segment " + name + " truncated to " + (i + 1) + " records");
            break;
          }
        }
      }
      iw.optimize();
      LOG.info("* Creating index took " + (System.currentTimeMillis() - s1) + " ms");
      s1 = System.currentTimeMillis();
      // merge all other indexes using the latest IndexWriter (still open):
      if (masters.size() > 1) {
        LOG.info(" - merging subindexes...");
        stage = SegmentMergeStatus.STAGE_MERGEIDX;
        IndexReader[] ireaders = new IndexReader[masters.size() - 1];
        for (int i = 0; i < masters.size() - 1; i++) ireaders[i] = IndexReader.open((File)masters.get(i));
        iw.addIndexes(ireaders);
        for (int i = 0; i < masters.size() - 1; i++) {
          ireaders[i].close();
          FileUtil.fullyDelete((File)masters.get(i));
        }
      }
      iw.close();
      LOG.info("* Optimizing index took " + (System.currentTimeMillis() - s1) + " ms");
      LOG.info("* Removing duplicate entries...");
      stage = SegmentMergeStatus.STAGE_DEDUP;
      IndexReader ir = IndexReader.open(masterDir);
      int i = 0;
      long cnt = 0L;
      processedRecords = 0L;
      s1 = System.currentTimeMillis();
      delta = s1;
      TermEnum te = ir.terms();
      while(te.next()) {
        Term t = te.term();
        if (t == null) continue;
        if (!(t.field().equals("ch") || t.field().equals("uh"))) continue;
        cnt++;
        processedRecords = cnt / 2;
        if (cnt > 0 && (cnt % (LOG_STEP  * 2) == 0)) {
          LOG.info(" Processed " + processedRecords + " records (" +
                  (float)(LOG_STEP * 1000)/(float)(System.currentTimeMillis() - delta) + " rec/s)");
          delta = System.currentTimeMillis();
        }
        // Enumerate all docs with the same URL hash or content hash
        TermDocs td = ir.termDocs(t);
        if (td == null) continue;
        if (t.field().equals("uh")) {
          // Keep only the latest version of the document with
          // the same url hash. Note: even if the content
          // hash is identical, other metadata may be different, so even
          // in this case it makes sense to keep the latest version.
          int id = -1;
          String time = null;
          Document doc = null;
          while (td.next()) {
            int docid = td.doc();
            if (!ir.isDeleted(docid)) {
              doc = ir.document(docid);
              if (time == null) {
                time = doc.get("time");
                id = docid;
                continue;
              }
              String dtime = doc.get("time");
              // "time" is a DateField, and can be compared lexicographically
              if (dtime.compareTo(time) > 0) {
                if (id != -1) {
                  ir.delete(id);
                }
                time = dtime;
                id = docid;
              } else {
                ir.delete(docid);
              }
            }
          }
        } else if (t.field().equals("ch")) {
          // Keep only the version of the document with
          // the highest score, and then with the shortest url.
          int id = -1;
          int ul = 0;
          float score = 0.0f;
          Document doc = null;
          while (td.next()) {
            int docid = td.doc();
            if (!ir.isDeleted(docid)) {
              doc = ir.document(docid);
              if (ul == 0) {
                try {
                  ul = Integer.parseInt(doc.get("ul"));
                  score = Float.parseFloat(doc.get("score"));
                } catch (Exception e) {};
                id = docid;
                continue;
              }
              int dul = 0;
              float dscore = 0.0f;
              try {
                dul = Integer.parseInt(doc.get("ul"));
                dscore = Float.parseFloat(doc.get("score"));
              } catch (Exception e) {};
              int cmp = Float.compare(dscore, score);
              if (cmp == 0) {
                // equal scores, select the one with shortest url
                if (dul < ul) {
                  if (id != -1) {
                    ir.delete(id);
                  }
                  ul = dul;
                  id = docid;
                } else {
                  ir.delete(docid);
                }
              } else if (cmp < 0) {
                ir.delete(docid);
              } else {
                if (id != -1) {
                  ir.delete(id);
                }
                ul = dul;
                id = docid;
              }
            }
          }
        }
      }
      //
      // keep the IndexReader open...
      //
     
      LOG.info("* Deduplicating took " + (System.currentTimeMillis() - s1) + " ms");
      stage = SegmentMergeStatus.STAGE_WRITING;
      processedRecords = 0L;
      Vector outDirs = new Vector();
      File outDir = new File(output, SegmentWriter.getNewSegmentName());
      outDirs.add(outDir);
      LOG.info("* Merging all segments into " + output.getName());
      s1 = System.currentTimeMillis();
      delta = s1;
      nfs.mkdirs(outDir);
      SegmentWriter sw = new SegmentWriter(nfs, outDir, true);
      LOG.fine(" - opening first output segment in " + outDir.getName());
      FetcherOutput fo = new FetcherOutput();
      Content co = new Content();
      ParseText pt = new ParseText();
      ParseData pd = new ParseData();
      int outputCnt = 0;
      for (int n = 0; n < ir.maxDoc(); n++) {
View Full Code Here

      (new File(directory, ParseText.DIR_NAME)).getPath(), ParseText.class);

    try {
      LongWritable key = new LongWritable();
      ParserOutput val = new ParserOutput();
      FetcherOutput fo = new FetcherOutput();
      int count = 0;
      int status;
      while (parserOutputReader.next(key,val)) {
        fetcherNPReader.next(fo);
        // safe guarding
View Full Code Here

     * This thread participates in looping through
     * entries of FetcherOutput and Content
     */
    public void run() {

      FetcherOutput fetcherOutput = new FetcherOutput();
      Content content = new Content();

      FetchListEntry fle = null;
      String url = null;

      while (true) {
        if (LogFormatter.hasLoggedSevere())       // something bad happened
          break;                                  // exit

        t0 = System.currentTimeMillis();

        try {

          // must be read in order! thus synchronize threads.
          synchronized (ParseSegment.this) {
            t1 = System.currentTimeMillis();

            try {
              if (fetcherNPReader.next(fetcherOutput) == null ||
                contentReader.next(content) == null)
              return;
            } catch (EOFException eof) {
              // only partial data available, stop this thread,
              // other threads will be stopped also.
              return;
            }

            entry++;
            myEntry = entry;
            if (LOG.isLoggable(Level.FINE))
              LOG.fine("Read in entry "+entry);

            // safe guard against mismatched files
            //if (entry != fetcherNPReader.key() ||
            //    entry != contentReader.key()) {
            //  LOG.severe("Mismatched entries under "
            //    + FetcherOutput.DIR_NAME_NP + " and " + Content.DIR_NAME);
            //  continue;
            //}
          }

          t2 = System.currentTimeMillis();

          fle = fetcherOutput.getFetchListEntry();
          url = fle.getPage().getURL().toString();

          LOG.fine("parsing " + url);            // parse the page

          // safe guard against mismatched files
View Full Code Here

        rnd = "/" + System.currentTimeMillis();
        url += rnd;
      }
      url += "/example.html";
      FetchListEntry fle = new FetchListEntry(true, new Page(url, 1.0f), new String[] { "test" + rnd });
      FetcherOutput fo = new FetcherOutput(fle, MD5Hash.digest(url), ProtocolStatus.STATUS_SUCCESS);
      StringBuffer content = new StringBuffer("<html><body><h1>Hello from Page " + i + "</h1>");
      if (unique) {
        content.append("<p>Created at epoch time: " + System.currentTimeMillis() + ", " + r.nextLong() + "</p>");
      }
      for (int k = 0; k < 10; k++) {
View Full Code Here

TOP

Related Classes of org.apache.nutch.fetcher.FetcherOutput

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.