Package org.apache.nutch.crawl

Examples of org.apache.nutch.crawl.CrawlDatum


    for (int i = 0; i < sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

      protocol = new ProtocolFactory(conf).getProtocol(urlString);
      content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();

      parse = new ParseUtil(conf).parse(content).get(content.getUrl());

      String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
      Assert.assertTrue(sampleTexts[i].equals(text));
View Full Code Here


    Configuration conf = NutchConfiguration.create();
    for (int i = 0; i < sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

      protocol = new ProtocolFactory(conf).getProtocol(urlString);
      content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
      parse = new ParseUtil(conf).parseByExtensionId("parse-zip",content).get(content.getUrl());
      Assert.assertTrue(parse.getText().equals(expectedText));
    }
  }
View Full Code Here

  protected byte checkMergedSegment(Path testDir, Path mergedSegment) throws Exception  {
    // Get a MapFile reader for the <Text,CrawlDatum> pairs
    MapFile.Reader[] readers = MapFileOutputFormat.getReaders(fs, new Path(mergedSegment, CrawlDatum.FETCH_DIR_NAME), conf);
   
    Text key = new Text();
    CrawlDatum value = new CrawlDatum();
    byte finalStatus = 0x0;
   
    for (MapFile.Reader reader : readers) {
      while (reader.next(key, value)) {
        LOG.info("Reading status for: " + key.toString() + " > " + CrawlDatum.getStatusName(value.getStatus()));
       
        // Only consider fetch status
        if (CrawlDatum.hasFetchStatus(value) && key.toString().equals("http://nutch.apache.org/")) {
          finalStatus = value.getStatus();
        }
      }
     
      // Close the reader again
      reader.close();
View Full Code Here

   
    // The URL of our redirecting URL
    String redirectUrl = "http://nutch.apache.org/i_redirect_to_the_root/";
   
    // Our value
    CrawlDatum value = new CrawlDatum();
   
    // Path of the segment's crawl_fetch directory
    Path crawlFetchPath = new Path(new Path(segment, CrawlDatum.FETCH_DIR_NAME), "part-00000");

    // Get a writer for map files containing <Text,CrawlDatum> pairs
    MapFile.Writer writer = new MapFile.Writer(conf, fs, crawlFetchPath.toString(), Text.class, CrawlDatum.class);

    // Whether we're handling a redirect now
    // first add the linked datum
    // - before redirect status because url sorts before redirectUrl
    // - before fetch status to check whether fetch datum is preferred over linked datum when merging
    if (redirect) {
      // We're writing our our main record URL with status linked
      LOG.info(url + " > " + CrawlDatum.getStatusName(CrawlDatum.STATUS_LINKED));
      value = new CrawlDatum();
      value.setStatus(CrawlDatum.STATUS_LINKED);
      writer.append(new Text(url), value);
    }

    // Whether we're fetching now
    if (fetch) {
      LOG.info(url + " > " + CrawlDatum.getStatusName(status));
     
      // Set the status
      value.setStatus(status);

      // Write the pair and ok
      writer.append(new Text(url), value);
    }

    // Whether we're handing a redirect now
    if (redirect) {
      // And the redirect URL with redirect status, pointing to our main URL
      LOG.info(redirectUrl + " > " + CrawlDatum.getStatusName(CrawlDatum.STATUS_FETCH_REDIR_TEMP));
      value.setStatus(CrawlDatum.STATUS_FETCH_REDIR_TEMP);
      writer.append(new Text(redirectUrl), value);
    }
   
    // Close the stuff
    writer.close();
View Full Code Here

   * important that segments be named in an increasing lexicographic order as
   * their creation time increases.
   */
  public void reduce(Text key, Iterator<MetaWrapper> values,
      OutputCollector<Text, MetaWrapper> output, Reporter reporter) throws IOException {
    CrawlDatum lastG = null;
    CrawlDatum lastF = null;
    CrawlDatum lastSig = null;
    Content lastC = null;
    ParseData lastPD = null;
    ParseText lastPT = null;
    String lastGname = null;
    String lastFname = null;
    String lastSigname = null;
    String lastCname = null;
    String lastPDname = null;
    String lastPTname = null;
    TreeMap<String, ArrayList<CrawlDatum>> linked =
      new TreeMap<String, ArrayList<CrawlDatum>>();
    while (values.hasNext()) {
      MetaWrapper wrapper = values.next();
      Object o = wrapper.get();
      String spString = wrapper.getMeta(SEGMENT_PART_KEY);
      if (spString == null) {
        throw new IOException("Null segment part, key=" + key);       
      }
      SegmentPart sp = SegmentPart.parse(spString);
      if (o instanceof CrawlDatum) {
        CrawlDatum val = (CrawlDatum)o;
        // check which output dir it belongs to
        if (sp.partName.equals(CrawlDatum.GENERATE_DIR_NAME)) {
          if (lastG == null) {
            lastG = val;
            lastGname = sp.segmentName;
          } else {
            // take newer
            if (lastGname.compareTo(sp.segmentName) < 0) {
              lastG = val;
              lastGname = sp.segmentName;
            }
          }
        } else if (sp.partName.equals(CrawlDatum.FETCH_DIR_NAME)) {
          // only consider fetch status and ignore fetch retry status
          // https://issues.apache.org/jira/browse/NUTCH-1520
          // https://issues.apache.org/jira/browse/NUTCH-1113
          if (CrawlDatum.hasFetchStatus(val) &&
            val.getStatus() != CrawlDatum.STATUS_FETCH_RETRY &&
            val.getStatus() != CrawlDatum.STATUS_FETCH_NOTMODIFIED) {
            if (lastF == null) {
              lastF = val;
              lastFname = sp.segmentName;
            } else {
              if (lastFname.compareTo(sp.segmentName) < 0) {
                lastF = val;
                lastFname = sp.segmentName;
              }
            }
          }
        } else if (sp.partName.equals(CrawlDatum.PARSE_DIR_NAME)) {
          if (val.getStatus() == CrawlDatum.STATUS_SIGNATURE) {
            if (lastSig == null) {
              lastSig = val;
              lastSigname = sp.segmentName;
            } else {
              // take newer
              if (lastSigname.compareTo(sp.segmentName) < 0) {
                lastSig = val;
                lastSigname = sp.segmentName;
              }
            }
            continue;
          }
          // collect all LINKED values from the latest segment
          ArrayList<CrawlDatum> segLinked = linked.get(sp.segmentName);
          if (segLinked == null) {
            segLinked = new ArrayList<CrawlDatum>();
            linked.put(sp.segmentName, segLinked);
          }
          segLinked.add(val);
        } else {
          throw new IOException("Cannot determine segment part: " + sp.partName);
        }
      } else if (o instanceof Content) {
        if (lastC == null) {
          lastC = (Content)o;
          lastCname = sp.segmentName;
        } else {
          if (lastCname.compareTo(sp.segmentName) < 0) {
            lastC = (Content)o;
            lastCname = sp.segmentName;
          }
        }
      } else if (o instanceof ParseData) {
        if (lastPD == null) {
          lastPD = (ParseData)o;
          lastPDname = sp.segmentName;
        } else {
          if (lastPDname.compareTo(sp.segmentName) < 0) {
            lastPD = (ParseData)o;
            lastPDname = sp.segmentName;
          }
        }
      } else if (o instanceof ParseText) {
        if (lastPT == null) {
          lastPT = (ParseText)o;
          lastPTname = sp.segmentName;
        } else {
          if (lastPTname.compareTo(sp.segmentName) < 0) {
            lastPT = (ParseText)o;
            lastPTname = sp.segmentName;
          }
        }
      }
    }
  // perform filtering based on full merge record
    if (mergeFilters != null &&
       !mergeFilters.filter(key, lastG, lastF, lastSig, lastC, lastPD, lastPT,
                    linked.isEmpty() ? null : linked.lastEntry().getValue())){
      return;
    }

    curCount++;
    String sliceName = null;
    MetaWrapper wrapper = new MetaWrapper();
    if (sliceSize > 0) {
      sliceName = String.valueOf(curCount / sliceSize);
      wrapper.setMeta(SEGMENT_SLICE_KEY, sliceName);
    }
    SegmentPart sp = new SegmentPart();
    // now output the latest values
    if (lastG != null) {
      wrapper.set(lastG);
      sp.partName = CrawlDatum.GENERATE_DIR_NAME;
      sp.segmentName = lastGname;
      wrapper.setMeta(SEGMENT_PART_KEY, sp.toString());
      output.collect(key, wrapper);
    }
    if (lastF != null) {
      wrapper.set(lastF);
      sp.partName = CrawlDatum.FETCH_DIR_NAME;
      sp.segmentName = lastFname;
      wrapper.setMeta(SEGMENT_PART_KEY, sp.toString());
      output.collect(key, wrapper);
    }
    if (lastSig != null) {
      wrapper.set(lastSig);
      sp.partName = CrawlDatum.PARSE_DIR_NAME;
      sp.segmentName = lastSigname;
      wrapper.setMeta(SEGMENT_PART_KEY, sp.toString());
      output.collect(key, wrapper);
    }
    if (lastC != null) {
      wrapper.set(lastC);
      sp.partName = Content.DIR_NAME;
      sp.segmentName = lastCname;
      wrapper.setMeta(SEGMENT_PART_KEY, sp.toString());
      output.collect(key, wrapper);
    }
    if (lastPD != null) {
      wrapper.set(lastPD);
      sp.partName = ParseData.DIR_NAME;
      sp.segmentName = lastPDname;
      wrapper.setMeta(SEGMENT_PART_KEY, sp.toString());
      output.collect(key, wrapper);
    }
    if (lastPT != null) {
      wrapper.set(lastPT);
      sp.partName = ParseText.DIR_NAME;
      sp.segmentName = lastPTname;
      wrapper.setMeta(SEGMENT_PART_KEY, sp.toString());
      output.collect(key, wrapper);
    }
    if (linked.size() > 0) {
      String name = linked.lastKey();
      sp.partName = CrawlDatum.PARSE_DIR_NAME;
      sp.segmentName = name;
      wrapper.setMeta(SEGMENT_PART_KEY, sp.toString());
      ArrayList<CrawlDatum> segLinked = linked.get(name);
      for (int i = 0; i < segLinked.size(); i++) {
        CrawlDatum link = segLinked.get(i);
        wrapper.set(link);
        output.collect(key, wrapper);
      }
    }
  }
View Full Code Here

  public void reduce(Text key, Iterator<NutchWritable> values,
                     OutputCollector<Text, NutchIndexAction> output, Reporter reporter)
    throws IOException {
    Inlinks inlinks = null;
    CrawlDatum dbDatum = null;
    CrawlDatum fetchDatum = null;
    ParseData parseData = null;
    ParseText parseText = null;

    while (values.hasNext()) {
      final Writable value = values.next().get(); // unwrap
      if (value instanceof Inlinks) {
        inlinks = (Inlinks)value;
      } else if (value instanceof CrawlDatum) {
        final CrawlDatum datum = (CrawlDatum)value;
        if (CrawlDatum.hasDbStatus(datum)) {
          dbDatum = datum;
        }
        else if (CrawlDatum.hasFetchStatus(datum)) {
          // don't index unmodified (empty) pages
          if (datum.getStatus() != CrawlDatum.STATUS_FETCH_NOTMODIFIED) {
            fetchDatum = datum;
          }
        } else if (CrawlDatum.STATUS_LINKED == datum.getStatus() ||
                   CrawlDatum.STATUS_SIGNATURE == datum.getStatus() ||
                   CrawlDatum.STATUS_PARSE_META == datum.getStatus()) {
          continue;
        } else {
          throw new RuntimeException("Unexpected status: "+datum.getStatus());
        }
      } else if (value instanceof ParseData) {
        parseData = (ParseData)value;

        // Handle robots meta? https://issues.apache.org/jira/browse/NUTCH-1434
View Full Code Here

    try {
      String uri = req.getUri().toString();
      LOG.info("URI: " + uri);
      addMyHeader(res, "URI", uri);
      Text url = new Text(uri.toString());
      CrawlDatum cd = seg.getCrawlDatum(url);
      if (cd != null) {
        addMyHeader(res, "Res", "found");
        LOG.info("-got " + cd.toString());
        ProtocolStatus ps = (ProtocolStatus)cd.getMetaData().get(Nutch.WRITABLE_PROTO_STATUS_KEY);
        if (ps != null) {
          Integer TrCode = protoCodes.get(ps.getCode());
          if (TrCode != null) {
            res.setStatus(TrCode.intValue());           
          } else {
View Full Code Here

        if (System.currentTimeMillis() >= timelimit && timelimit != -1) {
          // enough .. lets' simply
          // read all the entries from the input without processing them
          try {
            Text url = new Text();
            CrawlDatum datum = new CrawlDatum();
            hasMore = reader.next(url, datum);
            timelimitcount++;
          } catch (IOException e) {
            LOG.error("QueueFeeder error reading input, record " + cnt, e);
            return;
          }
          continue;
        }
        int feed = size - queues.getTotalSize();
        if (feed <= 0) {
          // queues are full - spin-wait until they have some free space
          try {
            Thread.sleep(1000);
          } catch (Exception e) {};
          continue;
        } else {
          LOG.debug("-feeding " + feed + " input urls ...");
          while (feed > 0 && hasMore) {
            try {
              Text url = new Text();
              CrawlDatum datum = new CrawlDatum();
              hasMore = reader.next(url, datum);
              if (hasMore) {
                queues.addFetchItem(url, datum);
                cnt++;
                feed--;
View Full Code Here

    public CrawlDatum getCrawlDatum(Text url) throws IOException {
      synchronized (crawlLock) {
        if (crawl == null)
          crawl = getReaders(CrawlDatum.FETCH_DIR_NAME);
      }
      return (CrawlDatum)getEntry(crawl, url, new CrawlDatum());
    }
View Full Code Here

    Path fetchDir = new Path(segment, CrawlDatum.FETCH_DIR_NAME);
    if (fs.exists(fetchDir) && fs.getFileStatus(fetchDir).isDir()) {
      cnt = 0L;
      long start = Long.MAX_VALUE;
      long end = Long.MIN_VALUE;
      CrawlDatum value = new CrawlDatum();
      MapFile.Reader[] mreaders = MapFileOutputFormat.getReaders(fs, fetchDir, getConf());
      for (int i = 0; i < mreaders.length; i++) {
        while (mreaders[i].next(key, value)) {
          cnt++;
          if (value.getFetchTime() < start) start = value.getFetchTime();
          if (value.getFetchTime() > end) end = value.getFetchTime();
        }
        mreaders[i].close();
      }
      stats.start = start;
      stats.end = end;
      stats.fetched = cnt;
    }
    Path parseDir = new Path(segment, ParseData.DIR_NAME);
    if (fs.exists(parseDir) && fs.getFileStatus(parseDir).isDir()) {
      cnt = 0L;
      long errors = 0L;
      ParseData value = new ParseData();
      MapFile.Reader[] mreaders = MapFileOutputFormat.getReaders(fs, parseDir, getConf());
      for (int i = 0; i < mreaders.length; i++) {
        while (mreaders[i].next(key, value)) {
          cnt++;
          if (!value.getStatus().isSuccess()) errors++;
        }
        mreaders[i].close();
      }
      stats.parsed = cnt;
      stats.parseErrors = errors;
View Full Code Here

TOP

Related Classes of org.apache.nutch.crawl.CrawlDatum

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.