Examples of CrawlDatum

cn.edu.hfut.dmic.webcollector.model.CrawlDatum
存储爬取任务的类，是WebCollector的核心类，记录了一个url的爬取信息，同样也可以作为一个爬取任务 @author hu
org.apache.nutch.crawl.CrawlDatum

Examples of org.apache.nutch.crawl.CrawlDatum

    Metadata metaData = new Metadata();
    metaData.add("Language", "en/us");
    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData);
    ParseImpl parse = new ParseImpl("this is a sample foo bar page. hope you enjoy it.", parseData);


    CrawlDatum crawlDatum = new CrawlDatum();
    crawlDatum.setFetchTime(100L);


    Inlinks inlinks = new Inlinks();


    try {
      filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), crawlDatum, inlinks);

View Full Code Here

Examples of org.apache.nutch.crawl.CrawlDatum


    for (int i = 0; i < sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];


      protocol = new ProtocolFactory(conf).getProtocol(urlString);
      content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();


      parse = new ParseUtil(conf).parse(content).get(content.getUrl());


      String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
      Assert.assertTrue(sampleTexts[i].equals(text));

View Full Code Here

Examples of org.apache.nutch.crawl.CrawlDatum

    Configuration conf = NutchConfiguration.create();
    for (int i = 0; i < sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];


      protocol = new ProtocolFactory(conf).getProtocol(urlString);
      content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
      parse = new ParseUtil(conf).parseByExtensionId("parse-zip",content).get(content.getUrl());
      Assert.assertTrue(parse.getText().equals(expectedText));
    }
  }

View Full Code Here

Examples of org.apache.nutch.crawl.CrawlDatum

  protected byte checkMergedSegment(Path testDir, Path mergedSegment) throws Exception  {
    // Get a MapFile reader for the <Text,CrawlDatum> pairs
    MapFile.Reader[] readers = MapFileOutputFormat.getReaders(fs, new Path(mergedSegment, CrawlDatum.FETCH_DIR_NAME), conf);
    
    Text key = new Text();
    CrawlDatum value = new CrawlDatum();
    byte finalStatus = 0x0;
    
    for (MapFile.Reader reader : readers) {
      while (reader.next(key, value)) {
        LOG.info("Reading status for: " + key.toString() + " > " + CrawlDatum.getStatusName(value.getStatus()));
        
        // Only consider fetch status
        if (CrawlDatum.hasFetchStatus(value) && key.toString().equals("http://nutch.apache.org/")) {
          finalStatus = value.getStatus();
        }
      }
      
      // Close the reader again
      reader.close();

View Full Code Here

Examples of org.apache.nutch.crawl.CrawlDatum

    
    // The URL of our redirecting URL
    String redirectUrl = "http://nutch.apache.org/i_redirect_to_the_root/";
    
    // Our value
    CrawlDatum value = new CrawlDatum();
    
    // Path of the segment's crawl_fetch directory
    Path crawlFetchPath = new Path(new Path(segment, CrawlDatum.FETCH_DIR_NAME), "part-00000");


    // Get a writer for map files containing <Text,CrawlDatum> pairs
    MapFile.Writer writer = new MapFile.Writer(conf, fs, crawlFetchPath.toString(), Text.class, CrawlDatum.class);


    // Whether we're handling a redirect now
    // first add the linked datum
    // - before redirect status because url sorts before redirectUrl
    // - before fetch status to check whether fetch datum is preferred over linked datum when merging
    if (redirect) {
      // We're writing our our main record URL with status linked
      LOG.info(url + " > " + CrawlDatum.getStatusName(CrawlDatum.STATUS_LINKED));
      value = new CrawlDatum();
      value.setStatus(CrawlDatum.STATUS_LINKED);
      writer.append(new Text(url), value);
    }


    // Whether we're fetching now
    if (fetch) {
      LOG.info(url + " > " + CrawlDatum.getStatusName(status));
      
      // Set the status
      value.setStatus(status);


      // Write the pair and ok
      writer.append(new Text(url), value);
    }


    // Whether we're handing a redirect now
    if (redirect) {
      // And the redirect URL with redirect status, pointing to our main URL
      LOG.info(redirectUrl + " > " + CrawlDatum.getStatusName(CrawlDatum.STATUS_FETCH_REDIR_TEMP));
      value.setStatus(CrawlDatum.STATUS_FETCH_REDIR_TEMP);
      writer.append(new Text(redirectUrl), value);
    }
    
    // Close the stuff
    writer.close();

View Full Code Here

Examples of org.apache.nutch.crawl.CrawlDatum

   * important that segments be named in an increasing lexicographic order as
   * their creation time increases.
   */
  public void reduce(Text key, Iterator<MetaWrapper> values,
      OutputCollector<Text, MetaWrapper> output, Reporter reporter) throws IOException {
    CrawlDatum lastG = null;
    CrawlDatum lastF = null;
    CrawlDatum lastSig = null;
    Content lastC = null;
    ParseData lastPD = null;
    ParseText lastPT = null;
    String lastGname = null;
    String lastFname = null;
    String lastSigname = null;
    String lastCname = null;
    String lastPDname = null;
    String lastPTname = null;
    TreeMap<String, ArrayList<CrawlDatum>> linked =
      new TreeMap<String, ArrayList<CrawlDatum>>();
    while (values.hasNext()) {
      MetaWrapper wrapper = values.next();
      Object o = wrapper.get();
      String spString = wrapper.getMeta(SEGMENT_PART_KEY);
      if (spString == null) {
        throw new IOException("Null segment part, key=" + key);        
      }
      SegmentPart sp = SegmentPart.parse(spString);
      if (o instanceof CrawlDatum) {
        CrawlDatum val = (CrawlDatum)o;
        // check which output dir it belongs to
        if (sp.partName.equals(CrawlDatum.GENERATE_DIR_NAME)) {
          if (lastG == null) {
            lastG = val;
            lastGname = sp.segmentName;
          } else {
            // take newer
            if (lastGname.compareTo(sp.segmentName) < 0) {
              lastG = val;
              lastGname = sp.segmentName;
            }
          }
        } else if (sp.partName.equals(CrawlDatum.FETCH_DIR_NAME)) {
          // only consider fetch status and ignore fetch retry status
          // https://issues.apache.org/jira/browse/NUTCH-1520
          // https://issues.apache.org/jira/browse/NUTCH-1113
          if (CrawlDatum.hasFetchStatus(val) &&
            val.getStatus() != CrawlDatum.STATUS_FETCH_RETRY &&
            val.getStatus() != CrawlDatum.STATUS_FETCH_NOTMODIFIED) {
            if (lastF == null) {
              lastF = val;
              lastFname = sp.segmentName;
            } else {
              if (lastFname.compareTo(sp.segmentName) < 0) {
                lastF = val;
                lastFname = sp.segmentName;
              }
            }
          }
        } else if (sp.partName.equals(CrawlDatum.PARSE_DIR_NAME)) {
          if (val.getStatus() == CrawlDatum.STATUS_SIGNATURE) {
            if (lastSig == null) {
              lastSig = val;
              lastSigname = sp.segmentName;
            } else {
              // take newer
              if (lastSigname.compareTo(sp.segmentName) < 0) {
                lastSig = val;
                lastSigname = sp.segmentName;
              }
            }
            continue;
          }
          // collect all LINKED values from the latest segment
          ArrayList<CrawlDatum> segLinked = linked.get(sp.segmentName);
          if (segLinked == null) {
            segLinked = new ArrayList<CrawlDatum>();
            linked.put(sp.segmentName, segLinked);
          }
          segLinked.add(val);
        } else {
          throw new IOException("Cannot determine segment part: " + sp.partName);
        }
      } else if (o instanceof Content) {
        if (lastC == null) {
          lastC = (Content)o;
          lastCname = sp.segmentName;
        } else {
          if (lastCname.compareTo(sp.segmentName) < 0) {
            lastC = (Content)o;
            lastCname = sp.segmentName;
          }
        }
      } else if (o instanceof ParseData) {
        if (lastPD == null) {
          lastPD = (ParseData)o;
          lastPDname = sp.segmentName;
        } else {
          if (lastPDname.compareTo(sp.segmentName) < 0) {
            lastPD = (ParseData)o;
            lastPDname = sp.segmentName;
          }
        }
      } else if (o instanceof ParseText) {
        if (lastPT == null) {
          lastPT = (ParseText)o;
          lastPTname = sp.segmentName;
        } else {
          if (lastPTname.compareTo(sp.segmentName) < 0) {
            lastPT = (ParseText)o;
            lastPTname = sp.segmentName;
          }
        }
      }
    }
  // perform filtering based on full merge record
    if (mergeFilters != null && 
       !mergeFilters.filter(key, lastG, lastF, lastSig, lastC, lastPD, lastPT, 
                    linked.isEmpty() ? null : linked.lastEntry().getValue())){
      return;
    }


    curCount++;
    String sliceName = null;
    MetaWrapper wrapper = new MetaWrapper();
    if (sliceSize > 0) {
      sliceName = String.valueOf(curCount / sliceSize);
      wrapper.setMeta(SEGMENT_SLICE_KEY, sliceName);
    }
    SegmentPart sp = new SegmentPart();
    // now output the latest values
    if (lastG != null) {
      wrapper.set(lastG);
      sp.partName = CrawlDatum.GENERATE_DIR_NAME;
      sp.segmentName = lastGname;
      wrapper.setMeta(SEGMENT_PART_KEY, sp.toString());
      output.collect(key, wrapper);
    }
    if (lastF != null) {
      wrapper.set(lastF);
      sp.partName = CrawlDatum.FETCH_DIR_NAME;
      sp.segmentName = lastFname;
      wrapper.setMeta(SEGMENT_PART_KEY, sp.toString());
      output.collect(key, wrapper);
    }
    if (lastSig != null) {
      wrapper.set(lastSig);
      sp.partName = CrawlDatum.PARSE_DIR_NAME;
      sp.segmentName = lastSigname;
      wrapper.setMeta(SEGMENT_PART_KEY, sp.toString());
      output.collect(key, wrapper);
    }
    if (lastC != null) {
      wrapper.set(lastC);
      sp.partName = Content.DIR_NAME;
      sp.segmentName = lastCname;
      wrapper.setMeta(SEGMENT_PART_KEY, sp.toString());
      output.collect(key, wrapper);
    }
    if (lastPD != null) {
      wrapper.set(lastPD);
      sp.partName = ParseData.DIR_NAME;
      sp.segmentName = lastPDname;
      wrapper.setMeta(SEGMENT_PART_KEY, sp.toString());
      output.collect(key, wrapper);
    }
    if (lastPT != null) {
      wrapper.set(lastPT);
      sp.partName = ParseText.DIR_NAME;
      sp.segmentName = lastPTname;
      wrapper.setMeta(SEGMENT_PART_KEY, sp.toString());
      output.collect(key, wrapper);
    }
    if (linked.size() > 0) {
      String name = linked.lastKey();
      sp.partName = CrawlDatum.PARSE_DIR_NAME;
      sp.segmentName = name;
      wrapper.setMeta(SEGMENT_PART_KEY, sp.toString());
      ArrayList<CrawlDatum> segLinked = linked.get(name);
      for (int i = 0; i < segLinked.size(); i++) {
        CrawlDatum link = segLinked.get(i);
        wrapper.set(link);
        output.collect(key, wrapper);
      }
    }
  }

View Full Code Here

Examples of org.apache.nutch.crawl.CrawlDatum


  public void reduce(Text key, Iterator<NutchWritable> values,
                     OutputCollector<Text, NutchIndexAction> output, Reporter reporter)
    throws IOException {
    Inlinks inlinks = null;
    CrawlDatum dbDatum = null;
    CrawlDatum fetchDatum = null;
    ParseData parseData = null;
    ParseText parseText = null;


    while (values.hasNext()) {
      final Writable value = values.next().get(); // unwrap
      if (value instanceof Inlinks) {
        inlinks = (Inlinks)value;
      } else if (value instanceof CrawlDatum) {
        final CrawlDatum datum = (CrawlDatum)value;
        if (CrawlDatum.hasDbStatus(datum)) {
          dbDatum = datum;
        }
        else if (CrawlDatum.hasFetchStatus(datum)) {
          // don't index unmodified (empty) pages
          if (datum.getStatus() != CrawlDatum.STATUS_FETCH_NOTMODIFIED) {
            fetchDatum = datum;
          }
        } else if (CrawlDatum.STATUS_LINKED == datum.getStatus() ||
                   CrawlDatum.STATUS_SIGNATURE == datum.getStatus() ||
                   CrawlDatum.STATUS_PARSE_META == datum.getStatus()) {
          continue;
        } else {
          throw new RuntimeException("Unexpected status: "+datum.getStatus());
        }
      } else if (value instanceof ParseData) {
        parseData = (ParseData)value;


        // Handle robots meta? https://issues.apache.org/jira/browse/NUTCH-1434

View Full Code Here

Examples of org.apache.nutch.crawl.CrawlDatum

    try {
      String uri = req.getUri().toString();
      LOG.info("URI: " + uri);
      addMyHeader(res, "URI", uri);
      Text url = new Text(uri.toString());
      CrawlDatum cd = seg.getCrawlDatum(url);
      if (cd != null) {
        addMyHeader(res, "Res", "found");
        LOG.info("-got " + cd.toString());
        ProtocolStatus ps = (ProtocolStatus)cd.getMetaData().get(Nutch.WRITABLE_PROTO_STATUS_KEY);
        if (ps != null) {
          Integer TrCode = protoCodes.get(ps.getCode());
          if (TrCode != null) {
            res.setStatus(TrCode.intValue());            
          } else {

View Full Code Here

Examples of org.apache.nutch.crawl.CrawlDatum

        if (System.currentTimeMillis() >= timelimit && timelimit != -1) {
          // enough .. lets' simply
          // read all the entries from the input without processing them
          try {
            Text url = new Text();
            CrawlDatum datum = new CrawlDatum();
            hasMore = reader.next(url, datum);
            timelimitcount++;
          } catch (IOException e) {
            LOG.error("QueueFeeder error reading input, record " + cnt, e);
            return;
          }
          continue;
        }
        int feed = size - queues.getTotalSize();
        if (feed <= 0) {
          // queues are full - spin-wait until they have some free space
          try {
            Thread.sleep(1000);
          } catch (Exception e) {};
          continue;
        } else {
          LOG.debug("-feeding " + feed + " input urls ...");
          while (feed > 0 && hasMore) {
            try {
              Text url = new Text();
              CrawlDatum datum = new CrawlDatum();
              hasMore = reader.next(url, datum);
              if (hasMore) {
                queues.addFetchItem(url, datum);
                cnt++;
                feed--;

View Full Code Here

Examples of org.apache.nutch.crawl.CrawlDatum

    public CrawlDatum getCrawlDatum(Text url) throws IOException {
      synchronized (crawlLock) {
        if (crawl == null)
          crawl = getReaders(CrawlDatum.FETCH_DIR_NAME);
      }
      return (CrawlDatum)getEntry(crawl, url, new CrawlDatum());
    }

View Full Code Here

0 1 2 3 4 5

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.