Package org.apache.nutch.crawl

Examples of org.apache.nutch.crawl.CrawlDatum


        Configuration conf = NutchConfiguration.create();
        for (int i = 0; i < sampleFiles.length; i++) {
            urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

            protocol = new ProtocolFactory(conf).getProtocol(urlString);
            content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
            parse = new ParseUtil(conf).parseByExtensionId("parse-rss", content);

            //check that there are 3 outlinks:
            //http://test.channel.com
            //http://www-scf.usc.edu/~mattmann/
View Full Code Here


    Parse parse;

    Configuration conf = NutchConfiguration.create();
    urlString = "file:" + sampleDir + fileSeparator + rtfFile;
    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum())
                      .getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-rtf", content);
    String text = parse.getText();
    assertEquals("The quick brown fox jumps over the lazy dog", text.trim());
View Full Code Here

   * name (not all segment data contain time information). Therefore it is extremely
   * important that segments be named in an increasing lexicographic order as
   * their creation time increases.
   */
  public void reduce(WritableComparable key, Iterator values, OutputCollector output, Reporter reporter) throws IOException {
    CrawlDatum lastG = null;
    CrawlDatum lastF = null;
    CrawlDatum lastSig = null;
    Content lastC = null;
    ParseData lastPD = null;
    ParseText lastPT = null;
    String lastGname = null;
    String lastFname = null;
    String lastSigname = null;
    String lastCname = null;
    String lastPDname = null;
    String lastPTname = null;
    TreeMap linked = new TreeMap();
    while (values.hasNext()) {
      MetaWrapper wrapper = (MetaWrapper)values.next();
      Object o = wrapper.get();
      String spString = wrapper.getMeta(SEGMENT_PART_KEY);
      if (spString == null) {
        throw new IOException("Null segment part, key=" + key);       
      }
      SegmentPart sp = SegmentPart.parse(spString);
      if (o instanceof CrawlDatum) {
        CrawlDatum val = (CrawlDatum)o;
        // check which output dir it belongs to
        if (sp.partName.equals(CrawlDatum.GENERATE_DIR_NAME)) {
          if (lastG == null) {
            lastG = val;
            lastGname = sp.segmentName;
          } else {
            // take newer
            if (lastGname.compareTo(sp.segmentName) < 0) {
              lastG = val;
              lastGname = sp.segmentName;
            }
          }
        } else if (sp.partName.equals(CrawlDatum.FETCH_DIR_NAME)) {
          if (lastF == null) {
            lastF = val;
            lastFname = sp.segmentName;
          } else {
            // take newer
            if (lastFname.compareTo(sp.segmentName) < 0) {
              lastF = val;
              lastFname = sp.segmentName;
            }
          }
        } else if (sp.partName.equals(CrawlDatum.PARSE_DIR_NAME)) {
          if (val.getStatus() == CrawlDatum.STATUS_SIGNATURE) {
            if (lastSig == null) {
              lastSig = val;
              lastSigname = sp.segmentName;
            } else {
              // take newer
              if (lastSigname.compareTo(sp.segmentName) < 0) {
                lastSig = val;
                lastSigname = sp.segmentName;
              }
            }
            continue;
          }
          // collect all LINKED values from the latest segment
          ArrayList segLinked = (ArrayList)linked.get(sp.segmentName);
          if (segLinked == null) {
            segLinked = new ArrayList();
            linked.put(sp.segmentName, segLinked);
          }
          segLinked.add(val);
        } else {
          throw new IOException("Cannot determine segment part: " + sp.partName);
        }
      } else if (o instanceof Content) {
        if (lastC == null) {
          lastC = (Content)o;
          lastCname = sp.segmentName;
        } else {
          if (lastCname.compareTo(sp.segmentName) < 0) {
            lastC = (Content)o;
            lastCname = sp.segmentName;
          }
        }
      } else if (o instanceof ParseData) {
        if (lastPD == null) {
          lastPD = (ParseData)o;
          lastPDname = sp.segmentName;
        } else {
          if (lastPDname.compareTo(sp.segmentName) < 0) {
            lastPD = (ParseData)o;
            lastPDname = sp.segmentName;
          }
        }
      } else if (o instanceof ParseText) {
        if (lastPT == null) {
          lastPT = (ParseText)o;
          lastPTname = sp.segmentName;
        } else {
          if (lastPTname.compareTo(sp.segmentName) < 0) {
            lastPT = (ParseText)o;
            lastPTname = sp.segmentName;
          }
        }
      }
    }
    curCount++;
    String sliceName = null;
    MetaWrapper wrapper = new MetaWrapper();
    if (sliceSize > 0) {
      sliceName = String.valueOf(curCount / sliceSize);
      wrapper.setMeta(SEGMENT_SLICE_KEY, sliceName);
    }
    SegmentPart sp = new SegmentPart();
    // now output the latest values
    if (lastG != null) {
      wrapper.set(lastG);
      sp.partName = CrawlDatum.GENERATE_DIR_NAME;
      sp.segmentName = lastGname;
      wrapper.setMeta(SEGMENT_PART_KEY, sp.toString());
      output.collect(key, wrapper);
    }
    if (lastF != null) {
      wrapper.set(lastF);
      sp.partName = CrawlDatum.FETCH_DIR_NAME;
      sp.segmentName = lastFname;
      wrapper.setMeta(SEGMENT_PART_KEY, sp.toString());
      output.collect(key, wrapper);
    }
    if (lastSig != null) {
      wrapper.set(lastSig);
      sp.partName = CrawlDatum.PARSE_DIR_NAME;
      sp.segmentName = lastSigname;
      wrapper.setMeta(SEGMENT_PART_KEY, sp.toString());
      output.collect(key, wrapper);
    }
    if (lastC != null) {
      wrapper.set(lastC);
      sp.partName = Content.DIR_NAME;
      sp.segmentName = lastCname;
      wrapper.setMeta(SEGMENT_PART_KEY, sp.toString());
      output.collect(key, wrapper);
    }
    if (lastPD != null) {
      wrapper.set(lastPD);
      sp.partName = ParseData.DIR_NAME;
      sp.segmentName = lastPDname;
      wrapper.setMeta(SEGMENT_PART_KEY, sp.toString());
      output.collect(key, wrapper);
    }
    if (lastPT != null) {
      wrapper.set(lastPT);
      sp.partName = ParseText.DIR_NAME;
      sp.segmentName = lastPTname;
      wrapper.setMeta(SEGMENT_PART_KEY, sp.toString());
      output.collect(key, wrapper);
    }
    if (linked.size() > 0) {
      String name = (String)linked.lastKey();
      sp.partName = CrawlDatum.PARSE_DIR_NAME;
      sp.segmentName = name;
      wrapper.setMeta(SEGMENT_PART_KEY, sp.toString());
      ArrayList segLinked = (ArrayList)linked.get(name);
      for (int i = 0; i < segLinked.size(); i++) {
        CrawlDatum link = (CrawlDatum)segLinked.get(i);
        wrapper.set(link);
        output.collect(key, wrapper);
      }
    }
  }
View Full Code Here

    for (int i = 0; i < sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

      protocol = new ProtocolFactory(conf).getProtocol(urlString);
      content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();

      parse = new ParseUtil(conf).parse(content);

      String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
      assertTrue(sampleTexts[i].equals(text));
View Full Code Here

    Configuration conf = NutchConfiguration.create();
    for (int i=0; i<sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

      protocol = new ProtocolFactory(conf).getProtocol(urlString);
      content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
      parse = new ParseUtil(conf).parseByExtensionId("parse-msword", content);

      assertTrue(parse.getText().startsWith(expectedText));
    }
  }
View Full Code Here

    public void run() {
      synchronized (Fetcher.this) {activeThreads++;} // count threads
     
      try {
        Text key = new Text();
        CrawlDatum datum = new CrawlDatum();
       
        while (true) {
          // TODO : NUTCH-258 ...
          // If something bad happened, then exit
          // if (conf.getBoolean("fetcher.exit", false)) {
          //   break;
          // ]
         
          try {                                   // get next entry from input
            if (!input.next(key, datum)) {
              break;                              // at eof, exit
            }
          } catch (IOException e) {
            if (LOG.isFatalEnabled()) {
              e.printStackTrace(LogUtil.getFatalStream(LOG));
              LOG.fatal("fetcher caught:"+e.toString());
            }
            break;
          }

          synchronized (Fetcher.this) {
            lastRequestStart = System.currentTimeMillis();
          }

          // url may be changed through redirects.
          Text url = new Text();
          url.set(key);
          try {
            if (LOG.isInfoEnabled()) { LOG.info("fetching " + url); }

            // fetch the page
            boolean redirecting;
            int redirectCount = 0;
            do {
              if (LOG.isDebugEnabled()) {
                LOG.debug("redirectCount=" + redirectCount);
              }
              redirecting = false;
              Protocol protocol = this.protocolFactory.getProtocol(url.toString());
              ProtocolOutput output = protocol.getProtocolOutput(url, datum);
              ProtocolStatus status = output.getStatus();
              Content content = output.getContent();
              ParseStatus pstatus = null;

              switch(status.getCode()) {

              case ProtocolStatus.SUCCESS:        // got a page
                pstatus = output(url, datum, content, status, CrawlDatum.STATUS_FETCH_SUCCESS);
                updateStatus(content.getContent().length);
                if (pstatus != null && pstatus.isSuccess() &&
                        pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
                  String newUrl = pstatus.getMessage();
                  newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER);
                  newUrl = this.urlFilters.filter(newUrl);
                  if (newUrl != null && !newUrl.equals(url.toString())) {
                    // record that we were redirected
                    output(url, datum, null, status, CrawlDatum.STATUS_FETCH_REDIR_PERM);
                    url = new Text(newUrl);
                    if (maxRedirect > 0) {
                      redirecting = true;
                      redirectCount++;
                      if (LOG.isDebugEnabled()) {
                        LOG.debug(" - content redirect to " + url + " (fetching now)");
                      }
                    } else {
                      output(url, new CrawlDatum(), null, null, CrawlDatum.STATUS_LINKED);
                      if (LOG.isDebugEnabled()) {
                        LOG.debug(" - content redirect to " + url + " (fetching later)");
                      }
                    }
                  } else if (LOG.isDebugEnabled()) {
                    LOG.debug(" - content redirect skipped: " +
                             (newUrl != null ? "to same url" : "filtered"));
                  }
                }
                break;

              case ProtocolStatus.MOVED:         // redirect
              case ProtocolStatus.TEMP_MOVED:
                int code;
                if (status.getCode() == ProtocolStatus.MOVED) {
                  code = CrawlDatum.STATUS_FETCH_REDIR_PERM;
                } else {
                  code = CrawlDatum.STATUS_FETCH_REDIR_TEMP;
                }
                output(url, datum, content, status, code);
                String newUrl = status.getMessage();
                newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER);
                newUrl = this.urlFilters.filter(newUrl);
                if (newUrl != null && !newUrl.equals(url.toString())) {
                  url = new Text(newUrl);
                  if (maxRedirect > 0) {
                    redirecting = true;
                    redirectCount++;
                    if (LOG.isDebugEnabled()) {
                      LOG.debug(" - protocol redirect to " + url + " (fetching now)");
                    }
                  } else {
                    output(url, new CrawlDatum(), null, null, CrawlDatum.STATUS_LINKED);
                    if (LOG.isDebugEnabled()) {
                      LOG.debug(" - protocol redirect to " + url + " (fetching later)");
                    }
                  }
                } else if (LOG.isDebugEnabled()) {
                  LOG.debug(" - protocol redirect skipped: " +
                           (newUrl != null ? "to same url" : "filtered"));
                }
                break;

              // failures - increase the retry counter
              case ProtocolStatus.EXCEPTION:
                logError(url, status.getMessage());
              /* FALLTHROUGH */
              case ProtocolStatus.RETRY:          // retry
                datum.setRetriesSinceFetch(datum.getRetriesSinceFetch()+1);
              /* FALLTHROUGH */
              // intermittent blocking - retry without increasing the counter
              case ProtocolStatus.WOULDBLOCK:
              case ProtocolStatus.BLOCKED:
                output(url, datum, null, status, CrawlDatum.STATUS_FETCH_RETRY);
View Full Code Here

    Path fetchDir = new Path(segment, CrawlDatum.FETCH_DIR_NAME);
    if (fs.exists(fetchDir) && fs.isDirectory(fetchDir)) {
      cnt = 0L;
      long start = Long.MAX_VALUE;
      long end = Long.MIN_VALUE;
      CrawlDatum value = new CrawlDatum();
      MapFile.Reader[] mreaders = MapFileOutputFormat.getReaders(fs, fetchDir, getConf());
      for (int i = 0; i < mreaders.length; i++) {
        while (mreaders[i].next(key, value)) {
          cnt++;
          if (value.getFetchTime() < start) start = value.getFetchTime();
          if (value.getFetchTime() > end) end = value.getFetchTime();
        }
        mreaders[i].close();
      }
      stats.start = start;
      stats.end = end;
      stats.fetched = cnt;
    }
    Path parseDir = new Path(segment, ParseData.DIR_NAME);
    if (fs.exists(fetchDir) && fs.isDirectory(fetchDir)) {
      cnt = 0L;
      long errors = 0L;
      ParseData value = new ParseData();
      MapFile.Reader[] mreaders = MapFileOutputFormat.getReaders(fs, parseDir, getConf());
      for (int i = 0; i < mreaders.length; i++) {
        while (mreaders[i].next(key, value)) {
          cnt++;
          if (!value.getStatus().isSuccess()) errors++;
        }
        mreaders[i].close();
      }
      stats.parsed = cnt;
      stats.parseErrors = errors;
View Full Code Here

  /** Increase the score by a sum of inlinked scores. */
  public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum, List inlinked) throws ScoringFilterException {
    float adjust = 0.0f;
    for (int i = 0; i < inlinked.size(); i++) {
      CrawlDatum linked = (CrawlDatum)inlinked.get(i);
      adjust += linked.getScore();
    }
    if (old == null) old = datum;
    datum.setScore(old.getScore() + adjust);
  }
View Full Code Here

   
//    if (verbose) {
//      LOGGER.setLevel(Level.FINE);
//    }
   
    ProtocolOutput out = http.getProtocolOutput(new Text(url), new CrawlDatum());
    Content content = out.getContent();
   
    System.out.println("Status: " + out.getStatus());
    if (content != null) {
      System.out.println("Content Type: " + content.getContentType());
View Full Code Here

    this.urlString = createUrl(this.testFile.getName());

    System.out.println("Testing file: " + this.urlString + "...");
    this.protocol =new ProtocolFactory(NutchConfiguration.create()).getProtocol(this.urlString);
    this.content = this.protocol.getProtocolOutput(new Text(this.urlString), new CrawlDatum()).getContent();
  }
View Full Code Here

TOP

Related Classes of org.apache.nutch.crawl.CrawlDatum

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.