Package org.apache.nutch.protocol

Examples of org.apache.nutch.protocol.ProtocolStatus


                        Text key = new Text();
                        CrawlDatum value = new CrawlDatum();
                        if (readers[StatusData].next(key, value)) {
    //                        value.getFetchInterval();
                            currRec.status = value.getStatus();
                            ProtocolStatus pstatus = (ProtocolStatus) value.getMetaData().get(Nutch.WRITABLE_PROTO_STATUS_KEY);
                            currRec.protocol_code = (null == pstatus) ? 0 : pstatus.getCode();
                            if (logger.isDebugEnabled()) {
                                logger.debug("       STATUS OF "+key.toString()+": "+currRec.status+" "+CrawlDatum.getStatusName(currRec.status)+" (code "+currRec.protocol_code+")");
                            }
                            break;
                        } else {
View Full Code Here


    ProtocolOutput output = null;
    try {
      return getProtocolOutput(new FetchListEntry(true,
            new Page(urlString, 1.0f), new String[0]));
    } catch (MalformedURLException mue) {
      return new ProtocolOutput(null, new ProtocolStatus(mue));
    }
  }
View Full Code Here

        } else {                                    // convert to exception
          throw new FileError(code);
        }
      }
    } catch (Exception e) {
      return new ProtocolOutput(null, new ProtocolStatus(e));
    }
  }
View Full Code Here

    byte version = in.readByte();                 // read version
    fetchListEntry = FetchListEntry.read(in);
    md5Hash = MD5Hash.read(in);
    if (version < 5) {
      int status = in.readByte();
      protocolStatus = new ProtocolStatus(oldToNewMap[status]);
    } else {
      protocolStatus = ProtocolStatus.read(in);
    }

    if (version < 4) {
View Full Code Here

    ProtocolOutput output = null;
    try {
      return getProtocolOutput(new FetchListEntry(true,
            new Page(urlString, 1.0f), new String[0]));
    } catch (MalformedURLException mue) {
      return new ProtocolOutput(null, new ProtocolStatus(mue));
    }
  }
View Full Code Here

        } else {                                    // convert to exception
          throw new FtpError(code);
        }
      }
    } catch (Exception e) {
      return new ProtocolOutput(null, new ProtocolStatus(e));
    }
  }
View Full Code Here

  public ProtocolOutput getProtocolOutput(String urlString) {
    try {
      return getProtocolOutput(new FetchListEntry(true, new Page(urlString, 1.0f), new String[0]));
    } catch (MalformedURLException mue) {
      return new ProtocolOutput(null, new ProtocolStatus(mue));
    }
  }
View Full Code Here

    try {
      URL url = new URL(urlString);

        try {
          if (!RobotRulesParser.isAllowed(url))
                  return new ProtocolOutput(null, new ProtocolStatus(ProtocolStatus.ROBOTS_DENIED, url));
        } catch (Throwable e) {
          // XXX Maybe bogus: assume this is allowed.
          LOG.fine("Exception checking robot rules for " + url + ": " + e);
        }

        InetAddress addr = blockAddr(url);
        HttpResponse response;
        try {
          response = new HttpResponse(url); // make a request
        } finally {
          unblockAddr(addr);
        }

        int code = response.getCode();
        Content c = response.toContent();

        if (code == 200) { // got a good response
          return new ProtocolOutput(c); // return it

        } else if (code == 410) { // page is gone
          return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, "Http: " + code + " url=" + url));

        } else if (code >= 300 && code < 400) { // handle redirect
          String location = response.getHeader("Location");
          // some broken servers, such as MS IIS, use lowercase header name...
          if (location == null) location = response.getHeader("location");
          if (location == null) location = "";
          url = new URL(url, location);
          int protocolStatusCode;
          switch (code) {
            case 300:   // multiple choices, preferred value in Location
              protocolStatusCode = ProtocolStatus.MOVED;
              break;
            case 301:   // moved permanently
            case 305:   // use proxy (Location is URL of proxy)
              protocolStatusCode = ProtocolStatus.MOVED;
              break;
            case 302:   // found (temporarily moved)
            case 303:   // see other (redirect after POST)
            case 307:   // temporary redirect
              protocolStatusCode = ProtocolStatus.TEMP_MOVED;
              break;
            case 304:   // not modified
              protocolStatusCode = ProtocolStatus.NOTMODIFIED;
              break;
            default:
              protocolStatusCode = ProtocolStatus.MOVED;
          }
          // handle this in the higher layer.
          return new ProtocolOutput(c, new ProtocolStatus(protocolStatusCode, url));
        } else if (code == 400) { // bad request, mark as GONE
          LOG.fine("400 Bad request: " + url);
          return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, url));
        } else if (code == 401) { // requires authorization, but no valid auth provided.
          LOG.fine("401 Authentication Required");
          return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.ACCESS_DENIED, "Authentication required: "
                  + urlString));
        } else if (code == 404) {
          return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.NOTFOUND, url));
        } else if (code == 410) { // permanently GONE
          return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, url));
        } else {
          return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.EXCEPTION, "Http code=" + code + ", url="
                  + url));
        }
    } catch (Throwable e) {
      e.printStackTrace();
      return new ProtocolOutput(null, new ProtocolStatus(e));
    }
  }
View Full Code Here

      URL u = new URL(urlString);
     
      if (checkRobots) {
        try {
          if (!robots.isAllowed(this, u)) {
            return new ProtocolOutput(null, new ProtocolStatus(ProtocolStatus.ROBOTS_DENIED, url));
          }
        } catch (Throwable e) {
          // XXX Maybe bogus: assume this is allowed.
          if (logger.isTraceEnabled()) {
            logger.trace("Exception checking robot rules for " + url + ": " + e);
          }
        }
      }
     
      long crawlDelay = robots.getCrawlDelay(this, u);
      long delay = crawlDelay > 0 ? crawlDelay : serverDelay;
      if (checkBlocking && maxCrawlDelay >= 0 && delay > maxCrawlDelay) {
        // skip this page, otherwise the thread would block for too long.
        LOGGER.info("Skipping: " + u + " exceeds fetcher.max.crawl.delay, max="
                + (maxCrawlDelay / 1000) + ", Crawl-Delay=" + (delay / 1000));
        return new ProtocolOutput(null, ProtocolStatus.STATUS_WOULDBLOCK);
      }
      String host = null;
      if (checkBlocking) {
        try {
          host = blockAddr(u, delay);
        } catch (BlockedException be) {
          return new ProtocolOutput(null, ProtocolStatus.STATUS_BLOCKED);
        }
      }
      Response response;
      try {
        response = getResponse(u, datum, false); // make a request
      } finally {
        if (checkBlocking) unblockAddr(host, delay);
      }
     
      int code = response.getCode();
      byte[] content = response.getContent();
      Content c = new Content(u.toString(), u.toString(),
                              (content == null ? EMPTY_CONTENT : content),
                              response.getHeader("Content-Type"),
                              response.getHeaders(), this.conf);
     
      if (code == 200) { // got a good response
        return new ProtocolOutput(c); // return it
       
      } else if (code == 410) { // page is gone
        return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, "Http: " + code + " url=" + url));
       
      } else if (code >= 300 && code < 400) { // handle redirect
        String location = response.getHeader("Location");
        // some broken servers, such as MS IIS, use lowercase header name...
        if (location == null) location = response.getHeader("location");
        if (location == null) location = "";
        u = new URL(u, location);
        int protocolStatusCode;
        switch (code) {
          case 300:   // multiple choices, preferred value in Location
            protocolStatusCode = ProtocolStatus.MOVED;
            break;
          case 301:   // moved permanently
          case 305:   // use proxy (Location is URL of proxy)
            protocolStatusCode = ProtocolStatus.MOVED;
            break;
          case 302:   // found (temporarily moved)
          case 303:   // see other (redirect after POST)
          case 307:   // temporary redirect
            protocolStatusCode = ProtocolStatus.TEMP_MOVED;
            break;
          case 304:   // not modified
            protocolStatusCode = ProtocolStatus.NOTMODIFIED;
            break;
          default:
            protocolStatusCode = ProtocolStatus.MOVED;
        }
        // handle this in the higher layer.
        return new ProtocolOutput(c, new ProtocolStatus(protocolStatusCode, u));
      } else if (code == 400) { // bad request, mark as GONE
        if (logger.isTraceEnabled()) { logger.trace("400 Bad request: " + u); }
        return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, u));
      } else if (code == 401) { // requires authorization, but no valid auth provided.
        if (logger.isTraceEnabled()) { logger.trace("401 Authentication Required"); }
        return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.ACCESS_DENIED, "Authentication required: "
                + urlString));
      } else if (code == 404) {
        return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.NOTFOUND, u));
      } else if (code == 410) { // permanently GONE
        return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, u));
      } else {
        return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.EXCEPTION, "Http code=" + code + ", url="
                + u));
      }
    } catch (Throwable e) {
      e.printStackTrace(LogUtil.getErrorStream(logger));
      return new ProtocolOutput(null, new ProtocolStatus(e));
    }
  }
View Full Code Here

     
      if (code == 200) { // got a good response
        return new ProtocolOutput(c); // return it
       
      } else if (code == 410) { // page is gone
        return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, "Http: " + code + " url=" + url));
       
      } else if (code >= 300 && code < 400) { // handle redirect
        String location = response.getHeader("Location");
        // some broken servers, such as MS IIS, use lowercase header name...
        if (location == null) location = response.getHeader("location");
        if (location == null) location = "";
        u = new URL(u, location);
        int protocolStatusCode;
        switch (code) {
          case 300:   // multiple choices, preferred value in Location
            protocolStatusCode = ProtocolStatus.MOVED;
            break;
          case 301:   // moved permanently
          case 305:   // use proxy (Location is URL of proxy)
            protocolStatusCode = ProtocolStatus.MOVED;
            break;
          case 302:   // found (temporarily moved)
          case 303:   // see other (redirect after POST)
          case 307:   // temporary redirect
            protocolStatusCode = ProtocolStatus.TEMP_MOVED;
            break;
          case 304:   // not modified
            protocolStatusCode = ProtocolStatus.NOTMODIFIED;
            break;
          default:
            protocolStatusCode = ProtocolStatus.MOVED;
        }
        // handle this in the higher layer.
        return new ProtocolOutput(c, new ProtocolStatus(protocolStatusCode, u));
      } else if (code == 400) { // bad request, mark as GONE
        if (logger.isTraceEnabled()) { logger.trace("400 Bad request: " + u); }
        return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, u));
      } else if (code == 401) { // requires authorization, but no valid auth provided.
        if (logger.isTraceEnabled()) { logger.trace("401 Authentication Required"); }
        return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.ACCESS_DENIED, "Authentication required: "
                + urlString));
      } else if (code == 404) {
        return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.NOTFOUND, u));
      } else if (code == 410) { // permanently GONE
        return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, u));
      } else {
        return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.EXCEPTION, "Http code=" + code + ", url="
                + u));
      }
    } catch (Throwable e) {
      logger.error("Failed to get protocol output", e);
      return new ProtocolOutput(null, new ProtocolStatus(e));
    }
  }
View Full Code Here

TOP

Related Classes of org.apache.nutch.protocol.ProtocolStatus

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.