Package org.apache.nutch.net.protocols

Examples of org.apache.nutch.net.protocols.Response


    String urlString = url.toString();
    try {
      URL u = new URL(urlString);
     
      long startTime = System.currentTimeMillis();
      Response response = getResponse(u, datum, false); // make a request
     
      if(this.responseTime) {
        int elapsedTime = (int) (System.currentTimeMillis() - startTime);
        datum.getMetaData().put(RESPONSE_TIME, new IntWritable(elapsedTime));
      }
     
      int code = response.getCode();
      byte[] content = response.getContent();
      Content c = new Content(u.toString(), u.toString(),
                              (content == null ? EMPTY_CONTENT : content),
                              response.getHeader("Content-Type"),
                              response.getHeaders(), this.conf);
     
      if (code == 200) { // got a good response
        return new ProtocolOutput(c); // return it
       
      } else if (code == 410) { // page is gone
        return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, "Http: " + code + " url=" + url));
       
      } else if (code >= 300 && code < 400) { // handle redirect
        String location = response.getHeader("Location");
        // some broken servers, such as MS IIS, use lowercase header name...
        if (location == null) location = response.getHeader("location");
        if (location == null) location = "";
        u = new URL(u, location);
        int protocolStatusCode;
        switch (code) {
          case 300:   // multiple choices, preferred value in Location
View Full Code Here


   
    String urlString = url.toString();
    try {
      URL u = new URL(urlString);
      String host = null;
      Response response = getResponse(u, datum, false); // make a request
     
      int code = response.getCode();
      byte[] content = response.getContent();
      Content c = new Content(u.toString(), u.toString(),
                              (content == null ? EMPTY_CONTENT : content),
                              response.getHeader("Content-Type"),
                              response.getHeaders(), this.conf);
     
      if (code == 200) { // got a good response
        return new ProtocolOutput(c); // return it
       
      } else if (code == 410) { // page is gone
        return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, "Http: " + code + " url=" + url));
       
      } else if (code >= 300 && code < 400) { // handle redirect
        String location = response.getHeader("Location");
        // some broken servers, such as MS IIS, use lowercase header name...
        if (location == null) location = response.getHeader("location");
        if (location == null) location = "";
        u = new URL(u, location);
        int protocolStatusCode;
        switch (code) {
          case 300:   // multiple choices, preferred value in Location
View Full Code Here

  public ProtocolOutput getProtocolOutput(String url, WebPage page) {

    try {
      URL u = new URL(url);
      String host = null;
      Response response = getResponse(u, page, false); // make a request
      int code = response.getCode();
      byte[] content = response.getContent();
      Content c = new Content(u.toString(), u.toString(),
          (content == null ? EMPTY_CONTENT : content),
          response.getHeader("Content-Type"),
          response.getHeaders(), mimeTypes);

      if (code == 200) { // got a good response
        return new ProtocolOutput(c); // return it

      } else if (code == 410) { // page is gone
        return new ProtocolOutput(c,
            ProtocolStatusUtils.makeStatus(ProtocolStatusCodes.GONE, "Http: " + code + " url=" + url));
      } else if (code >= 300 && code < 400) { // handle redirect
        String location = response.getHeader("Location");
        // some broken servers, such as MS IIS, use lowercase header name...
        if (location == null) location = response.getHeader("location");
        if (location == null) location = "";
        u = new URL(u, location);
        int protocolStatusCode;
        switch (code) {
        case 300:   // multiple choices, preferred value in Location
View Full Code Here

    if (robotRules == null) {                     // cache miss
      URL redir = null;
      if (LOG.isTraceEnabled()) { LOG.trace("cache miss " + url); }
      try {
        Response response = http.getResponse(new URL(url, "/robots.txt"),
                                             new WebPage(), true);
        // try one level of redirection ?
        if (response.getCode() == 301 || response.getCode() == 302) {
          String redirection = response.getHeader("Location");
          if (redirection == null) {
            // some versions of MS IIS are known to mangle this header
            redirection = response.getHeader("location");
          }
          if (redirection != null) {
            if (!redirection.startsWith("http")) {
              // RFC says it should be absolute, but apparently it isn't
              redir = new URL(url, redirection);
            } else {
              redir = new URL(redirection);
            }

            response = http.getResponse(redir, new WebPage(), true);
          }
        }

        if (response.getCode() == 200)               // found rules: parse them
          robotRules = parseRules(response.getContent());
        else if ( (response.getCode() == 403) && (!allowForbidden) )
          robotRules = FORBID_ALL_RULES;            // use forbid all
        else if (response.getCode() >= 500) {
          cacheRule = false;
          robotRules = EMPTY_RULES;
        }else
          robotRules = EMPTY_RULES;                 // use default rules
      } catch (Throwable t) {
View Full Code Here

  public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
   
    String urlString = url.toString();
    try {
      URL u = new URL(urlString);
      Response response = getResponse(u, datum, false); // make a request
     
      int code = response.getCode();
      byte[] content = response.getContent();
      Content c = new Content(u.toString(), u.toString(),
                              (content == null ? EMPTY_CONTENT : content),
                              response.getHeader("Content-Type"),
                              response.getHeaders(), this.conf);
     
      if (code == 200) { // got a good response
        return new ProtocolOutput(c); // return it
       
      } else if (code == 410) { // page is gone
        return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, "Http: " + code + " url=" + url));
       
      } else if (code >= 300 && code < 400) { // handle redirect
        String location = response.getHeader("Location");
        // some broken servers, such as MS IIS, use lowercase header name...
        if (location == null) location = response.getHeader("location");
        if (location == null) location = "";
        u = new URL(u, location);
        int protocolStatusCode;
        switch (code) {
          case 300:   // multiple choices, preferred value in Location
View Full Code Here

   
    if (robotRules == null) {                     // cache miss
      URL redir = null;
      if (LOG.isTraceEnabled()) { LOG.trace("cache miss " + url); }
      try {
        Response response = ((HttpBase)http).getResponse(new URL(url, "/robots.txt"),
                                             new CrawlDatum(), true);
        // try one level of redirection ?
        if (response.getCode() == 301 || response.getCode() == 302) {
          String redirection = response.getHeader("Location");
          if (redirection == null) {
            // some versions of MS IIS are known to mangle this header
            redirection = response.getHeader("location");
          }
          if (redirection != null) {
            if (!redirection.startsWith("http")) {
              // RFC says it should be absolute, but apparently it isn't
              redir = new URL(url, redirection);
            } else {
              redir = new URL(redirection);
            }
           
            response = ((HttpBase)http).getResponse(redir, new CrawlDatum(), true);
          }
        }

        if (response.getCode() == 200)               // found rules: parse them
          robotRules =  parseRules(url.toString(), response.getContent(),
                                   response.getHeader("Content-Type"),
                                   agentNames);

        else if ( (response.getCode() == 403) && (!allowForbidden) )
          robotRules = FORBID_ALL_RULES;            // use forbid all
        else if (response.getCode() >= 500) {
          cacheRule = false;
          robotRules = EMPTY_RULES;
        }else                                       
          robotRules = EMPTY_RULES;                 // use default rules
      } catch (Throwable t) {
View Full Code Here

   * @throws Exception    When an error occurs or test case fails.
   */
  private void fetchPage(String page, int expectedCode)
      throws Exception {
    URL url = new URL("http", "127.0.0.1", port, page);
    Response response = null;
    response = http.getResponse(url, new CrawlDatum(), true);

    int code = response.getCode();
    assertEquals("HTTP Status Code for " + url, expectedCode, code);
  }
View Full Code Here

      }
     
      long crawlDelay = robots.getCrawlDelay(this, u);
      long delay = crawlDelay > 0 ? crawlDelay : serverDelay;
      String host = blockAddr(u, delay);
      Response response;
      try {
        response = getResponse(u, datum, false); // make a request
      } finally {
        unblockAddr(host, delay);
      }
     
      int code = response.getCode();
      byte[] content = response.getContent();
      Content c = new Content(u.toString(), u.toString(),
                              (content == null ? EMPTY_CONTENT : content),
                              response.getHeader("Content-Type"),
                              response.getHeaders(), this.conf);
     
      if (code == 200) { // got a good response
        return new ProtocolOutput(c); // return it
       
      } else if (code == 410) { // page is gone
        return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, "Http: " + code + " url=" + url));
       
      } else if (code >= 300 && code < 400) { // handle redirect
        String location = response.getHeader("Location");
        // some broken servers, such as MS IIS, use lowercase header name...
        if (location == null) location = response.getHeader("location");
        if (location == null) location = "";
        u = new URL(u, location);
        int protocolStatusCode;
        switch (code) {
          case 300:   // multiple choices, preferred value in Location
View Full Code Here

   * @throws Exception    When an error occurs or test case fails.
   */
  private void fetchPage(String page, int expectedCode)
      throws Exception {
    URL url = new URL("http", "127.0.0.1", port, page);
    Response response = null;
    response = http.getResponse(url, new CrawlDatum(), true);

    int code = response.getCode();
    assertEquals("HTTP Status Code for " + url, expectedCode, code);
  }
View Full Code Here

   
    String urlString = url.toString();
    try {
      URL u = new URL(urlString);
      String host = null;
      Response response = getResponse(u, datum, false); // make a request
     
      int code = response.getCode();
      byte[] content = response.getContent();
      Content c = new Content(u.toString(), u.toString(),
                              (content == null ? EMPTY_CONTENT : content),
                              response.getHeader("Content-Type"),
                              response.getHeaders(), this.conf);
     
      if (code == 200) { // got a good response
        return new ProtocolOutput(c); // return it
       
      } else if (code == 410) { // page is gone
        return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, "Http: " + code + " url=" + url));
       
      } else if (code >= 300 && code < 400) { // handle redirect
        String location = response.getHeader("Location");
        // some broken servers, such as MS IIS, use lowercase header name...
        if (location == null) location = response.getHeader("location");
        if (location == null) location = "";
        u = new URL(u, location);
        int protocolStatusCode;
        switch (code) {
          case 300:   // multiple choices, preferred value in Location
View Full Code Here

TOP

Related Classes of org.apache.nutch.net.protocols.Response

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.