Package org.apache.nutch.net.protocols

Examples of org.apache.nutch.net.protocols.Response


          host = blockAddr(u, delay);
        } catch (BlockedException be) {
          return new ProtocolOutput(null, ProtocolStatus.STATUS_BLOCKED);
        }
      }
      Response response;
      try {
        response = getResponse(u, datum, false); // make a request
      } finally {
        if (checkBlocking) unblockAddr(host, delay);
      }
     
      int code = response.getCode();
      byte[] content = response.getContent();
      Content c = new Content(u.toString(), u.toString(),
                              (content == null ? EMPTY_CONTENT : content),
                              response.getHeader("Content-Type"),
                              response.getHeaders(), this.conf);
     
      if (code == 200) { // got a good response
        return new ProtocolOutput(c); // return it
       
      } else if (code == 410) { // page is gone
        return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, "Http: " + code + " url=" + url));
       
      } else if (code >= 300 && code < 400) { // handle redirect
        String location = response.getHeader("Location");
        // some broken servers, such as MS IIS, use lowercase header name...
        if (location == null) location = response.getHeader("location");
        if (location == null) location = "";
        u = new URL(u, location);
        int protocolStatusCode;
        switch (code) {
          case 300:   // multiple choices, preferred value in Location
View Full Code Here


    boolean cacheRule = true;
   
    if (robotRules == null) {                     // cache miss
      if (LOG.isTraceEnabled()) { LOG.trace("cache miss " + url); }
      try {
        Response response = http.getResponse(new URL(url, "/robots.txt"),
                                             new CrawlDatum(), true);

        if (response.getCode() == 200)               // found rules: parse them
          robotRules = parseRules(response.getContent());
        else if ( (response.getCode() == 403) && (!allowForbidden) )
          robotRules = FORBID_ALL_RULES;            // use forbid all
        else if (response.getCode() >= 500) {
    cacheRule = false;
    robotRules = EMPTY_RULES;
        }else                                       
          robotRules = EMPTY_RULES;                 // use default rules
      } catch (Throwable t) {
View Full Code Here

    String urlString = url.toString();
    try {
      URL u = new URL(urlString);
     
      long startTime = System.currentTimeMillis();
      Response response = getResponse(u, datum, false); // make a request
     
      if(this.responseTime) {
        int elapsedTime = (int) (System.currentTimeMillis() - startTime);
        datum.getMetaData().put(RESPONSE_TIME, new IntWritable(elapsedTime));
      }
     
      int code = response.getCode();
      byte[] content = response.getContent();
      Content c = new Content(u.toString(), u.toString(),
                              (content == null ? EMPTY_CONTENT : content),
                              response.getHeader("Content-Type"),
                              response.getHeaders(), this.conf);
     
      if (code == 200) { // got a good response
        return new ProtocolOutput(c); // return it
       
      } else if (code >= 300 && code < 400) { // handle redirect
        String location = response.getHeader("Location");
        // some broken servers, such as MS IIS, use lowercase header name...
        if (location == null) location = response.getHeader("location");
        if (location == null) location = "";
        u = new URL(u, location);
        int protocolStatusCode;
        switch (code) {
          case 300:   // multiple choices, preferred value in Location
View Full Code Here

   
    if (robotRules == null) {                     // cache miss
      URL redir = null;
      if (LOG.isTraceEnabled()) { LOG.trace("cache miss " + url); }
      try {
        Response response = ((HttpBase)http).getResponse(new URL(url, "/robots.txt"),
                                             new CrawlDatum(), true);
        // try one level of redirection ?
        if (response.getCode() == 301 || response.getCode() == 302) {
          String redirection = response.getHeader("Location");
          if (redirection == null) {
            // some versions of MS IIS are known to mangle this header
            redirection = response.getHeader("location");
          }
          if (redirection != null) {
            if (!redirection.startsWith("http")) {
              // RFC says it should be absolute, but apparently it isn't
              redir = new URL(url, redirection);
            } else {
              redir = new URL(redirection);
            }
           
            response = ((HttpBase)http).getResponse(redir, new CrawlDatum(), true);
          }
        }

        if (response.getCode() == 200)               // found rules: parse them
          robotRules =  parseRules(url.toString(), response.getContent(),
                                   response.getHeader("Content-Type"),
                                   agentNames);

        else if ( (response.getCode() == 403) && (!allowForbidden) )
          robotRules = FORBID_ALL_RULES;            // use forbid all
        else if (response.getCode() >= 500) {
          cacheRule = false;
          robotRules = EMPTY_RULES;
        }else                                       
          robotRules = EMPTY_RULES;                 // use default rules
      } catch (Throwable t) {
View Full Code Here

   * @throws Exception    When an error occurs or test case fails.
   */
  private void fetchPage(String page, int expectedCode)
      throws Exception {
    URL url = new URL("http", "127.0.0.1", port, page);
    Response response = null;
    response = http.getResponse(url, new CrawlDatum(), true);

    int code = response.getCode();
    Assert.assertEquals("HTTP Status Code for " + url, expectedCode, code);
  }
View Full Code Here

          host = blockAddr(u, delay);
        } catch (BlockedException be) {
          return new ProtocolOutput(null, ProtocolStatus.STATUS_BLOCKED);
        }
      }
      Response response;
      try {
        response = getResponse(u, datum, false); // make a request
      } finally {
        if (checkBlocking) unblockAddr(host, delay);
      }
     
      int code = response.getCode();
      byte[] content = response.getContent();
      Content c = new Content(u.toString(), u.toString(),
                              (content == null ? EMPTY_CONTENT : content),
                              response.getHeader("Content-Type"),
                              response.getHeaders(), this.conf);
     
      if (code == 200) { // got a good response
        return new ProtocolOutput(c); // return it
       
      } else if (code == 410) { // page is gone
        return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, "Http: " + code + " url=" + url));
       
      } else if (code >= 300 && code < 400) { // handle redirect
        String location = response.getHeader("Location");
        // some broken servers, such as MS IIS, use lowercase header name...
        if (location == null) location = response.getHeader("location");
        if (location == null) location = "";
        u = new URL(u, location);
        int protocolStatusCode;
        switch (code) {
          case 300:   // multiple choices, preferred value in Location
View Full Code Here

    boolean cacheRule = true;
   
    if (robotRules == null) {                     // cache miss
      if (LOG.isTraceEnabled()) { LOG.trace("cache miss " + url); }
      try {
        Response response = http.getResponse(new URL(url, "/robots.txt"),
                                             new CrawlDatum(), true);

        if (response.getCode() == 200)               // found rules: parse them
          robotRules = parseRules(response.getContent());
        else if ( (response.getCode() == 403) && (!allowForbidden) )
          robotRules = FORBID_ALL_RULES;            // use forbid all
        else if (response.getCode() >= 500) {
    cacheRule = false;
    robotRules = EMPTY_RULES;
        }else                                       
          robotRules = EMPTY_RULES;                 // use default rules
      } catch (Throwable t) {
View Full Code Here

    boolean cacheRule = true;
   
    if (robotRules == null) {                     // cache miss
      if (LOG.isTraceEnabled()) { LOG.trace("cache miss " + url); }
      try {
        Response response = http.getResponse(new URL(url, "/robots.txt"),
                                             new CrawlDatum(), true);

        if (response.getCode() == 200)               // found rules: parse them
          robotRules = parseRules(response.getContent());
        else if ( (response.getCode() == 403) && (!allowForbidden) )
          robotRules = FORBID_ALL_RULES;            // use forbid all
        else if (response.getCode() >= 500) {
          cacheRule = false;
          robotRules = EMPTY_RULES;
        }else                                       
          robotRules = EMPTY_RULES;                 // use default rules
      } catch (Throwable t) {
View Full Code Here

          host = blockAddr(u, delay);
        } catch (BlockedException be) {
          return new ProtocolOutput(null, ProtocolStatus.STATUS_BLOCKED);
        }
      }
      Response response;
      try {
        response = getResponse(u, datum, false); // make a request
      } finally {
        if (checkBlocking) unblockAddr(host, delay);
      }
     
      int code = response.getCode();
      byte[] content = response.getContent();
      Content c = new Content(u.toString(), u.toString(),
                              (content == null ? EMPTY_CONTENT : content),
                              response.getHeader("Content-Type"),
                              response.getHeaders(), this.conf);
     
      if (code == 200) { // got a good response
        return new ProtocolOutput(c); // return it
       
      } else if (code == 410) { // page is gone
        return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, "Http: " + code + " url=" + url));
       
      } else if (code >= 300 && code < 400) { // handle redirect
        String location = response.getHeader("Location");
        // some broken servers, such as MS IIS, use lowercase header name...
        if (location == null) location = response.getHeader("location");
        if (location == null) location = "";
        u = new URL(u, location);
        int protocolStatusCode;
        switch (code) {
          case 300:   // multiple choices, preferred value in Location
View Full Code Here

   * @throws Exception    When an error occurs or test case fails.
   */
  private void fetchPage(String page, int expectedCode)
      throws Exception {
    URL url = new URL("http", "127.0.0.1", port, page);
    Response response = null;
    response = http.getResponse(url, new CrawlDatum(), true);

    int code = response.getCode();
    assertEquals("HTTP Status Code for " + url, expectedCode, code);
  }
View Full Code Here

TOP

Related Classes of org.apache.nutch.net.protocols.Response

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.