Package org.apache.nutch.protocol

Examples of org.apache.nutch.protocol.ProtocolOutput


  public void setContentType(String testTextFile) throws ProtocolNotFound {
    String urlString = "file:" + sampleDir + fileSeparator + testTextFile;
    assertNotNull(urlString);
    WebPage datum = new WebPage();
    Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
    ProtocolOutput output = protocol.getProtocolOutput(urlString,datum);
    assertNotNull(output);

    assertEquals("Status code: [" + output.getStatus().getCode()
        + "], not equal to: [" + ProtocolStatusCodes.SUCCESS + "]: args: ["
        + output.getStatus().getArgs() + "]", ProtocolStatusCodes.SUCCESS, output
        .getStatus().getCode());
    assertNotNull(output.getContent());
    assertNotNull(output.getContent().getContentType());
    assertEquals(expectedMimeType, output.getContent().getContentType());
    assertNotNull(output.getContent().getMetadata());
    assertEquals(expectedMimeType, output.getContent().getMetadata().get(
        Response.CONTENT_TYPE));

  }
View Full Code Here


      sUrl = new URL(urlStr);
      channelSftp = getChannelSftp(sUrl);

      String urlFile = sUrl.getFile();
      if (urlFile.endsWith(".htm") || urlFile.endsWith(".html")) {
        ProtocolOutput po = getFileProtocolOutput(sUrl, channelSftp,
            "text/html");
        return po;
      } else if (urlFile.endsWith(".pdf")) {
        ProtocolOutput po = getFileProtocolOutput(sUrl, channelSftp,
            "application/pdf");
        return po;
      } else {
        ProtocolOutput po = getDirectoryProtocolOutput(sUrl, channelSftp);
        return po;
      }
    } catch (MalformedURLException e) {
      logger.error("Bad URL String: " + urlStr, e);
      return null;
View Full Code Here

        .getMtimeString());
    metadata.set(Response.LOCATION, urlStr);

    Content content = new Content(urlStr, urlStr, bytes, contentType, metadata,
        configuration);
    ProtocolOutput po = new ProtocolOutput(content);
    return po;
  }
View Full Code Here

          .getMtimeString());
      metadata.set(Response.LOCATION, url.toExternalForm());

      Content content = new Content(url.toExternalForm(), url.toExternalForm(),
          directoryList.getBytes(), "text/html", metadata, configuration);
      ProtocolOutput po = new ProtocolOutput(content);
      return po;
    } catch (SftpException e) {
      logger.error("SftpException in getDirectoryProtocolOutput()", e);
      throw e;
    }
View Full Code Here

    ProtocolFactory factory = new ProtocolFactory(conf);
    Protocol protocol = factory.getProtocol(url);

    WebPage page = new WebPage();
    page.setBaseUrl(new org.apache.avro.util.Utf8(url));
    ProtocolOutput protocolOutput = protocol.getProtocolOutput(url, page);
    page.setProtocolStatus(protocolOutput.getStatus());
    if (protocolOutput.getStatus().getCode() == ProtocolStatusCodes.SUCCESS) {
      page.setStatus(CrawlStatus.STATUS_FETCHED);
      page.setFetchTime(System.currentTimeMillis());
    } else {
      LOG.error("Fetch failed with protocol status: "
          + ProtocolStatusUtils.getName(protocolOutput.getStatus().getCode())
          + ": " + ProtocolStatusUtils.getMessage(protocolOutput.getStatus()));
      return -1;
    }
   
    Content content = protocolOutput.getContent();
    if (content == null) {
      LOG.warn("No content for " + url);
      return 0;
    }
View Full Code Here

    ProtocolFactory factory = new ProtocolFactory(conf);
    Protocol protocol = factory.getProtocol(url);
    WebPage page = new WebPage();
   
    ProtocolOutput protocolOutput = protocol.getProtocolOutput(url, page);
   
    if(!protocolOutput.getStatus().isSuccess()) {
      LOG.error("Fetch failed with protocol status: "
          + ProtocolStatusUtils.getName(protocolOutput.getStatus().getCode())
          + ": " + ProtocolStatusUtils.getMessage(protocolOutput.getStatus()));
      return (-1);
    }
    Content content = protocolOutput.getContent();
   
    if (content == null) {
      LOG.error("No content for " + url);
      return (-1);
    }
View Full Code Here

          (content == null ? EMPTY_CONTENT : content),
          response.getHeader("Content-Type"),
          response.getHeaders(), mimeTypes);

      if (code == 200) { // got a good response
        return new ProtocolOutput(c); // return it
      } else if (code == 410) { // page is gone
        return new ProtocolOutput(c,
            ProtocolStatusUtils.makeStatus(ProtocolStatusCodes.GONE, "Http: " + code + " url=" + url));
      } else if (code >= 300 && code < 400) { // handle redirect
        String location = response.getHeader("Location");
        // some broken servers, such as MS IIS, use lowercase header name...
        if (location == null) location = response.getHeader("location");
        if (location == null) location = "";
        u = new URL(u, location);
        int protocolStatusCode;
        switch (code) {
        case 300:   // multiple choices, preferred value in Location
          protocolStatusCode = ProtocolStatusCodes.MOVED;
          break;
        case 301:   // moved permanently
        case 305:   // use proxy (Location is URL of proxy)
          protocolStatusCode = ProtocolStatusCodes.MOVED;
          break;
        case 302:   // found (temporarily moved)
        case 303:   // see other (redirect after POST)
        case 307:   // temporary redirect
          protocolStatusCode = ProtocolStatusUtils.TEMP_MOVED;
          break;
        case 304:   // not modified
          protocolStatusCode = ProtocolStatusUtils.NOTMODIFIED;
          break;
        default:
          protocolStatusCode = ProtocolStatusUtils.MOVED;
        }
        // handle this in the higher layer.
        return new ProtocolOutput(c, ProtocolStatusUtils.makeStatus(protocolStatusCode, u));
      } else if (code == 400) { // bad request, mark as GONE
        if (logger.isTraceEnabled()) { logger.trace("400 Bad request: " + u); }
        return new ProtocolOutput(c, ProtocolStatusUtils.makeStatus(ProtocolStatusCodes.GONE, u));
      } else if (code == 401) { // requires authorization, but no valid auth provided.
        if (logger.isTraceEnabled()) { logger.trace("401 Authentication Required"); }
        return new ProtocolOutput(c,
            ProtocolStatusUtils.makeStatus(ProtocolStatusCodes.ACCESS_DENIED,
                "Authentication required: "+ url));
      } else if (code == 404) {
        return new ProtocolOutput(c,
            ProtocolStatusUtils.makeStatus(ProtocolStatusCodes.NOTFOUND, u));
      } else if (code == 410) { // permanently GONE
        return new ProtocolOutput(c,
            ProtocolStatusUtils.makeStatus(ProtocolStatusCodes.GONE, u));
      } else {
        return new ProtocolOutput(c,
            ProtocolStatusUtils.makeStatus(ProtocolStatusCodes.EXCEPTION, "Http code=" + code + ", url="
                + u));
      }
    } catch (Throwable e) {
      logger.error("Failed with the following error: ", e);
      return new ProtocolOutput(null,
          ProtocolStatusUtils.makeStatus(ProtocolStatusCodes.EXCEPTION, e.toString()));
    }
  }
View Full Code Here

        System.exit(-1);
      } else // root is required parameter
        url = args[i];
    }

    ProtocolOutput out = http.getProtocolOutput(url, new WebPage());
    Content content = out.getContent();

    System.out.println("Status: " + out.getStatus());
    if (content != null) {
      System.out.println("Content Type: " + content.getContentType());
      System.out.println("Content Length: " +
          content.getMetadata().get(Response.CONTENT_LENGTH));
      System.out.println("Content:");
View Full Code Here

        FileResponse response;
        response = new FileResponse(u, page, this, getConf()); // make a request
        int code = response.getCode();

        if (code == 200) { // got a good response
          return new ProtocolOutput(response.toContent()); // return it
        } else if (code >= 300 && code < 400) { // handle redirect
          if (redirects == MAX_REDIRECTS)
            throw new FileException("Too many redirects: " + url);
          u = new URL(response.getHeader("Location"));
          redirects++;
          if (LOG.isTraceEnabled()) {
            LOG.trace("redirect to " + u);
          }

        } else { // convert to exception
          throw new FileError(code);
        }
      }
    } catch (Exception e) {
      e.printStackTrace();
      ProtocolStatus ps = ProtocolStatusUtils.makeStatus(
          ProtocolStatusCodes.EXCEPTION, e.toString());
      return new ProtocolOutput(null, ps);
    }
  }
View Full Code Here

        response = new FtpResponse(u, page, this, getConf()); // make a request

        int code = response.getCode();

        if (code == 200) { // got a good response
          return new ProtocolOutput(response.toContent()); // return it

        } else if (code >= 300 && code < 400) { // handle redirect
          if (redirects == MAX_REDIRECTS)
            throw new FtpException("Too many redirects: " + url);
          u = new URL(response.getHeader("Location"));
          redirects++;
          if (LOG.isTraceEnabled()) {
            LOG.trace("redirect to " + u);
          }
        } else { // convert to exception
          throw new FtpError(code);
        }
      }
    } catch (Exception e) {
      ProtocolStatus ps = ProtocolStatusUtils.makeStatus(
          ProtocolStatusCodes.EXCEPTION, e.toString());
      return new ProtocolOutput(null, ps);
    }
  }
View Full Code Here

TOP

Related Classes of org.apache.nutch.protocol.ProtocolOutput

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.