Package org.apache.nutch.protocol

Examples of org.apache.nutch.protocol.ProtocolOutput


      if (LOG.isTraceEnabled())
        LOG.trace("cache miss " + url);

      try {
        String robotsUrl = new URL(url, "/robots.txt").toString();       
        ProtocolOutput output = ((Ftp)ftp).getProtocolOutput(robotsUrl, new WebPage());
        int statusCode = output.getStatus().getCode();

        if (statusCode == ProtocolStatusCodes.SUCCESS) {
          robotRules =  parseRules(url.toString(), output.getContent().getContent(),
                                  CONTENT_TYPE, agentNames);
        } else {                                      
          robotRules = EMPTY_RULES;                 // use default rules
        }
      } catch (Throwable t) {
View Full Code Here


    }

    ProtocolFactory factory = new ProtocolFactory(conf);
    Protocol protocol = factory.getProtocol(url);
    Text turl = new Text(url);
    ProtocolOutput output = protocol.getProtocolOutput(turl, cd);
   
    if (!output.getStatus().isSuccess()) {
      System.err.println("Fetch failed with protocol status: " + output.getStatus());
      return (-1);
    }
   
    Content content = output.getContent();

    if (content == null) {
      LOG.error("No content for " + url);
      return (-1);
    }
View Full Code Here

    ProtocolFactory factory = new ProtocolFactory(conf);
    Protocol protocol = factory.getProtocol(url);
    CrawlDatum datum = new CrawlDatum();

    ProtocolOutput output = protocol.getProtocolOutput(new Text(url), datum);
   
    if (!output.getStatus().isSuccess()) {
      System.out.println("Fetch failed with protocol status: " + output.getStatus());
      return 0;
    }
        
    Content content = output.getContent();

    if (content == null) {
      System.out.println("No content for " + url);
      return 0;
    }
View Full Code Here

   */
  private void fetchPage(String page, int expectedCode) throws Exception {
    URL url = new URL("http", "127.0.0.1", port, page);
    CrawlDatum crawlDatum = new CrawlDatum();
    Response response = http.getResponse(url, crawlDatum, true);
    ProtocolOutput out = http.getProtocolOutput(new Text(url.toString()),
        crawlDatum);
    Content content = out.getContent();
    assertEquals("HTTP Status Code for " + url, expectedCode,
        response.getCode());

    if (page.compareTo("/nonexists.html") != 0
        && page.compareTo("/brokenpage.jsp") != 0
View Full Code Here

   */
  public void setContentType(String testTextFile) throws ProtocolException {
    String urlString = "file:" + sampleDir + fileSeparator + testTextFile;
    Assert.assertNotNull(urlString);
    Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
    ProtocolOutput output = protocol.getProtocolOutput(new Text(urlString),
        datum);
    Assert.assertNotNull(output);
    Assert.assertEquals("Status code: [" + output.getStatus().getCode()
        + "], not equal to: [" + ProtocolStatus.SUCCESS + "]: args: ["
        + output.getStatus().getArgs() + "]", ProtocolStatus.SUCCESS, output
        .getStatus().getCode());
    Assert.assertNotNull(output.getContent());
    Assert.assertNotNull(output.getContent().getContentType());
    Assert.assertEquals(expectedMimeType, output.getContent().getContentType());
    Assert.assertNotNull(output.getContent().getMetadata());
    Assert.assertEquals(expectedMimeType,
        output.getContent().getMetadata().get(Response.CONTENT_TYPE));

  }
View Full Code Here

        response = new FileResponse(u, datum, this, getConf());   // make a request
 
        int code = response.getCode();
 
        if (code == 200) {                          // got a good response
          return new ProtocolOutput(response.toContent());              // return it
 
        } else if (code == 304) {                   // got not modified
          return new ProtocolOutput(response.toContent(), ProtocolStatus.STATUS_NOTMODIFIED);

        } else if (code == 401) {                   // access denied / no read permissions
          return new ProtocolOutput(response.toContent(), new ProtocolStatus(ProtocolStatus.ACCESS_DENIED));

        } else if (code == 404) {                   // no such file
          return new ProtocolOutput(response.toContent(), ProtocolStatus.STATUS_NOTFOUND);

        } else if (code >= 300 && code < 400) {     // handle redirect
          u = new URL(response.getHeader("Location"));
          if (LOG.isTraceEnabled()) {
            LOG.trace("redirect to " + u);
          }
          if (symlinksAsRedirects) {
            return new ProtocolOutput(response.toContent(), new ProtocolStatus(
                ProtocolStatus.MOVED, u));
          } else if (redirects == MAX_REDIRECTS) {
            LOG.trace("Too many redirects: {}", url);
            return new ProtocolOutput(response.toContent(), new ProtocolStatus(
                ProtocolStatus.REDIR_EXCEEDED, u));
          }
          redirects++;
 
        } else {                                    // convert to exception
          throw new FileError(code);
        }
      }
    } catch (Exception e) {
      e.printStackTrace();
      return new ProtocolOutput(null, new ProtocolStatus(e));
    }
  }
View Full Code Here

      file.setMaxContentLength(maxContentLength);

    // set log level
    //LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));

    ProtocolOutput output = file.getProtocolOutput(new Text(urlString), new CrawlDatum());
    Content content = output.getContent();

    System.err.println("URL: " + content.getUrl());
    System.err.println("Status: " + output.getStatus());
    System.err.println("Content-Type: " + content.getContentType());
    System.err.println("Content-Length: " +
                       content.getMetadata().get(Response.CONTENT_LENGTH));
    System.err.println("Last-Modified: " +
                       content.getMetadata().get(Response.LAST_MODIFIED));
View Full Code Here

TOP

Related Classes of org.apache.nutch.protocol.ProtocolOutput

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.