Package org.apache.nutch.protocol

Examples of org.apache.nutch.protocol.Protocol


  protected void tearDown() {}

  public void testIt() throws ProtocolException, ParseException {
    String urlString;
    Protocol protocol;
    Content content;
    Parse parse;
    Configuration conf = NutchConfiguration.create();

    for (int i = 0; i < sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

      protocol = new ProtocolFactory(conf).getProtocol(urlString);
      content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();

      parse = new ParseUtil(conf).parse(content).get(content.getUrl());

      String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
      assertTrue(sampleTexts[i].equals(text));
View Full Code Here


          try {
            LOG.info("fetching " + fit.url + " (queue crawl delay=" +
                      fetchQueues.getFetchItemQueue(fit.queueID).crawlDelay + "ms)");

            // fetch the page
            final Protocol protocol = this.protocolFactory.getProtocol(fit.url);
            final BaseRobotRules rules = protocol.getRobotRules(fit.url, fit.page);
            if (!rules.isAllowed(fit.u.toString())) {
              // unblock
              fetchQueues.finishFetchItem(fit, true);
              if (LOG.isDebugEnabled()) {
                LOG.debug("Denied by robots.txt: " + fit.url);
              }
              output(fit, null, ProtocolStatusUtils.STATUS_ROBOTS_DENIED,
                  CrawlStatus.STATUS_GONE);
              continue;
            }
            if (rules.getCrawlDelay() > 0) {
              if (rules.getCrawlDelay() > maxCrawlDelay && maxCrawlDelay >= 0) {
                // unblock
                fetchQueues.finishFetchItem(fit, true);
                LOG.debug("Crawl-Delay for " + fit.url + " too long (" + rules.getCrawlDelay() + "), skipping");
                output(fit, null, ProtocolStatusUtils.STATUS_ROBOTS_DENIED, CrawlStatus.STATUS_GONE);
                continue;
              } else {
                final FetchItemQueue fiq = fetchQueues.getFetchItemQueue(fit.queueID);
                fiq.crawlDelay = rules.getCrawlDelay();
                if (LOG.isDebugEnabled()) {
                  LOG.info("Crawl delay for queue: " + fit.queueID + " is set to " + fiq.crawlDelay + " as per robots.txt. url: " + fit.url);
                }
              }
            }
            final ProtocolOutput output = protocol.getProtocolOutput(fit.url, fit.page);
            final ProtocolStatus status = output.getStatus();
            final Content content = output.getContent();
            // unblock queue
            fetchQueues.finishFetchItem(fit);
View Full Code Here

   */
  public void setContentType(String testTextFile) throws ProtocolNotFound {
    String urlString = "file:" + sampleDir + fileSeparator + testTextFile;
    assertNotNull(urlString);
    WebPage datum = new WebPage();
    Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
    ProtocolOutput output = protocol.getProtocolOutput(urlString,datum);
    assertNotNull(output);

    assertEquals("Status code: [" + output.getStatus().getCode()
        + "], not equal to: [" + ProtocolStatusCodes.SUCCESS + "]: args: ["
        + output.getStatus().getArgs() + "]", ProtocolStatusCodes.SUCCESS, output
View Full Code Here

   *           If the {@link Parser}Layer cannot be loaded.
   */
  @Test
  public void testParseFetchChannel() throws ProtocolNotFound, ParseException {
    String urlString;
    Protocol protocol;
    Content content;
    ParseResult parseResult;

    Configuration conf = NutchConfiguration.create();
    for (int i = 0; i < sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
      urlString = urlString.replace('\\', '/');

      protocol = new ProtocolFactory(conf).getProtocol(urlString);
      content = protocol.getProtocolOutput(new Text(urlString),
          new CrawlDatum()).getContent();

      parseResult = new ParseUtil(conf).parseByExtensionId("feed", content);

      assertEquals(3, parseResult.size());
View Full Code Here

    }

    IndexingFilters indexers = new IndexingFilters(conf);

    ProtocolFactory factory = new ProtocolFactory(conf);
    Protocol protocol = factory.getProtocol(url);

    WebPage page = new WebPage();
    page.setBaseUrl(new org.apache.avro.util.Utf8(url));
    ProtocolOutput protocolOutput = protocol.getProtocolOutput(url, page);
    page.setProtocolStatus(protocolOutput.getStatus());
    if (protocolOutput.getStatus().getCode() == ProtocolStatusCodes.SUCCESS) {
      page.setStatus(CrawlStatus.STATUS_FETCHED);
      page.setFetchTime(System.currentTimeMillis());
    } else {
View Full Code Here

    if (LOG.isInfoEnabled()) {
      LOG.info("fetching: " + url);
    }

    ProtocolFactory factory = new ProtocolFactory(conf);
    Protocol protocol = factory.getProtocol(url);
    WebPage page = new WebPage();
   
    ProtocolOutput protocolOutput = protocol.getProtocolOutput(url, page);
   
    if(!protocolOutput.getStatus().isSuccess()) {
      LOG.error("Fetch failed with protocol status: "
          + ProtocolStatusUtils.getName(protocolOutput.getStatus().getCode())
          + ": " + ProtocolStatusUtils.getMessage(protocolOutput.getStatus()));
View Full Code Here

  protected void tearDown() {}

  @Test
  public void testIt() throws ProtocolException, ParseException {
    String urlString;
    Protocol protocol;
    Content content;
    Parse parse;
    Configuration conf = NutchConfiguration.create();

    for (int i = 0; i < sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

      protocol = new ProtocolFactory(conf).getProtocol(urlString);
      content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();

      parse = new ParseUtil(conf).parse(content).get(content.getUrl());

      String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
      assertTrue(sampleTexts[i].equals(text));
View Full Code Here

    FileOutputStream fos = new FileOutputStream(tempFile);
    fos.write(expectedText.getBytes());
    fos.close();

    // get nutch content
    Protocol protocol = new ProtocolFactory(NutchConfiguration.create()).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
    protocol = null;
  }
View Full Code Here

  protected void tearDown() {}

  @Test
  public void testIt() throws ProtocolException, ParseException {
    String urlString;
    Protocol protocol;
    Content content;
    Parse parse;

    Configuration conf = NutchConfiguration.create();
    for (int i = 0; i < sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

      protocol = new ProtocolFactory(conf).getProtocol(urlString);
      content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
      parse = new ParseUtil(conf).parseByExtensionId("parse-zip",content).get(content.getUrl());
      assertTrue(parse.getText().equals(expectedText));
    }
  }
View Full Code Here

        value = "";
      cd.getMetaData().put(new Text(key), new Text(value));
    }

    ProtocolFactory factory = new ProtocolFactory(conf);
    Protocol protocol = factory.getProtocol(url);
    Text turl = new Text(url);
    ProtocolOutput output = protocol.getProtocolOutput(turl, cd);
   
    if (!output.getStatus().isSuccess()) {
      System.err.println("Fetch failed with protocol status: " + output.getStatus());
      return (-1);
    }
View Full Code Here

TOP

Related Classes of org.apache.nutch.protocol.Protocol

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.