Examples of org.apache.nutch.protocol.Protocol

org.apache.nutch.protocol.Protocol
A retriever of url content. Implemented by protocol extensions.

    }


    IndexingFilters indexers = new IndexingFilters(conf);


    ProtocolFactory factory = new ProtocolFactory(conf);
    Protocol protocol = factory.getProtocol(url);
    CrawlDatum datum = new CrawlDatum();


    Content content = protocol.getProtocolOutput(new Text(url), datum)
        .getContent();


    if (content == null) {
      System.out.println("No content for " + url);
      return 0;

View Full Code Here

    if (LOG.isInfoEnabled()) {
      LOG.info("fetching: " + url);
    }


    ProtocolFactory factory = new ProtocolFactory(conf);
    Protocol protocol = factory.getProtocol(url);
    Content content = protocol.getProtocolOutput(new Text(url),
        new CrawlDatum()).getContent();


    if (content == null) {
      System.err.println("Can't fetch URL successfully");
      return (-1);

View Full Code Here

    Configuration conf = NutchConfiguration.create();
    
    String urlString = "file:" + sampleDir + fileSeparator + sampleFile;
    
    try {
      Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
      Content content = protocol.getProtocolOutput(new Text(urlString),
          new CrawlDatum()).getContent();
      
      Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
      
      // check that we get the same values

View Full Code Here

    //LOG.setLevel(Level.FINE);
    String url = args[0];
    Configuration conf = NutchConfiguration.create();
    RSSParser parser = new RSSParser();
    parser.setConf(conf);
    Protocol protocol = new ProtocolFactory(conf).getProtocol(url);
    Content content = protocol.getProtocolOutput(new UTF8(url), new CrawlDatum()).getContent();
    Parse parse = parser.getParse(content);
    System.out.println("data: "+ parse.getData());
    System.out.println("text: "+parse.getText());
  }

View Full Code Here

    if (LOG.isInfoEnabled()) {
      LOG.info("fetching: " + url);
    }


    ProtocolFactory factory = new ProtocolFactory(conf);
    Protocol protocol = factory.getProtocol(url);
    WebPage page = new WebPage();
    Content content = protocol.getProtocolOutput(url, page).getContent();
    page.setBaseUrl(new org.apache.avro.util.Utf8(url));
    page.setContent(ByteBuffer.wrap(content.getContent()));


    if (force) {
      content.setContentType(contentType);

View Full Code Here

            do {
              if (LOG.isDebugEnabled()) {
                LOG.debug("redirectCount=" + redirectCount);
              }
              redirecting = false;
              final Protocol protocol = this.protocolFactory.getProtocol(fit.url);
              final RobotRules rules = protocol.getRobotRules(fit.url, fit.page);
              if (!rules.isAllowed(fit.u)) {
                // unblock
                fetchQueues.finishFetchItem(fit, true);
                if (LOG.isDebugEnabled()) {
                  LOG.debug("Denied by robots.txt: " + fit.url);
                }
                output(fit, null, ProtocolStatusUtils.STATUS_ROBOTS_DENIED,
                    CrawlStatus.STATUS_GONE);
                continue;
              }
              if (rules.getCrawlDelay() > 0) {
                if (rules.getCrawlDelay() > maxCrawlDelay) {
                  // unblock
                  fetchQueues.finishFetchItem(fit, true);
                  LOG.debug("Crawl-Delay for " + fit.url + " too long (" + rules.getCrawlDelay() + "), skipping");
                  output(fit, null, ProtocolStatusUtils.STATUS_ROBOTS_DENIED, CrawlStatus.STATUS_GONE);
                  continue;
                } else {
                  final FetchItemQueue fiq = fetchQueues.getFetchItemQueue(fit.queueID);
                  fiq.crawlDelay = rules.getCrawlDelay();
                }
              }
              final ProtocolOutput output = protocol.getProtocolOutput(fit.url, fit.page);
              final ProtocolStatus status = output.getStatus();
              final Content content = output.getContent();
              // unblock queue
              fetchQueues.finishFetchItem(fit);

View Full Code Here

   */
  public void setContentType(String testTextFile) throws ProtocolNotFound {
    String urlString = "file:" + sampleDir + fileSeparator + testTextFile;
    assertNotNull(urlString);
    WebPage datum = new WebPage();
    Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
    ProtocolOutput output = protocol.getProtocolOutput(urlString,datum);
    assertNotNull(output);


    assertEquals("Status code: [" + output.getStatus().getCode()
        + "], not equal to: [" + ProtocolStatusCodes.SUCCESS + "]: args: ["
        + output.getStatus().getArgs() + "]", ProtocolStatusCodes.SUCCESS, output

View Full Code Here

    FileOutputStream fos = new FileOutputStream(tempFile);
    fos.write(expectedText.getBytes());
    fos.close();


    // get nutch content
    Protocol protocol = new ProtocolFactory(NutchConfiguration.create()).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
    protocol = null;
  }

View Full Code Here

   * @throws ParseException
   *           If the {@link Parser}Layer cannot be loaded.
   */
  public void testParseFetchChannel() throws ProtocolNotFound, ParseException {
    String urlString;
    Protocol protocol;
    Content content;
    ParseResult parseResult;


    Configuration conf = NutchConfiguration.create();
    for (int i = 0; i < sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
      urlString = urlString.replace('\\', '/');


      protocol = new ProtocolFactory(conf).getProtocol(urlString);
      content = protocol.getProtocolOutput(new Text(urlString),
          new CrawlDatum()).getContent();


      parseResult = new ParseUtil(conf).parseByExtensionId("feed", content);


      assertEquals(3, parseResult.size());

View Full Code Here

   * file</li>
   * </ul>
   */
  public void testIt()throws ProtocolException, ParseException, IOException {
    String urlString;
    Protocol protocol;
    Parse parse;


    Configuration conf = NutchConfiguration.create();
    MimeUtil mimeutil = new MimeUtil(conf);
    for (int i = 0; i < sampleFiles.length; i++) {

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.nutch.protocol.Protocol

org.apache.nutch.analysis.lang.LanguageIdentifier

org.apache.nutch.fetcher.FetcherReducer$FetcherThread

org.apache.nutch.indexer.IndexingFiltersChecker

org.apache.nutch.parse.ext.TestExtParser

org.apache.nutch.parse.feed.TestFeedParser

org.apache.nutch.parse.html.TestMetatagParser

org.apache.nutch.parse.metatags.TestMetatagParser

org.apache.nutch.parse.mp3.TestMP3Parser

org.apache.nutch.parse.msexcel.TestMSExcelParser

org.apache.nutch.parse.msword.TestMSWordParser

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.