Package org.apache.nutch.protocol

Examples of org.apache.nutch.protocol.ProtocolFactory


    Configuration conf = NutchConfiguration.create();
    for (int i = 0; i < sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

      protocol = new ProtocolFactory(conf).getProtocol(urlString);
      content = protocol.getProtocolOutput(new Text(urlString),
          new CrawlDatum()).getContent();
      parse = new ParseUtil(conf).parseByExtensionId("parse-tika",
          content).get(content.getUrl());
View Full Code Here


      LOG.info("fetching: " + url);
    }
       
    IndexingFilters indexers = new IndexingFilters(conf);
   
    ProtocolFactory factory = new ProtocolFactory(conf);
    Protocol protocol = factory.getProtocol(url);
    CrawlDatum datum = new CrawlDatum();
   
    Content content = protocol.getProtocolOutput(new Text(url), datum)
        .getContent();
   
View Full Code Here

    if (LOG.isInfoEnabled()) {
      LOG.info("fetching: " + url);
    }

    ProtocolFactory factory = new ProtocolFactory(conf);
    Protocol protocol = factory.getProtocol(url);
    Content content = protocol.getProtocolOutput(new Text(url),
        new CrawlDatum()).getContent();

    if (content == null) {
      System.err.println("Can't fetch URL successfully");
View Full Code Here

      LOG.info("fetching: " + url);
    }

    IndexingFilters indexers = new IndexingFilters(conf);

    ProtocolFactory factory = new ProtocolFactory(conf);
    Protocol protocol = factory.getProtocol(url);
    CrawlDatum datum = new CrawlDatum();

    Content content = protocol.getProtocolOutput(new Text(url), datum)
        .getContent();
View Full Code Here

    if (LOG.isInfoEnabled()) {
      LOG.info("fetching: " + url);
    }

    ProtocolFactory factory = new ProtocolFactory(conf);
    Protocol protocol = factory.getProtocol(url);
    Content content = protocol.getProtocolOutput(new Text(url),
        new CrawlDatum()).getContent();

    if (content == null) {
      System.err.println("Can't fetch URL successfully");
View Full Code Here

    Configuration conf = NutchConfiguration.create();
   
    String urlString = "file:" + sampleDir + fileSeparator + sampleFile;
   
    try {
      Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
      Content content = protocol.getProtocolOutput(new Text(urlString),
          new CrawlDatum()).getContent();
     
      Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
     
View Full Code Here

    //LOG.setLevel(Level.FINE);
    String url = args[0];
    Configuration conf = NutchConfiguration.create();
    RSSParser parser = new RSSParser();
    parser.setConf(conf);
    Protocol protocol = new ProtocolFactory(conf).getProtocol(url);
    Content content = protocol.getProtocolOutput(new UTF8(url), new CrawlDatum()).getContent();
    Parse parse = parser.getParse(content);
    System.out.println("data: "+ parse.getData());
    System.out.println("text: "+parse.getText());
  }
View Full Code Here

    if (LOG.isInfoEnabled()) {
      LOG.info("fetching: " + url);
    }

    ProtocolFactory factory = new ProtocolFactory(conf);
    Protocol protocol = factory.getProtocol(url);
    WebPage page = new WebPage();
    Content content = protocol.getProtocolOutput(url, page).getContent();
    page.setBaseUrl(new org.apache.avro.util.Utf8(url));
    page.setContent(ByteBuffer.wrap(content.getContent()));
View Full Code Here

      this.setDaemon(true);                       // don't hang JVM on exit
      this.setName("FetcherThread" + num);        // use an informative name
      this.context = context;
      Configuration conf = context.getConfiguration();
      this.urlFilters = new URLFilters(conf);
      this.protocolFactory = new ProtocolFactory(conf);
      this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_FETCHER);
      this.maxCrawlDelay = conf.getInt("fetcher.max.crawl.delay", 30) * 1000;
      // backward-compatible default setting
      this.byIP = conf.getBoolean("fetcher.threads.per.host.by.ip", true);
      this.maxRedirect = conf.getInt("http.redirect.max", 3);
View Full Code Here

    Collection<WebPage.Field> fields = new HashSet<WebPage.Field>(FIELDS);
    if (job.getConfiguration().getBoolean(PARSE_KEY, false)) {
      ParserJob parserJob = new ParserJob();
      fields.addAll(parserJob.getFields(job));
    }
    ProtocolFactory protocolFactory = new ProtocolFactory(job.getConfiguration());
    fields.addAll(protocolFactory.getFields());

    return fields;
  }
View Full Code Here

TOP

Related Classes of org.apache.nutch.protocol.ProtocolFactory

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.