Package org.apache.nutch.protocol

Examples of org.apache.nutch.protocol.ProtocolFactory


    }

    if (LOG.isInfoEnabled()) { LOG.info("fetching: "+url); }

    Configuration conf = NutchConfiguration.create();
    ProtocolFactory factory = new ProtocolFactory(conf);
    Protocol protocol = factory.getProtocol(url);
    Content content = protocol.getProtocolOutput(new UTF8(url), new CrawlDatum()).getContent();

    if (force) {
      content.setContentType(contentType);
    } else {
View Full Code Here


   *
   */
  public void setContentType(String testTextFile) throws ProtocolException {
    String urlString = "file:" + sampleDir + fileSeparator + testTextFile;
    assertNotNull(urlString);
    Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
    ProtocolOutput output = protocol.getProtocolOutput(new Text(urlString),
        datum);
    assertNotNull(output);
    assertEquals("Status code: [" + output.getStatus().getCode()
        + "], not equal to: [" + ProtocolStatus.SUCCESS + "]: args: ["
View Full Code Here

    Configuration conf = NutchConfiguration.create();
    for (int i = 0; i < sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

      protocol = new ProtocolFactory(conf).getProtocol(urlString);
      content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
      parse = new ParseUtil(conf).parseByExtensionId("parse-zip",content).get(content.getUrl());
      assertTrue(parse.getText().equals(expectedText));
    }
  }
View Full Code Here

  protected void tearDown() {}

  public String getTextContent(String fileName) throws ProtocolException, ParseException {
    String urlString = "file:" + sampleDir + fileSeparator + fileName;
    Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
    Content content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
    Parse parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());
    return parse.getText();
  }
View Full Code Here

    Content content;
    Parse parse;

    Configuration conf = NutchConfiguration.create();
    urlString = "file:" + sampleDir + fileSeparator + rtfFile;
    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString),
        new CrawlDatum()).getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
        .get(content.getUrl());
    String text = parse.getText();
View Full Code Here

    for (int i = 0; i < sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

      Configuration conf = NutchConfiguration.create();
      protocol = new ProtocolFactory(conf).getProtocol(urlString);
      content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
      parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());

      int index = parse.getText().indexOf(expectedText);
      assertTrue(index > 0);
View Full Code Here

    Configuration conf = NutchConfiguration.create();
    for (int i = 0; i < sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

      protocol = new ProtocolFactory(conf).getProtocol(urlString);
      content = protocol.getProtocolOutput(new Text(urlString),
          new CrawlDatum()).getContent();
      parse = new ParseUtil(conf).parseByExtensionId("parse-tika",
          content).get(content.getUrl());
View Full Code Here

   * @return contents of url
   */
  private static String getUrlContent(String url, Configuration conf) {
    Protocol protocol;
    try {
      protocol = new ProtocolFactory(conf).getProtocol(url);
      Content content = protocol.getProtocolOutput(new Text(url), new CrawlDatum()).getContent();
      Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
      System.out.println("text:" + parse.getText());
      return parse.getText();

View Full Code Here

    }

    if (LOG.isInfoEnabled()) { LOG.info("fetching: "+url); }

    Configuration conf = NutchConfiguration.create();
    ProtocolFactory factory = new ProtocolFactory(conf);
    Protocol protocol = factory.getProtocol(url);
    Content content = protocol.getProtocolOutput(new Text(url), new CrawlDatum()).getContent();

    if (force) {
      content.setContentType(contentType);
    } else {
View Full Code Here

    Configuration conf = NutchConfiguration.create();
    for (int i = 0; i < sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
      urlString = urlString.replace('\\', '/');

      protocol = new ProtocolFactory(conf).getProtocol(urlString);
      content = protocol.getProtocolOutput(new Text(urlString),
          new CrawlDatum()).getContent();

      parseResult = new ParseUtil(conf).parseByExtensionId("feed", content);
View Full Code Here

TOP

Related Classes of org.apache.nutch.protocol.ProtocolFactory

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.