Package org.apache.nutch.protocol

Examples of org.apache.nutch.protocol.ProtocolFactory


    Content content;
    Parse parse;

    Configuration conf = NutchConfiguration.create();
    ParseUtil parser = new ParseUtil(conf);
    ProtocolFactory factory = new ProtocolFactory(conf);
    for (int i = 0; i < sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

      protocol = factory.getProtocol(urlString);
      content = protocol.getProtocolOutput(new Text(urlString),
                                           new CrawlDatum()).getContent();
      parse = parser.parseByExtensionId("parse-msexcel", content).get(content.getUrl());

      assertTrue(parse.getText().equals(expectedText));
View Full Code Here


    for (int i = 0; i < sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

      Configuration conf = NutchConfiguration.create();
      protocol = new ProtocolFactory(conf).getProtocol(urlString);
      content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
      parse = new ParseUtil(conf).parseByExtensionId("parse-pdf", content).get(content.getUrl());

      int index = parse.getText().indexOf(expectedText);
      assertTrue(index > 0);
View Full Code Here

    Configuration conf = NutchConfiguration.create();
    for (int i = 0; i < sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
      urlString = urlString.replace('\\', '/');

      protocol = new ProtocolFactory(conf).getProtocol(urlString);
      content = protocol.getProtocolOutput(new Text(urlString),
          new CrawlDatum()).getContent();

      parseResult = new ParseUtil(conf).parseByExtensionId("feed", content);
View Full Code Here

    Configuration conf = NutchConfiguration.create();

    for (int i = 0; i < sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

      protocol = new ProtocolFactory(conf).getProtocol(urlString);
      content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();

      parse = new ParseUtil(conf).parse(content).get(content.getUrl());

      String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
View Full Code Here

    Configuration conf = NutchConfiguration.create();
    for (int i = 0; i < sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

      protocol = new ProtocolFactory(conf).getProtocol(urlString);
      content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
      parse = new ParseUtil(conf).parseByExtensionId("parse-zip",content).get(content.getUrl());
      Assert.assertTrue(parse.getText().equals(expectedText));
    }
  }
View Full Code Here

      LOG.info("fetching: " + url);
    }

    IndexingFilters indexers = new IndexingFilters(conf);

    ProtocolFactory factory = new ProtocolFactory(conf);
    Protocol protocol = factory.getProtocol(url);
    CrawlDatum datum = new CrawlDatum();

    ProtocolOutput output = protocol.getProtocolOutput(new Text(url), datum);
   
    IndexWriters writers = new IndexWriters(getConf());
View Full Code Here

   *
   */
  public void setContentType(String testTextFile) throws ProtocolException {
    String urlString = "file:" + sampleDir + fileSeparator + testTextFile;
    Assert.assertNotNull(urlString);
    Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
    ProtocolOutput output = protocol.getProtocolOutput(new Text(urlString),
        datum);
    Assert.assertNotNull(output);
    Assert.assertEquals("Status code: [" + output.getStatus().getCode()
        + "], not equal to: [" + ProtocolStatus.SUCCESS + "]: args: ["
View Full Code Here

  public Metadata parseMeta(String fileName, Configuration conf) {
    Metadata metadata = null;
    try {
      String urlString = "file:" + sampleDir + fileSeparator + fileName;
      Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
      Content content = protocol.getProtocolOutput(new Text(urlString),
          new CrawlDatum()).getContent();
      Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
      metadata = parse.getData().getParseMeta();
    } catch (Exception e) {
View Full Code Here

    conf.set("file.content.limit", "-1");
  }

  public String getTextContent(String fileName) throws ProtocolException, ParseException {
    String urlString = "file:" + sampleDir + fileSeparator + fileName;
    Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
    Content content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
    Parse parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());
    return parse.getText();
  }
View Full Code Here

    FileOutputStream fos = new FileOutputStream(tempFile);
    fos.write(expectedText.getBytes());
    fos.close();

    // get nutch content
    Protocol protocol = new ProtocolFactory(NutchConfiguration.create()).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
    protocol = null;
  }
View Full Code Here

TOP

Related Classes of org.apache.nutch.protocol.ProtocolFactory

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.