Package org.apache.nutch.protocol

Examples of org.apache.nutch.protocol.Content


    in.readFully(bytes);
    Configuration conf = NutchConfiguration.create();
    HtmlParser parser = new HtmlParser();
    parser.setConf(conf);
    Parse parse = parser.getParse(
            new Content(url, url, bytes, "text/html", new Metadata(), conf)).get(url);
    System.out.println("data: "+parse.getData());

    System.out.println("text: "+parse.getText());
   
  }
View Full Code Here


  @Test
  public void testIt() throws ProtocolException, ParseException {
    String urlString;
    Protocol protocol;
    Content content;
    Parse parse;
    Configuration conf = NutchConfiguration.create();

    for (int i = 0; i < sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

      protocol = new ProtocolFactory(conf).getProtocol(urlString);
      content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();

      parse = new ParseUtil(conf).parse(content).get(content.getUrl());

      String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
      Assert.assertTrue(sampleTexts[i].equals(text));
    }
  }
View Full Code Here

  }

  protected Parse parse(byte[] contentBytes) {
    String dummyUrl = "http://dummy.url/";
    return parser.getParse(
        new Content(dummyUrl, dummyUrl, contentBytes, "text/html", new Metadata(),
            conf)).get(dummyUrl);
  }
View Full Code Here

  @Test
  public void testIt() throws ProtocolException, ParseException {
    String urlString;
    Protocol protocol;
    Content content;
    Parse parse;

    Configuration conf = NutchConfiguration.create();
    for (int i = 0; i < sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

      protocol = new ProtocolFactory(conf).getProtocol(urlString);
      content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
      parse = new ParseUtil(conf).parseByExtensionId("parse-zip",content).get(content.getUrl());
      Assert.assertTrue(parse.getText().equals(expectedText));
    }
  }
View Full Code Here

    if (!output.getStatus().isSuccess()) {
      System.out.println("Fetch failed with protocol status: " + output.getStatus());
      return 0;
    }
        
    Content content = output.getContent();

    if (content == null) {
      System.out.println("No content for " + url);
      return 0;
    }

    contentType = content.getContentType();

    if (contentType == null) {
      return -1;
    }
View Full Code Here

  public void reduce(Text key, Iterator<MetaWrapper> values,
      OutputCollector<Text, MetaWrapper> output, Reporter reporter) throws IOException {
    CrawlDatum lastG = null;
    CrawlDatum lastF = null;
    CrawlDatum lastSig = null;
    Content lastC = null;
    ParseData lastPD = null;
    ParseText lastPT = null;
    String lastGname = null;
    String lastFname = null;
    String lastSigname = null;
View Full Code Here

          }
          addMyHeader(res, "ProtocolStatus", ps.toString());
        } else {
          res.setStatus(HttpServletResponse.SC_OK);         
        }
        Content c = seg.getContent(url);
        if (c == null) { // missing content
          req.setHandled(true);
          res.addHeader("X-Handled-By", getClass().getSimpleName());
          return;
        }
        byte[] data = c.getContent();
        LOG.debug("-data len=" + data.length);
        Metadata meta = c.getMetadata();
        String[] names = meta.names();
        LOG.debug("- " + names.length + " meta");
        for (int i = 0; i < names.length; i++) {
          boolean my = true;
          char ch = names[i].charAt(0);
View Full Code Here

    public Content getContent(Text url) throws IOException {
      synchronized (cLock) {
        if (content == null)
          content = getReaders(Content.DIR_NAME);
      }
      return (Content)getEntry(content, url, new Content());
    }
View Full Code Here

      file.setMaxContentLength(maxContentLength);

    // set log level
    //LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));

    Content content = file.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();

    System.err.println("Content-Type: " + content.getContentType());
    System.err.println("Content-Length: " +
                       content.getMetadata().get(Response.CONTENT_LENGTH));
    System.err.println("Last-Modified: " +
                       content.getMetadata().get(Response.LAST_MODIFIED));
    if (dumpContent) {
      System.out.print(new String(content.getContent()));
    }

    file = null;
  }
View Full Code Here

  public byte[] getContent() {
    return content;
  }

  public Content toContent() {
    return new Content(orig, base, (content != null ? content : EMPTY_CONTENT),
        getHeader(Response.CONTENT_TYPE), headers, this.conf);
  }
View Full Code Here

TOP

Related Classes of org.apache.nutch.protocol.Content

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.