Package org.apache.nutch.storage

Examples of org.apache.nutch.storage.WebPage


      byte[] bytes = new byte[(int) file.length()];
      DataInputStream in = new DataInputStream(new FileInputStream(file));
      in.readFully(bytes);
      in.close();

      WebPage page = new WebPage();
      page.setBaseUrl(new Utf8(urlString));
      page.setContent(ByteBuffer.wrap(bytes));
      String mtype = mimeutil.getMimeType(file);
      page.setContentType(new Utf8(mtype));

      parse = new ParseUtil(conf).parse(urlString, page);

      int index = parse.getText().indexOf(expectedText);
      assertTrue(index > 0);
View Full Code Here


      byte[] bytes = new byte[(int) file.length()];
      DataInputStream in = new DataInputStream(new FileInputStream(file));
      in.readFully(bytes);
      in.close();

      WebPage page = new WebPage();
      page.setBaseUrl(new Utf8(urlString));
      page.setContent(ByteBuffer.wrap(bytes));
      String mtype = mimeutil.getMimeType(file);
      page.setContentType(new Utf8(mtype));

      parse = new ParseUtil(conf).parse(urlString, page);

      // check that there are 2 outlinks:
View Full Code Here

    DataInputStream in = new DataInputStream(new FileInputStream(file));
    in.readFully(bytes);
    Configuration conf = NutchConfiguration.create();
    // TikaParser parser = new TikaParser();
    // parser.setConf(conf);
    WebPage page = new WebPage();
    page.setBaseUrl(new Utf8(url));
    page.setContent(ByteBuffer.wrap(bytes));
    MimeUtil mimeutil = new MimeUtil(conf);
    String mtype = mimeutil.getMimeType(file);
    page.setContentType(new Utf8(mtype));
    // Parse parse = parser.getParse(url, page);

    Parse parse = new ParseUtil(conf).parse(url, page);

    System.out.println("content type: " + mtype);
View Full Code Here

      ftp.setTimeout(timeout);

    if (maxContentLength != Integer.MIN_VALUE) // set maxContentLength
      ftp.setMaxContentLength(maxContentLength);

    Content content = ftp.getProtocolOutput(urlString, new WebPage())
        .getContent();

    System.err.println("Content-Type: " + content.getContentType());
    System.err.println("Content-Length: "
        + content.getMetadata().get(Response.CONTENT_LENGTH));
View Full Code Here

    }
    in.close();
    byte[] bytes = out.toByteArray();
    Configuration conf = NutchConfiguration.create();

    WebPage page = new WebPage();
    page.setBaseUrl(new Utf8(url));
    page.setContent(ByteBuffer.wrap(bytes));
    MimeUtil mimeutil = new MimeUtil(conf);
    String mtype = mimeutil.getMimeType(file);
    page.setContentType(new Utf8(mtype));

    new ParseUtil(conf).parse(url, page);

    ByteBuffer bb = page.getFromMetadata(new Utf8("License-Url"));
    assertEquals(license, new String(bb.array()));
    bb = page.getFromMetadata(new Utf8("License-Location"));
    assertEquals(location, new String(bb.array()));
    bb = page.getFromMetadata(new Utf8("Work-Type"));
    if (bb == null)
      assertEquals(type, null);
    else
      assertEquals(type, new String(bb.array()));
  }
View Full Code Here

      if (LOG.isTraceEnabled())
        LOG.trace("cache miss " + url);

      try {
        String robotsUrl = new URL(url, "/robots.txt").toString();       
        ProtocolOutput output = ((Ftp)ftp).getProtocolOutput(robotsUrl, new WebPage());
        int statusCode = output.getStatus().getCode();

        if (statusCode == ProtocolStatusCodes.SUCCESS) {
          robotRules =  parseRules(url.toString(), output.getContent().getContent(),
                                  CONTENT_TYPE, agentNames);
View Full Code Here

  byte[] bytes = new byte[(int) file.length()];
  DataInputStream in = new DataInputStream(new FileInputStream(file));
  in.readFully(bytes);
  in.close();
  Parse parse;
  WebPage page = new WebPage();
  page.setBaseUrl(new Utf8("file:"+urlString));
  page.setContent(ByteBuffer.wrap(bytes));
  // set the content type?
  MimeUtil mimeutil = new MimeUtil(conf);
  String mtype = mimeutil.getMimeType(file);
  page.setContentType(new Utf8(mtype));
   
  parse = new ParseUtil(conf).parse("file:"+urlString, page);
  return parse.getText();
    }
View Full Code Here

      byte[] bytes = new byte[(int) file.length()];
      DataInputStream in = new DataInputStream(new FileInputStream(file));
      in.readFully(bytes);
      in.close();

      WebPage page = new WebPage();
      page.setBaseUrl(new Utf8(urlString));
      page.setContent(ByteBuffer.wrap(bytes));
      String mtype = mimeutil.getMimeType(file);
      page.setContentType(new Utf8(mtype));

      parse = new ParseUtil(conf).parse(urlString, page);

      String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
View Full Code Here

    byte[] bytes = new byte[(int) file.length()];
    DataInputStream dip = new DataInputStream(new FileInputStream(file));
    dip.readFully(bytes);
    dip.close();
   
    WebPage page = new WebPage();
    page.setBaseUrl(new Utf8(urlString));
    page.setContent(ByteBuffer.wrap(bytes));
    MimeUtil mutil = new MimeUtil(conf);
    String mime = mutil.getMimeType(file);
    page.setContentType(new Utf8(mime));
 
    parse = new ParseUtil(conf).parse(urlString, page);
    return parse.getOutlinks();
  }
View Full Code Here

    try {
      ParseUtil parser = new ParseUtil(NutchConfiguration.create());
      /* loop through the test documents and validate result */
      for (int t = 0; t < docs.length; t++) {
        WebPage page = getPage(docs[t]);
        parser.parse(URL.toString(), page);
        ByteBuffer blang = page.getFromMetadata(new Utf8(Metadata.LANGUAGE));
        String lang = null;
        if (blang != null)
          lang = Bytes.toString(blang.array());
        assertEquals(metalanguages[t], lang);
      }
View Full Code Here

TOP

Related Classes of org.apache.nutch.storage.WebPage

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.