Package org.apache.nutch.storage

Examples of org.apache.nutch.storage.WebPage


  }
 
  private void assertContentType(Configuration conf, String source, String expected) throws IndexingException {
    MoreIndexingFilter filter = new MoreIndexingFilter();
    filter.setConf(conf);
    WebPage page = new WebPage();
    String url = "http://www.example.com/";
    page.setContent(ByteBuffer.wrap("text".getBytes()));
    page.setTitle(new Utf8("title"));
    page.putToHeaders(EncodingDetector.CONTENT_TYPE_UTF8, new Utf8(source));
    NutchDocument doc = filter.filter(new NutchDocument(), url, page);
    assertEquals("mime type not detected", expected, doc.getFieldValue("type"));
  }
View Full Code Here


      file.setMaxContentLength(maxContentLength);

    // set log level
    // LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));

    Content content = file.getProtocolOutput(urlString, new WebPage())
        .getContent();

    System.out.println("Content-Type: " + content.getContentType());
    System.out.println("Content-Length: "
        + content.getMetadata().get(Response.CONTENT_LENGTH));
View Full Code Here

  byte[] bytes = new byte[(int) file.length()];
  DataInputStream in = new DataInputStream(new FileInputStream(file));
  in.readFully(bytes);
  in.close();
  Parse parse;
  WebPage page = new WebPage();
  page.setBaseUrl(new Utf8("file:"+urlString));
  page.setContent(ByteBuffer.wrap(bytes));
  // set the content type?
  MimeUtil mimeutil = new MimeUtil(conf);
  String mtype = mimeutil.getMimeType(file);
  page.setContentType(new Utf8(mtype));
   
  parse = new ParseUtil(conf).parse("file:"+urlString, page);
  return parse.getText();
    }
View Full Code Here

    }
    in.close();
    byte[] bytes = out.toByteArray();
    Configuration conf = NutchConfiguration.create();

    WebPage page = new WebPage();
    page.setBaseUrl(new Utf8(url));
    page.setContent(ByteBuffer.wrap(bytes));
    MimeUtil mimeutil = new MimeUtil(conf);
    String mtype = mimeutil.getMimeType(file);
    page.setContentType(new Utf8(mtype));

    new ParseUtil(conf).parse(url, page);

    ByteBuffer bb = page.getFromMetadata(new Utf8("License-Url"));
    assertEquals(license, new String(bb.array()));
    bb = page.getFromMetadata(new Utf8("License-Location"));
    assertEquals(location, new String(bb.array()));
    bb = page.getFromMetadata(new Utf8("Work-Type"));
    if (bb == null)
      assertEquals(type, null);
    else
      assertEquals(type, new String(bb.array()));
  }
View Full Code Here

      byte[] bytes = new byte[(int) file.length()];
      DataInputStream in = new DataInputStream(new FileInputStream(file));
      in.readFully(bytes);
      in.close();

      WebPage page = new WebPage();
      page.setBaseUrl(new Utf8(urlString));
      page.setContent(ByteBuffer.wrap(bytes));
      String mtype = mimeutil.getMimeType(file);
      page.setContentType(new Utf8(mtype));

      parse = new ParseUtil(conf).parse(urlString, page);

      String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
View Full Code Here

      byte[] bytes = new byte[(int) file.length()];
      DataInputStream in = new DataInputStream(new FileInputStream(file));
      in.readFully(bytes);
      in.close();

      WebPage page = new WebPage();
      page.setBaseUrl(new Utf8(urlString));
      page.setContent(ByteBuffer.wrap(bytes));
      String mtype = mimeutil.getMimeType(file);
      page.setContentType(new Utf8(mtype));

      parse = new ParseUtil(conf).parse(urlString, page);

      int index = parse.getText().indexOf(expectedText);
      assertTrue(index > 0);
View Full Code Here

      byte[] bytes = new byte[(int) file.length()];
      DataInputStream in = new DataInputStream(new FileInputStream(file));
      in.readFully(bytes);
      in.close();

      WebPage page = new WebPage();
      page.setBaseUrl(new Utf8(urlString));
      page.setContent(ByteBuffer.wrap(bytes));
      String mtype = mimeutil.getMimeType(file);
      page.setContentType(new Utf8(mtype));

      parse = new ParseUtil(conf).parse(urlString, page);

      // check that there are 2 outlinks:
View Full Code Here

    DataInputStream in = new DataInputStream(new FileInputStream(file));
    in.readFully(bytes);
    Configuration conf = NutchConfiguration.create();
    // TikaParser parser = new TikaParser();
    // parser.setConf(conf);
    WebPage page = new WebPage();
    page.setBaseUrl(new Utf8(url));
    page.setContent(ByteBuffer.wrap(bytes));
    MimeUtil mimeutil = new MimeUtil(conf);
    String mtype = mimeutil.getMimeType(file);
    page.setContentType(new Utf8(mtype));
    // Parse parse = parser.getParse(url, page);

    Parse parse = new ParseUtil(conf).parse(url, page);

    System.out.println("content type: " + mtype);
View Full Code Here

    //    if (verbose) {
    //      LOGGER.setLevel(Level.FINE);
    //    }

    ProtocolOutput out = http.getProtocolOutput(url, new WebPage());
    Content content = out.getContent();

    System.out.println("Status: " + out.getStatus());
    if (content != null) {
      System.out.println("Content Type: " + content.getContentType());
View Full Code Here

    //Metadata metadata = new Metadata();
    EncodingDetector detector;
    // Content content;
    String encoding;

    WebPage page = new WebPage();
    page.setBaseUrl(new Utf8("http://www.example.com/"));
    page.setContentType(new Utf8("text/plain"));
    page.setContent(ByteBuffer.wrap(contentInOctets));

    detector = new EncodingDetector(conf);
    detector.autoDetectClues(page, true);
    encoding = detector.guessEncoding(page, "windows-1252");
    // no information is available, so it should return default encoding
    assertEquals("windows-1252", encoding.toLowerCase());

    page = new WebPage();
    page.setBaseUrl(new Utf8("http://www.example.com/"));
    page.setContentType(new Utf8("text/plain"));
    page.setContent(ByteBuffer.wrap(contentInOctets));
    page.putToHeaders(EncodingDetector.CONTENT_TYPE_UTF8, new Utf8("text/plain; charset=UTF-16"));
   
    detector = new EncodingDetector(conf);
    detector.autoDetectClues(page, true);
    encoding = detector.guessEncoding(page, "windows-1252");
    assertEquals("utf-16", encoding.toLowerCase());

    page = new WebPage();
    page.setBaseUrl(new Utf8("http://www.example.com/"));
    page.setContentType(new Utf8("text/plain"));
    page.setContent(ByteBuffer.wrap(contentInOctets));
   
    detector = new EncodingDetector(conf);
    detector.autoDetectClues(page, true);
    detector.addClue("windows-1254", "sniffed");
    encoding = detector.guessEncoding(page, "windows-1252");
    assertEquals("windows-1254", encoding.toLowerCase());

    // enable autodetection
    conf.setInt(EncodingDetector.MIN_CONFIDENCE_KEY, 50);
    page = new WebPage();
    page.setBaseUrl(new Utf8("http://www.example.com/"));
    page.setContentType(new Utf8("text/plain"));
    page.setContent(ByteBuffer.wrap(contentInOctets));
    page.putToMetadata(new Utf8(Response.CONTENT_TYPE), ByteBuffer.wrap("text/plain; charset=UTF-16".getBytes()));
   
    detector = new EncodingDetector(conf);
    detector.autoDetectClues(page, true);
    detector.addClue("utf-32", "sniffed");
    encoding = detector.guessEncoding(page, "windows-1252");
View Full Code Here

TOP

Related Classes of org.apache.nutch.storage.WebPage

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.