Examples of org.apache.nutch.storage.WebPage

org.apache.nutch.storage.WebPage

  }
  
  private void assertContentType(Configuration conf, String source, String expected) throws IndexingException {
    MoreIndexingFilter filter = new MoreIndexingFilter();
    filter.setConf(conf);
    WebPage page = new WebPage();
    String url = "http://www.example.com/";
    page.setContent(ByteBuffer.wrap("text".getBytes()));
    page.setTitle(new Utf8("title"));
    page.putToHeaders(EncodingDetector.CONTENT_TYPE_UTF8, new Utf8(source));
    NutchDocument doc = filter.filter(new NutchDocument(), url, page);
    assertEquals("mime type not detected", expected, doc.getFieldValue("type"));
  }

View Full Code Here

      file.setMaxContentLength(maxContentLength);


    // set log level
    // LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));


    Content content = file.getProtocolOutput(urlString, new WebPage())
        .getContent();


    System.out.println("Content-Type: " + content.getContentType());
    System.out.println("Content-Length: "
        + content.getMetadata().get(Response.CONTENT_LENGTH));

View Full Code Here

  byte[] bytes = new byte[(int) file.length()];
  DataInputStream in = new DataInputStream(new FileInputStream(file));
  in.readFully(bytes);
  in.close();
  Parse parse;
  WebPage page = new WebPage();
  page.setBaseUrl(new Utf8("file:"+urlString));
  page.setContent(ByteBuffer.wrap(bytes));
  // set the content type?
  MimeUtil mimeutil = new MimeUtil(conf);
  String mtype = mimeutil.getMimeType(file);
  page.setContentType(new Utf8(mtype));
    
  parse = new ParseUtil(conf).parse("file:"+urlString, page);
  return parse.getText();
    }

View Full Code Here

    }
    in.close();
    byte[] bytes = out.toByteArray();
    Configuration conf = NutchConfiguration.create();


    WebPage page = new WebPage();
    page.setBaseUrl(new Utf8(url));
    page.setContent(ByteBuffer.wrap(bytes));
    MimeUtil mimeutil = new MimeUtil(conf);
    String mtype = mimeutil.getMimeType(file);
    page.setContentType(new Utf8(mtype));


    new ParseUtil(conf).parse(url, page);


    ByteBuffer bb = page.getFromMetadata(new Utf8("License-Url"));
    assertEquals(license, new String(bb.array()));
    bb = page.getFromMetadata(new Utf8("License-Location"));
    assertEquals(location, new String(bb.array()));
    bb = page.getFromMetadata(new Utf8("Work-Type"));
    if (bb == null)
      assertEquals(type, null);
    else
      assertEquals(type, new String(bb.array()));
  }

View Full Code Here

      byte[] bytes = new byte[(int) file.length()];
      DataInputStream in = new DataInputStream(new FileInputStream(file));
      in.readFully(bytes);
      in.close();


      WebPage page = new WebPage();
      page.setBaseUrl(new Utf8(urlString));
      page.setContent(ByteBuffer.wrap(bytes));
      String mtype = mimeutil.getMimeType(file);
      page.setContentType(new Utf8(mtype));


      parse = new ParseUtil(conf).parse(urlString, page);


      String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();

View Full Code Here

      byte[] bytes = new byte[(int) file.length()];
      DataInputStream in = new DataInputStream(new FileInputStream(file));
      in.readFully(bytes);
      in.close();


      WebPage page = new WebPage();
      page.setBaseUrl(new Utf8(urlString));
      page.setContent(ByteBuffer.wrap(bytes));
      String mtype = mimeutil.getMimeType(file);
      page.setContentType(new Utf8(mtype));


      parse = new ParseUtil(conf).parse(urlString, page);


      int index = parse.getText().indexOf(expectedText);
      assertTrue(index > 0);

View Full Code Here

      byte[] bytes = new byte[(int) file.length()];
      DataInputStream in = new DataInputStream(new FileInputStream(file));
      in.readFully(bytes);
      in.close();


      WebPage page = new WebPage();
      page.setBaseUrl(new Utf8(urlString));
      page.setContent(ByteBuffer.wrap(bytes));
      String mtype = mimeutil.getMimeType(file);
      page.setContentType(new Utf8(mtype));


      parse = new ParseUtil(conf).parse(urlString, page);


      // check that there are 2 outlinks:

View Full Code Here

    DataInputStream in = new DataInputStream(new FileInputStream(file));
    in.readFully(bytes);
    Configuration conf = NutchConfiguration.create();
    // TikaParser parser = new TikaParser();
    // parser.setConf(conf);
    WebPage page = new WebPage();
    page.setBaseUrl(new Utf8(url));
    page.setContent(ByteBuffer.wrap(bytes));
    MimeUtil mimeutil = new MimeUtil(conf);
    String mtype = mimeutil.getMimeType(file);
    page.setContentType(new Utf8(mtype));
    // Parse parse = parser.getParse(url, page);


    Parse parse = new ParseUtil(conf).parse(url, page);


    System.out.println("content type: " + mtype);

View Full Code Here


    //    if (verbose) {
    //      LOGGER.setLevel(Level.FINE);
    //    }


    ProtocolOutput out = http.getProtocolOutput(url, new WebPage());
    Content content = out.getContent();


    System.out.println("Status: " + out.getStatus());
    if (content != null) {
      System.out.println("Content Type: " + content.getContentType());

View Full Code Here

    //Metadata metadata = new Metadata();
    EncodingDetector detector;
    // Content content;
    String encoding;


    WebPage page = new WebPage();
    page.setBaseUrl(new Utf8("http://www.example.com/"));
    page.setContentType(new Utf8("text/plain"));
    page.setContent(ByteBuffer.wrap(contentInOctets));


    detector = new EncodingDetector(conf);
    detector.autoDetectClues(page, true);
    encoding = detector.guessEncoding(page, "windows-1252");
    // no information is available, so it should return default encoding
    assertEquals("windows-1252", encoding.toLowerCase());


    page = new WebPage();
    page.setBaseUrl(new Utf8("http://www.example.com/"));
    page.setContentType(new Utf8("text/plain"));
    page.setContent(ByteBuffer.wrap(contentInOctets));
    page.putToHeaders(EncodingDetector.CONTENT_TYPE_UTF8, new Utf8("text/plain; charset=UTF-16"));
    
    detector = new EncodingDetector(conf);
    detector.autoDetectClues(page, true);
    encoding = detector.guessEncoding(page, "windows-1252");
    assertEquals("utf-16", encoding.toLowerCase());


    page = new WebPage();
    page.setBaseUrl(new Utf8("http://www.example.com/"));
    page.setContentType(new Utf8("text/plain"));
    page.setContent(ByteBuffer.wrap(contentInOctets));
    
    detector = new EncodingDetector(conf);
    detector.autoDetectClues(page, true);
    detector.addClue("windows-1254", "sniffed");
    encoding = detector.guessEncoding(page, "windows-1252");
    assertEquals("windows-1254", encoding.toLowerCase());


    // enable autodetection
    conf.setInt(EncodingDetector.MIN_CONFIDENCE_KEY, 50);
    page = new WebPage();
    page.setBaseUrl(new Utf8("http://www.example.com/"));
    page.setContentType(new Utf8("text/plain"));
    page.setContent(ByteBuffer.wrap(contentInOctets));
    page.putToMetadata(new Utf8(Response.CONTENT_TYPE), ByteBuffer.wrap("text/plain; charset=UTF-16".getBytes()));
    
    detector = new EncodingDetector(conf);
    detector.autoDetectClues(page, true);
    detector.addClue("utf-32", "sniffed");
    encoding = detector.guessEncoding(page, "windows-1252");

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.nutch.storage.WebPage

org.apache.gora.persistency.impl.StateManagerImpl

org.apache.nutch.analysis.lang.TestHTMLLanguageParser

org.apache.nutch.api.DbReader$DbIterator

org.apache.nutch.crawl.DbUpdateReducer

org.apache.nutch.crawl.InjectorJob$UrlMapper

org.apache.nutch.crawl.TestGenerator

org.apache.nutch.crawl.TestInjector

org.apache.nutch.crawl.TestURLPartitioner

org.apache.nutch.crawl.WebTableReader

org.apache.nutch.fetcher.FetcherReducer$FetcherThread

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.