Package org.apache.nutch.storage

Examples of org.apache.nutch.storage.WebPage


      fail(e.toString());
    }
  }

  private WebPage getPage(String text) {
    WebPage page = new WebPage();
    page.setBaseUrl(BASE);
    page.setContent(ByteBuffer.wrap(text.getBytes()));
    page.setContentType(new Utf8("text/html"));
    page
        .putToHeaders(EncodingDetector.CONTENT_TYPE_UTF8, new Utf8("text/html"));
    return page;
  }
View Full Code Here


    String class1 = "NonExistingFilter";
    String class2 = "org.apache.nutch.indexer.basic.BasicIndexingFilter";
    conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);

    IndexingFilters filters = new IndexingFilters(conf);
    WebPage page = new WebPage();
    page.setText(new Utf8("text"));
    page.setTitle(new Utf8("title"));
    filters.filter(new NutchDocument(),"http://www.example.com/",page);
  }
View Full Code Here

    Configuration conf = NutchConfiguration.create();
    conf.addResource("nutch-default.xml");
    conf.addResource("crawl-tests.xml");

    IndexingFilters filters = new IndexingFilters(conf);
    WebPage page = new WebPage();
    page.setText(new Utf8("text"));
    page.setTitle(new Utf8("title"));
    NutchDocument doc = filters.filter(null,"http://www.example.com/",page);

    assertNull(doc);
  }
View Full Code Here

    String class1 = "org.apache.nutch.indexer.basic.BasicIndexingFilter";
    conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1);

    IndexingFilters filters1 = new IndexingFilters(conf);
    WebPage page = new WebPage();
    page.setText(new Utf8("text"));
    page.setTitle(new Utf8("title"));
    NutchDocument fdoc1 = filters1.filter(new NutchDocument(),"http://www.example.com/",page);

    // add another index filter
    String class2 = "org.apache.nutch.indexer.metadata.MetadataIndexer";
    // set content metadata
View Full Code Here

    //Metadata metadata = new Metadata();
    EncodingDetector detector;
    // Content content;
    String encoding;

    WebPage page = new WebPage();
    page.setBaseUrl(new Utf8("http://www.example.com/"));
    page.setContentType(new Utf8("text/plain"));
    page.setContent(ByteBuffer.wrap(contentInOctets));

    detector = new EncodingDetector(conf);
    detector.autoDetectClues(page, true);
    encoding = detector.guessEncoding(page, "windows-1252");
    // no information is available, so it should return default encoding
    assertEquals("windows-1252", encoding.toLowerCase());

    page = new WebPage();
    page.setBaseUrl(new Utf8("http://www.example.com/"));
    page.setContentType(new Utf8("text/plain"));
    page.setContent(ByteBuffer.wrap(contentInOctets));
    page.putToHeaders(EncodingDetector.CONTENT_TYPE_UTF8, new Utf8("text/plain; charset=UTF-16"));
   
    detector = new EncodingDetector(conf);
    detector.autoDetectClues(page, true);
    encoding = detector.guessEncoding(page, "windows-1252");
    assertEquals("utf-16", encoding.toLowerCase());

    page = new WebPage();
    page.setBaseUrl(new Utf8("http://www.example.com/"));
    page.setContentType(new Utf8("text/plain"));
    page.setContent(ByteBuffer.wrap(contentInOctets));
   
    detector = new EncodingDetector(conf);
    detector.autoDetectClues(page, true);
    detector.addClue("windows-1254", "sniffed");
    encoding = detector.guessEncoding(page, "windows-1252");
    assertEquals("windows-1254", encoding.toLowerCase());

    // enable autodetection
    conf.setInt(EncodingDetector.MIN_CONFIDENCE_KEY, 50);
    page = new WebPage();
    page.setBaseUrl(new Utf8("http://www.example.com/"));
    page.setContentType(new Utf8("text/plain"));
    page.setContent(ByteBuffer.wrap(contentInOctets));
    page.putToMetadata(new Utf8(Response.CONTENT_TYPE), ByteBuffer.wrap("text/plain; charset=UTF-16".getBytes()));
   
    detector = new EncodingDetector(conf);
    detector.autoDetectClues(page, true);
    detector.addClue("utf-32", "sniffed");
    encoding = detector.guessEncoding(page, "windows-1252");
View Full Code Here

    int numReduceTasks = 100;
   
    int partitionFromRef = refPartitioner.getPartition("http://www.example.org/", numReduceTasks);
    //init selector entry (score shouldn't matter)
    SelectorEntry selectorEntry = new SelectorEntry("http://www.example.org/", 1337);
    WebPage page = new WebPage();
    int partitionFromSig = sigPartitioner.getPartition(selectorEntry, page, numReduceTasks);
   
    assertEquals("partitions should be same",
        partitionFromRef, partitionFromSig);
   
View Full Code Here

   
    int numReduceTasks = 100;
   
    int partitionFromRef = refPartitioner.getPartition("http://www.example.org/", numReduceTasks);
    IntWritable intWritable = new IntWritable(1337); //doesn't matter
    WebPage page = new WebPage();
    String key = TableUtil.reverseUrl("http://www.example.org/");
    FetchEntry fetchEntry = new FetchEntry(conf, key, page);
    int partitionFromSig = sigPartitioner.getPartition(intWritable, fetchEntry, numReduceTasks);
   
    assertEquals("partitions should be same",
View Full Code Here

   * @param score
   * @return Constructed object
   */
  private URLWebPage createURLWebPage(final String url,
      final int fetchInterval, final float score) {
    WebPage page = new WebPage();
    page.setFetchInterval(fetchInterval);
    page.setScore(score);
    page.setStatus(CrawlStatus.STATUS_UNFETCHED);
    return new URLWebPage(url, page);
  }
View Full Code Here

    }

    Result<String, WebPage> results = store.execute(query);
    while (results.next()) {
      try {
        WebPage page = results.get();
        String url = results.getKey();

        if (page == null)
          continue;

        if (requiredMark != null && requiredMark.checkMark(page) == null)
          continue;

        l.add(new URLWebPage(TableUtil.unreverseUrl(url), (WebPage)page.clone()));
      } catch (Exception e) {
        e.printStackTrace();
      }
    }
    return l;
View Full Code Here

  private List<String> readDb() throws Exception {
    List<URLWebPage> pages = CrawlTestUtil.readContents(webPageStore, null, fields);
    ArrayList<String> read = new ArrayList<String>();
    for (URLWebPage up : pages) {
      WebPage page = up.getDatum();
      String representation = up.getUrl();
      representation += "\tnutch.score=" + (int)page.getScore();
      ByteBuffer bb = page.getFromMetadata(new Utf8("custom.attribute"));
      if (bb != null) {
        representation += "\tcustom.attribute=" + ByteUtils.toString(bb.array());
      }
      read.add(representation);
    }
View Full Code Here

TOP

Related Classes of org.apache.nutch.storage.WebPage

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.