Package org.apache.nutch.storage

Examples of org.apache.nutch.storage.WebPage


  private List<String> readDb() throws Exception {
    List<URLWebPage> pages = CrawlTestUtil.readContents(webPageStore, null, fields);
    ArrayList<String> read = new ArrayList<String>();
    for (URLWebPage up : pages) {
      WebPage page = up.getDatum();
      String representation = up.getUrl();
      representation += "\tnutch.score=" + (int)page.getScore();
      ByteBuffer bb = page.getFromMetadata(new Utf8("custom.attribute"));
      if (bb != null) {
        representation += "\tcustom.attribute=" + ByteUtils.toString(bb.array());
      }
      read.add(representation);
    }
View Full Code Here


   
    private void advance() throws IOException {
      hasNext = res.next();
      if (hasNext && batchId != null) {
        do {
          WebPage page = res.get();
          Utf8 mark = Mark.UPDATEDB_MARK.checkMark(page);
          if (NutchJob.shouldProcess(mark, batchId)) {
            return;
          } else {
            if (LOG.isDebugEnabled()) {
View Full Code Here

implements Writable {

  private WebPage webPage;

  public WebPageWritable() {
    this(null, new WebPage());
  }
View Full Code Here

          return redirectedPage;
        }
        if (newUrl == null || newUrl.equals(url)) {
          String reprUrl = URLUtil.chooseRepr(url, newUrl,
              refreshTime < FetcherJob.PERM_REFRESH_TIME);
          WebPage newWebPage = new WebPage();
          if (reprUrl == null) {
            LOG.warn("reprUrl==null for " + url);
            return redirectedPage;
          } else {
            page.setReprUrl(new Utf8(reprUrl));
View Full Code Here

      if (newUrl == null || newUrl.equals(url)) {
        return;
      }
      reprUrl = URLUtil.chooseRepr(reprUrl, newUrl, temp);
      final String reversedNewUrl = TableUtil.reverseUrl(newUrl);
      WebPage newWebPage = new WebPage();
      if (!reprUrl.equals(url)) {
        newWebPage.setReprUrl(new Utf8(reprUrl));
      }
      newWebPage.putToMetadata(FetcherJob.REDIRECT_DISCOVERED, TableUtil.YES_VAL);
      context.write(reversedNewUrl, newWebPage);
      if (LOG.isDebugEnabled()) {
        LOG.debug(" - " + redirType + " redirect to " +
            reprUrl + " (fetching later)");
      }
View Full Code Here

    try {
      ParseUtil parser = new ParseUtil(NutchConfiguration.create());
      /* loop through the test documents and validate result */
      for (int t = 0; t < docs.length; t++) {
        WebPage page = getPage(docs[t]);
        parser.parse(URL.toString(), page);
        ByteBuffer blang = page.getFromMetadata(new Utf8(Metadata.LANGUAGE));
        String lang = null;
        if (blang != null)
          lang = Bytes.toString(blang.array());
        assertEquals(metalanguages[t], lang);
      }
View Full Code Here

      fail(e.toString());
    }
  }

  private WebPage getPage(String text) {
    WebPage page = new WebPage();
    page.setBaseUrl(BASE);
    page.setContent(ByteBuffer.wrap(text.getBytes()));
    page.setContentType(new Utf8("text/html"));
    page
        .putToHeaders(EncodingDetector.CONTENT_TYPE_UTF8, new Utf8("text/html"));
    return page;
  }
View Full Code Here

   *             When an error occurs or test case fails.
   */
  private void fetchPage(String page, int expectedCode) throws Exception {
    URL url = new URL("http", "127.0.0.1", port, page);
    Response response = null;
    response = http.getResponse(url, new WebPage(), true);

    int code = response.getCode();
    assertEquals("HTTP Status Code for " + url, expectedCode, code);
  }
View Full Code Here

   *
   */
  public void setContentType(String testTextFile) throws ProtocolNotFound {
    String urlString = "file:" + sampleDir + fileSeparator + testTextFile;
    assertNotNull(urlString);
    WebPage datum = new WebPage();
    Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
    ProtocolOutput output = protocol.getProtocolOutput(urlString,datum);
    assertNotNull(output);

    assertEquals("Status code: [" + output.getStatus().getCode()
View Full Code Here

     MoreIndexingFilter filter = new MoreIndexingFilter();
     filter.setConf(conf);
     assertNotNull(filter);
     NutchDocument doc = new NutchDocument();
     try{
       filter.filter(doc, "http://nutch.apache.org/index.html", new WebPage());
     }
     catch(Exception e){
       e.printStackTrace();
       fail(e.getMessage());
     }
View Full Code Here

TOP

Related Classes of org.apache.nutch.storage.WebPage

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.