Examples of WebPage


Examples of org.apache.gora.examples.generated.WebPage

    store.deleteByQuery(query);//don't you love that HBase sometimes does not delete arbitrarily
    assertNumResults(store.newQuery(), URLS.length);

    //assert that data is deleted
    for (int i = 0; i < SORTED_URLS.length; i++) {
      WebPage page = store.get(SORTED_URLS[i]);
      Assert.assertNotNull(page);

      Assert.assertNotNull(page.getUrl());
      Assert.assertEquals(page.getUrl().toString(), SORTED_URLS[i]);
      Assert.assertEquals(0, page.getOutlinks().size());
      Assert.assertEquals(0, page.getParsedContent().size());
      if(page.getContent() != null) {
        System.out.println("url:" + page.getUrl().toString());
        System.out.println( "limit:" + page.getContent().limit());
      } else {
        Assert.assertNull(page.getContent());
      }
    }

    //test 6 - delete some with some fields
    WebPageDataCreator.createWebPageData(store);

    query = store.newQuery();
    query.setFields(WebPage.Field.URL.getName());
    String startKey = SORTED_URLS[NUM_KEYS];
    String endKey = SORTED_URLS[SORTED_URLS.length - NUM_KEYS];
    query.setStartKey(startKey);
    query.setEndKey(endKey);

    assertNumResults(store.newQuery(), URLS.length);
    store.deleteByQuery(query);
    store.deleteByQuery(query);
    store.deleteByQuery(query);//don't you love that HBase sometimes does not delete arbitrarily

    assertNumResults(store.newQuery(), URLS.length);

    //assert that data is deleted
    for (int i = 0; i < URLS.length; i++) {
      WebPage page = store.get(URLS[i]);
      Assert.assertNotNull(page);
      if( URLS[i].compareTo(startKey) < 0 || URLS[i].compareTo(endKey) >= 0) {
        //not deleted
        assertWebPage(page, i);
      } else {
        //deleted
        Assert.assertNull(page.getUrl());
        Assert.assertNotNull(page.getOutlinks());
        Assert.assertNotNull(page.getParsedContent());
        Assert.assertNotNull(page.getContent());
        Assert.assertTrue(page.getOutlinks().size() > 0);
        Assert.assertTrue(page.getParsedContent().size() > 0);
      }
    }

  }
View Full Code Here

Examples of org.apache.gora.examples.generated.WebPage

    Arrays.sort(SORTED_URLS);
  }
 
  public static void createWebPageData(DataStore<String, WebPage> dataStore)
  throws IOException {
    WebPage page;
    log.info("creating web page data");
   
    for(int i=0; i<URLS.length; i++) {
      page = new WebPage();
      page.setUrl(new Utf8(URLS[i]));
      page.setContent(ByteBuffer.wrap(CONTENTS[i].getBytes()));
      for(String token : CONTENTS[i].split(" ")) {
        page.addToParsedContent(new Utf8(token))
      }
     
      for(int j=0; j<LINKS[i].length; j++) {
        page.putToOutlinks(new Utf8(URLS[LINKS[i][j]]), new Utf8(ANCHORS[i][j]));
      }
     
      Metadata metadata = new Metadata();
      metadata.setVersion(1);
      metadata.putToData(new Utf8("metakey"), new Utf8("metavalue"));
      page.setMetadata(metadata);
     
      dataStore.put(URLS[i], page);
    }
    dataStore.flush();
    log.info("finished creating web page data");
View Full Code Here

Examples of org.apache.gora.examples.generated.WebPage

    String revUrl = "foo.com:http/";
    String url = "http://foo.com/";

    webPageStore.createSchema();
    WebPage page = webPageStore.newPersistent();
    Metadata metadata = new Metadata()
    metadata.setVersion(1);
    metadata.putToData(new Utf8("foo"), new Utf8("baz"));

    page.setMetadata(metadata);
    page.setUrl(new Utf8(url));

    webPageStore.put(revUrl, page);
    webPageStore.flush();

    page = webPageStore.get(revUrl);
    metadata = page.getMetadata();
    Assert.assertNotNull(metadata);
    Assert.assertEquals(1, metadata.getVersion());
    Assert.assertEquals(new Utf8("baz"), metadata.getData().get(new Utf8("foo")));
  }
View Full Code Here

Examples of org.apache.gora.examples.generated.WebPage

  @Test
  public void testPutArray() throws IOException {
    log.info("test method: testPutArray");
    webPageStore.createSchema();
    WebPage page = webPageStore.newPersistent();

    String[] tokens = {"example", "content", "in", "example.com"};

    for(String token: tokens) {
      page.addToParsedContent(new Utf8(token));
    }

    webPageStore.put("com.example/http", page);
    webPageStore.close();
View Full Code Here

Examples of org.apache.gora.examples.generated.WebPage

  @Test
  public void testPutBytes() throws IOException {
    log.info("test method: testPutBytes");
    webPageStore.createSchema();
    WebPage page = webPageStore.newPersistent();
    page.setUrl(new Utf8("http://example.com"));
    byte[] contentBytes = "example content in example.com".getBytes();
    ByteBuffer buff = ByteBuffer.wrap(contentBytes);
    page.setContent(buff);

    webPageStore.put("com.example/http", page);
    webPageStore.close();

    assertPutBytes(contentBytes);
View Full Code Here

Examples of org.apache.gora.examples.generated.WebPage

  @Test
  public void testPutMap() throws IOException {
    log.info("test method: testPutMap");
    webPageStore.createSchema();

    WebPage page = webPageStore.newPersistent();

    page.setUrl(new Utf8("http://example.com"));
    page.putToOutlinks(new Utf8("http://example2.com"), new Utf8("anchor2"));
    page.putToOutlinks(new Utf8("http://example3.com"), new Utf8("anchor3"));
    page.putToOutlinks(new Utf8("http://example3.com"), new Utf8("anchor4"));
    webPageStore.put("com.example/http", page);
    webPageStore.close();

    assertPutMap();
  }
View Full Code Here

Examples of org.apache.gora.examples.generated.WebPage

    Result<String, WebPage> result = store.newQuery().execute();

    int i=0;
    while(result.next()) {
      WebPage page = result.get();
      TestIOUtils.testSerializeDeserialize(page);
      i++;
    }
    Assert.assertEquals(WebPageDataCreator.URLS.length, i);
  }
View Full Code Here

Examples of org.apache.gora.examples.generated.WebPage

    Assert.assertEquals(WebPageDataCreator.URLS.length, i);
  }

  @Test
  public void testSerdeMultipleWebPages() throws Exception {
    WebPage page1 = new WebPage();
    WebPage page2 = new WebPage();
    WebPage page3 = new WebPage();

    page1.setUrl(new Utf8("foo"));
    page2.setUrl(new Utf8("baz"));
    page3.setUrl(new Utf8("bar"));

    page1.addToParsedContent(new Utf8("coo"));

    page2.putToOutlinks(new Utf8("a"), new Utf8("b"));
View Full Code Here

Examples of org.apache.nutch.storage.WebPage

      }
      if (url == null)
        return;

      String reversedUrl = TableUtil.reverseUrl(url);
      WebPage row = new WebPage();
      row.setFetchTime(curTime);
      row.setFetchInterval(customInterval);

      // now add the metadata
      Iterator<String> keysIter = metadata.keySet().iterator();
      while (keysIter.hasNext()) {
          String keymd = keysIter.next();
          String valuemd = metadata.get(keymd);
          row.putToMetadata(new Utf8(keymd), ByteBuffer.wrap(valuemd.getBytes()));
      }

      if (customScore != -1)
        row.setScore(customScore);
      else
        row.setScore(scoreInjected);

      try {
        scfilters.injectedScore(url, row);
      } catch (ScoringFilterException e) {
        if (LOG.isWarnEnabled()) {
View Full Code Here

Examples of org.apache.nutch.storage.WebPage

  @Override
  protected void reduce(UrlWithScore key, Iterable<NutchWritable> values,
      Context context) throws IOException, InterruptedException {
    String keyUrl = key.getUrl().toString();

    WebPage page = null;
    inlinkedScoreData.clear();
   
    for (NutchWritable nutchWritable : values) {
      Writable val = nutchWritable.get();
      if (val instanceof WebPageWritable) {
        page = ((WebPageWritable) val).getWebPage();
      } else {
        inlinkedScoreData.add((ScoreDatum) val);
        if (inlinkedScoreData.size() >= maxLinks) {
          LOG.info("Limit reached, skipping further inlinks for " + keyUrl);
          break;
        }
      }
    }
    String url;
    try {
      url = TableUtil.unreverseUrl(keyUrl);
    } catch (Exception e) {
      // this can happen because a newly discovered malformed link
      // may slip by url filters
      // TODO: Find a better solution
      return;
    }

    if (page == null) { // new row
      if (!additionsAllowed) {
        return;
      }
      page = new WebPage();
      schedule.initializeSchedule(url, page);
      page.setStatus(CrawlStatus.STATUS_UNFETCHED);
      try {
        scoringFilters.initialScore(url, page);
      } catch (ScoringFilterException e) {
        page.setScore(0.0f);
      }
    } else {
      if (page.getMetadata().containsKey(FetcherJob.REDIRECT_DISCOVERED)
            && !page.isReadable(WebPage.Field.STATUS.getIndex())) {
        // this row is marked during fetch as the destination of a redirect
        // but does not contain anything else, so we initialize it.
        page.setStatus(CrawlStatus.STATUS_UNFETCHED);
        schedule.initializeSchedule(url, page);
        try {
          scoringFilters.initialScore(url, page);
        } catch (ScoringFilterException e) {
          page.setScore(0.0f);
        }
      } else { // update row
        byte status = (byte)page.getStatus();
        switch (status) {
        case CrawlStatus.STATUS_FETCHED:         // succesful fetch
        case CrawlStatus.STATUS_REDIR_TEMP:      // successful fetch, redirected
        case CrawlStatus.STATUS_REDIR_PERM:
        case CrawlStatus.STATUS_NOTMODIFIED:     // successful fetch, notmodified
          int modified = FetchSchedule.STATUS_UNKNOWN;
          if (status == CrawlStatus.STATUS_NOTMODIFIED) {
            modified = FetchSchedule.STATUS_NOTMODIFIED;
          }
          ByteBuffer prevSig = page.getPrevSignature();
          ByteBuffer signature = page.getSignature();
          if (prevSig != null && signature != null) {
            if (SignatureComparator.compare(prevSig.array(), signature.array()) != 0) {
              modified = FetchSchedule.STATUS_MODIFIED;
            } else {
              modified = FetchSchedule.STATUS_NOTMODIFIED;
            }
          }
          long fetchTime = page.getFetchTime();
          long prevFetchTime = page.getPrevFetchTime();
          long modifiedTime = page.getModifiedTime();

          schedule.setFetchSchedule(url, page, prevFetchTime, 0L,
              fetchTime, modifiedTime, modified);
          if (maxInterval < page.getFetchInterval())
            schedule.forceRefetch(url, page, false);
          break;
        case CrawlStatus.STATUS_RETRY:
          schedule.setPageRetrySchedule(url, page, 0L, 0L, page.getFetchTime());
          if (page.getRetriesSinceFetch() < retryMax) {
            page.setStatus(CrawlStatus.STATUS_UNFETCHED);
          } else {
            page.setStatus(CrawlStatus.STATUS_GONE);
          }
          break;
        case CrawlStatus.STATUS_GONE:
          schedule.setPageGoneSchedule(url, page, 0L, 0L, page.getFetchTime());
          break;
        }
      }
    }

    if (page.getInlinks() != null) {
      page.getInlinks().clear();
    }
    for (ScoreDatum inlink : inlinkedScoreData) {
      page.putToInlinks(new Utf8(inlink.getUrl()), new Utf8(inlink.getAnchor()));
    }

    try {
      scoringFilters.updateScore(url, page, inlinkedScoreData);
    } catch (ScoringFilterException e) {
      LOG.warn("Scoring filters failed with exception " +
                StringUtils.stringifyException(e));
    }

    // clear markers
    // But only delete when they exist. This is much faster for the underlying
    // store. The markers are on the input anyway.
    if (page.getFromMetadata(FetcherJob.REDIRECT_DISCOVERED) != null) {
      page.removeFromMetadata(FetcherJob.REDIRECT_DISCOVERED);
    }
    Mark.GENERATE_MARK.removeMarkIfExist(page);
    Mark.FETCH_MARK.removeMarkIfExist(page);
    Utf8 mark = Mark.PARSE_MARK.removeMarkIfExist(page);
    if (mark != null) {
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.