Package org.archive.wayback.core

Examples of org.archive.wayback.core.CaptureSearchResult


   * @param results
   */
  public void filter(CaptureSearchResults results) {
    Iterator<CaptureSearchResult> itr = results.iterator();
    while(itr.hasNext()) {
      CaptureSearchResult result = itr.next();
      String captureDate = result.getCaptureTimestamp();
      if((captureDate.compareTo(startDateStr) >= 0)
          && (captureDate.compareTo(endDateStr) < 0)) {
        matches.add(result);
      }   
    }
View Full Code Here


    } catch (IllegalArgumentException e) {
      LOGGER.warning("Skipping unrecognized record type : " + typeStr);
      return null;
    }

    CaptureSearchResult result = genericResult(rec);

    switch (type) {
    case response:
      String mime = annotater.transformHTTPMime(header.getMimetype());
      if(mime != null && mime.equals("text/dns")) {
        // close to complete reading, then the digest is legit
        // TODO: DO we want to use the WARC header digest for this?
        rec.close();
        result.setDigest(transformWARCDigest(rec.getDigestStr()));
        result.setMimeType(mime);
      } else {
        result = adaptWARCHTTPResponse(result,rec);
      }
      break;

     
    case revisit:
      // also set the mime type:
      result.setMimeType("warc/revisit");
      break;
     
    case request:
      if(processAll) {
        // also set the mime type:
        result.setMimeType("warc/request");
      } else {
        result = null;
      }
      break;
     
    case metadata:
      if(processAll) {
        // also set the mime type:
        result.setMimeType("warc/metadata");
      } else {
        result = null;
      }
      break;
     
    case warcinfo:
      result.setMimeType(WARC_FILEDESC_VERSION);
      break;
     
    default:
      LOGGER.info("Skipping record type : " + type);
      break;
View Full Code Here

  
   *    file, offset, timestamp, digest, urlKey, originalUrl
   */
  private CaptureSearchResult genericResult(WARCRecord rec) {

    CaptureSearchResult result = new CaptureSearchResult();

    result.setMimeType(DEFAULT_VALUE);
    result.setHttpCode(DEFAULT_VALUE);
    result.setRedirectUrl(DEFAULT_VALUE);

    ArchiveRecordHeader header = rec.getHeader();

    String file = transformWARCFilename(header.getReaderIdentifier());
    long offset = header.getOffset();
   
    result.setCaptureTimestamp(transformWARCDate(header.getDate()));
    result.setFile(file);
    result.setOffset(offset);
    result.setDigest(transformWARCDigest(header.getHeaderValue(
        WARCRecord.HEADER_KEY_PAYLOAD_DIGEST)));
   
    String origUrl = header.getUrl();
    if(origUrl == null) {
      String type = header.getHeaderValue(WARCConstants.HEADER_KEY_TYPE).toString();
      if(type.equals(WARCConstants.WARCRecordType.warcinfo)) {
        String filename = header.getHeaderValue(
            WARCConstants.HEADER_KEY_FILENAME).toString();
        result.setOriginalUrl("filedesc:"+filename);
        result.setUrlKey("filedesc:"+filename);       
      } else {
        result.setOriginalUrl(DEFAULT_VALUE);
        result.setUrlKey(DEFAULT_VALUE);
      }

     
    } else {
      result.setOriginalUrl(origUrl);
      try {
        String urlKey = canonicalizer.urlStringToKey(origUrl);
        result.setUrlKey(urlKey);
      } catch (URIException e) {
        String shortUrl =
          (origUrl.length() < 100)
          ? origUrl
          :origUrl.substring(0,100);
        LOGGER.warning("FAILED canonicalize(" + shortUrl + "):" +
            file + " " + offset);
        result.setUrlKey(origUrl);
      }
    }
    return result;
  }
View Full Code Here

        Iterator<CaptureSearchResult> itr = results.iterator();
        previous = null;
        next = null;
        this.result = result;
        while(itr.hasNext()) {
                CaptureSearchResult cur = itr.next();
                if(cur.isClosest()) {
                        break;
                }
                previous = cur;
        }
        if(itr.hasNext()) {
View Full Code Here

  private void createNext() {
    if(cachedNext == null) {
      if(peek.hasNext()) {
        // populate
        CaptureSearchResult captureResult = peek.next();
        String currentKey = captureResult.getUrlKey();
        String originalUrl = captureResult.getOriginalUrl();
        String firstCapture = captureResult.getCaptureTimestamp();
        LOGGER.info("Creating new UrlResult:" + currentKey + " " +
            firstCapture);
        String lastCapture = firstCapture;
        HashMap<String,Object> digests = new HashMap<String,Object>();
        digests.put(captureResult.getDigest(),null);
        int numCaptures = 1;

        cachedNext = new UrlSearchResult();
        cachedNext.setUrlKey(currentKey);
        cachedNext.setOriginalUrl(originalUrl);

        // now rip through the rest until we find either the last
        // in the iterator, or the first having a different urlKey:
        while((captureResult = peek.peekNext()) != null) {
          String urlKey = captureResult.getUrlKey();
          if(currentKey.equals(urlKey)) {
            // remove from iterator, and accumulate:
            peek.next();
            numCaptures++;
            digests.put(captureResult.getDigest(), null);

            String captureTS = captureResult.getCaptureTimestamp();
            if(captureTS.compareTo(firstCapture) < 0) {
              firstCapture = captureTS;
            }
            if(captureTS.compareTo(lastCapture) > 0) {
              lastCapture = captureTS;
View Full Code Here

        Resource payloadResource = createTestHtmlResource("20100601000000", "hogheogehoge\n".getBytes("UTF-8"));
        CaptureSearchResults results = setupCaptures(
                0,
                payloadResource
                );
        CaptureSearchResult closest = results.getClosest();
       
        // when closest's timestamp == request's timestamp,
        // it gets ReplayRenderer with replay.getRenderer(wbRequest, closest, httpHeaderResource, payloadResource),
        // and calls renderResource() on it.
        EasyMock.expect(replay.getRenderer(wbRequest, closest, payloadResource, payloadResource)).andReturn(replayRenderer);
View Full Code Here

        byte[] payload = "hogehogehogehoge\n".getBytes("UTF-8");
        Resource payloadResource = createTestHtmlResource("20100501000001", payload);
        Resource headerResource = createTestRevisitResource("20100601000000", payload.length, true);
        CaptureSearchResults results = setupCaptures(1, payloadResource, headerResource);

        CaptureSearchResult previous = results.getResults().get(0);
        CaptureSearchResult closest = results.getClosest();
//        previous.setFile("aaa.warc.gz");
//        previous.setOffset(0);
        closest.flagDuplicateDigest(previous); // right? TODO: could be done in setupCaptures()
        assertTrue(closest.isDuplicateDigest());
        assertTrue(closest.getDuplicatePayloadFile() != null);
        assertTrue(closest.getDuplicatePayloadOffset() != null);

        EasyMock.expect(replay.getRenderer(wbRequest, closest, headerResource, payloadResource)).andReturn(replayRenderer);
        // calls replayRenderer.renderResource(...)
        replayRenderer.renderResource(httpRequest, httpResponse, wbRequest,
                closest, headerResource, payloadResource, cut.getUriConverter(),
View Full Code Here

                1,
                createTestHtmlResource("http://test.example.com/style.css",
                        "20100501000000", "hogheogehoge\n".getBytes("UTF-8")),
                createTest502Resource()
                );
        CaptureSearchResult closest = results.getClosest();
        assertTrue(closest.isHttpError());
       
        // or wbRequest.setBestLatestReplayRequest();
        final String expectedRedirectURI = "/web/20100501000000cs_/http://test.example.com/style.css";
        httpResponse.setHeader("Location", expectedRedirectURI);
        httpResponse.setStatus(302);
View Full Code Here

        // - calls queryIndex(), which calls collection.resourceIndex.query(),
        //     which returns CaptureSearchResults
        //   (unexpected object from queryIndex() results in WaybackException("Unknown index format").
        //    this is considered to be a programming/configuration error. not tested.)
        CaptureSearchResults results = new CaptureSearchResults();
        CaptureSearchResult result = new CaptureSearchResult();
        results.setClosest(result);
        EasyMock.expect(resourceIndex.query(wbRequest)).andReturn(results);
        // - calls MementoUtils.printTimemapResponse(results, wbRequest, httpResponse) instead
        //     if wbRequst.isMementoTimemapRequest() (N/A here) (TODO: can we move this to
        //     QueryRenderer implementation?)
        // - calls query.renderCaptureResults(...)
        query.renderCaptureResults(httpRequest, httpResponse, wbRequest, results, cut.getUriConverter());
       
        EasyMock.replay(httpRequest, httpResponse, resourceIndex, query);
       
        cut.init();
        boolean r = cut.handleRequest(httpRequest, httpResponse);
       
        EasyMock.verify(query);
       
        // result shall have closest flag set (FIrefox proxy plugin expects this)
        assertTrue("closest flag", result.isClosest());
       
        assertTrue("handleRequest return value", r);
       
    }
View Full Code Here

    assertTrue("subpath",isBlocked(filter,"http://www.peagreenboat.com/foo"));
    assertTrue("emptypath",isBlocked(filter,"http://www.peagreenboat.com/"));
  }
 
  private boolean isBlocked(ObjectFilter<CaptureSearchResult> filter, String url) throws URIException {
    CaptureSearchResult result = new CaptureSearchResult();
    result.setOriginalUrl(url);
    result.setUrlKey(canonicalizer.urlStringToKey(url));
    int filterResult = filter.filterObject(result);
    if(filterResult == ObjectFilter.FILTER_EXCLUDE) {
      return true;
    }
    return false;
View Full Code Here

TOP

Related Classes of org.archive.wayback.core.CaptureSearchResult

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.