Package org.archive.wayback.core

Examples of org.archive.wayback.core.CaptureSearchResult


    // First result that is greater/less than target
    if (results.isEmpty()) {
      return nextResult;
    }
   
    CaptureSearchResult lastResult = getLastAdded();
   
   
    // Now compare date diff
    long nextTime = nextResult.getCaptureDate().getTime();
    long lastTime = lastResult.getCaptureDate().getTime();
   
    long targetTime = Timestamp.parseAfter(targetTimestamp).getDate().getTime();
   
    if (Math.abs(nextTime - targetTime) < Math.abs(lastTime - targetTime)) {
      return nextResult;
View Full Code Here


  public static CaptureSearchResult[] getResults(
      List<Partition<Partition<CaptureSearchResult>>> years) {

    int count = years.size();
    CaptureSearchResult results[] = new CaptureSearchResult[count];
    for(int i = 0; i < count; i++) {
      Partition<Partition<CaptureSearchResult>> year = years.get(i);
      CaptureSearchResult first = null;
      if(year.getTotal() > 0) {
        for(Partition<CaptureSearchResult> month : year.list()) {
          if(month.getTotal() > 0) {
            first = month.list().get(0);
            break;
View Full Code Here

 

  public static String getFirstUrlMonth(Partition<CaptureSearchResult> month,
      ResultURIConverter c) {
    if(month.getTotal() > 0) {
      CaptureSearchResult first = month.list().get(0);
      return c.makeReplayURI(first.getCaptureTimestamp(),
          first.getOriginalUrl());
    }
    return null;
  }
View Full Code Here

  }

  public static String getLastUrlMonth(Partition<CaptureSearchResult> month,
      ResultURIConverter c) {
    if(month.getTotal() > 0) {
      CaptureSearchResult last = month.list().get(month.list().size()-1);
      return c.makeReplayURI(last.getCaptureTimestamp(),
          last.getOriginalUrl());
    }
    return null;
  }
View Full Code Here

      // assume whatever we found for next Month is OK, even if null:
      navs[NAV_NEXT_DAY= navs[NAV_NEXT_MONTH];
   
   
    // FINALLY! We just need the next and prev links:
    CaptureSearchResult prevResult = null;
    CaptureSearchResult curResult = null;
    CaptureSearchResult nextResult = null;
    for(CaptureSearchResult result : curDay.list()) {
      if(result.isClosest()) {
        curResult = result;
      } else {
        // have we seen the current one?
        if(curResult == null) {
          // no, track this as the "prev", and continue:
          prevResult = result;
        } else {
          // yes, this is the "next", remember and break:
          nextResult = result;
          break;
        }
      }
    }
    if(prevResult != null) {
      navs[NAV_PREV_CAPTURE] =
        uc.makeReplayURI(prevResult.getCaptureTimestamp(),
            prevResult.getOriginalUrl());
    } else {
      // assume whatever we found for prev Day is OK, even if null:
      navs[NAV_PREV_CAPTURE] = navs[NAV_PREV_DAY];
    }
    if(nextResult != null) {
      navs[NAV_NEXT_CAPTURE] =
        uc.makeReplayURI(nextResult.getCaptureTimestamp(),
            nextResult.getOriginalUrl());
    } else {
      // assume whatever we found for prev Day is OK, even if null:
      navs[NAV_NEXT_CAPTURE] = navs[NAV_NEXT_DAY];
    }
View Full Code Here

    }
    CaptureSearchResults captureResults =
      (CaptureSearchResults) results;

   
    CaptureSearchResult closest = null;
   
    closest =
      getReplay().getClosest(wbRequest, captureResults);
   
    //CaptureSearchResult originalClosest = closest;
   
    int counter = 0;
   
    //TODO: parameterize
    //int maxTimeouts = 2;
    //int maxMissingRevisits = 2;
   
    Set<String> skipFiles = null;
    //boolean isRevisit = false;
   
    while (true) {   
      // Support for redirect from the CDX redirectUrl field
      // This was the intended use of the redirect field, but has not actually be tested
      // To enable this functionality, uncomment the lines below
      // This is an optimization that allows for redirects to be handled without loading the original content
      //
      //String redir = closest.getRedirectUrl();
      //if ((redir != null) && !redir.equals("-")) {
      //  String fullRedirect = getUriConverter().makeReplayURI(closest.getCaptureTimestamp(), redir);
      //  throw new BetterRequestException(fullRedirect, Integer.valueOf(closest.getHttpCode()));
      //}
     
      Resource httpHeadersResource = null;
      Resource payloadResource = null;
      boolean isRevisit = false;
     
      try {
        counter++;
       
        if (closest == null) {
          throw new ResourceNotAvailableException("Self-Redirect: No Closest Match Found", 404);
        }
       
        closest.setClosest(true);
        checkAnchorWindow(wbRequest,closest);
       
       
        // Attempt to resolve any not-found embedded content with next-best
        // For "best last" capture, skip not-founds and redirects, hoping to find the best 200 response.
        if ((wbRequest.isAnyEmbeddedContext() && closest.isHttpError()) ||
          (wbRequest.isBestLatestReplayRequest() && !closest.isHttpSuccess())) {
          CaptureSearchResult nextClosest = closest;
         
          while ((nextClosest = findNextClosest(nextClosest, captureResults, requestMS)) != null) {
            // If redirect, save but keep looking -- if no better match, will use the redirect
            if (nextClosest.isHttpRedirect()) {
              closest = nextClosest;
            // If success, pick that one!
            } else if (nextClosest.isHttpSuccess()) {
              closest = nextClosest;
              break;
            }
          }
        }
       
        // Redirect to url for the actual closest capture, if not a retry
        if (counter == 1) {
          handleReplayRedirect(wbRequest, httpResponse, captureResults, closest);
        }     
       
        // If revisit, may load two resources separately
        if (closest.isDuplicateDigest()) {
          isRevisit = true;
         
          // If the payload record is known and it failed before with this payload, don't try
          // loading the header resource even.. outcome will likely be same
          if ((closest.getDuplicatePayloadFile() != null) &&
            (skipFiles != null) && skipFiles.contains(closest.getDuplicatePayloadFile())) {
            counter--; //don't really count this as we're not even checking the file anymore
            throw new ResourceNotAvailableException("Revisit: Skipping already failed " + closest.getDuplicatePayloadFile());
         
          } else if ((closest.getDuplicatePayloadFile() == null) && wbRequest.isTimestampSearchKey()) {
            // If a missing revisit and loaded optimized, try loading the entire timeline again
           
            wbRequest.setTimestampSearchKey(false);
           
            results = queryIndex(wbRequest);
           
            captureResults = (CaptureSearchResults)results;
           
            closest = getReplay().getClosest(wbRequest, captureResults);
            //originalClosest = closest;
            //maxTimeouts *= 2;
            //maxMissingRevisits *= 2;
           
            continue;
          }
         
          // If old-style arc revisit (no mimetype, filename is '-'), then don't load
          // headersResource = payloadResource
          if (EMPTY_VALUE.equals(closest.getFile())) {
            closest.setFile(closest.getDuplicatePayloadFile());
            closest.setOffset(closest.getDuplicatePayloadOffset());
           
            // See that this is successful
            httpHeadersResource = getResource(closest, skipFiles);
           
            // Hmm, since this is a revisit it should not redirect -- was: if both headers and payload are from a different timestamp, redirect to that timestamp
//            if (!closest.getCaptureTimestamp().equals(closest.getDuplicateDigestStoredTimestamp())) {
//              throwRedirect(wbRequest, httpResponse, captureResults, closest.getDuplicateDigestStoredTimestamp(), closest.getOriginalUrl(), closest.getHttpCode());
//            }
           
            payloadResource = httpHeadersResource;
           
          } else {
            httpHeadersResource = getResource(closest, skipFiles);
           
            CaptureSearchResult payloadLocation = retrievePayloadForIdenticalContentRevisit(wbRequest, httpHeadersResource, closest);
           
            if (payloadLocation == null) {
              throw new ResourceNotAvailableException("Revisit: Missing original for revisit record " + closest.toString(), 404);
            }
           
            payloadResource = getResource(payloadLocation, skipFiles);
           
            // If zero length old-style revisit with no headers, then must use payloadResource as headersResource
            if (httpHeadersResource.getRecordLength() <= 0) {
              httpHeadersResource.close();
              httpHeadersResource = payloadResource;
            }
          }
        } else {
          httpHeadersResource = getResource(closest, skipFiles);
          payloadResource = httpHeadersResource;
        }
       
        // Ensure that we are not self-redirecting!
        // If the status is a redirect, check that the location or url date's are different from the current request
        // Otherwise, replay the previous matched capture.
        // This chain is unlikely to go past one previous capture, but is possible
        if (isSelfRedirect(httpHeadersResource, closest, wbRequest, requestURL)) {
          LOGGER.info("Self-Redirect: Skipping " + closest.getCaptureTimestamp() + "/" + closest.getOriginalUrl());
          closest = findNextClosest(closest, captureResults, requestMS);
          continue;
        }
       
        if (counter > 1) {
          handleReplayRedirect(wbRequest, httpResponse, captureResults, closest);
        }
                 
        p.retrieved();
       
        ReplayRenderer renderer =
          getReplay().getRenderer(wbRequest, closest, httpHeadersResource, payloadResource);
       
        if (this.isEnableWarcFileHeader() && (warcFileHeader != null)) {
          if (isRevisit && (closest.getDuplicatePayloadFile() != null)) {
            httpResponse.addHeader(warcFileHeader, closest.getDuplicatePayloadFile());
          } else {
            httpResponse.addHeader(warcFileHeader, closest.getFile());
          }
        }
       
        // Memento URL-M response
        if (this.isEnableMemento()) {
          MementoUtils.addMementoHeaders(httpResponse, captureResults, closest, wbRequest);
        }
   
        renderer.renderResource(httpRequest, httpResponse, wbRequest,
            closest, httpHeadersResource, payloadResource, getUriConverter(), captureResults);
     
        p.rendered();
        p.write(wbRequest.getReplayTimestamp() + " " +
            wbRequest.getRequestUrl());
     
        break;
       
      } catch (SpecificCaptureReplayException scre) {
       
        //final String SOCKET_TIMEOUT_MSG = "java.net.SocketTimeoutException: Read timed out";
       
        CaptureSearchResult nextClosest = null;
       
        // if exceed maxRedirectAttempts, stop
        if ((counter > maxRedirectAttempts) && ((this.getLiveWebPrefix() == null) || !isWaybackReferer(wbRequest, this.getLiveWebPrefix()))) {
          LOGGER.info("LOADFAIL: Timeout: Too many retries, limited to " + maxRedirectAttempts);
        } else if ((closest != null) && !wbRequest.isIdentityContext()) {
          nextClosest = findNextClosest(closest, captureResults, requestMS);
        }
       
        // Skip any nextClosest that has the same exact filename?
        // Removing in case skip something that works..
        // while ((nextClosest != null) && closest.getFile().equals(nextClosest.getFile())) {
        //  nextClosest = findNextClosest(nextClosest, captureResults, requestMS);
        //}
       
        String msg = null;
       
        if (closest != null) {
          msg = scre.getMessage() + " /" + closest.getCaptureTimestamp() + "/" + closest.getOriginalUrl();
        } else {
          msg = scre.getMessage() + " /" + wbRequest.getReplayTimestamp() + "/" + wbRequest.getRequestUrl();
        }
       
        if (nextClosest != null) {
       
          // Store failed filename for revisits, as they may be repeated
          if (isRevisit) {
            if (scre.getDetails() != null) {
              if (skipFiles == null) {
                skipFiles = new HashSet<String>();
              }
              // Details should contain the failed filename from the ResourceStore
              skipFiles.add(scre.getDetails());
            }           
          }
         
          if (msg.startsWith("Self-Redirect")) {         
            LOGGER.info("(" + counter + ")LOADFAIL-> " + msg + " -> " + nextClosest.getCaptureTimestamp());
          } else {
            LOGGER.warning("(" + counter + ")LOADFAIL-> " + msg + " -> " + nextClosest.getCaptureTimestamp());
          }
         
          closest = nextClosest;
        } else if (wbRequest.isTimestampSearchKey()) {
          wbRequest.setTimestampSearchKey(false);
View Full Code Here

    }
  }
 
  protected CaptureSearchResult findNextClosest(CaptureSearchResult currentClosest, CaptureSearchResults results, long requestMS)
  {
    CaptureSearchResult prev = currentClosest.getPrevResult();
    CaptureSearchResult next = currentClosest.getNextResult();
   
    currentClosest.removeFromList();
   
    if (prev == null) {
      return next;
    } else if (next == null) {
      return prev;
    }
   
    long prevMS = prev.getCaptureDate().getTime();
    long nextMS = next.getCaptureDate().getTime();
    long prevDiff = Math.abs(prevMS - requestMS);
    long nextDiff = Math.abs(requestMS - nextMS);
   
    if (prevDiff == 0) {
      return prev;
    } else if (nextDiff == 0) {
      return next;
    }   
   
    String currHash = currentClosest.getDigest();
    String prevHash = prev.getDigest();
    String nextHash = next.getDigest();
    boolean prevSameHash = (prevHash.equals(currHash));
    boolean nextSameHash = (nextHash.equals(currHash));
   
    if (prevSameHash != nextSameHash) {
      return prevSameHash ? prev : next;
    }
   
    String prevStatus = prev.getHttpCode();
    String nextStatus = next.getHttpCode();
    boolean prev200 = (prevStatus != null) && prevStatus.equals("200");
    boolean next200 = (nextStatus != null) && nextStatus.equals("200");
   
    // If only one is a 200, prefer the entry with the 200
    if (prev200 != next200) {
View Full Code Here

    if (!closest.isDuplicateDigest()) {
      LOGGER.warning("Revisit: record is not a revisit by identical content digest " + closest.getCaptureTimestamp() + " " + closest.getOriginalUrl());
      return null;
    }
   
    CaptureSearchResult payloadLocation = null;
   
    // Revisit from same url -- shold have been found by the loader
   
    if (closest.getDuplicatePayloadFile() != null && closest.getDuplicatePayloadOffset() != null) {
      payloadLocation = new CaptureSearchResult();
      payloadLocation.setFile(closest.getDuplicatePayloadFile());
      payloadLocation.setOffset(closest.getDuplicatePayloadOffset());
      payloadLocation.setCompressedLength(closest.getDuplicatePayloadCompressedLength());
      return payloadLocation;
    }

    // Url Agnostic Revisit with target-uri and refers-to-date
   
View Full Code Here

      CaptureSearchResults cResults = (CaptureSearchResults) results;
     
      // The Firefox proxy plugin maks an XML request to populate the
      // list of available captures, and needs the closest result to
      // the one being replayed to be flagged as such:
      CaptureSearchResult closest = cResults.getClosest();
      if(closest != null) {
        closest.setClosest(true);
      }
     
      getQuery().renderCaptureResults(httpRequest,httpResponse,wbRequest,
            cResults,getUriConverter());
View Full Code Here

     * @return CaptureSearchResults built
     */
    protected CaptureSearchResults setupCaptures(int closestIndex, Resource... resources) throws Exception {
        CaptureSearchResults results = new CaptureSearchResults();
        for (Resource res : resources) {
            CaptureSearchResult result = new CaptureSearchResult();
            // TODO: Resource should have methods for accessing URI and date
            if (res instanceof WarcResource) {
                // TODO: want to use WARCRecordToSearchResultAdapter? WarcResource
                // has no method to retrieve underlining WARCRecord.
                ArchiveRecordHeader h = ((WarcResource)res).getWarcHeaders();
                String originalUrl = h.getUrl();
                String ts = (String)h.getHeaderValue("WARC-Date");
                // WARC-Date is in ISOZ format.
                ts = transformWARCDate(ts);
                result.setOriginalUrl(originalUrl);
                result.setCaptureTimestamp(ts);
                result.setOffset(0);
                // this is (W)ARC file name in real practice. here we use
                // DT14 timestamp as pseudo filename (.warc.gz suffix is not
                // essential).
                result.setFile(ts + ".warc.gz");
            } else if (res instanceof ArcResource) {
                // TODO: should use ARCRecordToSearchResultAdapter? ArcResource has
                // getArcRecord() methods whose result may be cast to ARCRecord.
                // NB: ArcResource#getARCMetadata() creates a new Map object.
                Map<String, String> meta = ((ArcResource)res).getARCMetadata();
                String originalUrl = meta.get(ArchiveFileConstants.URL_FIELD_KEY);
                String ts = meta.get(ArchiveFileConstants.DATE_FIELD_KEY);
                result.setOriginalUrl(originalUrl);
                result.setCaptureTimestamp(ts);
            } else {
                throw new AssertionError("unexpected Resource type: " + res.getClass());
            }
            result.setHttpCode(Integer.toString(res.getStatusCode()));
            // CaptureSearchResultMatcher fails without this, but actual value does not
            // matter. so set it to 0.
            result.setOffset(0);
            assertTrue("invalid timestamp " + result.getCaptureTimestamp(),
                    validTimestamp(result.getCaptureTimestamp()));
            if (closestIndex == 0) {
                result.setClosest(true);
                results.setClosest(result);
                EasyMock.expect(replay.getClosest(wbRequest, results)).andReturn(result);
            }
           
            // Note AccessPoint passes a copy of CaptureSearchResult in some case (ex. Replay_Revisit() test).
View Full Code Here

TOP

Related Classes of org.archive.wayback.core.CaptureSearchResult

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.