Package org.archive.wayback.core

Examples of org.archive.wayback.core.SearchResult


  }

  private SearchResult adaptDNS(ArchiveRecordHeader header, WARCRecord rec)
  throws IOException {

    SearchResult result = getBlankSearchResult();

    result.put(WaybackConstants.RESULT_CAPTURE_DATE,
        transformDate(header.getDate()));
    result.put(WaybackConstants.RESULT_ARC_FILE,
        transformWarcFilename(header.getReaderIdentifier()));
    result.put(WaybackConstants.RESULT_OFFSET,
        String.valueOf(header.getOffset()));
   
    String uriStr = header.getUrl();
   
    String origHost = uriStr.substring(WaybackConstants.DNS_URL_PREFIX
        .length());
    result.put(WaybackConstants.RESULT_MIME_TYPE, header.getMimetype());

    result.put(WaybackConstants.RESULT_ORIG_HOST, origHost);
    result.put(WaybackConstants.RESULT_URL, uriStr);
    result.put(WaybackConstants.RESULT_URL_KEY, uriStr);

    rec.close();
    result.put(WaybackConstants.RESULT_MD5_DIGEST, rec.getDigestStr());

    return result;
  }
View Full Code Here


  }

  private SearchResult adaptRevisit(ArchiveRecordHeader header, WARCRecord rec)
  throws IOException {

    SearchResult result = getBlankSearchResult();

    result.put(WaybackConstants.RESULT_CAPTURE_DATE,
        transformDate(header.getDate()));
    result.put(WaybackConstants.RESULT_MD5_DIGEST,
        transformDigest(header.getHeaderValue(
            WARCRecord.HEADER_KEY_PAYLOAD_DIGEST)));
   
    addUrlDataToSearchResult(result,header.getUrl());
View Full Code Here

    }
 
  private SearchResult adaptResponse(ArchiveRecordHeader header, WARCRecord rec)
  throws IOException {

    SearchResult result = getBlankSearchResult();

    result.put(WaybackConstants.RESULT_CAPTURE_DATE,
        transformDate(header.getDate()));
    result.put(WaybackConstants.RESULT_ARC_FILE,
        transformWarcFilename(header.getReaderIdentifier()));
    result.put(WaybackConstants.RESULT_OFFSET,
        String.valueOf(header.getOffset()));
   
    String origUrl = header.getUrl();
    UURI uri = addUrlDataToSearchResult(result,origUrl);

    // need to parse the documents HTTP message and headers here: WARCReader
    // does not implement this... yet..
   
        byte [] statusBytes = HttpParser.readRawLine(rec);
        int eolCharCount = getEolCharsCount(statusBytes);
        if (eolCharCount <= 0) {
            throw new RecoverableIOException("Failed to read http status where one " +
                " was expected: " + new String(statusBytes));
        }
        String statusLine = EncodingUtil.getString(statusBytes, 0,
            statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING);
        if ((statusLine == null) ||
                !StatusLine.startsWithHTTP(statusLine)) {
           throw new RecoverableIOException("Failed parse of http status line.");
        }
        StatusLine status = new StatusLine(statusLine);
    result.put(WaybackConstants.RESULT_HTTP_CODE,
        String.valueOf(status.getStatusCode()));
       
    Header[] headers = HttpParser.parseHeaders(rec,
                ARCConstants.DEFAULT_ENCODING);

    rec.close();
    result.put(WaybackConstants.RESULT_MD5_DIGEST,
        transformDigest(header.getHeaderValue(
            WARCRecord.HEADER_KEY_PAYLOAD_DIGEST)));

    if (headers != null) {
 
      for (Header httpHeader : headers) {
        if (httpHeader.getName().equals(
            WaybackConstants.LOCATION_HTTP_HEADER)) {
 
          String locationStr = httpHeader.getValue();
          // TODO: "Location" is supposed to be absolute:
          // (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html)
          // (section 14.30) but Content-Location can be
          // relative.
          // is it correct to resolve a relative Location, as
          // we are?
          // it's also possible to have both in the HTTP
          // headers...
          // should we prefer one over the other?
          // right now, we're ignoring "Content-Location"
          try {
            UURI uriRedirect = UURIFactory.getInstance(uri,
                locationStr);
            result.put(WaybackConstants.RESULT_REDIRECT_URL,
                uriRedirect.getEscapedURI());
          } catch (URIException e) {
            LOGGER.info("Bad Location: " + locationStr
                + " for " + origUrl + " in "
                + header.getReaderIdentifier() + " Skipped");
          }
        } else if(httpHeader.getName().toLowerCase().equals("content-type")) {
          result.put(WaybackConstants.RESULT_MIME_TYPE,
              transformHTTPMime(httpHeader.getValue()));
        }
      }
    }
    return result;
View Full Code Here

    return result;
  }
 
  private SearchResult adaptInner(WARCRecord rec) throws IOException {
   
    SearchResult result = null;
    ArchiveRecordHeader header = rec.getHeader();
    String type = header.getHeaderValue(WARCConstants.HEADER_KEY_TYPE).toString();
    if(type.equals(WARCConstants.RESPONSE)) {
      String mime = header.getMimetype();
      if(mime.equals("text/dns")) {
View Full Code Here

 
  private SearchResult adaptInner(ARCRecord rec) throws IOException {
    rec.close();
    ARCRecordMetaData meta = rec.getMetaData();
   
    SearchResult result = new SearchResult();
    String arcName = meta.getArc();
    int index = arcName.lastIndexOf(File.separator);
    if (index > 0 && (index + 1) < arcName.length()) {
        arcName = arcName.substring(index + 1);
    }
    result.put(WaybackConstants.RESULT_ARC_FILE, arcName);
    result.put(WaybackConstants.RESULT_OFFSET, String.valueOf(meta
        .getOffset()));
   
    // initialize with default HTTP code...
    result.put(WaybackConstants.RESULT_HTTP_CODE, "-");
   
    result.put(WaybackConstants.RESULT_MD5_DIGEST, rec.getDigestStr());
    result.put(WaybackConstants.RESULT_MIME_TYPE, meta.getMimetype());
    result.put(WaybackConstants.RESULT_CAPTURE_DATE, meta.getDate());
   
    String uriStr = meta.getUrl();
    if (uriStr.startsWith(ARCRecord.ARC_MAGIC_NUMBER)) {
      // skip filedesc record altogether...
      return null;
    }
    if (uriStr.startsWith(WaybackConstants.DNS_URL_PREFIX)) {
      // skip URL + HTTP header processing for dns records...
   
      String origHost = uriStr.substring(WaybackConstants.DNS_URL_PREFIX
          .length());
      result.put(WaybackConstants.RESULT_ORIG_HOST, origHost);
      result.put(WaybackConstants.RESULT_REDIRECT_URL, "-");
      result.put(WaybackConstants.RESULT_URL, uriStr);
      result.put(WaybackConstants.RESULT_URL_KEY, uriStr);
   
    } else {
   
      UURI uri = UURIFactory.getInstance(uriStr);
      result.put(WaybackConstants.RESULT_URL, uriStr);
   
      String uriHost = uri.getHost();
      if (uriHost == null) {
        LOGGER.info("No host in " + uriStr + " in " + meta.getArc());
      } else {
        result.put(WaybackConstants.RESULT_ORIG_HOST, uriHost);
   
        String statusCode = (meta.getStatusCode() == null) ? "-" : meta
            .getStatusCode();
        result.put(WaybackConstants.RESULT_HTTP_CODE, statusCode);
   
        String redirectUrl = "-";
        Header[] headers = rec.getHttpHeaders();
        if (headers != null) {
   
          for (int i = 0; i < headers.length; i++) {
            if (headers[i].getName().equals(
                WaybackConstants.LOCATION_HTTP_HEADER)) {

              String locationStr = headers[i].getValue();
              // TODO: "Location" is supposed to be absolute:
              // (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html)
              // (section 14.30) but Content-Location can be
              // relative.
              // is it correct to resolve a relative Location, as
              // we are?
              // it's also possible to have both in the HTTP
              // headers...
              // should we prefer one over the other?
              // right now, we're ignoring "Content-Location"
              try {
                UURI uriRedirect = UURIFactory.getInstance(uri,
                    locationStr);
                redirectUrl = uriRedirect.getEscapedURI();
   
              } catch (URIException e) {
                LOGGER.info("Bad Location: " + locationStr
                    + " for " + uriStr + " in "
                    + meta.getArc() + " Skipped");
              }
              break;
            }
          }
        }
        result.put(WaybackConstants.RESULT_REDIRECT_URL, redirectUrl);
   
        String indexUrl = canonicalizer.urlStringToKey(meta.getUrl());
        result.put(WaybackConstants.RESULT_URL_KEY, indexUrl);
      }
   
    }
    return result;
  }
View Full Code Here

   * @param urlString
   * @param timestamp
   * @return true if the url-timestamp should not be shown to end users
   */
  public boolean isExcluded(String urlString, String timestamp) {
    SearchResult sr = new SearchResult();

    LaxURI url = null;
    String host = null;
    try {
      url = new LaxURI(urlString,true);
      host = url.getHost();
    } catch (URIException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
      return true;
    }
    sr.put(WaybackConstants.RESULT_ORIG_HOST, host);
    sr.put(WaybackConstants.RESULT_URL, urlString);
   
    int ruling = filter.filterObject(sr);
    return (ruling != ObjectFilter.FILTER_INCLUDE);
  }
View Full Code Here

        throw new ResourceNotAvailableException("Bad results...");
      }
      CaptureSearchResults captureResults = (CaptureSearchResults) results;
 
      // TODO: check which versions are actually accessible right now?
      SearchResult closest = captureResults.getClosest(wbRequest);
      resource = collection.getResourceStore().retrieveResource(closest);
 
      replay.renderResource(httpRequest, httpResponse, wbRequest,
          closest, resource, uriConverter, captureResults);
    } catch(WaybackException e) {
View Full Code Here

    try {
      SearchResults results = collection.getResourceIndex().query(wbRequest);
      if(results.getResultsType().equals(
          WaybackConstants.RESULTS_TYPE_CAPTURE)) {
        CaptureSearchResults cResults = (CaptureSearchResults) results;
        SearchResult closest = cResults.getClosest(wbRequest);
        closest.put(WaybackConstants.RESULT_CLOSEST_INDICATOR,
            WaybackConstants.RESULT_CLOSEST_VALUE);
        query.renderUrlResults(httpRequest,httpResponse,wbRequest,
            results,uriConverter);

      } else {
View Full Code Here

   * @param results
   */
  public void filter(SearchResults results) {
    Iterator<SearchResult> itr = results.iterator();
    while(itr.hasNext()) {
      SearchResult result = itr.next();
      String captureDate = result.get(
          WaybackConstants.RESULT_CAPTURE_DATE);
      if((captureDate.compareTo(startDateStr) >= 0)
          && (captureDate.compareTo(endDateStr) < 0)) {
        matches.add(result);
      }   
View Full Code Here

       for (int i = 0; i < nodes.getLength(); i++) {
        
           Element e = (Element) nodes.item(i);

           SearchResult result = elementToSearchResult(e);
           results.addSearchResult(result);
       }
       Element channelElement = (Element) channel.item(0);
      
       results.putFilter(WaybackConstants.RESULTS_FIRST_RETURNED,
View Full Code Here

TOP

Related Classes of org.archive.wayback.core.SearchResult

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.