Package org.archive.wayback.archivalurl

Source Code of org.archive.wayback.archivalurl.ArchivalUrlSAXRewriteReplayRenderer

/*
*  This file is part of the Wayback archival access software
*   (http://archive-access.sourceforge.net/projects/wayback/).
*
*  Licensed to the Internet Archive (IA) by one or more individual
*  contributors.
*
*  The IA licenses this file to You under the Apache License, Version 2.0
*  (the "License"); you may not use this file except in compliance with
*  the License.  You may obtain a copy of the License at
*
*      http://www.apache.org/licenses/LICENSE-2.0
*
*  Unless required by applicable law or agreed to in writing, software
*  distributed under the License is distributed on an "AS IS" BASIS,
*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*  See the License for the specific language governing permissions and
*  limitations under the License.
*/
package org.archive.wayback.archivalurl;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Map;

import javax.servlet.ServletException;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

import org.archive.wayback.ReplayRenderer;
import org.archive.wayback.ResultURIConverter;
import org.archive.wayback.core.CaptureSearchResult;
import org.archive.wayback.core.CaptureSearchResults;
import org.archive.wayback.core.Resource;
import org.archive.wayback.core.WaybackRequest;
import org.archive.wayback.exception.WaybackException;
import org.archive.wayback.replay.HttpHeaderOperation;
import org.archive.wayback.replay.HttpHeaderProcessor;
import org.archive.wayback.replay.JSPExecutor;
import org.archive.wayback.replay.TagMagix;
import org.archive.wayback.replay.TextReplayRenderer;
import org.archive.wayback.replay.charset.CharsetDetector;
import org.archive.wayback.replay.charset.StandardCharsetDetector;
import org.archive.wayback.replay.html.ContextResultURIConverterFactory;
import org.archive.wayback.replay.html.IdentityResultURIConverterFactory;
import org.archive.wayback.replay.html.ReplayParseContext;
import org.archive.wayback.util.htmllex.ContextAwareLexer;
import org.archive.wayback.util.htmllex.ParseEventHandler;
import org.htmlparser.Node;
import org.htmlparser.lexer.Lexer;
import org.htmlparser.lexer.Page;
import org.htmlparser.util.ParserException;

/**
* ReplayRenderer which attempts to rewrite text/html documents so URLs
* references within the document load from the correct ArchivalURL AccessPoint.
*
* @author brad
*
*/
public class ArchivalUrlSAXRewriteReplayRenderer implements ReplayRenderer {
  private ParseEventHandler delegator = null;
  private HttpHeaderProcessor httpHeaderProcessor;
  private CharsetDetector charsetDetector = new StandardCharsetDetector();
  private ContextResultURIConverterFactory converterFactory = null;
  private boolean rewriteHttpsOnly;
 
  private final static String OUTPUT_CHARSET = "utf-8";
  private static int FRAMESET_SCAN_BUFFER_SIZE = 16 * 1024;
  private static ReplayRenderer frameWrappingRenderer = null;
  public static ReplayRenderer getFrameWrappingRenderer() {
    return frameWrappingRenderer;
  }

  public static void setFrameWrappingRenderer(ReplayRenderer frameWrappingRenderer) {
    ArchivalUrlSAXRewriteReplayRenderer.frameWrappingRenderer = frameWrappingRenderer;
  }

  /**
   * @param httpHeaderProcessor which should process HTTP headers
   */
  public ArchivalUrlSAXRewriteReplayRenderer(HttpHeaderProcessor httpHeaderProcessor) {
    this.httpHeaderProcessor = httpHeaderProcessor;
  }

  // assume this is only called for appropriate doc types: html
  public void renderResource(HttpServletRequest httpRequest,
      HttpServletResponse httpResponse, WaybackRequest wbRequest,
      CaptureSearchResult result, Resource resource,
      ResultURIConverter uriConverter, CaptureSearchResults results)
          throws ServletException, IOException, WaybackException {
    renderResource(httpRequest, httpResponse, wbRequest, result, resource,
        resource, uriConverter, results);
  }

  @Override
  public void renderResource(HttpServletRequest httpRequest,
      HttpServletResponse httpResponse, WaybackRequest wbRequest,
      CaptureSearchResult result, Resource httpHeadersResource,
      Resource payloadResource, ResultURIConverter uriConverter,
      CaptureSearchResults results) throws ServletException, IOException,
      WaybackException {

    Resource decodedResource = TextReplayRenderer.decodeResource(httpHeadersResource, payloadResource);

    // The URL of the page, for resolving in-page relative URLs:
    URL url = null;
    try {
      url = new URL(result.getOriginalUrl());
    } catch (MalformedURLException e1) {
      // TODO: this shouldn't happen...
      e1.printStackTrace();
      throw new IOException(e1.getMessage());
    }
    // determine the character set used to encode the document bytes:
    String charSet = charsetDetector.getCharset(httpHeadersResource, decodedResource, wbRequest);

    ContextResultURIConverterFactory fact = createConverterFactory(uriConverter, httpRequest, wbRequest);
   
    // set up the context:
    ReplayParseContext context =
        new ReplayParseContext(fact,url,result.getCaptureTimestamp());
   
    context.setRewriteHttpsOnly(rewriteHttpsOnly);

    if(!wbRequest.isFrameWrapperContext()) {
      // in case this is an HTML page with FRAMEs, peek ahead an look:
      // TODO: make ThreadLocal:
      byte buffer[] = new byte[FRAMESET_SCAN_BUFFER_SIZE];

      decodedResource.mark(FRAMESET_SCAN_BUFFER_SIZE);
      int amtRead = decodedResource.read(buffer);
      decodedResource.reset();

      if(amtRead > 0) {
        StringBuilder foo = new StringBuilder(new String(buffer,charSet));
        int frameIdx = TagMagix.getEndOfFirstTag(foo, "FRAMESET");
        if(frameIdx != -1) {
          // insert flag so we don't add FRAMESET:
          context.putData(FastArchivalUrlReplayParseEventHandler.FERRET_DONE_KEY,"");

//          // top-level Frameset: Draw the frame wrapper thingy:
//          frameWrappingRenderer.renderResource(httpRequest,
//              httpResponse, wbRequest, result, resource,
//              uriConverter, results);
//          return;
        }
      }
    }


    // copy the HTTP response code:
    HttpHeaderOperation.copyHTTPMessageHeader(httpHeadersResource, httpResponse);

    // transform the original headers according to our headerProcessor:
    Map<String,String> headers = HttpHeaderOperation.processHeaders(
        httpHeadersResource, result, uriConverter, httpHeaderProcessor);

    // prepare several objects for the parse:

    // a JSPExecutor:
    JSPExecutor jspExec = new JSPExecutor(uriConverter, httpRequest,
        httpResponse, wbRequest, results, result, decodedResource);


    // To make sure we get the length, we have to buffer it all up...
    ByteArrayOutputStream baos = new ByteArrayOutputStream();

    context.setOutputCharset(OUTPUT_CHARSET);
    context.setOutputStream(baos);
    context.setJspExec(jspExec);


    // and finally, parse, using the special lexer that knows how to
    // handle javascript blocks containing unescaped HTML entities:
    Page lexPage = new Page(decodedResource,charSet);
    Lexer lexer = new Lexer(lexPage);
    Lexer.STRICT_REMARKS = false;
    ContextAwareLexer lex = new ContextAwareLexer(lexer, context);
    Node node;
    try {
      delegator.handleParseStart(context);
      while((node = lex.nextNode()) != null) {
        delegator.handleNode(context, node);
      }
      delegator.handleParseComplete(context);
    } catch (ParserException e) {
      e.printStackTrace();
      throw new IOException(e.getMessage());
    }

    // At this point, baos contains the utf-8 encoded bytes of our result:
    byte[] utf8Bytes = baos.toByteArray();
    // set the corrected length:
    headers.put(HttpHeaderOperation.HTTP_LENGTH_HEADER,
        String.valueOf(utf8Bytes.length));
    headers.put(TextReplayRenderer.GUESSED_CHARSET_HEADER, charSet);

    // send back the headers:
    HttpHeaderOperation.sendHeaders(headers, httpResponse);
    // Tomcat will always send a charset... It's trying to be smarter than
    // we are. If the original page didn't include a "charset" as part of
    // the "Content-Type" HTTP header, then Tomcat will use the default..
    // who knows what that is, or what that will do to the page..
    // let's try explicitly setting it to what we used:
    httpResponse.setCharacterEncoding(OUTPUT_CHARSET);
    httpResponse.getOutputStream().write(utf8Bytes);
  }
 
  protected ContextResultURIConverterFactory createConverterFactory(ResultURIConverter uriConverter, HttpServletRequest httpRequest, WaybackRequest wbRequest)
  {
    // sam ecode in ArchivalURLJSStringTransformerReplayRenderer
    ContextResultURIConverterFactory fact = null;
   
    if (uriConverter instanceof ArchivalUrlResultURIConverter) {
      fact = new ArchivalUrlContextResultURIConverterFactory(
          (ArchivalUrlResultURIConverter) uriConverter);
    } else if (converterFactory != null) {
      fact = converterFactory;
    } else {
      fact = new IdentityResultURIConverterFactory(uriConverter);     
    }
   
    return fact;
  }

  /**
   * @return the charsetDetector
   */
  public CharsetDetector getCharsetDetector() {
    return charsetDetector;
  }

  /**
   * @param charsetDetector the charsetDetector to set
   */
  public void setCharsetDetector(CharsetDetector charsetDetector) {
    this.charsetDetector = charsetDetector;
  }

  /**
   * @return the delegator
   */
  public ParseEventHandler getDelegator() {
    return delegator;
  }

  /**
   * @param delegator the delegator to set
   */
  public void setDelegator(ParseEventHandler delegator) {
    this.delegator = delegator;
  }

  public ContextResultURIConverterFactory getConverterFactory() {
    return converterFactory;
  }

  public void setConverterFactory(
      ContextResultURIConverterFactory converterFactory) {
    this.converterFactory = converterFactory;
  }

  public boolean isRewriteHttpsOnly() {
    return rewriteHttpsOnly;
  }

  public void setRewriteHttpsOnly(boolean rewriteHttpsOnly) {
    this.rewriteHttpsOnly = rewriteHttpsOnly;
  }
}
TOP

Related Classes of org.archive.wayback.archivalurl.ArchivalUrlSAXRewriteReplayRenderer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.