Package com.antonytrupe.server

Source Code of com.antonytrupe.server.CrawlFilter

/**
* Copyright [2010] OzDroid Pty Ltd Licensed under the Apache License,
* Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/
package com.antonytrupe.server;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.net.URLDecoder;
import java.util.HashMap;
import java.util.Map;
import java.util.logging.Logger;

import javax.cache.Cache;
import javax.cache.CacheException;
import javax.cache.CacheManager;
import javax.servlet.Filter;
import javax.servlet.FilterChain;
import javax.servlet.FilterConfig;
import javax.servlet.ServletException;
import javax.servlet.ServletOutputStream;
import javax.servlet.ServletRequest;
import javax.servlet.ServletResponse;
import javax.servlet.http.HttpServletRequest;

import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.UrlFetchWebConnection;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.google.appengine.api.memcache.stdimpl.GCacheFactory;

/**
* @author Geoff Bruckner
* @1/10/2010
*
*/
public final class CrawlFilter implements Filter {
  private WebClient webClient;

  public static final String SCHEME = "http";
  public static final long PUMP_TIME = 5000;

  public static String rewriteQueryString(String url_with_escaped_fragment) {
    try {
      String decoded = URLDecoder.decode(url_with_escaped_fragment,
          "UTF-8");

      // this helps run on development mode
      String gwt = decoded.replace("gwt", "?gwt");

      String unescapedAmp = gwt.replace("&_escaped_fragment_=", "#!");
      String result = unescapedAmp.replace("_escaped_fragment_=", "#!");

      return result;

    } catch (UnsupportedEncodingException e) {
      // catch exception here
      return "";
    }
  }

  /**
   * Performs clean up of WebClient
   */
  @Override
  public void destroy() {
    if (this.webClient != null) {
      this.webClient.closeAllWindows();
    }
  }

  @SuppressWarnings("unchecked")
  @Override
  public void doFilter(ServletRequest request, ServletResponse response,
      FilterChain chain) throws IOException {

    HttpServletRequest req = (HttpServletRequest) request;

    String queryString = req.getQueryString();
    if ((queryString != null)
        && (queryString.contains("_escaped_fragment_="))) {

      String uri = req.getRequestURI();
      int port = req.getServerPort();
      String domain = req.getServerName();

      // rewrite the URL back to the original #! version
      // remember to unescape any %XX characters
      String url_with_hash_fragment = uri
          + rewriteQueryString(queryString);

      // check memcache first

      Cache cache = null;
      String pageSource = null;

      Map<Object, Object> props = new HashMap<Object, Object>();
      props.put(GCacheFactory.EXPIRATION_DELTA, new Integer(3600));

      try {
        cache = CacheManager.getInstance().getCacheFactory()
            .createCache(props);
        // Get the value from the cache.
        pageSource = (String) cache.get(url_with_hash_fragment);
      } catch (CacheException e) {
        // ...
      }

      // ////////////
      if (pageSource == null) {

        // use the headless browser to obtain an HTML snapshot
        URL url = new URL(SCHEME, domain, port, url_with_hash_fragment);
        this.webClient.getOptions().setTimeout(0);
        HtmlPage page = null;

        try {
          page = this.webClient.getPage(url);

          // gae hack because its single threaded
          this.webClient.getJavaScriptEngine().pumpEventLoop(
              PUMP_TIME);

        } catch (IOException ioe) {
          Logger.getLogger("")
              .warning(
                  "Failed to let google index "
                      + url
                      + " Work on increasing server side performance.");
        }
        if (page != null) {
          pageSource = page.asXml();
        }
      }

      ServletOutputStream out = response.getOutputStream();
      out.println(new String(pageSource));
      out.flush();

      // update cache
      if (cache != null) {
        cache.put(url_with_hash_fragment, pageSource);
      }
    } else {
      try {
        /*
         * not an _escaped_fragment_ URL, so move up the chain of
         * servlet (filters)
         */
        chain.doFilter(request, response);

      } catch (ServletException e) {
        // Handle server errors here
      }
    }
  }

  /**
   * Initialise com.gargoylesoftware.htmlunit.WebClient
   */
  @Override
  public void init(FilterConfig arg0) throws ServletException {

    // Logger.getLogger("com.gargoylesoftware.htmlunit").setLevel(Level.OFF);

    this.webClient = new WebClient(BrowserVersion.CHROME);
    this.webClient.setWebConnection(new UrlFetchWebConnection(
        this.webClient));
  }
}
TOP

Related Classes of com.antonytrupe.server.CrawlFilter

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.