/**
* Copyright [2010] OzDroid Pty Ltd Licensed under the Apache License,
* Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/
package com.antonytrupe.server;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.net.URLDecoder;
import java.util.HashMap;
import java.util.Map;
import java.util.logging.Logger;
import javax.cache.Cache;
import javax.cache.CacheException;
import javax.cache.CacheManager;
import javax.servlet.Filter;
import javax.servlet.FilterChain;
import javax.servlet.FilterConfig;
import javax.servlet.ServletException;
import javax.servlet.ServletOutputStream;
import javax.servlet.ServletRequest;
import javax.servlet.ServletResponse;
import javax.servlet.http.HttpServletRequest;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.UrlFetchWebConnection;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.google.appengine.api.memcache.stdimpl.GCacheFactory;
/**
* @author Geoff Bruckner
* @1/10/2010
*
*/
public final class CrawlFilter implements Filter {
private WebClient webClient;
public static final String SCHEME = "http";
public static final long PUMP_TIME = 5000;
public static String rewriteQueryString(String url_with_escaped_fragment) {
try {
String decoded = URLDecoder.decode(url_with_escaped_fragment,
"UTF-8");
// this helps run on development mode
String gwt = decoded.replace("gwt", "?gwt");
String unescapedAmp = gwt.replace("&_escaped_fragment_=", "#!");
String result = unescapedAmp.replace("_escaped_fragment_=", "#!");
return result;
} catch (UnsupportedEncodingException e) {
// catch exception here
return "";
}
}
/**
* Performs clean up of WebClient
*/
@Override
public void destroy() {
if (this.webClient != null) {
this.webClient.closeAllWindows();
}
}
@SuppressWarnings("unchecked")
@Override
public void doFilter(ServletRequest request, ServletResponse response,
FilterChain chain) throws IOException {
HttpServletRequest req = (HttpServletRequest) request;
String queryString = req.getQueryString();
if ((queryString != null)
&& (queryString.contains("_escaped_fragment_="))) {
String uri = req.getRequestURI();
int port = req.getServerPort();
String domain = req.getServerName();
// rewrite the URL back to the original #! version
// remember to unescape any %XX characters
String url_with_hash_fragment = uri
+ rewriteQueryString(queryString);
// check memcache first
Cache cache = null;
String pageSource = null;
Map<Object, Object> props = new HashMap<Object, Object>();
props.put(GCacheFactory.EXPIRATION_DELTA, new Integer(3600));
try {
cache = CacheManager.getInstance().getCacheFactory()
.createCache(props);
// Get the value from the cache.
pageSource = (String) cache.get(url_with_hash_fragment);
} catch (CacheException e) {
// ...
}
// ////////////
if (pageSource == null) {
// use the headless browser to obtain an HTML snapshot
URL url = new URL(SCHEME, domain, port, url_with_hash_fragment);
this.webClient.getOptions().setTimeout(0);
HtmlPage page = null;
try {
page = this.webClient.getPage(url);
// gae hack because its single threaded
this.webClient.getJavaScriptEngine().pumpEventLoop(
PUMP_TIME);
} catch (IOException ioe) {
Logger.getLogger("")
.warning(
"Failed to let google index "
+ url
+ " Work on increasing server side performance.");
}
if (page != null) {
pageSource = page.asXml();
}
}
ServletOutputStream out = response.getOutputStream();
out.println(new String(pageSource));
out.flush();
// update cache
if (cache != null) {
cache.put(url_with_hash_fragment, pageSource);
}
} else {
try {
/*
* not an _escaped_fragment_ URL, so move up the chain of
* servlet (filters)
*/
chain.doFilter(request, response);
} catch (ServletException e) {
// Handle server errors here
}
}
}
/**
* Initialise com.gargoylesoftware.htmlunit.WebClient
*/
@Override
public void init(FilterConfig arg0) throws ServletException {
// Logger.getLogger("com.gargoylesoftware.htmlunit").setLevel(Level.OFF);
this.webClient = new WebClient(BrowserVersion.CHROME);
this.webClient.setWebConnection(new UrlFetchWebConnection(
this.webClient));
}
}