Source Code of com.gwtplatform.crawlerservice.server.CrawlServiceServlet

/**
 * Copyright 2011 ArcBees Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */


package com.gwtplatform.crawlerservice.server;


import java.io.IOException;
import java.io.PrintWriter;
import java.net.MalformedURLException;
import java.net.URLDecoder;
import java.util.Date;
import java.util.List;
import java.util.Map;


import javax.inject.Provider;
import javax.inject.Singleton;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;


import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.google.inject.Inject;
import com.googlecode.objectify.Key;
import com.gwtplatform.crawlerservice.server.domain.CachedPage;
import com.gwtplatform.crawlerservice.server.service.CachedPageDao;


/**
 * Servlet that makes it possible to fetch an external page, renders it using HTMLUnit and returns
 * the HTML page.
 */
@Singleton
public class CrawlServiceServlet extends HttpServlet {


    private static final String CHAR_ENCODING = "UTF-8";


    private static final long serialVersionUID = -6129110224710383122L;


    @Inject(optional = true)
    @HtmlUnitTimeoutMillis
    private long timeoutMillis = 12000;


    @Inject(optional = true)
    @CachedPageTimeoutSec
    private long cachedPageTimeoutSec = 15 * 60;


    private final Provider<WebClient> webClientProvider;


    private final String key;


    private final CachedPageDao cachedPageDao;


    @Inject
    CrawlServiceServlet(final Provider<WebClient> webClientProvider,
            @ServiceKey String key,
            CachedPageDao cachedPageDao) {
        this.webClientProvider = webClientProvider;
        this.key = key;
        this.cachedPageDao = cachedPageDao;
    }


    @Override
    protected void doGet(HttpServletRequest req, HttpServletResponse resp) {


        PrintWriter out = null;
        try {
            resp.setCharacterEncoding(CHAR_ENCODING);
            resp.setHeader("Content-Type", "text/plain; charset=" + CHAR_ENCODING);


            out = resp.getWriter();


            String receivedKey = URLDecoder.decode(req.getParameter("key"), CHAR_ENCODING);
            if (!key.equals(receivedKey)) {
                out.println("<h3>The service key received does not match the desired key.</h3>");
            } else {
                String url = URLDecoder.decode(req.getParameter("url"), CHAR_ENCODING);


                List<Key<CachedPage>> keys = cachedPageDao.listKeysByProperty("url", url);
                Map<Key<CachedPage>, CachedPage> deprecatedPages = cachedPageDao.get(keys);


                Date currDate = new Date();


                CachedPage matchingPage = extractMatchingPage(deprecatedPages, currDate);
                cachedPageDao.deleteKeys(deprecatedPages.keySet());


                if (needToFetchPage(matchingPage, currDate, out)) {
                    CachedPage cachedPage = createPlaceholderPage(url, currDate);
                    StringBuilder renderedHtml = renderPage(url);
                    storeFetchedPage(cachedPage, renderedHtml);
                    out.println(renderedHtml.toString());
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            if (out != null) {
                out.close();
            }
        }
    }


    private void storeFetchedPage(CachedPage cachedPage,
            StringBuilder stringBuilder) {
        cachedPage.setContent(stringBuilder.toString());
        cachedPage.setFetchInProgress(false);
        cachedPageDao.put(cachedPage);
    }


    /**
     * Checks if the page {@link matchingPage} needs to be fetched. If it does not need to be fetched,
     * but a fetch is already in progress, then it prints out {@code FETCH_IN_PROGRESS} to the
     * specified {@link PrintWriter}.
     *
     * @param matchingPage The matching page, can be {@code null} if no page matches.
     * @param currDate     The current date.
     * @param out          The {@link PrintWriter} to write to, if needed.
     * @return {@code true} if the page needs to be fetched, {@code false} otherwise.
     */
    private boolean needToFetchPage(CachedPage matchingPage,
            Date currDate, PrintWriter out) {


        if (matchingPage == null) {
            return true;
        }


        if (matchingPage.isFetchInProgress()) {
            // If fetch is in progress since more than 60 seconds, we consider something went wrong and fetch again.
            if (currDate.getTime() > matchingPage.getFetchDate().getTime() + 60000) {
                cachedPageDao.delete(matchingPage);
                return true;
            } else {
                out.println("FETCH_IN_PROGRESS");
                return false;
            }
        } else {
            out.println(matchingPage.getContent());
            return false;
        }
    }


    /**
     * Creates a placeholder page for the given {@code url} and stores it in the datastore.
     *
     * @param url      The URL of the page for which to create a placeholder.
     * @param currDate The current date, to mark the page.
     * @return The newly created placeholder page.
     */
    private CachedPage createPlaceholderPage(String url, Date currDate) {
        CachedPage result = new CachedPage();
        result.setUrl(url);
        result.setFetchDate(currDate);
        result.setFetchInProgress(true);
        cachedPageDao.put(result);
        return result;
    }


    /**
     * Fetches the page at {@code url} and renders the page in a {@link StringBuilder}. The rendered
     * page is prefixed with a message indicating this is a non-interactive version.
     *
     * @param url The URL of the page to render.
     * @return The rendered page, in a {@link StringBuilder}.
     * @throws IOException
     * @throws MalformedURLException
     */
    private StringBuilder renderPage(String url) throws IOException,
            MalformedURLException {
        WebClient webClient = webClientProvider.get();


        webClient.setCssEnabled(false);
        webClient.setJavaScriptTimeout(0);
        webClient.setJavaScriptTimeout(0);
        webClient.setThrowExceptionOnScriptError(false);
        webClient.setThrowExceptionOnFailingStatusCode(false);
        webClient.setJavaScriptEnabled(true);
        HtmlPage page = webClient.getPage(url);
        webClient.getJavaScriptEngine().pumpEventLoop(timeoutMillis);


        StringBuilder stringBuilder = new StringBuilder();
        stringBuilder.append("<hr />\n");
        stringBuilder.append("<center><h3>You are viewing a non-interactive page that is intended for the crawler.  ");
        stringBuilder.append("You probably want to see this page: <a href=\"" + url + "\">" + url +
                "</a></h3></center>\n");
        stringBuilder.append("<hr />\n");


        stringBuilder.append(page.asXml());
        webClient.closeAllWindows();
        return stringBuilder;
    }


    /**
     * Checks if there is a page from {@code deprecatedPages} that is not expired. If there is
     * more than one, choose the most recent. If one is found it is removed from the
     * {@code deprecatedPages} list.
     *
     * @param deprecatedPages The list of pages that match the URL but that are expected to be.
     * @param currDate        The current date, to check for expiration.
     * @return The non-expired matching page if found, {@code null} otherwise.
     */
    private CachedPage extractMatchingPage(Map<Key<CachedPage>, CachedPage> deprecatedPages,
            Date currDate) {
        CachedPage matchingPage = findMostRecentPage(deprecatedPages);


        // Keep the matching page only if it has not expired
        if (matchingPage == null ||
                currDate.getTime() >
                        matchingPage.getFetchDate().getTime() + cachedPageTimeoutSec * 1000) {
            matchingPage = null;
        } else {
            deprecatedPages.remove(Key.create(CachedPage.class, matchingPage.getId()));
        }


        return matchingPage;
    }


    private CachedPage findMostRecentPage(Map<Key<CachedPage>, CachedPage> pages) {
        CachedPage result = null;
        for (CachedPage page : pages.values()) {
            if (result == null ||
                    page.getFetchDate().after(result.getFetchDate())) {
                result = page;
            }
        }
        return result;
    }


}
Source Code of com.gwtplatform.crawlerservice.server.CrawlServiceServlet

Related Classes of com.gwtplatform.crawlerservice.server.CrawlServiceServlet