Package org.cruxframework.crux.tools.crawling

Source Code of org.cruxframework.crux.tools.crawling.CrawlingTool$PageInfo

/*
* Copyright 2011 cruxframework.org.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package org.cruxframework.crux.tools.crawling;

import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.cruxframework.crux.core.server.crawling.CrawlingException;
import org.cruxframework.crux.core.server.crawling.CrawlingUtils;
import org.cruxframework.crux.core.utils.StreamUtils;
import org.cruxframework.crux.tools.parameters.ConsoleParameter;
import org.cruxframework.crux.tools.parameters.ConsoleParameterOption;
import org.cruxframework.crux.tools.parameters.ConsoleParametersProcessingException;
import org.cruxframework.crux.tools.parameters.ConsoleParametersProcessor;

import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;

/**
* Creates a static version for DHTML pages, to serve to search engines. Follows the specification described here
* {@link https://developers.google.com/webmasters/ajax-crawling/}
*
* @author Thiago da Rosa de Bustamante
*
*/
public class CrawlingTool
{
  private static final Log logger = LogFactory.getLog(CrawlingTool.class);

  private File outputDir;
  private WebClient webClient;
  private String applicationBaseURL;
  private List<PageInfo> pages = new ArrayList<CrawlingTool.PageInfo>();
  private boolean stopOnErrors = false;
 
  private final int javascriptTime;
 
  /**
   * Constructor
   * @param outputDir
   * @param javascriptTime
   * @param applicationBaseURL
   */
  public CrawlingTool(File outputDir, int javascriptTime, String applicationBaseURL)
    {
    if (outputDir == null || !outputDir.exists() || !outputDir.isDirectory())
    {
      throw new CrawlingException("Invalid output directory");
    }
    this.outputDir = outputDir;
    this.applicationBaseURL = applicationBaseURL;
    this.javascriptTime = javascriptTime;
    webClient = new WebClient(BrowserVersion.FIREFOX_17);
    webClient.getOptions().setCssEnabled(true);
    webClient.getOptions().setThrowExceptionOnScriptError(false);
    webClient.setAjaxController(new NicelyResynchronizingAjaxController());
    }

  /**
   *
   * @return
   */
  public boolean isStopOnErrors()
    {
      return stopOnErrors;
    }

  /**
   *
   * @param stopOnErrors
   */
  public void setStopOnErrors(boolean stopOnErrors)
    {
      this.stopOnErrors = stopOnErrors;
    }

  /**
   *
   * @param page
   * @param escapedFragment
   * @param path
   */
  public void createSnapshot(String page, String escapedFragment)
  {
    try
        {
      String pagePath = CrawlingUtils.getStaticPageFor(page, escapedFragment);
      if (pagePath != null)
      {
        logger.info("Creating snapshot for page ["+page+"], escapedFragment ["+escapedFragment+"]");
        HtmlPage htmlPage = webClient.getPage(CrawlingUtils.rewriteUrl(applicationBaseURL, page, escapedFragment));
        webClient.waitForBackgroundJavaScript(javascriptTime);
        File outputFile = new File(outputDir, pagePath);
        //htmlPage.save(outputFile) creates the whole site structure, with images and CSS files locally. So use asXML instead
        StreamUtils.write(new ByteArrayInputStream(htmlPage.asXml().getBytes("UTF-8")), new FileOutputStream(outputFile), true);
        webClient.closeAllWindows();
      }
        }
        catch (Exception e)
        {
          throw new CrawlingException("Error generating snapshot for page ["+page+"], with escapedFragment ["+escapedFragment+"].", e);
        }
  }
 
  public void createSnapshots()
  {
    for (PageInfo pageInfo : pages)
        {
          try
          {
            createSnapshot(pageInfo.page, pageInfo.escapedFragment);
          }
          catch (Exception e)
          {
            logger.error("Error creating snapshot for page ["+pageInfo.page+"], escapedFragment ["+pageInfo.escapedFragment+"]", e);
            if (stopOnErrors)
            {
              break;
            }
      }
        }
  }
 
  /**
   *
   * @param page
   * @param escapedFragment
   */
  public void addSnaphot(String page, String escapedFragment)
  {
    pages.add(new PageInfo(page, escapedFragment));
  }
 
  /**
   *
   * @param urls
   * @throws IOException
   */
  public void loadUrls(File urls) throws IOException
    {
    BufferedReader reader = new BufferedReader(new FileReader(urls));
   
    String line = reader.readLine();
   
    while (line != null)
    {
      loadURL(line);
      line = reader.readLine();
    }
    reader.close();
    }

  protected void loadURL(String line)
    {
     String[] parts = line.split(":");
     if (parts != null && parts.length == 2)
     {
       addSnaphot(parts[0], parts[1]);
     }
    }

  /**
   *
   * @return
   */
  protected static ConsoleParametersProcessor createParametersProcessor()
  {
    ConsoleParameter parameter;
    ConsoleParametersProcessor parametersProcessor = new ConsoleParametersProcessor("CrawlingTool");

    parameter = new ConsoleParameter("outputDir", "The folder where the snapshots will be created.", true, true);
    parameter.addParameterOption(new ConsoleParameterOption("dirName", "Folder name"));
    parametersProcessor.addSupportedParameter(parameter);
   
    parameter = new ConsoleParameter("applicationBaseURL", "Web application base URL.", true, true);
    parameter.addParameterOption(new ConsoleParameterOption("baseURL", "web application root URLĖ›"));
    parametersProcessor.addSupportedParameter(parameter);
   
    parameter = new ConsoleParameter("javascriptTime", "Time to wait for page rendering before takes the snapshot.", true, true);
    parameter.addParameterOption(new ConsoleParameterOption("time", "Time in miliseconds"));
    parametersProcessor.addSupportedParameter(parameter);

    parameter = new ConsoleParameter("urls", "A file containing the application urls for snapshot generation.", false, true);
    parameter.addParameterOption(new ConsoleParameterOption("fileName", "File name"));
    parametersProcessor.addSupportedParameter(parameter);

    parameter = new ConsoleParameter("stopOnErrors", "Inform if the process must stop if an error occurs.", false, true);
    parameter.addParameterOption(new ConsoleParameterOption("stop", "True if the process must stop."));
    parametersProcessor.addSupportedParameter(parameter);

    parametersProcessor.addSupportedParameter(new ConsoleParameter("-help", "Display the usage screen.", false, true));
    parametersProcessor.addSupportedParameter(new ConsoleParameter("-h", "Display the usage screen.", false, true));
    return parametersProcessor; 
  }
 
  public static void main(String[] args)
  {
    try
    {
      ConsoleParametersProcessor parametersProcessor = CrawlingTool.createParametersProcessor();
      Map<String, ConsoleParameter> parameters = parametersProcessor.processConsoleParameters(args);

      if (parameters.containsKey("-help") || parameters.containsKey("-h"))
      {
        parametersProcessor.showsUsageScreen();
      }
      else
      {
        File outputDir = new File (parameters.get("outputDir").getValue());
        int javascriptTime = Integer.parseInt(parameters.get("javascriptTime").getValue());
        String applicationBaseURL = parameters.get("applicationBaseURL").getValue();
        File urls = new File (parameters.get("urls").getValue());
        boolean stopOnErrors = Boolean.parseBoolean(parameters.get("stopOnErrors").getValue());
       
        CrawlingTool crawlingTool = new CrawlingTool(outputDir, javascriptTime, applicationBaseURL);
        crawlingTool.setStopOnErrors(stopOnErrors);
        crawlingTool.loadUrls(urls);
        crawlingTool.createSnapshots();
      }
      System.exit(0);
    }
    catch (ConsoleParametersProcessingException e)
    {
      logger.error("Error processing program parameters: "+e.getLocalizedMessage()+". Program aborted.", e);
    }
    catch (CrawlingException e)
    {
      logger.error("Error generating files: "+e.getLocalizedMessage()+". Program aborted.", e);
    }
        catch (IOException e)
        {
      logger.error("Error loading urls from file. Program aborted", e);
        }
    System.exit(1);
 
 
  private static class PageInfo
  {
    private String page;
    private String escapedFragment;
   
    public PageInfo(String page, String escapedFragment)
        {
      this.page = page;
      this.escapedFragment = escapedFragment;
        }
  }
}
TOP

Related Classes of org.cruxframework.crux.tools.crawling.CrawlingTool$PageInfo

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.