Package org.dtk.analysis.page

Source Code of org.dtk.analysis.page.RecursiveWebPage

package org.dtk.analysis.page;

import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.logging.Logger;

import org.dtk.analysis.ModuleAnalysis;
import org.dtk.analysis.ModuleFormat;
import org.dtk.analysis.RecursiveModuleAnalysis;
import org.dtk.analysis.exceptions.FatalAnalysisError;
import org.dtk.analysis.exceptions.ModuleSourceNotAvailable;
import org.dtk.analysis.exceptions.UnknownModuleIdentifier;
import org.dtk.analysis.script.config.DojoConfigAttrs;
import org.dtk.analysis.script.config.LoaderConfigParser;
import org.dtk.analysis.script.config.ScriptConfigParser;
import org.dtk.analysis.script.dependency.AMDScriptParser;
import org.dtk.analysis.script.dependency.NonAMDScriptParser;
import org.dtk.analysis.script.dependency.ScriptDependencyParser;
import org.dtk.analysis.script.node.ArrayLiteral;
import org.dtk.analysis.script.node.ObjectLiteral;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
* Base class for recursively analysing web pages for modules dependencies, discovered module
* dependencies are retrieved and parsed. Each instance is initialised with a Document representing
* the parsed HTML page and a remote location for that page. This is used to construct the remote
* paths locations for any discovered module dependencies. 
*
* Once the Dojo loader has been discovered, each subsequent script tag is analysed for module dependencies.
* Each dependency is converted to a remote location and its source retrieved. This module source is then
* recursively analysed for any dependencies specified. Module dependencies will be ignored if the containing
* package is already set to "ignore" or we have previously seen this module dependency.
*
* Retrieved module source is made available through the "getModuleSource" and "isModuleSourceAvailable" methods.
*
* @author James Thomas
*/

public abstract class RecursiveWebPage extends WebPage implements RecursiveModuleAnalysis {
 
  /**
   * Collection of packages whose modules should not be recursively
   * retrieved.
   */
  protected Set<String> ignoredPackages = new HashSet<String>();
 
  /**
   * Retrieved module source contents, look up by absolute module identifier.
   */
  protected Map<String, String> moduleSource = new HashMap<String, String>();

  /**
   * Loader configuration, resolved module paths discovered.
   */
  protected Map<String, String> modulePaths = new HashMap<String, String>();

  /**
   * Configuration for URL path from current page location that module identifiers are
   * resolved relative to.
   */
  protected String baseUrl = null;
 
  /**
   * URL containing page location being analysed. Used to resolve linked
   * resource paths.
   */
  protected URL location = null;
 
  /**
   * Static logging instance.
   */
  protected static final Logger logger = Logger.getLogger(RecursiveWebPage.class.getName())
 
  /**
   * Default constructor, pass parsed HTML page and original location.
   * 
   * @param document - Parsed HTML page
   * @param location - Page location
   * @throws IOException - Unable to analyse HTML page
   */ 
  protected RecursiveWebPage(Document document, URL location) {
    super(document);   
    this.location = location;
    this.document.setBaseUri(location.toString());
  }
 
  /**
   * Parse document script tag for all module identifiers listed as
   * application dependencies. Each discovered identifier will update the
   * global package directory with the absolute module identifier.
   *
   * If there are problems retrieving script contents, ignore and return.
   *
   * @param script - Document script tag
   */
  @Override
  protected void parsePostDojoScript(Element script) {
    String scriptContents = retrieveScriptContents(script);

    if (scriptContents != null) {
      recursivelyAnalyseScriptDependencies(scriptContents);
    }
  }
 
  /**
   * Analyse module dependencies from a JavaScript source string. May be either
   * in AMD or non-AMD format. Any discovered modules, that haven't already been seen
   * and don't belong to an ignored package, are recursively retrieved and analysed for
   * their dependencies.
   *
   * @param scriptSource - JavaScript source text to analyse for module dependencies
   */
  protected void recursivelyAnalyseScriptDependencies(String scriptSource) {
    List<String> moduleDependencies = analyseModuleDependencies(scriptSource);
   
    for(String moduleIdentifier: moduleDependencies) {
      String absoluteModuleIdentifier = getAbsoluteModuleIdentifier(moduleIdentifier),
        packageName = getPackageIdentifier(absoluteModuleIdentifier);

      if (shouldIncludeDiscoveredModule(packageName, absoluteModuleIdentifier)) {
        updateDiscoveredModules(packageName, absoluteModuleIdentifier);      
         
        if (shouldAnalyseForDependencies(packageName, absoluteModuleIdentifier)) {
          String moduleContents = retrieveModuleSource(absoluteModuleIdentifier);   
          if (moduleContents != null) {
            moduleSource.put(absoluteModuleIdentifier, moduleContents);
            recursivelyAnalyseScriptDependencies(moduleContents);
          }
        }
       }     
     }
  }
 
  /**
   * Should the package module has its module dependencies analysed?
   * Ignore any modules for pre-specified packages or those we have already
   * analysed.
   *
   * @param packageName - Module package identifer
   * @param absoluteModuleIdentifier - Absolute module identifier
   * @return Module should be analysed for dependencies
   */
  protected boolean shouldAnalyseForDependencies(String packageName, String absoluteModuleIdentifier) {
    try {
      return !isPackageIgnored(packageName) && !isModuleSourceAvailable(absoluteModuleIdentifier);
    } catch (UnknownModuleIdentifier e) {
      logger.warning(String.format("Unable to analyse module dependencies for unknown package & module identifier (%s) %s",
        packageName, absoluteModuleIdentifier));
      return false;
    }
  }
 
  /**
   * Override parent loader configuration to support parsing module paths.
   * Used to calculate absolute module locations from relative identifiers and
   * find any custom base URL parameters set.
   *
   * @param parsedScriptConfig - Converted JavaScript object value containing loader config.
   */
  @Override 
  protected void updateInternalLoaderConfig(Map<String, Object> parsedScriptConfig) {
    super.updateInternalLoaderConfig(parsedScriptConfig);
   
    if (parsedScriptConfig.containsKey(DojoConfigAttrs.MODULE_PATHS_CONFIG_FLAG)) {
      updateInternalPathsConfig((ObjectLiteral) parsedScriptConfig.get(DojoConfigAttrs.MODULE_PATHS_CONFIG_FLAG));
    }
   
    if (parsedScriptConfig.containsKey(DojoConfigAttrs.PATHS_CONFIG_FLAG)) {
      updateInternalPathsConfig((ObjectLiteral) parsedScriptConfig.get(DojoConfigAttrs.PATHS_CONFIG_FLAG));
    }
   
    if (parsedScriptConfig.containsKey(DojoConfigAttrs.PACKAGES_CONFIG_FLAG)) {
      updateInternalPathsConfig((ArrayLiteral) parsedScriptConfig.get(DojoConfigAttrs.PACKAGES_CONFIG_FLAG));
    }   
   
    if (parsedScriptConfig.containsKey(DojoConfigAttrs.BASE_URL_CONFIG_FLAG)) {
      baseUrl = (String) parsedScriptConfig.get(DojoConfigAttrs.BASE_URL_CONFIG_FLAG);
    }
  }
 
  /**
   * Update the internal loader configuration paths for each value found.
   * Must be a string value.
   *
   * @param paths - Module paths
   */
  protected void updateInternalPathsConfig(ObjectLiteral paths) {
    for(String packageName: paths.getKeys()) {
      Object packagePath = paths.getValue(packageName);
      if (packagePath instanceof String) {
        modulePaths.put(packageName, (String) packagePath);
      }
    }
  }
  /**
   * Update the internal loader configuration paths for packages declaration, 
   * contains list of object literals with "name" and "location" values.
   *
   * Will ignore any non-string values referenced.
   *
   * @param packages - Module packages list
   */ 
  protected void updateInternalPathsConfig(ArrayLiteral packages) {
    for(Object packageInfo: packages.getValueList()) {
      if (packageInfo instanceof ObjectLiteral) {
        ObjectLiteral packageInfoLit = (ObjectLiteral) packageInfo;
        Object packagePath = packageInfoLit.getValue(DojoConfigAttrs.PACKAGES_LOCATION_CONFIG_FLAG),
           packageName = packageInfoLit.getValue(DojoConfigAttrs.PACKAGES_NAME_CONFIG_FLAG);
       
        if (packagePath instanceof String && packageName instanceof String) {
          modulePaths.put((String) packageName, (String) packagePath);
        }
      }
    }   
  }
 
  /**
   * Is this package being ignored for recursively module analysis?
   *
   * @param packageIdentifier - Global package identifier
   * @return Package is ignored
   */
  protected boolean isPackageIgnored(String packageIdentifier) {
    return ignoredPackages.contains(packageIdentifier);
  }

  /**
   * Have we seen this module identifier during analysis?
   * 
   * @param packageName - Package identifier
   * @param moduleIdentifier - Module identifier
   * @return Module discovered during analysis
   */
  protected boolean hasSeenModuleIdentifier(String packageName, String moduleIdentifier) {
    List<String> packageModules = discoveredModules.get(packageName);
    return packageModules != null && packageModules.contains(moduleIdentifier);       
  }
 
  /**
   * Retrieve and return associated module source for an identifier.
   * Implementation responsible for details of where and how module
   * source is available.
   *
   * @param moduleIdentifier - Absolute module identifier
   * @return Module source
   */
  abstract protected String retrieveModuleSource(String moduleIdentifier)
 
  /**
   * Return module source that was parsed during recursive analysis.
   *
   * @param moduleIdentifier - Module identifier for source
   * @return Module's definition source
   * @throws ModuleSourceNotAvailable - Module identifier belongs to a package that had
   * recursive parsing turned off
   * @throws UnknownModuleIdentifier - Module identifier provided wasn't discovered during analysis
   */
  public String getModuleSource(String moduleIdentifier) throws ModuleSourceNotAvailable, UnknownModuleIdentifier {
    String packageName = getPackageIdentifier(moduleIdentifier);
   
    if (!hasSeenModuleIdentifier(packageName, moduleIdentifier)) {
      throw new UnknownModuleIdentifier();
    } else if (!moduleSource.containsKey(moduleIdentifier)) {
      throw new ModuleSourceNotAvailable();
    }
   
    return moduleSource.get(moduleIdentifier);
  }
 
  /**
   * Has the referenced module being recursively analysed during parsing? If so,
   * the module source will be available for reading.
   *
   * @param moduleIdentifier - Module identifier discovered during analysis
   * @return Module source contents
   * @throws UnknownModuleIdentifier - Module identifier wasn't discovered during analysis
   */
  public boolean isModuleSourceAvailable(String moduleIdentifier) throws UnknownModuleIdentifier {
    String packageName = getPackageIdentifier(moduleIdentifier);
   
    if (!hasSeenModuleIdentifier(packageName, moduleIdentifier)) {
      throw new UnknownModuleIdentifier();
    }
   
    return moduleSource.containsKey(moduleIdentifier);
 
 
  /**
   * Set a list of explicit package identifiers whose modules shouldn't
   * be recursively parsing during analysis. These modules will be identified
   * but source files won't be pulled down and analysed for module dependencies.
   *
   * @param packagesToIgnore - Package identifiers
   */
  public void setIgnoredPackages(Set<String> packagesToIgnore) {
    this.ignoredPackages = packagesToIgnore;
  }
 
  /**
   * Retrieve the set of package identifiers whose modules will be ignored
   * during recursively module analysis.  These modules will be identified
   * but source files won't be pulled down and analysed for module dependencies.
   *
   * @return List of package identifiers
   */
  public Set<String> getIgnoredPackages() {
    return ignoredPackages;   
  }
}
TOP

Related Classes of org.dtk.analysis.page.RecursiveWebPage

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.