Package org.webharvest.runtime

Source Code of org.webharvest.runtime.Scraper

/*  Copyright (c) 2006-2007, Vladimir Nikic
    All rights reserved.

    Redistribution and use of this software in source and binary forms,
    with or without modification, are permitted provided that the following
    conditions are met:

    * Redistributions of source code must retain the above
      copyright notice, this list of conditions and the
      following disclaimer.

    * Redistributions in binary form must reproduce the above
      copyright notice, this list of conditions and the
      following disclaimer in the documentation and/or other
      materials provided with the distribution.

    * The name of Web-Harvest may not be used to endorse or promote
      products derived from this software without specific prior
      written permission.

    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
    POSSIBILITY OF SUCH DAMAGE.

    You can contact Vladimir Nikic by sending e-mail to
    nikic_vladimir@yahoo.com. Please include the word "Web-Harvest" in the
    subject line.
*/
package org.webharvest.runtime;

import org.apache.log4j.Logger;
import org.webharvest.definition.IElementDef;
import org.webharvest.definition.ScraperConfiguration;
import org.webharvest.runtime.processors.BaseProcessor;
import org.webharvest.runtime.processors.CallProcessor;
import org.webharvest.runtime.processors.HttpProcessor;
import org.webharvest.runtime.processors.ProcessorResolver;
import org.webharvest.runtime.scripting.ScriptEngine;
import org.webharvest.runtime.variables.Variable;
import org.webharvest.runtime.variables.NodeVariable;
import org.webharvest.runtime.variables.EmptyVariable;
import org.webharvest.runtime.web.HttpClientManager;
import org.webharvest.utils.CommonUtil;
import org.webharvest.utils.Stack;
import org.webharvest.utils.ClassLoaderUtil;
import org.webharvest.exception.DatabaseException;

import java.util.*;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;

/**
* Basic runtime class.
*/
public class Scraper {

    public static final int STATUS_READY = 0;
    public static final int STATUS_RUNNING = 1;
    public static final int STATUS_PAUSED = 2;
    public static final int STATUS_FINISHED = 3;
    public static final int STATUS_STOPPED = 4;
    public static final int STATUS_ERROR = 5;
    public static final int STATUS_EXIT = 6;

    private Logger logger = Logger.getLogger("" + System.currentTimeMillis());
    private ScraperConfiguration configuration;
    private String workingDir;
    private ScraperContext context;

    private RuntimeConfig runtimeConfig;

    private transient boolean isDebugMode = false;

    private HttpClientManager httpClientManager;

    // stack of running processors
    private transient Stack runningProcessors = new Stack();

    // stack of running functions
    private transient Stack runningFunctions = new Stack();

    // params that are proceeded to calling function
    private transient Map functionParams = new HashMap();

    // stack of running http processors
    private transient Stack runningHttpProcessors = new Stack();

    // default script engine used throughout the configuration execution
    private ScriptEngine scriptEngine = null;

    // all used script engines in this scraper
    private Map usedScriptEngines = new HashMap();

    // pool of used database connections
    Map dbPool = new HashMap();

    private List<ScraperRuntimeListener> scraperRuntimeListeners = new LinkedList<ScraperRuntimeListener>();

    private int status = STATUS_READY;

    private String message = null;

    /**
     * Constructor.
     * @param configuration
     * @param workingDir
     */
    public Scraper(ScraperConfiguration configuration, String workingDir) {
        this.configuration = configuration;
        this.runtimeConfig = new RuntimeConfig();
        this.workingDir = CommonUtil.adaptFilename(workingDir);

        this.httpClientManager = new HttpClientManager();

        this.context = new ScraperContext(this);
        this.scriptEngine = configuration.createScriptEngine(this.context);
        this.usedScriptEngines.put(configuration.getDefaultScriptEngine(), this.scriptEngine);
    }

    /**
     * Adds parameter with specified name and value to the context.
     * This way some predefined variables can be put in runtime context
     * before execution starts.
     * @param name
     * @param value
     */
    public void addVariableToContext(String name, Object value) {
        this.context.put(name, new NodeVariable(value));
    }

    /**
     * Add all map values to the context.
     * @param map
     */
    public void addVariablesToContext(Map map) {
        if (map != null) {
            Iterator iterator = map.entrySet().iterator();
            while (iterator.hasNext()) {
                Map.Entry entry = (Map.Entry) iterator.next();
                this.context.put( entry.getKey(), new NodeVariable(entry.getValue()) );
            }
        }
    }

    public Variable execute(List<IElementDef> ops) {
        this.setStatus(STATUS_RUNNING);

        // inform al listeners that execution is just about to start
        for (ScraperRuntimeListener listener: scraperRuntimeListeners) {
            listener.onExecutionStart(this);
        }

        try {
            for (IElementDef elementDef: ops) {
                BaseProcessor processor = ProcessorResolver.createProcessor(elementDef, this.configuration, this);
                if (processor != null) {
                    processor.run(this, context);
                }
            }
        } finally {
            releaseDBConnections();
        }

        return new EmptyVariable();
    }

    public void execute() {
      long startTime = System.currentTimeMillis();

        execute( configuration.getOperations() );

        if ( this.status == STATUS_RUNNING ) {
            this.setStatus(STATUS_FINISHED);
        }

        // inform al listeners that execution is finished
        Iterator listenersIterator = this.scraperRuntimeListeners.iterator();
        while (listenersIterator.hasNext()) {
            ScraperRuntimeListener listener = (ScraperRuntimeListener) listenersIterator.next();
            listener.onExecutionEnd(this);
        }

        if ( logger.isInfoEnabled() ) {
            if (this.status == STATUS_FINISHED) {
                logger.info("Configuration executed in " + (System.currentTimeMillis() - startTime) + "ms.");
            } else if (this.status == STATUS_STOPPED) {
                logger.info("Configuration stopped!");
            }
        }
    }
   
    public ScraperContext getContext() {
    return context;
  }

  public ScraperConfiguration getConfiguration() {
        return configuration;
    }

    public String getWorkingDir() {
        return this.workingDir;
    }

    public HttpClientManager getHttpClientManager() {
        return httpClientManager;
    }

    public void addRunningFunction(CallProcessor callProcessor) {
        runningFunctions.push(callProcessor);
    }

    public CallProcessor getRunningFunction() {
        return runningFunctions.isEmpty() ? null : (CallProcessor) runningFunctions.peek();
    }

    public void clearFunctionParams() {
        this.functionParams.clear();
    }

    public void addFunctionParam(String name, Variable value) {
        this.functionParams.put(name, value);
    }

    public Map getFunctionParams() {
        return functionParams;
    }

    public void removeRunningFunction() {
        if (runningFunctions.size() > 0) {
            runningFunctions.pop();
        }
    }
   
    public HttpProcessor getRunningHttpProcessor() {
      return (HttpProcessor) runningHttpProcessors.peek();
    }
   
    public void setRunningHttpProcessor(HttpProcessor httpProcessor) {
      runningHttpProcessors.push(httpProcessor);
    }

    public void removeRunningHttpProcessor() {
        if (runningHttpProcessors.size() > 0) {
            runningHttpProcessors.pop();
        }
    }

    public int getRunningLevel() {
        return runningProcessors.size() + 1;
    }

    public boolean isDebugMode() {
        return isDebugMode;
    }

    public void setDebug(boolean debug) {
        this.isDebugMode = debug;
    }

    public ScriptEngine getScriptEngine() {
        return runningFunctions.size() > 0 ? getRunningFunction().getScriptEngine() : this.scriptEngine;
    }

    public synchronized ScriptEngine getScriptEngine(String engineType) {
        ScriptEngine engine = (ScriptEngine) this.usedScriptEngines.get(engineType);
        if (engine == null) {
            engine = configuration.createScriptEngine(this.context, engineType);
            this.usedScriptEngines.put(engineType, engine);
        }

        return engine;
    }

    public Logger getLogger() {
        return logger;
    }

    public BaseProcessor getRunningProcessor() {
        return (BaseProcessor) runningProcessors.peek();
    }

    /**
     * @param processor Processor whose parent is needed.
     * @return Parent running processor of the specified running processor, or null if processor is
     * not currently running or if it is top running processor.
     */
    public BaseProcessor getParentRunningProcessor(BaseProcessor processor) {
        List runningProcessorList = runningProcessors.getList();
        int index = CommonUtil.findValueInCollection(runningProcessorList, processor);
        return index > 0 ? (BaseProcessor) runningProcessorList.get(index - 1) : null;
    }

    /**
     * @param processorClazz Class of enclosing running processor.
     * @return Parent running processor in the tree of specified class, or null if it doesn't exist.
     */
    public BaseProcessor getRunningProcessorOfType(Class processorClazz) {
        List runningProcessorList = runningProcessors.getList();
        ListIterator listIterator = runningProcessorList.listIterator(runningProcessors.size());
        while (listIterator.hasPrevious()) {
            BaseProcessor curr = (BaseProcessor) listIterator.previous();
            if (processorClazz.equals(curr.getClass())) {
                return curr;
            }
        }
        return null;
    }

    public RuntimeConfig getRuntimeConfig() {
        return runtimeConfig;
    }

    /**
     * Get connection from the connection pool, and first create one if necessery
     * @param jdbc Name of JDBC class
     * @param connection JDBC connection string
     * @param username Username
     * @param password Password
     * @return JDBC connection used to access database
     */
    public Connection getConnection(String jdbc, String connection, String username, String password) {
        try {
            String poolKey = jdbc + "-" + connection + "-" + username + "-" + password;
            Connection conn = (Connection) dbPool.get(poolKey);
            if (conn == null) {
                ClassLoaderUtil.registerJDBCDriver(jdbc);
                conn = DriverManager.getConnection(connection, username, password);
                dbPool.put(poolKey, conn);
            }
            return conn;
        }
        catch (Exception e) {
            throw new DatabaseException(e);
        }
    }

    public void setExecutingProcessor(BaseProcessor processor) {
        this.runningProcessors.push(processor);
        Iterator iterator = this.scraperRuntimeListeners.iterator();
        while (iterator.hasNext()) {
            ScraperRuntimeListener listener = (ScraperRuntimeListener) iterator.next();
            listener.onNewProcessorExecution(this, processor);
        }
    }

    public void finishExecutingProcessor() {
        if (this.runningProcessors.size() > 0) {
            this.runningProcessors.pop();
        }
    }

    public void processorFinishedExecution(BaseProcessor processor, Map properties) {
        Iterator iterator = this.scraperRuntimeListeners.iterator();
        while (iterator.hasNext()) {
            ScraperRuntimeListener listener = (ScraperRuntimeListener) iterator.next();
            listener.onProcessorExecutionFinished(this, processor, properties);
        }
    }

    public void addRuntimeListener(ScraperRuntimeListener listener) {
        this.scraperRuntimeListeners.add(listener);
    }

    public void removeRuntimeListener(ScraperRuntimeListener listener) {
        this.scraperRuntimeListeners.remove(listener);
    }

    public synchronized int getStatus() {
        return status;
    }

    private synchronized void setStatus(int status) {
        this.status = status;
    }

    public void stopExecution() {
        setStatus(STATUS_STOPPED);
    }

    public void exitExecution(String message) {
        setStatus(STATUS_EXIT);
        this.message = message;
    }

    public String getMessage() {
        return message;
    }

    public void pauseExecution() {
        if (this.status == STATUS_RUNNING) {
            setStatus(STATUS_PAUSED);

            // inform al listeners that execution is paused
            Iterator listenersIterator = this.scraperRuntimeListeners.iterator();
            while (listenersIterator.hasNext()) {
                ScraperRuntimeListener listener = (ScraperRuntimeListener) listenersIterator.next();
                listener.onExecutionPaused(this);
            }
        }
    }

    public void continueExecution() {
        if (this.status == STATUS_PAUSED) {
            setStatus(STATUS_RUNNING);

            // inform al listeners that execution is continued
            Iterator listenersIterator = this.scraperRuntimeListeners.iterator();
            while (listenersIterator.hasNext()) {
                ScraperRuntimeListener listener = (ScraperRuntimeListener) listenersIterator.next();
                listener.onExecutionContinued(this);
            }
        }
    }

    /**
     * Inform all scraper listeners that an error has occured during scraper execution.
     */
    public void informListenersAboutError(Exception e) {
        setStatus(STATUS_ERROR);

        // inform al listeners that execution is continued
        Iterator listenersIterator = this.scraperRuntimeListeners.iterator();
        while (listenersIterator.hasNext()) {
            ScraperRuntimeListener listener = (ScraperRuntimeListener) listenersIterator.next();
            listener.onExecutionError(this, e);
        }
    }

    /**
     * Releases all DB connections from the pool.
     */
    public void releaseDBConnections() {
        Iterator iterator = dbPool.values().iterator();
        while (iterator.hasNext()) {
            Connection conn = (Connection) iterator.next();
            if (conn != null) {
                try {
                    conn.close();
                } catch (SQLException e) {
                    throw new DatabaseException(e);
                }
            }
        }
    }

    public void dispose() {
        // empty scraper's variable context
        this.context.clear();

        // free connection with context
        this.context.dispose();

        // releases script engines
        if (this.usedScriptEngines != null) {
            Iterator iterator = this.usedScriptEngines.values().iterator();
            while (iterator.hasNext()) {
                ScriptEngine engine = (ScriptEngine) iterator.next();
                if (engine != null) {
                    engine.dispose();
                }
            }
        }

        this.logger.removeAllAppenders();
       
        Iterator iterator = usedScriptEngines.values().iterator();
        while (iterator.hasNext()) {
            ScriptEngine engine = (ScriptEngine) iterator.next();
            engine.dispose();
        }
    }

}
TOP

Related Classes of org.webharvest.runtime.Scraper

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.