Package com.google.gsa.proxy

Source Code of com.google.gsa.proxy.Crawler

/**
  * Copyright (C) 2008 Google - Enterprise EMEA SE
  *
  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
  * use this file except in compliance with the License. You may obtain a copy of
  * the License at
  *
  * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  * License for the specific language governing permissions and limitations under
  * the License.
  */

package com.google.gsa.proxy;

import com.google.gsa.AuthorizationProcessImpl;
import com.google.gsa.Credential;
import com.google.gsa.Credentials;

import com.google.gsa.proxy.auth.session.CrawlingSession;

import com.google.gsa.proxy.auth.CrawlingUtils;
import com.google.gsa.proxy.auth.ipaddress.IPAddressChecker;
import com.google.gsa.proxy.auth.ipaddress.IPAddresses;
import com.google.gsa.proxy.config.Config;
import com.google.gsa.sessions.UserSession;

import com.google.gsa.sessions.nonValidSessionException;
import com.google.gsa.valve.configuration.ValveConfiguration;
import com.google.gsa.valve.configuration.ValveConfigurationException;

import com.google.gsa.valve.configuration.ValveConfigurationInstance;

import com.google.gsa.valve.modules.utils.AuthorizationUtils;

import java.io.IOException;

import java.util.Vector;

import javax.naming.NamingException;

import javax.servlet.Filter;
import javax.servlet.FilterChain;
import javax.servlet.FilterConfig;
import javax.servlet.ServletException;
import javax.servlet.ServletRequest;
import javax.servlet.ServletResponse;

import javax.servlet.http.Cookie;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

import org.apache.commons.configuration.ConfigurationException;
import org.apache.log4j.Logger;

/**
* This is the class that implements the main crawl process, serving content
* from the remote repositories back to the appliance. It acts as a proxy that
* can be used in the middle between the search appliance and the remote
* content sources.
* <p>
* It's implemented as a Java filter that gets the crawl requests and validate if
* they are OK and sends back to the appliance the result of accessing the
* remote document. It uses some security rules like IP address restriction.
*
*/
public class Crawler implements Filter {

    private FilterConfig _filterConfig = null;

    //Logger
    private Logger logger = Logger.getLogger(Crawler.class);

    //Basic
    private static final String HEADER_BASIC = "Basic";

    //HTTP Request and Response objects
    private HttpServletRequest httpRequest = null;
    private HttpServletResponse httpResponse = null;

    //Crawling Session instance
    private static CrawlingSession crawlSession = null;

    //Authorization Process
    private static String authorizationProcessClsName = null;

    //Valve Configuration instance
    private ValveConfiguration valveConf = null;

    //Configuration file pointers
    private static String gsaValveConfigPath = null;
    private static String crawlerConfigPath = null;


    /**
     * Init method
     *
     * @param filterConfig filter config
     *
     * @throws ServletException
     */
    public void init(FilterConfig filterConfig) throws ServletException {
        _filterConfig = filterConfig;
    }


    /**
     * Destroy method
     */
    public void destroy() {
        _filterConfig = null;
    }


    /**
     * This is the main method of the Filter invoked by the Java application
     * whenever the request matches with the rules configured at web.xml file.
     * It checks the response is OK and then processes the request sending back
     * the result to the appliance (crawler).
     *
     * @param request servlet HTTP request
     * @param response servlet HTTP response
     * @param chain servlet chain
     *
     * @throws IOException
     * @throws ServletException
     */
    public void doFilter(ServletRequest request, ServletResponse response,
                         FilterChain chain) throws IOException,
                                                   ServletException {

        int responseCode = HttpServletResponse.SC_UNAUTHORIZED;

        //Translate Request and Response objects               
        try {
            httpRequest = (HttpServletRequest)request;
            httpResponse = (HttpServletResponse)response;
        } catch (Exception ex) {
            logger.error("Error when class casting Request and Response: " +
                         ex.getMessage(), ex);
        } finally {
        }

        //Process request
        try {
            //Set Config file paths
            setConfigFilePaths();

            //Set Valve config
            initializeValveConfiguration();

            //Execute doProcess
            responseCode = doProcess(httpRequest, httpResponse);
        } catch (Exception e) {
            logger.error("Error doing crawling process: " + e.getMessage(), e);
            responseCode = HttpServletResponse.SC_INTERNAL_SERVER_ERROR;
        }

        //Set error
        httpResponse.setStatus(responseCode);
    }

    /**
     * Processes the http request and responds with the authn/authz result
     *
     * @param httpRequest servlet HTTP request
     * @param httpResponse servlet HTTP response
     *
     * @return the HTTP error code
     *
     * @throws IOException
     */
    public int doProcess(HttpServletRequest httpRequest,
                         HttpServletResponse httpResponse) throws IOException {

        //doProcess variable initialization
        String realm = null; //Domain realm
        String timeout = null; //SAML timeout      
        String url = null; //URL coming in the request
        String credID = null; //Credential ID

        logger.debug("Starting doProcess");

        //Get realm and check it's OK
        try {
            realm =
                    Config.getConfig(crawlerConfigPath).getString(".crawler.realm");
        } catch (ConfigurationException e) {
            logger.error("Error when reading realm: " + e.getMessage(), e);
        }
        if (realm == null) {
            logger.error("Realm could not be read from the config file");
            return HttpServletResponse.SC_INTERNAL_SERVER_ERROR;
        }

        //Check if the request contains the HTTP Basic credentials. If not, send back the unauthorized error with the WWW-Authenticate header                     
        if (!CrawlingUtils.doesContainBasicHeader(httpRequest)) {
            logger.debug("The request does not contain Basic credentials");
            CrawlingUtils.sendAuthenticateHeader(httpResponse, realm);
            return HttpServletResponse.SC_UNAUTHORIZED;
        }

        //Check if it is the Crawler user. If not, send back unauthorized error
        if (!CrawlingUtils.isCrawler(httpRequest)) {
            logger.warn("The user is not crawler");
            return HttpServletResponse.SC_UNAUTHORIZED;
        }

        //Reading the valid IP Addresses from the config file
        IPAddressChecker ipAddressChecker = null;
        try {
            ipAddressChecker = CrawlingUtils.getIPAddresses();
        } catch (Exception e) {
            logger.error("Error when getting IP Addresses: " + e.getMessage(),
                         e);
        }
        //Check if the sender is in between the IP Addresses
        String remoteAddress = httpRequest.getRemoteAddr();
        logger.debug("The remote address is: " + remoteAddress);
        IPAddresses sourceIPAddress = new IPAddresses(remoteAddress);

        if (!ipAddressChecker.isIPAddressIncluded(sourceIPAddress)) {
            logger.warn("The remote IP Address is not included in the permitted list");
            return HttpServletResponse.SC_UNAUTHORIZED;
        }

        //Set the
        String authZHeader = CrawlingUtils.getAuthorizationHeader(httpRequest);
        //This is the credential string and Session ID as well
        String credStr = authZHeader.substring(HEADER_BASIC.length());

        //Get timeout (maxSessionAge) and check it's OK       
        try {
            timeout = Config.getConfig().getString(".session.maxSessionAge");
        } catch (ConfigurationException e) {
            logger.error("Error when reading timeout: " + e.getMessage(), e);
        }
        if (timeout == null) {
            logger.error("Session Timeout could not be read from the config file");
            return HttpServletResponse.SC_INTERNAL_SERVER_ERROR;
        }

        //Get URL
        //url = httpRequest.getRequestURL().toString();
        url = getUrl(httpRequest);
        logger.debug("URL is: " + url);

        //Get CredID       
        try {
            credID = Config.getConfig().getString(".crawler.credentialID");
        } catch (ConfigurationException e) {
            logger.error("Error when reading Credential ID: " + e.getMessage(),
                         e);
        }
        if (credID == null) {
            logger.error("Credential ID could not be read from the config file");
            return HttpServletResponse.SC_INTERNAL_SERVER_ERROR;
        }

        Vector<Cookie> authCookies = new Vector<Cookie>();

        //Credentials
        Credential cred =
            CrawlingUtils.getBasicCredentials(authZHeader, realm, credID);

        //Protection: check credentials have the right format
        if (cred == null) {
            logger.error("Basic Credentials does not have the correct format");
            return HttpServletResponse.SC_INTERNAL_SERVER_ERROR;
        }

        //Add credentials
        Credentials creds = new Credentials();
        creds.add(cred);

        //Session status: set default value
        int resultCode = HttpServletResponse.SC_UNAUTHORIZED;

        //Instantiate Crawling session       
        crawlSession = new CrawlingSession(new Long(timeout).longValue());
        //Set Valve Config
        crawlSession.setValveConf(valveConf);

        //Get session
        logger.debug("Session ID to be seeked: " + credStr);
        UserSession userSession = crawlSession.getSession(credStr);

        //Check if session exists. If not, create/recreate it
        if (userSession == null) {

            resultCode =
                    crawlSession.authenticate(credStr, cred.getUsername(), httpRequest,
                                              httpResponse, authCookies, url,
                                              creds, credID);

            //Check resultCode
            if (resultCode != HttpServletResponse.SC_OK) {
                logger.error("Authentication result is not OK: " + resultCode);
                return resultCode;
            } else {
                userSession = crawlSession.getSession(credStr);
            }

        } else {

            //Check the session is valid
            if (!crawlSession.isValidSession(credStr)) {
                //reauthenticate
                resultCode =
                        crawlSession.reauthenticate(credStr, cred.getUsername(),
                                                    httpRequest, httpResponse,
                                                    authCookies, url, creds,
                                                    credID);

                //Check resultCode
                if (resultCode != HttpServletResponse.SC_OK) {
                    logger.error("Authentication result is not OK: " +
                                 resultCode);
                    return resultCode;
                } else {
                    userSession = crawlSession.getSession(credStr);
                }

            }
        }

        //Authorization
        AuthorizationProcessImpl authorizationProcessCls;
        try {
            authorizationProcessCls = setAuthorizationProcessImpl();
        } catch (ValveConfigurationException e) {
            logger.error("Valve configuration error: " + e.getMessage(), e);
            resultCode = HttpServletResponse.SC_INTERNAL_SERVER_ERROR;
            return resultCode;
        }

        if (authorizationProcessCls != null) {

            //Avoid HTML processing (URL rewriting)
            AuthorizationUtils.setProcessHTML(false);

            try {
                logger.debug("Authorization process [" + url + "]");
                //
                //Launch authorization process               
                resultCode =
                        authorizationProcessCls.authorize(httpRequest, httpResponse,
                                                          userSession.getCookies(),
                                                          url, credID);
                //Check if result is -1 (there is no pattern in the config file that matches with the URL)
                if (resultCode == -1) {
                    logger.debug("Auth pattern not found for such URL. Setting 401");
                    resultCode = HttpServletResponse.SC_UNAUTHORIZED;
                }

            } catch (nonValidSessionException e) {
                logger.error("Session is not longer valid: " + e.getMessage(),
                             e);
                resultCode = HttpServletResponse.SC_UNAUTHORIZED;
            } catch (Exception e) {
                logger.error("Exception when authorizing: " + e.getMessage(),
                             e);
                resultCode = HttpServletResponse.SC_UNAUTHORIZED;
            }
        } else {
            logger.error("Authorization class is NULL");
            resultCode = HttpServletResponse.SC_INTERNAL_SERVER_ERROR;
        }

        logger.debug("Response status is: " + resultCode);

        return resultCode;
    }

    /**
     * Sets the authorization class that drives the authorization
     * process
     *
     * @return the name of the class that drives the authorization process
     *
     * @throws ValveConfigurationException
     */
    public AuthorizationProcessImpl setAuthorizationProcessImpl() throws ValveConfigurationException {

        AuthorizationProcessImpl authorizationProcessImpl = null;

        //Set authorizationProcessClsName if it has not been done yet
        if (authorizationProcessClsName == null) {
            //read the authorization class name from Valve Config
            if (valveConf != null) {
                authorizationProcessClsName =
                        valveConf.getAuthorizationProcessImpl();
                logger.debug("Setting authorizationProcessClsName: " +
                             authorizationProcessClsName);
            } else {
                // Throw Configuration Exception
                throw new ValveConfigurationException("Valve Configuration file has not been set correctly");
            }

        }

        // Protection
        if ((authorizationProcessClsName == null) ||
            (authorizationProcessClsName.equals(""))) {

            // Throw Configuration Exception
            throw new ValveConfigurationException("Configuration parameter [authorizationProcessImpl] has not been set correctly");

        }

        try {

            // Instantiate the authorization process class

            authorizationProcessImpl =
                    (AuthorizationProcessImpl)Class.forName(authorizationProcessClsName).newInstance();
            authorizationProcessImpl.setValveConfiguration(valveConf);

        } catch (InstantiationException ie) {

            // Throw Configuration Exception
            throw new ValveConfigurationException("Configuration parameter [authorizationProcessImpl] has not been set correctly - InstantiationException");


        } catch (IllegalAccessException iae) {

            // Throw Configuration Exception
            throw new ValveConfigurationException("Configuration parameter [authorizationProcessImpl] has not been set correctly - IllegalAccessException");

        } catch (ClassNotFoundException cnfe) {

            // Throw Configuration Exception
            throw new ValveConfigurationException("Configuration parameter [authorizationProcessImpl] has not been set correctly - ClassNotFoundException");

        }

        return authorizationProcessImpl;

    }


    /**
     * Sets the config file location (if they were not set yet)
     *
     */
    private void setConfigFilePaths() {

        //Only process if either Valve or Crawler config file paths are not set yet
        if ((gsaValveConfigPath == null) || (crawlerConfigPath == null)) {

            //Get Context vars
            javax.naming.Context ctx = null;
            javax.naming.Context env = null;
            try {
                ctx = new javax.naming.InitialContext();
                env = (javax.naming.Context)ctx.lookup("java:comp/env");
            } catch (NamingException e) {
                logger.error("Error when setting the Naming Context (Config paths): " +
                             e.getMessage(), e);
            }


            //If Valve config file path is null: read the pointer from environment (web.xml)
            if (gsaValveConfigPath == null) {
                try {
                    //Set gsaValveConfigPath
                    gsaValveConfigPath =
                            (String)env.lookup("gsaValveConfigPath");
                } catch (NamingException e) {
                    logger.error("Error when setting Valve config path: " +
                                 e.getMessage(), e);
                }

                logger.debug("gsaValveConfigPath is: " + gsaValveConfigPath);
            }

            //If Crawler config file path is null: read the pointer from environment (web.xml)
            if (crawlerConfigPath == null) {
                try {
                    //Set crawlerConfigPath
                    crawlerConfigPath =
                            (String)env.lookup("crawlerConfigPath");
                } catch (NamingException e) {
                    logger.error("Error when setting Crawler config path: " +
                                 e.getMessage(), e);
                }

                logger.debug("crawlerConfigPath is: " + crawlerConfigPath);
            }

        }
    }

    /**
     * Sets the ValveConf instance from the config file.
     * It only processes it if this var has not been set yet
     *
     */
    private void initializeValveConfiguration() {
        //Initialize valveConf only if it was not set yet
        if (valveConf == null) {
            //Check if the pointer is already defined
            if (gsaValveConfigPath != null) {
                try {
                    valveConf =
                            ValveConfigurationInstance.getValveConfig(gsaValveConfigPath);
                } catch (ValveConfigurationException e) {
                    logger.error("Error when setting Valve configuration instance: " +
                                 e.getMessage(), e);
                }
            } else {
                logger.error("The Valve config path is not readable. Check web.xml to set it properly");
            }
        }
    }

    /**
     * Gets the complete URL that is being requested by the crawler
     *
     * @param request servlet HTTP request
     *
     * @return the complete URL
     */
    public static String getUrl(HttpServletRequest request) {
        String reqUrl = request.getRequestURL().toString();
       
        //Support for HTTP 1.1
        if (request.getHeader("X-Forwarded-Host") != null){
            reqUrl = reqUrl.replace(request.getHeader("Host"),request.getHeader("X-Forwarded-Host"));
        }
       
        String queryString = request.getQueryString();
        if (queryString != null) {
            reqUrl += "?" + queryString;
        }
        return reqUrl;
    }


}
TOP

Related Classes of com.google.gsa.proxy.Crawler

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.