/***********************************************************************
*
* $CVSHeader$
*
* This file is part of WebScarab, an Open Web Application Security
* Project utility. For details, please see http://www.owasp.org/
*
* Copyright (c) 2002 - 2004 Rogan Dawes
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
* Getting Source
* ==============
*
* Source for this application is maintained at Sourceforge.net, a
* repository for free software projects.
*
* For details, please see http://www.sourceforge.net/projects/owasp
*
*/
/*
* Spider.java
*
* Created on August 5, 2003, 10:52 PM
*/
package org.owasp.webscarab.plugin.spider;
import org.owasp.webscarab.httpclient.ConversationHandler;
import org.owasp.webscarab.model.StoreException;
import org.owasp.webscarab.model.HttpUrl;
import org.owasp.webscarab.model.ConversationID;
import org.owasp.webscarab.model.Cookie;
import org.owasp.webscarab.model.NamedValue;
import org.owasp.webscarab.model.Request;
import org.owasp.webscarab.model.Response;
import org.owasp.webscarab.model.UrlModel;
import org.owasp.webscarab.parser.Parser;
import org.owasp.webscarab.plugin.Framework;
import org.owasp.webscarab.plugin.Plugin;
import org.owasp.webscarab.plugin.Hook;
import org.owasp.webscarab.httpclient.FetcherQueue;
import org.htmlparser.Node;
import org.htmlparser.Tag;
import org.htmlparser.NodeFilter;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.filters.HasAttributeFilter;
import java.util.List;
import java.util.LinkedList;
import java.util.Date;
import java.util.logging.Logger;
import java.net.MalformedURLException;
import java.lang.Thread;
import java.io.IOException;
/**
*
* @author rdawes
*/
public class Spider implements Plugin, ConversationHandler {
private SpiderModel _model = null;
private Framework _framework = null;
private FetcherQueue _fetcherQueue = null;
private int _threads = 4;
private Thread _runThread = null;
private Logger _logger = Logger.getLogger(getClass().getName());
/** Creates a new instance of Spider */
public Spider(Framework framework) {
_framework = framework;
_model = new SpiderModel(_framework.getModel());
_fetcherQueue = new FetcherQueue("Spider", this, 4, 0);
}
public SpiderModel getModel() {
return _model;
}
public String getPluginName() {
return new String("Spider");
}
public void run() {
_model.setStatus("Started");
_model.setStopping(false);
_runThread = Thread.currentThread();
_model.setRunning(true);
while (!_model.isStopping()) {
// queue them as fast as they come, sleep a bit otherwise
if (!queueRequests()) {
try {
Thread.sleep(100);
} catch (InterruptedException ie) {}
} else {
Thread.yield();
}
}
_fetcherQueue.clearRequestQueue();
_model.setRunning(false);
_runThread = null;
_model.setStatus("Stopped");
}
private boolean queueRequests() {
// if the request queue is empty, add the latest cookies etc to the
// request and submit it
if (_model.getQueuedLinkCount() == 0) return false;
if (_fetcherQueue.getRequestsQueued() > _threads) return false;
while (_model.getQueuedLinkCount() > 0 && _fetcherQueue.getRequestsQueued() <= _threads) {
Link link = _model.dequeueLink();
if (link == null) {
_logger.warning("Got a null link from the link queue");
return false;
}
Request request = newGetRequest(link);
if (_model.getCookieSync()) {
Cookie[] cookies = _model.getCookiesForUrl(request.getURL());
if (cookies.length>0) {
StringBuffer buff = new StringBuffer();
buff.append(cookies[0].getName()).append("=").append(cookies[0].getValue());
for (int i=1; i<cookies.length; i++) {
buff.append("; ").append(cookies[i].getName()).append("=").append(cookies[i].getValue());
}
request.setHeader("Cookie", buff.toString());
}
}
_fetcherQueue.submit(request);
}
return true;
}
public void responseReceived(Response response) {
Request request = response.getRequest();
if (request == null) {
_logger.warning("Got a null request from the response!");
return;
}
if (response.getStatus().startsWith("401")) {
_logger.info("Invalid credentials or authentication required for " + request.getURL());
_model.setAuthRequired(request.getURL());
return;
}
_framework.addConversation(request, response, "Spider");
if (_model.getCookieSync()) {
NamedValue[] headers = response.getHeaders();
for (int i=0; i<headers.length; i++) {
if (headers[i].getName().equalsIgnoreCase("Set-Cookie") || headers[i].getName().equalsIgnoreCase("Set-Cookie2")) {
Cookie cookie = new Cookie(new Date(), request.getURL(), headers[i].getValue());
_model.addCookie(cookie);
}
}
}
}
public void requestError(Request request, IOException ioe) {
_logger.info("Requested " + request.getURL() + " got IOException " + ioe.getMessage());
}
public boolean isBusy() {
if (!_model.isRunning()) return false;
return _model.getQueuedLinkCount()>0;
}
private boolean allowedURL(HttpUrl url) {
// check here if it is on the primary site, or sites, or matches an exclude Regex
// etc
// This only applies to the automated recursive spidering. If the operator
// really wants to fetch something offsite, more power to them
// Yes, this is effectively the classifier from websphinx, we can use that if it fits nicely
// OK if the URL matches the domain
if (isAllowedDomain(url) && !_model.isForbidden(url)) {
return true;
}
return false;
}
private boolean isAllowedDomain(HttpUrl url) {
String allowedDomains = _model.getAllowedDomains();
try {
return allowedDomains != null && !allowedDomains.equals("") && url.getHost().matches(allowedDomains);
} catch (Exception e) {
return false;
}
}
public void requestLinksUnder(HttpUrl url) {
List<Link> links = new LinkedList<Link>();
// build up a list of links
queueLinksUnder(url, links, 50);
// queue them
while (links.size()>0) _model.queueLink((Link) links.remove(0));
}
private void queueLinksUnder(HttpUrl url, List<Link> links, int max) {
String referer;
if (_model.isUnseen(url)) {
if (! _model.isForbidden(url) && !url.toString().matches(_framework.getDropPattern())) {
referer = _model.getReferer(url);
links.add(new Link(url, referer));
} else {
_logger.warning("Skipping forbidden path " + url);
}
}
if (links.size() == max) return;
UrlModel urlModel = _model.getUrlModel();
int count = urlModel.getChildCount(url);
for (int i=0; i<count; i++) {
HttpUrl child = urlModel.getChildAt(url, i);
queueLinksUnder(child, links, max);
if (links.size() == max) return;
}
}
public void requestLinks(HttpUrl[] urls) {
Link link;
for (int i=0; i<urls.length; i++) {
String referer = _model.getReferer(urls[i]);
link = new Link(urls[i], referer);
_model.queueLink(link);
}
}
public void clearQueue() {
_model.clearLinkQueue();
}
private Request newGetRequest(Link link) {
HttpUrl url = link.getURL();
String referer = link.getReferer();
Request req = new Request();
req.setMethod("GET");
req.setURL(url);
req.setVersion("HTTP/1.0"); // 1.1 or 1.0?
if (referer != null) {
req.setHeader("Referer", referer);
}
req.setHeader("Host", url.getHost() + ":" + url.getPort());
if (req.getVersion().equals("HTTP/1.0"))
req.setHeader("Connection", "Keep-Alive");
NamedValue[] headers = _model.getExtraHeaders();
if (headers != null && headers.length > 0) {
for (int i=0; i< headers.length; i++) {
if (headers[i] != null)
req.addHeader(headers[i]);
}
}
return req;
}
public void setExtraHeaders(NamedValue[] headers) {
_model.setExtraHeaders(headers);
}
public NamedValue[] getExtraHeaders() {
return _model.getExtraHeaders();
}
public void flush() throws StoreException {
// we do not manage our own store
}
public boolean stop() {
if (isBusy()) return false;
_model.setStopping(true);
try {
_runThread.join(5000);
} catch (InterruptedException ie) {
_logger.severe("Interrupted stopping " + getPluginName());
}
return !_model.isRunning();
}
public String getStatus() {
return _model.getStatus();
}
public void analyse(ConversationID id, Request request, Response response, String origin) {
HttpUrl base = request.getURL();
if (response.getStatus().equals("302")) {
String location = response.getHeader("Location");
if (location != null) {
try {
HttpUrl url = new HttpUrl(base, location);
_model.addUnseenLink(url, base);
} catch (MalformedURLException mue) {
_logger.warning("Badly formed Location header : " + location);
}
} else {
_logger.warning("302 received, but no Location header!");
}
return;
}
Object parsed = Parser.parse(base, response);
if (parsed != null && parsed instanceof NodeList) { // the parsed content is HTML
NodeList nodelist = (NodeList) parsed;
processHtml(base, nodelist);
// recurseHtmlNodes(nodelist, base);
} // else maybe it is a parsed Flash document? Anyone? :-)
}
private void processHtml(HttpUrl base, NodeList nodelist) {
NodeFilter filter = new HasAttributeFilter("href");
filter = new OrFilter(filter, new HasAttributeFilter("src"));
filter = new OrFilter(filter, new HasAttributeFilter("onclick"));
filter = new OrFilter(filter, new HasAttributeFilter("onblur"));
try {
NodeList links = nodelist.extractAllNodesThatMatch(filter);
for (NodeIterator ni = links.elements(); ni.hasMoreNodes(); ) {
Node node = ni.nextNode();
if (node instanceof Tag) {
boolean got = false;
Tag tag = (Tag) node;
String src = tag.getAttribute("src");
if (src != null) {
processLink(base, src);
got = true;
}
String href = tag.getAttribute("href");
if (href != null) {
processLink(base, href);
got = true;
}
if (!got) {
// _logger.info("Didn't get anything from " + tag.getClass().getName() + ": " + tag);
}
}
}
} catch (ParserException pe) {
_logger.warning("ParserException : " + pe);
}
}
private void processLink(HttpUrl base, String link) {
if (link.startsWith("http://") || link.startsWith("https://")) {
try {
HttpUrl url = new HttpUrl(link);
_model.addUnseenLink(url, base);
} catch (MalformedURLException mue) {
_logger.warning("Malformed link : " + link);
}
} else if (link.toLowerCase().startsWith("mailto:")) {
// do nothing
} else if (link.toLowerCase().startsWith("javascript:")) {
processScript(base, link.substring(10));
} else if (link.matches("^[a-zA-Z]+://.*")) {
_logger.info("Encountered an unhandled url scheme " + link);
} else {
_logger.fine("Creating a new relative URL with " + base + " and " + link + " '");
try {
HttpUrl url = new HttpUrl(base, link);
_model.addUnseenLink(url, base);
} catch (MalformedURLException mue) {
_logger.warning("Bad relative URL (" + base.toString() + ") : " + link);
}
}
}
private void processScript(HttpUrl base, String script) {
if (script.startsWith("window.open(")) {
_logger.info("Script opens a window : " + script);
} else if (script.startsWith("location.href")) {
_logger.info("Script sets location : " + script);
}
}
public boolean isModified() {
return false; // our modifications are kept in the SiteModel
}
public boolean isRunning() {
return _model.isRunning();
}
public void setSession(String type, Object store, String session) throws StoreException {
}
public Object getScriptableObject() {
return null;
}
public Hook[] getScriptingHooks() {
return new Hook[0];
}
}