import java.util.*;
import com.quiotix.html.parser.*;
import com.quiotix.html.parser.HtmlDocument.Attribute;
import com.quiotix.html.parser.HtmlDocument.AttributeList;
class HTMLSpider extends HtmlVisitor {
public static final String MAGIC_CLASS = "includeDoc";
Set seenList, deferralList;
URL documentURL;
HtmlDocument document;
public static void runSpider(URL u) {
HTMLSpider spider = new HTMLSpider();
protected HTMLSpider() {
this.seenList = new HashSet();
this.deferralList = new HashSet();
protected HTMLSpider(HTMLSpider base) {
this.seenList = base.seenList;
this.deferralList = new HashSet(base.deferralList);
protected HTMLSpider getRecursiveInstance(HtmlDocument.Tag t) {
return new HTMLSpider(this);
public void openURL(URL u) {
documentURL = u;
try {
URLConnection conn = u.openConnection();
InputStream in = conn.getInputStream();
if (isXML(u, conn))
in = transformXML(in);
HtmlParser parser = new HtmlParser(in);
document = parser.HtmlDocument();
} catch (Exception e) {
System.err.println("Couldn't open " + u);
System.err.println("\t" + e);
public boolean isXML(URL u, URLConnection conn) {
if (u.getFile().toLowerCase().endsWith(".xml"))
return true;
String contentType = conn.getContentType();
if (contentType != null &&
contentType.toLowerCase().indexOf("xml") != -1)
return true;
return false;
public InputStream transformXML(InputStream xmlDocStream) {
System.err.println("XML transformation is not yet implemented - "+
"ask David Tuma to write it.");
// FIXME: open the document, find its stylesheet parameter,
// and transform it
return xmlDocStream;
public Document styleDocument(Document document,
String stylesheet) throws Exception {
// load the transformer using JAXP
TransformerFactory factory = TransformerFactory.newInstance();
Transformer transformer = factory.newTransformer
(new StreamSource( stylesheet ));
// now lets style the given document
DocumentSource source = new DocumentSource( document );
DocumentResult result = new DocumentResult();
transformer.transform( source, result );
// return the transformed document
Document transformedDoc = result.getDocument();
return transformedDoc;
public void deferURL(URL u) {
public boolean shouldFollow(URL u) {
if (u == null) return false;
String norm = normalizeURL(u);
if (seenList.contains(norm) || deferralList.contains(norm))
return false;
return true;
public void visit(HtmlDocument.Tag t) {
if (t.tagName.equalsIgnoreCase("A")) {
Attribute href = getAttribute(t, "HREF");
if (href != null)
visitHref(t, href);
public void visitHref(HtmlDocument.Tag t, Attribute href) {
Attribute cssClass = getAttribute(t, "CLASS");
if (cssClass == null) return;
if (!MAGIC_CLASS.equalsIgnoreCase(deQuote(cssClass.value))) return;
URL url = resolveURL(href);
if (shouldFollow(url)) {
HTMLSpider subspider = getRecursiveInstance(t);
public URL resolveURL(Attribute attr) {
return resolveURL(deQuote(attr.value));
public URL resolveHash(Attribute attr) {
return resolveURL("#" + deQuote(attr.value));
public URL resolveURL(String url) {
try {
return new URL(documentURL, url);
} catch (MalformedURLException mue) {
return null;
public static Attribute getAttribute(HtmlDocument.Tag t, String attrName) {
Iterator i=t.attributeList.attributes.iterator();
while (i.hasNext()) {
Attribute attr = (Attribute);
if (
return attr;
return null;
public static void setAttribute(Attribute attr, String val) {
// FIXME: escape quotes in val?
attr.value = "\"" + val + "\"";
public static String deQuote(String s) {
if (s.startsWith("\"") && s.endsWith("\"") ||
s.startsWith("'") && s.endsWith("'"))
return s.substring(1, s.length() - 1);
return s;
public static String normalizeURL(URL u) {
return normalizeURL(u.toString());
public static String normalizeURL(String url) {
if (url.startsWith("file:"))
return url.toLowerCase();
return url;