import java.util.*;
import java.io.*;
import java.net.*;
import com.quiotix.html.parser.*;
import com.quiotix.html.parser.HtmlDocument.Attribute;
import com.quiotix.html.parser.HtmlDocument.AttributeList;
class HTMLSpider extends HtmlVisitor {
public static final String MAGIC_CLASS = "includeDoc";
Set seenList, deferralList;
URL documentURL;
HtmlDocument document;
public static void runSpider(URL u) {
HTMLSpider spider = new HTMLSpider();
spider.openURL(u);
}
protected HTMLSpider() {
this.seenList = new HashSet();
this.deferralList = new HashSet();
}
protected HTMLSpider(HTMLSpider base) {
this.seenList = base.seenList;
this.deferralList = new HashSet(base.deferralList);
}
protected HTMLSpider getRecursiveInstance(HtmlDocument.Tag t) {
return new HTMLSpider(this);
}
public void openURL(URL u) {
documentURL = u;
seenList.add(normalizeURL(u));
try {
URLConnection conn = u.openConnection();
conn.connect();
InputStream in = conn.getInputStream();
if (isXML(u, conn))
in = transformXML(in);
HtmlParser parser = new HtmlParser(in);
parser.streamAccept(this);
/*
document = parser.HtmlDocument();
document.accept(this);
*/
} catch (Exception e) {
System.err.println("Couldn't open " + u);
System.err.println("\t" + e);
}
}
public boolean isXML(URL u, URLConnection conn) {
if (u.getFile().toLowerCase().endsWith(".xml"))
return true;
String contentType = conn.getContentType();
if (contentType != null &&
contentType.toLowerCase().indexOf("xml") != -1)
return true;
return false;
}
public InputStream transformXML(InputStream xmlDocStream) {
System.err.println("XML transformation is not yet implemented - "+
"ask David Tuma to write it.");
// FIXME: open the document, find its stylesheet parameter,
// and transform it
return xmlDocStream;
}
/*
public Document styleDocument(Document document,
String stylesheet) throws Exception {
// load the transformer using JAXP
TransformerFactory factory = TransformerFactory.newInstance();
Transformer transformer = factory.newTransformer
(new StreamSource( stylesheet ));
// now lets style the given document
DocumentSource source = new DocumentSource( document );
DocumentResult result = new DocumentResult();
transformer.transform( source, result );
// return the transformed document
Document transformedDoc = result.getDocument();
return transformedDoc;
}
*/
public void deferURL(URL u) {
deferralList.add(normalizeURL(u));
}
public boolean shouldFollow(URL u) {
if (u == null) return false;
String norm = normalizeURL(u);
if (seenList.contains(norm) || deferralList.contains(norm))
return false;
else
return true;
}
public void visit(HtmlDocument.Tag t) {
if (t.tagName.equalsIgnoreCase("A")) {
Attribute href = getAttribute(t, "HREF");
if (href != null)
visitHref(t, href);
}
}
public void visitHref(HtmlDocument.Tag t, Attribute href) {
Attribute cssClass = getAttribute(t, "CLASS");
if (cssClass == null) return;
if (!MAGIC_CLASS.equalsIgnoreCase(deQuote(cssClass.value))) return;
URL url = resolveURL(href);
if (shouldFollow(url)) {
HTMLSpider subspider = getRecursiveInstance(t);
subspider.openURL(url);
}
}
public URL resolveURL(Attribute attr) {
return resolveURL(deQuote(attr.value));
}
public URL resolveHash(Attribute attr) {
return resolveURL("#" + deQuote(attr.value));
}
public URL resolveURL(String url) {
try {
return new URL(documentURL, url);
} catch (MalformedURLException mue) {
return null;
}
}
public static Attribute getAttribute(HtmlDocument.Tag t, String attrName) {
Iterator i=t.attributeList.attributes.iterator();
while (i.hasNext()) {
Attribute attr = (Attribute) i.next();
if (attr.name.equalsIgnoreCase(attrName))
return attr;
}
return null;
}
public static void setAttribute(Attribute attr, String val) {
// FIXME: escape quotes in val?
attr.value = "\"" + val + "\"";
}
public static String deQuote(String s) {
if (s.startsWith("\"") && s.endsWith("\"") ||
s.startsWith("'") && s.endsWith("'"))
return s.substring(1, s.length() - 1);
else
return s;
}
public static String normalizeURL(URL u) {
return normalizeURL(u.toString());
}
public static String normalizeURL(String url) {
if (url.startsWith("file:"))
return url.toLowerCase();
else
return url;
}
}