Package

Source Code of HTMLSpider

import java.util.*;
import java.io.*;
import java.net.*;
import com.quiotix.html.parser.*;
import com.quiotix.html.parser.HtmlDocument.Attribute;
import com.quiotix.html.parser.HtmlDocument.AttributeList;


class HTMLSpider extends HtmlVisitor {

    public static final String MAGIC_CLASS = "includeDoc";

    Set seenList, deferralList;
    URL documentURL;
    HtmlDocument document;


    public static void runSpider(URL u) {
        HTMLSpider spider = new HTMLSpider();
        spider.openURL(u);
    }

    protected HTMLSpider() {
        this.seenList = new HashSet();
        this.deferralList = new HashSet();
    }

    protected HTMLSpider(HTMLSpider base) {
        this.seenList = base.seenList;
        this.deferralList = new HashSet(base.deferralList);
    }

    protected HTMLSpider getRecursiveInstance(HtmlDocument.Tag t) {
        return new HTMLSpider(this);
    }

    public void openURL(URL u) {
        documentURL = u;
        seenList.add(normalizeURL(u));

        try {
            URLConnection conn = u.openConnection();
            conn.connect();
            InputStream in = conn.getInputStream();

            if (isXML(u, conn))
                in = transformXML(in);

            HtmlParser parser = new HtmlParser(in);
            parser.streamAccept(this);
            /*
            document = parser.HtmlDocument();
            document.accept(this);
            */
        } catch (Exception e) {
            System.err.println("Couldn't open " + u);
            System.err.println("\t" + e);
        }
    }

    public boolean isXML(URL u, URLConnection conn) {
        if (u.getFile().toLowerCase().endsWith(".xml"))
            return true;
        String contentType = conn.getContentType();
        if (contentType != null &&
            contentType.toLowerCase().indexOf("xml") != -1)
            return true;
        return false;
    }
    public InputStream transformXML(InputStream xmlDocStream) {
        System.err.println("XML transformation is not yet implemented - "+
                           "ask David Tuma to write it.");
        // FIXME: open the document, find its stylesheet parameter,
        // and transform it
        return xmlDocStream;
    }

    /*
    public Document styleDocument(Document document,
                                  String stylesheet) throws Exception {

        // load the transformer using JAXP
        TransformerFactory factory = TransformerFactory.newInstance();
        Transformer transformer = factory.newTransformer
            (new StreamSource( stylesheet ));

        // now lets style the given document
        DocumentSource source = new DocumentSource( document );
        DocumentResult result = new DocumentResult();
        transformer.transform( source, result );

        // return the transformed document
        Document transformedDoc = result.getDocument();
        return transformedDoc;
    }
    */

    public void deferURL(URL u) {
        deferralList.add(normalizeURL(u));
    }

    public boolean shouldFollow(URL u) {
        if (u == null) return false;
        String norm = normalizeURL(u);
        if (seenList.contains(norm) || deferralList.contains(norm))
            return false;
        else
            return true;
    }

    public void visit(HtmlDocument.Tag t) {
        if (t.tagName.equalsIgnoreCase("A")) {
            Attribute href = getAttribute(t, "HREF");
            if (href != null)
                visitHref(t, href);
        }
    }

    public void visitHref(HtmlDocument.Tag t, Attribute href) {

        Attribute cssClass = getAttribute(t, "CLASS");
        if (cssClass == null) return;
        if (!MAGIC_CLASS.equalsIgnoreCase(deQuote(cssClass.value))) return;

        URL url = resolveURL(href);
        if (shouldFollow(url)) {
            HTMLSpider subspider = getRecursiveInstance(t);
            subspider.openURL(url);
        }
    }

    public URL resolveURL(Attribute attr) {
        return resolveURL(deQuote(attr.value));
    }
    public URL resolveHash(Attribute attr) {
        return resolveURL("#" + deQuote(attr.value));
    }
    public URL resolveURL(String url) {
        try {
            return new URL(documentURL, url);
        } catch (MalformedURLException mue) {
            return null;
        }
    }

    public static Attribute getAttribute(HtmlDocument.Tag t, String attrName) {
        Iterator i=t.attributeList.attributes.iterator();
        while (i.hasNext()) {
            Attribute attr = (Attribute) i.next();
            if (attr.name.equalsIgnoreCase(attrName))
                return attr;
        }
        return null;
    }

    public static void setAttribute(Attribute attr, String val) {
        // FIXME: escape quotes in val?
        attr.value = "\"" + val + "\"";
    }

    public static String deQuote(String s) {
        if (s.startsWith("\"") && s.endsWith("\"") ||
            s.startsWith("'") && s.endsWith("'"))
            return s.substring(1, s.length() - 1);
        else
            return s;
    }
    public static String normalizeURL(URL u) {
        return normalizeURL(u.toString());
    }
    public static String normalizeURL(String url) {
        if (url.startsWith("file:"))
            return url.toLowerCase();
        else
            return url;
    }

}
TOP

Related Classes of HTMLSpider

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.