package net.matuschek.html;
/************************************************
Copyright (c) 2001/2002 by Daniel Matuschek
*************************************************/
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Vector;
import java.util.StringTokenizer;
import java.io.*;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.w3c.tidy.Tidy;
import org.apache.log4j.Category;
import net.matuschek.util.AttribValuePair;
/**
* This class implements an HTML document
*
* It uses JTidy to parse the given HTML code to an internal DOM
* representation.
*
* @author Daniel Matuschek
* @version $Id $
*/
public class HtmlDocument
{
/** URL of this document */
private URL url = null;
/** Content text as an array of bytes (this is how we get it from HTTP !) */
private byte[] content = null;
/** the DOM representation of this HTML document */
private Document domDoc = null;
/** Log4J category for logging purposes */
private Category log;
/** encoding */
private String encoding;
/** Base URL */
private URL baseURL=null;
/** All links */
Vector<URL> links;
/**
* initializes HTML document without content
*/
private HtmlDocument(URL url) {
log = Category.getInstance(getClass().getName());
this.url = url;
}
/**
* Initializes an HTML document with the given content.
*
* @param url the URL of this document. Needed for link extraction.
* @param content some HTML text as an array of bytes
*/
public HtmlDocument(URL url, byte[] content) {
this(url);
this.content = content;
parse();
}
/**
* Initializes an HTML document with the given content.
*
* @param url the URL of this document. Needed for link extraction.
* @param content some HTML text as an array of bytes
* @param newEncoding Is the encoding of the content.
*/
public HtmlDocument(URL url, byte[] content, String newEncoding) {
this(url);
this.content = content;
encoding = newEncoding;
parse();
}
/**
* Initalizes an HTML document from a String. Convert string to
* bytes using default encoding
*/
public HtmlDocument(URL url, String contentStr) {
this(url);
this.content = new byte[contentStr.length()+1];
for (int i=0; i<contentStr.length(); i++) {
this.content[i] = (byte)contentStr.charAt(i);
}
parse();
}
/**
* Extracts all links to other documents from this HTML document.
*
* @return a Vector of URLs containing the included links
*/
private void parse() {
if (domDoc == null) {
parseToDOM();
}
this.links = new Vector<URL>();
extractLinks(domDoc.getDocumentElement(),links);
}
public Vector<URL> getLinks() {
return this.links;
}
/**
* Extracts all links to included images from this HTML document.
*
* @return a Vector of URLs containing the included links
*/
public Vector getImageLinks() {
if (domDoc == null) {
parseToDOM();
}
Vector<URL> links = new Vector<URL>();
extractImageLinks(domDoc.getDocumentElement(),links);
return links;
}
/**
* gets all Element nodes of a given type as a Vector
* @param type the type of elements to return. e.g. type="a"
* will return all <A> tags. type must be lowercase
* @return a Vector containing all element nodes of the given type
*/
public Vector getElements(String type) {
if (domDoc == null) {
parseToDOM();
}
Vector <Element>links = new Vector<Element>();
extractElements(domDoc.getDocumentElement(),type,links);
return links;
}
/**
* Extract links from the given DOM subtree and put it into the given
* vector.
*
* @param element the top level DOM element of the DOM tree to parse
* @param links the vector that will store the links
*/
protected void extractLinks(Element element, Vector <URL>links) {
// this should not happen !
if (element==null) {
log.error("got a null element");
return;
}
String name = element.getNodeName().toLowerCase();
if (name.equals("a")) {
// A HREF=
addLink(element.getAttribute("href"),links);
} else if (name.equals("base")) {
// BASE HREF=
try {
this.baseURL = new URL(element.getAttribute("href"));
log.info("baseUR="+baseURL);
} catch (MalformedURLException e) { }
} else if (name.equals("frame")) {
// FRAME SRC=
addLink(element.getAttribute("src"),links);
// handle internal frame (iframes) as well
} else if (name.equals("iframe")) {
// IFRAME SRC=
addLink(element.getAttribute("src"),links);
} else if (name.equals("image")) {
// IMAGEG SRC= (incorrect, but seems to work in some browsers :(
addLink(element.getAttribute("src"),links);
} else if (name.equals("img")) {
// IMG SRC=
addLink(element.getAttribute("src"),links);
} else if (name.equals("area")) {
// AREA HREF=
addLink(element.getAttribute("href"),links);
} else if (name.equals("meta")) {
// META HTTP-EQUIV=REFRESH
String equiv=element.getAttribute("http-equiv");
if ((equiv != null) && (equiv.equalsIgnoreCase("refresh"))) {
String refreshcontent=element.getAttribute("content");
if (refreshcontent == null) { refreshcontent=""; }
StringTokenizer st=new StringTokenizer(refreshcontent,";");
while (st.hasMoreTokens()) {
String token=st.nextToken().trim();
AttribValuePair av = new AttribValuePair(token);
if (av.getAttrib().equals("url")) {
addLink(av.getValue(),links);
}
}
}
} else if (name.equals("body")) {
// BODY BACKGROUND=
String background = element.getAttribute("background");
if ( ! ( background == null) ||
( background.equals("") ) ) {
addLink(background,links);
}
} else {
log.info("Ignore tag name: "+name);
}
// recursive travel through all childs
NodeList childs = element.getChildNodes();
for (int i=0; i<childs.getLength(); i++) {
if (childs.item(i) instanceof Element) {
extractLinks((Element)childs.item(i),links);
}
}
}
/**
* Extract links to includes images from the given DOM subtree and
* put them into the given vector.
*
* @param element the top level DOM element of the DOM tree to parse
* @param links the vector that will store the links
*/
protected void extractImageLinks(Element element, Vector<URL> links) {
// this should not happen !
if (element==null) {
log.error("got a null element");
return;
}
String name = element.getNodeName();
if (name.equals("img")) {
// IMG SRC=
addLink(element.getAttribute("src"),links);
}
if (name.equals("image")) {
// IMAGE SRC=
addLink(element.getAttribute("src"),links);
}
// recursive travel through all childs
NodeList childs = element.getChildNodes();
for (int i=0; i<childs.getLength(); i++) {
if (childs.item(i) instanceof Element) {
extractImageLinks((Element)childs.item(i),links);
}
}
}
/**
* Extract elements from the given DOM subtree and put it into the given
* vector.
*
* @param element the top level DOM element of the DOM tree to parse
* @param type HTML tag to extract (e.g. "a", "form", "head" ...)
* @param elementList the vector that will store the elements
*/
protected void extractElements(Element element,
String type,
Vector <Element>elementList) {
// this should not happen !
if (element==null) {
log.error("got a null element");
return;
}
String name = element.getNodeName();
if (name.equals(type)) {
elementList.add(element);
}
// recursive travel through all childs
NodeList childs = element.getChildNodes();
for (int i=0; i<childs.getLength(); i++) {
if (childs.item(i) instanceof Element) {
extractElements((Element)childs.item(i),type,elementList);
}
}
}
/**
* parses the document to a DOM tree using Tidy
*/
private void parseToDOM() {
ByteArrayInputStream is = new ByteArrayInputStream(content);
// set tidy parameters
Tidy tidy = new Tidy();
tidy.setUpperCaseTags(false);
tidy.setUpperCaseAttrs(false);
tidy.setErrout(new PrintWriter(System.err));
domDoc = tidy.parseDOM(is,null);
}
/**
* adds a links to the given vector. ignores (but logs) possible errors
*/
private void addLink(String newURL, Vector<URL> links) {
// remove part after # from the URL
// thanks to Johannes Christen for bug fix.
if ((newURL == null) || (newURL.equals(""))) return;
int pos = newURL.indexOf("#");
if (pos >=0 ) {
newURL = newURL.substring(0,pos);
}
if (encoding != null) {
try {
newURL = new String(newURL.getBytes(), encoding);
} catch (UnsupportedEncodingException e) {
}
} else {
try {
newURL = new String(newURL.getBytes(), "ISO-8859-1");
} catch (UnsupportedEncodingException e) {
}
}
try {
URL u = null;
if (this.baseURL != null) {
u = new URL(this.baseURL,newURL);
} else {
u = new URL(url,newURL);
}
links.add(u);
} catch (Exception e) {
log.debug("error during link extraction: "+e.getMessage()+" "+newURL);
}
}
public URL getBaseURL() {
return baseURL;
}
}