package org.untmpl;
import org.htmlcleaner.BaseToken;
import org.htmlcleaner.ContentToken;
import org.htmlcleaner.TagNode;
/**
* Bottom-up traversal of an HTML document, producing hashes based on the
* subtrees of each node as it goes. For every encountered node a call-back
* function will be called.
*
* @author eskil.andreen
*
*/
public class Traverser {
/**
* Call-back operations should implement this class.
*
* @author eskil.andreen
*
*/
public static interface Callback {
public void handle(TagNode tn, Node n);
}
/**
* Traverses the HTML document starting from root. For every encountered
* node the handle method of the given call-back class is called.
*
* @param <T>
* @param root
* The root node of the HTML document.
* @param callback
* The call-back class.
* @return Convenience access to the supplied call-back class.
*/
public static <T extends Callback> T traverse(TagNode root, T callback) {
traverse(root, 0, callback);
return callback;
}
/**
* Recursive, bottom-up traversal of the HTML document tree. Each node is
* assigned a hash value create from, among other things, the hash value of
* its children. Thus, for two nodes to have the same subtrees they must
* have the same hash values.
*
* @param root
* @param depth
* @param callback
* @return
*/
private static Node traverse(BaseToken root, int depth, Callback callback) {
if (root instanceof TagNode) {
TagNode tn = (TagNode) root;
int height = -1;
int hash = HashCodeUtil.SEED;
hash = HashCodeUtil.hash(hash, tn.getName());
hash = HashCodeUtil.hash(hash, tn.getAttributes().toString());
for (Object n : tn.getChildren()) {
if (isInteresting(n)) {
Node c = traverse((BaseToken) n, depth + 1, callback);
hash = HashCodeUtil.hash(hash, c);
if (c.height > height)
height = c.height;
}
}
hash = HashCodeUtil.hash(hash, depth);
hash = HashCodeUtil.hash(hash, height);
Node out = new Node(hash, depth, height + 1);
callback.handle(tn, out);
return out;
} else if (root instanceof ContentToken) {
ContentToken cn = (ContentToken) root;
int hash = HashCodeUtil.SEED;
hash = HashCodeUtil.hash(hash, cn.getContent());
hash = HashCodeUtil.hash(hash, depth);
hash = HashCodeUtil.hash(hash, 0);
Node out = new Node(hash, depth, 0);
return out;
}
return null;
}
private static boolean isInteresting(Object o) {
return o instanceof TagNode || o instanceof ContentToken;
}
}