package com.cardence.lawshelf.html;
import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang.ArrayUtils;
import org.apache.commons.lang.CharUtils;
import org.apache.commons.logging.Log;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Attribute;
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Comment;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
@Component
public class JsoupHtmlParserImpl implements HtmlParser {
@Autowired
private Log log;
private HtmlParserHandler parserHandler;
private Document document;
public void setParserHandler(HtmlParserHandler handler) {
this.parserHandler = handler;
}
public void parseFile(String filepath) throws IOException {
File file = new File(filepath);
parseFile(file);
}
public void parseFile(File file) throws IOException {
String html = parserHandler.willProcessFile(file);
if (html != null) {
this.document = Jsoup.parse(html);
} else {
this.document = Jsoup.parse(file, null);
}
}
public void parseUrl(URL url) throws IOException {
String html = parserHandler.willProcessUrl(url);
if (html != null) {
this.document = Jsoup.parse(html);
} else {
this.document = Jsoup.parse(url, 10000);
}
}
public void parseHtmlString(String html) {
String modhtml = parserHandler.willProcessHtml(html);
if (modhtml != null) {
html = modhtml;
}
this.document = Jsoup.parse(html);
}
private void handleElements(Elements elements) {
List<Node> allNodes = elements.first().childNodes();
processNodeList(allNodes);
}
public void processStructure() {
this.parserHandler.beginDocument();
Elements headElements = document.getElementsByTag("head");
this.parserHandler.beginHead();
this.handleElements(headElements);
this.parserHandler.endHead();
Elements bodyElements = document.getElementsByTag("body");
this.parserHandler.beginBody();
this.handleElements(bodyElements);
this.parserHandler.endBody();
this.parserHandler.endDocument();
}
private void processNodeList(List<Node> nodelist) {
if (nodelist == null) {
return;
}
for (Node n : nodelist) {
processNode(n);
}
}
private void processNode(Node node) {
if (node == null) {
return;
}
if (node instanceof Comment) {
this.parserHandler.foundComment(node.outerHtml());
} else if (node instanceof Element) {
Element enode = (Element) node;
String tagname = enode.tagName();
String text = cleanOutControlChars(enode.text());
String outerHTML = cleanOutControlChars(enode.outerHtml());
String innerHTML = cleanOutControlChars(enode.html());
Attributes attrs = enode.attributes();
Map<String, String> attributeMap = new HashMap<String, String>();
for (Attribute attr : attrs) {
attributeMap.put(attr.getKey(), attr.getValue());
}
this.parserHandler.foundElement(tagname, innerHTML, outerHTML,
text, attributeMap);
this.parserHandler.beginElementChildren();
processNodeList(node.childNodes());
this.parserHandler.endElementChildren();
}
}
private String cleanOutControlChars(String s) {
StringBuffer sb = new StringBuffer();
StringBuffer foundControlChars = new StringBuffer();
char[] chars = s.toCharArray();
for (char c : chars) {
if (CharUtils.isAsciiControl(c)) {
foundControlChars.append(CharUtils.unicodeEscaped(c)).append(" ");
} else {
sb.append(c);
}
}
if (foundControlChars.length() > 0) {
log.info("Found a control chars: " + foundControlChars.toString());
}
return sb.toString();
}
}