Package com.cardence.lawshelf.html

Source Code of com.cardence.lawshelf.html.JsoupHtmlParserImpl

package com.cardence.lawshelf.html;

import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang.ArrayUtils;
import org.apache.commons.lang.CharUtils;
import org.apache.commons.logging.Log;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Attribute;
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Comment;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;

@Component
public class JsoupHtmlParserImpl implements HtmlParser {

  @Autowired
  private Log log;

  private HtmlParserHandler parserHandler;

  private Document document;

  public void setParserHandler(HtmlParserHandler handler) {
    this.parserHandler = handler;
  }

  public void parseFile(String filepath) throws IOException {
    File file = new File(filepath);
    parseFile(file);
  }

  public void parseFile(File file) throws IOException {
    String html = parserHandler.willProcessFile(file);
    if (html != null) {
      this.document = Jsoup.parse(html);
    } else {
      this.document = Jsoup.parse(file, null);
    }
  }

  public void parseUrl(URL url) throws IOException {
    String html = parserHandler.willProcessUrl(url);
    if (html != null) {
      this.document = Jsoup.parse(html);
    } else {
      this.document = Jsoup.parse(url, 10000);
    }
  }

  public void parseHtmlString(String html) {
    String modhtml = parserHandler.willProcessHtml(html);
    if (modhtml != null) {
      html = modhtml;
    }
    this.document = Jsoup.parse(html);
  }

  private void handleElements(Elements elements) {
    List<Node> allNodes = elements.first().childNodes();
    processNodeList(allNodes);

  }

  public void processStructure() {
    this.parserHandler.beginDocument();

    Elements headElements = document.getElementsByTag("head");
    this.parserHandler.beginHead();
    this.handleElements(headElements);
    this.parserHandler.endHead();

    Elements bodyElements = document.getElementsByTag("body");
    this.parserHandler.beginBody();
    this.handleElements(bodyElements);
    this.parserHandler.endBody();

    this.parserHandler.endDocument();
  }

  private void processNodeList(List<Node> nodelist) {
    if (nodelist == null) {
      return;
    }
    for (Node n : nodelist) {
      processNode(n);
    }
  }

  private void processNode(Node node) {
    if (node == null) {
      return;
    }

    if (node instanceof Comment) {
      this.parserHandler.foundComment(node.outerHtml());
    } else if (node instanceof Element) {
      Element enode = (Element) node;
      String tagname = enode.tagName();
      String text = cleanOutControlChars(enode.text());
      String outerHTML = cleanOutControlChars(enode.outerHtml());
      String innerHTML = cleanOutControlChars(enode.html());
      Attributes attrs = enode.attributes();
      Map<String, String> attributeMap = new HashMap<String, String>();
      for (Attribute attr : attrs) {
        attributeMap.put(attr.getKey(), attr.getValue());
      }

      this.parserHandler.foundElement(tagname, innerHTML, outerHTML,
          text, attributeMap);

      this.parserHandler.beginElementChildren();
      processNodeList(node.childNodes());
      this.parserHandler.endElementChildren();
    }
  }

  private String cleanOutControlChars(String s) {
    StringBuffer sb = new StringBuffer();
    StringBuffer foundControlChars = new StringBuffer();
    char[] chars = s.toCharArray();
    for (char c : chars) {
      if (CharUtils.isAsciiControl(c)) {
        foundControlChars.append(CharUtils.unicodeEscaped(c)).append(" ");
      } else {
        sb.append(c);
      }
    }

    if (foundControlChars.length() > 0) {
      log.info("Found a control chars: " + foundControlChars.toString());
    }
    return sb.toString();
  }
}
TOP

Related Classes of com.cardence.lawshelf.html.JsoupHtmlParserImpl

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.