Package com.cardence.lawshelf.handler

Source Code of com.cardence.lawshelf.handler.UscParserHandler

package com.cardence.lawshelf.handler;

import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.net.URL;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Deque;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeSet;

import org.apache.commons.lang.BooleanUtils;
import org.apache.commons.logging.Log;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import org.springframework.util.StringUtils;
import org.w3c.tidy.Tidy;

import com.cardence.lawshelf.html.HtmlParserHandler;
import com.cardence.lawshelf.html.MetaNode;
import com.cardence.lawshelf.model.Attribute;
import com.cardence.lawshelf.model.AttributeDao;
import com.cardence.lawshelf.model.BaseDatabaseObject;
import com.cardence.lawshelf.model.Code;
import com.cardence.lawshelf.model.CodeAlias;
import com.cardence.lawshelf.model.CodeAliasDao;
import com.cardence.lawshelf.model.CodeCollection;
import com.cardence.lawshelf.model.CodeCollectionDao;
import com.cardence.lawshelf.model.CodeDao;
import com.cardence.lawshelf.model.ContentFull;
import com.cardence.lawshelf.model.ContentFullDao;
import com.cardence.lawshelf.model.ContentPart;
import com.cardence.lawshelf.model.ContentPartDao;
import com.cardence.lawshelf.model.Section;
import com.cardence.lawshelf.model.SectionDao;
import com.cardence.lawshelf.model.UscCode;
import com.cardence.lawshelf.model.UscCollection;
import com.cardence.lawshelf.model.UscPrelimCollection;
import com.cardence.lawshelf.model.UscSection;
import com.cardence.lawshelf.model.helper.EntityPersistenceHelper;

@Component
public class UscParserHandler implements HtmlParserHandler {

  private static final String REGEX_1s = "[0-9]";
  private static final String REGEX_10s = "[1-9][0-9]";
  private static final String REGEX_100s = "[1-9][1-9][0-9]";
  private static final String REGEX_1000s = "[1-9][1-9][1-9][0-9]";
  private static final String REGEX_10000s = "[1-9][1-9][1-9][1-9][0-9]";
  private static final String REGEX_100000s = "[1-9][1-9][1-9][1-9][1-9][0-9]";
  private static final String REGEX_PDF_PAGENUM = //
  "(" + org.apache.commons.lang.StringUtils.join(new Object[] { //
      REGEX_100000s, //
          REGEX_10000s, //
          REGEX_1000s, //
          REGEX_100s, //
          REGEX_10s, //
          REGEX_1s //
      }, '|') //
      + ")";
  private static final String REGEX_PDF_START = "<!\\-\\- PDFPage:";
  private static final String REGEX_PDF_END = " \\-\\->";
  private static final String REGEX_PDF_PATTERN = REGEX_PDF_START + REGEX_PDF_PAGENUM + REGEX_PDF_END;
  // String regex =
  // "<!\\-\\- PDFPage:([1-9][0-9][0-9]|[1-9][0-9]|[0-9]) \\-\\->";

  @Autowired
  private Log log;
 
  @Autowired
  private EntityPersistenceHelper persistence;
 
  private Boolean isUscPrelim;

  private boolean debugMode;

  private boolean isHead;
  private boolean isBody;

  private Map<String, Integer> sectionSequenceTracker;
  private Set<String> unrecognizedKeySet;

  private int elementDepthFromField = -1;

  /* META NODE VARIABLES */
  private Set<String> statuteElementMetaNodeClassPairs = new HashSet<String>();
  private List<MetaNode> sectionMetaNodeList = new ArrayList<MetaNode>();
  private Deque<MetaNode> workingCommentStack = new ArrayDeque<MetaNode>();

  /*
   * Only Top Level UscFields (nothing nested) <br> <!-- field-start: --> and
   * <!-- field-end: -->
   */
  private List<UscField> workingUscFieldList = null;

  /*
   * The most current one being worked on. This is important since the stack
   * will only contain the top level elements, which would in turn have this
   * object; however, it may prove difficult to find.
   */
  private UscField currentUscField = null;

  private List<MetaNode> toplevelList = new ArrayList<MetaNode>();
  private ModelTracker model = null;

  private static int elementcount;

  public void beginDocument() {
    log.info(" -- DOCUMENT START --");

    this.model = new ModelTracker();
    this.sectionSequenceTracker = new HashMap<String, Integer>();
    this.unrecognizedKeySet = new TreeSet<String>();
  }

  public void endDocument() {
    if (debugMode) {
      debugMetaNodePrint();
    }

    log.info(" -- DOCUMENT END --");
    log.info("");
    log.info(" ---- PRINTING OUT INFORMATION ----");
    for (String s : this.unrecognizedKeySet) {
      log.info("  Found unrecognized field-start/field-end comment pair: " + s);
    }
  }

  public void foundComment(String comment) {
    processCommentType(comment);
  }

  public void foundElement(String tagname, String innerHTML, String outerHTML, String text,
      Map<String, String> attributeMap) {

    TagElement tagelement = new TagElement();
    tagelement.setTagname(tagname);
    tagelement.setTagvalue(text);
    tagelement.setInnerHTML(innerHTML);
    tagelement.setOuterHTML(outerHTML);
    tagelement.setAttributeMap(attributeMap);
    addContentElement(tagelement);

    foundMetaNodeElement(tagname, outerHTML, text, attributeMap);
  }

  public void beginElementChildren() {
    this.elementDepthFromField++;
  }

  public void endElementChildren() {
    this.elementDepthFromField--;
  }

  public void beginHead() {
    this.isHead = true;
  }

  public void endHead() {
    CodeCollection codeCollection = model.getCodeCollection();
    Code code = model.getCode();

    persistence.storeCodeCollection(codeCollection);

    code.setCodeCollection(codeCollection);
    persistence.storeCode(code);

    this.isHead = false;
  }

  public void beginBody() {
    this.isBody = true;
  }

  public void endBody() {

    Section priorSection = this.model.getSection();
    if (priorSection != null) {
      sectionEnd();
    }

    this.isBody = false;
    addMetaNodesToSection(null);
  }

  private void processCommentType(String commentHtml) {
    String[] bothSides = StringUtils.split(commentHtml, ":");
    if (bothSides == null) {
      return;
    }

    String type = StringUtils.trimWhitespace(StringUtils.replace(bothSides[0], "<!--", ""));
    String value = StringUtils.trimWhitespace(StringUtils.replace(bothSides[1], "-->", ""));

    if (this.isHead) {
      CodeCollection codeCollection = model.getCodeCollection();
      Code code = model.getCode();

      if (codeCollection == null) {
        if (isUscPrelim) {
          codeCollection = new UscPrelimCollection()
        } else {
          codeCollection = new UscCollection();
        }
        model.setCodeCollection(codeCollection);
      }
      if (code == null) {
        code = new UscCode();
        model.setCode(code);
      }

      if (type.equals("AUTHORITIES-PUBLICATION-NAME")) {
        code.addAlias(value);
        code.addAttribute(type, value);
      } else if (type.equals("AUTHORITIES-PUBLICATION-ID")) {
        code.addAlias(value);
        code.addAttribute(type, value);
      } else if (type.equals("AUTHORITIES-PUBLICATION-YEAR")) {
        if (!isUscPrelim) {
          codeCollection.setYear(Integer.parseInt(value));
        }
        code.addAttribute(type, value);
      } else if (type.equals("AUTHORITIES-LAWS-ENACTED-THROUGH-DATE")) {
        // ignore... should be caught again on the first documentid
        // element
        code.addAttribute(type, value);
      } else if (type.equals("AUTHORITIES-USC-TITLE-NAME")) {
        String[] valueParts = StringUtils.split(value, "-");
        code.setName(valueParts[1].trim());
        code.addAttribute(type, value);
      } else if (type.equals("AUTHORITIES-USC-TITLE-ENUM")) {
        code.setCodeSequence(Integer.parseInt(value));
        code.addAttribute(type, value);
      } else if (type.equals("AUTHORITIES-USC-TITLE-STATUS")) {
        code.setStatus(value);
        code.addAttribute(type, value);
      }
    } else if (this.isBody) {
      Section section = this.model.getSection();

      if (type.equals("documentid")) {

        this.addMetaNodesToSection(value);
        this.sectionStart(commentHtml);

      } else if (type.equals("field-start")) {
        this.fieldStart(value);
        this.startMetaNode(new MetaNode(value, "comment"));

      } else if (type.equals("field-end")) {
        this.fieldEnd(value);
        this.endMetaNode(value);

      } else if (type.equals("expcite")) {
        this.updateMetaNodeSectionTitle(value);
        if (section != null) {
          section.addAttribute(type, value);
        }

      } else if (type.equals("itempath")) {
        this.updateMetaNodeSectionTitle(value);
        if (section != null) {
          section.setSourceReference(value);
          section.addAttribute(type, value);
        }

      } else if (type.equals("itemsortkey")) {
        this.updateMetaNodeSectionTitle(value);
        if (section != null) {
          section.addAttribute(type, value);
        }
      }
    }
  }

  private void fieldStart(String fieldKey) {
    this.elementDepthFromField = 0;

    UscField field = new UscField();
    field.setFieldName(fieldKey);
    if (!field.isKnownFieldKey()) {
      unrecognizedKeySet.add(fieldKey);
    }

    if (this.currentUscField != null) {
      // we are not gonna store nested elements on the stack.
      this.currentUscField.addChild(field);
    } else {
      this.getWorkingUscFieldList().add(field);
    }

    // this most current one will always be here
    this.currentUscField = field;
  }

  private void fieldEnd(String fieldKey) {
    this.elementDepthFromField = -1;

    if (this.currentUscField != null) {
      if (this.currentUscField.getParent() != null) {
        this.currentUscField = this.currentUscField.getParent();
      } else {
        this.currentUscField = null;
      }
    }
  }

  /**
   * RESPONSIBILITIES:
   *
   * - IF PREVIOUS EXISTS... CALL ENDSECTION FOR IT <br>
   * - Since itemkey has not been read yet (only the docuemntid, we need to
   * wait till the sectionEnd() method to find the parent
   *
   * @param documentString
   */
  private void sectionStart(String documentString) {
    Section priorSection = this.model.getSection();

    if (priorSection != null) {
      // NOT THE FIRST - CLOSE THE PREVIOUS ONE
      sectionEnd();
    }
    // create a new one
    Section newSection = new UscSection();
    newSection.setCode(model.getCode());

    // process document text from comment
    String[] tokenString = StringUtils.tokenizeToStringArray(documentString, " ", true, true);
    for (String str : tokenString) {
      String[] documentElements = StringUtils.split(str, ":");
      if (documentElements == null) {
        log.debug("documentElements does not contain a ':' for string: " + str);
      } else {
        newSection.addAttribute(documentElements[0], documentElements[1]);
      }
    }

    // Is the previous element my parent???

    // store in the model tracker
    this.model.setSection(newSection);
  }

  /**
   * Responsibilities:
   *
   * - CHECK ID: "itemkey" FOR USC FILES<br>
   * - FIGURE OUT IF THE PREVIOUS SECTION WAS A PARENT OR NOT <br>
   * - IF SO, ADD THE DB PK ID AS THE PARENT TO THIS NEW SECTION <br>
   * - IF NOT, ITERATE UP THE STACK LOOKING FOR A PARENT, THEN PERFORM THE
   * OPERATION TO ADD THE DB PK ID AS THE PARENT
   *
   * - DB WRITE: SECTION PLUS ALL CONTENT (FULL AND PARTS)
   */
  private void sectionEnd() {
    Section currentSection = this.model.getSection();
    String itempath = currentSection.getSourceReference();

    // get the primary key for the "parent" section
    String[] pathelements = getItemPathElements(itempath);

    if (pathelements != null && pathelements.length > 0 && !pathelements[0].equals(itempath)) {
      pathelements = Arrays.copyOf(pathelements, pathelements.length - 1);

      // String parentItempath = "/"
      // + StringUtils.arrayToDelimitedString(pathelements, "/");

      String parentItempath = getParentItemPath(itempath);
      Integer parentPrimaryKey = this.model.findPrimaryKey(parentItempath);

      if (parentPrimaryKey == null && pathelements.length > 0) {
        // BUILD THE MISSING PARENT!!!
        this.buildMissingParentSection(parentItempath);
         parentPrimaryKey = this.model.findPrimaryKey(parentItempath);
      }
      String parentLevelPosition = this.model.findLevelPosition(parentItempath);
      Integer currentSequence = this.sectionSequenceTracker.get(parentItempath);
      Integer nextSequence = 1;
      if (currentSequence != null) {
        nextSequence = currentSequence + 1;
      }

      log.debug("looking for parent primary key: used itempath [" + itempath
          + "], converted to parent item path [" + parentItempath + "] found pk [" + parentPrimaryKey + "]");

      // add parent's primary key
      currentSection.setParentSectionId(parentPrimaryKey);
      currentSection.setSectionSequence(nextSequence);
      currentSection.setParentLevelPosition(parentLevelPosition);
      this.sectionSequenceTracker.put(parentItempath, nextSequence);
    }
    // SAVE SECTION
    persistence.storeSection(currentSection);

    // CONFIRM: ADD PK TO MODEL TRACKER
    assert currentSection.getId() != null : "Attempting to add a section's primary key, but it is null after a save";
    log.debug("Adding Primary Key for itempath->sectionId: " + itempath + "," + currentSection.getId());
    this.model.addPrimaryKeyMapping(itempath, currentSection.getId());
    this.model.addLevelPositionMapping(itempath, currentSection.getLevelPosition());
    this.sectionSequenceTracker.put(itempath, 0); // this is for any
                            // children

    // SAVE CONTENTS
    ContentFull contentFull = this.convertUscFieldsToContentFull(currentSection);
    List<ContentPart> contentPartList = this.convertUscFieldsToContentPartList(currentSection);

    // RULE: If content is ONLY NOTES, it almost always
    // (I have not seen an exception yet)
    // is all DIVS and TABLES...
    // VERY difficult to chop up into content parts... only store full for
    // now.
    persistence.storeContentFull(contentFull, (!currentSection.getIsNewRecord() == Boolean.TRUE));
    if (!contentFull.isNotes()) {
      persistence.storeContentParts(currentSection.getId(), contentFull.getId(), contentPartList,
          (!currentSection.getIsNewRecord() == Boolean.TRUE));
    }

    this.workingUscFieldList = null;
    this.currentUscField = null;

    this.model.setSection(null);
  }

  private String[] getItemPathElements(String itempath) {
    return StringUtils.tokenizeToStringArray(itempath, "/", true, true);
  }

  private String getItemPathString(String[] pathelements) {
    if (pathelements != null && pathelements.length > 0) {
      return "/" + StringUtils.arrayToDelimitedString(pathelements, "/");
    } else {
      return "/";
    }
  }

  private String getParentItemPath(String itempath) {
    return getParentItemPath(itempath, 1);
  }

  private String getParentItemPath(String itempath, Integer levelsUp) {
    if (levelsUp == null || levelsUp < 1) {
      levelsUp = 1;
    }

    String[] pathelements = getItemPathElements(itempath);

    if (pathelements != null && pathelements.length > 0 && pathelements.length > levelsUp
        && !pathelements[0].equals(itempath)) {

      pathelements = Arrays.copyOf(pathelements, pathelements.length - levelsUp);
    }

    return getItemPathString(pathelements);
  }

  private String extractHeadingFromItemPath(String itempath, Integer levelsUp) {
    return extractHeadingFromItemPath(getItemPathElements(itempath), levelsUp);
  }

  private String extractHeadingFromItemPath(String[] pathelements, Integer levelsUp) {
    if (levelsUp == null || levelsUp < 0) {
      levelsUp = 0;
    }

    int lengthOffset = levelsUp + 1;

    if (pathelements != null && pathelements.length > 0 && pathelements.length > levelsUp) {

      return pathelements[pathelements.length - lengthOffset];
    } else {
      return "";
    }
  }

  private Section buildMissingParentSection(String parentItemPath) {
    // NOTE: THIS RECURSES AS NEEDED
    //
    // rare circumstance!
    // this does happen where the parent does not have its own
    // section
    // ex. /title 10/part 4/subpart 5/Sec. 123
    // Current = 123
    // Parent = subpart 5, but there is not a subpart 5
    // create subpart 5

    String[] pathelements = getItemPathElements(parentItemPath);
    String onlyHeadingWeCouldFigureOut = extractHeadingFromItemPath(pathelements, 0);
    Section newParentSection = new UscSection();
    newParentSection.setCode(model.getCode());
    newParentSection.setHeading(onlyHeadingWeCouldFigureOut);
    newParentSection.setShortHeading(onlyHeadingWeCouldFigureOut);
    newParentSection.setSourceReference(parentItemPath);
    newParentSection.addAttribute("itempath", parentItemPath);

    // in order to make this work right, we NEED the parent's parent
    // information to "fit" this in
    // LOOKUP PARENT'S PARENT
    String grampsItemPath = getParentItemPath(parentItemPath);
    String[] grampsPathElements = getItemPathElements(grampsItemPath);
    Integer grampsPrimaryKey = this.model.findPrimaryKey(grampsItemPath);

    if (grampsPrimaryKey == null && grampsPathElements.length > 0) {
      buildMissingParentSection(grampsItemPath);
    }

    String grampsLevelPosition = this.model.findLevelPosition(grampsItemPath);
    Integer grampsCurrentSequence = this.sectionSequenceTracker.get(grampsItemPath);
    Integer grampsNextSequence = 1;
    if (grampsCurrentSequence != null) {
      grampsNextSequence = grampsCurrentSequence + 1;
    }

    // add parent's parent information
    newParentSection.setParentSectionId(grampsPrimaryKey);
    newParentSection.setSectionSequence(grampsNextSequence);
    newParentSection.setParentLevelPosition(grampsLevelPosition);

    // STORE NEW SIMULATED PARENT
    persistence.storeSection(newParentSection);

    assert newParentSection.getId() != null : "Attempting to add a new parent section's primary key, but it is null after a save";
    log.debug("Adding Primary Key for itempath->sectionId: " + parentItemPath + "," + newParentSection.getId());
    this.model.addPrimaryKeyMapping(parentItemPath, newParentSection.getId());
    this.model.addLevelPositionMapping(parentItemPath, newParentSection.getLevelPosition());
    this.sectionSequenceTracker.put(parentItemPath, 0);
    // NEW PARENT IS NOW READY

    return newParentSection;
  }

  private void addContentElement(TagElement el) {

    if (this.elementDepthFromField == 0) {
      if (this.currentUscField != null) {
        // we only care if this exists

        if (el.isHeading() && this.model.getSection() != null && this.model.getSection().getHeading() == null) {
          this.model.getSection().setHeading(el.getTagvalue());
          // 9.20.12 - changed from getinnerhtml to gettagvalue
        }

        this.currentUscField.addTagElement(el);
      }
    } else {
      if (this.currentUscField != null) {
        if (this.currentUscField.isHeading()) {
          if ("strong".equals(el.getTagname()) || "cap-smallcap".equals(el.getTagname())) {
            String s = this.model.getSection().getHeading();
            s = StringUtils.replace(s, "<" + el.getTagname() + ">", "");
            s = StringUtils.replace(s, "</" + el.getTagname() + ">", "");
            this.model.getSection().setHeading(s);
          } else if ("sup".equals(el.getTagname())) {

            String s = this.model.getSection().getHeading();
            int start = s.indexOf("<sup>");
            int end = s.indexOf("</sup>") + 6;

            if (start == -1 || end == -1) {
              log.warn("Could not find <sup> or </sup> in heading: " + s);
            } else {
              s = s.substring(0, start) + s.substring(end);
              this.model.getSection().setHeading(s);
            }
          }
        }
      }
    }

  }


  /* ****
   *
   * ### USC FIELD METHODS ###
   *
   * ***
   */

  private List<UscField> getWorkingUscFieldList() {
    if (this.workingUscFieldList == null) {
      this.workingUscFieldList = new ArrayList<UscField>();
    }
    return workingUscFieldList;
  }

  /* ****
   *
   * ### META NODE METHODS ###
   *
   * ***
   */

  private MetaNode getCurrentMetaNode() {
    if (this.workingCommentStack == null) {
      return null;
    }
    return this.workingCommentStack.peek();
  }

  private void startMetaNode(MetaNode meta) {
    this.workingCommentStack.push(meta);
  }

  private void endMetaNode(String name) {
    if (this.workingCommentStack.isEmpty()) {
      if (debugMode) {
        System.out.println("ERROR: Why is the working stack empty? need to end node [" + name + "]");
      }
    } else {
      MetaNode thisNode = this.workingCommentStack.pop();
      if (!thisNode.getName().equals(name)) {
        if (debugMode) {
          System.out.println("ERROR: Why is the last item on the stack not me? me=[" + name
              + "]; last item = [" + thisNode.getName() + "]");
        }
      } else {
        // looks good!
        MetaNode priorNode = this.workingCommentStack.peek();
        if (priorNode == null) {
          // add to toplevel list
          this.toplevelList.add(thisNode);
        } else {
          // add to prior node as child
          priorNode.addChild(thisNode);
        }
      }
    }
  }

  private void updateMetaNodeSectionTitle(String newTitle) {
    if (this.sectionMetaNodeList.isEmpty()) {
      // this only happens at the beginning of the document
    } else {
      if (newTitle == null) {
        // this only happens at the end of the document
      } else {
        // init the next one
        MetaNode sectionNode = this.sectionMetaNodeList.get((this.sectionMetaNodeList.size() - 1));
        if (sectionNode.getType().equals("section")) {
          sectionNode.setName(newTitle);
        }
      }

    }
  }

  private void addMetaNodesToSection(String nextTitle) {
    if (this.sectionMetaNodeList.isEmpty()) {
      // this only happens at the beginning of the document
    } else {
      // not the first one
      // copy the working stack into the
      // previously created node
      MetaNode sectionToFinish = this.sectionMetaNodeList.get(this.sectionMetaNodeList.size() - 1);
      sectionToFinish.setChildren(this.toplevelList);
      this.toplevelList = new ArrayList<MetaNode>();
    }

    if (nextTitle == null) {
      // this only happens at the end of the document
    } else {
      // init the next one
      MetaNode sectionNode = new MetaNode("", "section");
      this.sectionMetaNodeList.add(sectionNode);
    }

  }

  private void foundMetaNodeElement(String tagname, String html, String text, Map<String, String> attributeMap) {
    MetaNode node = this.getCurrentMetaNode();
    if (node == null) {
      if (debugMode) {
        log.debug("Why is this node null for tagname: " + tagname);
      }
    } else {
      if (node.getName().equals("statute")) {
        String classname = attributeMap.get("class");
        if (classname != null) {
          this.statuteElementMetaNodeClassPairs.add(tagname + ", " + classname);

          if (debugMode && elementcount++ % 25 == 0) {
            System.out.println("Text: " + text);
            System.out.println("HTML: " + html);
          }
        } else {
          if (debugMode) {
            log.debug("Tag " + tagname + " doesnt have a class attribute");
          }
        }
      }
    }
  }

  private void debugMetaNodePrint() {
    if (debugMode) {
      StringBuffer sb = new StringBuffer();
      for (MetaNode meta : sectionMetaNodeList) {
        sb.append(meta.toString());
      }

      sb.append("\n");

      for (String classname : this.statuteElementMetaNodeClassPairs) {
        sb.append("\n");
        sb.append("<classname>");
        sb.append(classname);
        sb.append("</classname>");
      }

      Tidy tidy = new Tidy(); // obtain a new Tidy instance
      tidy.setXmlOut(true);
      tidy.setXmlTags(true);
      tidy.setXmlSpace(true);

      ByteArrayInputStream bis = new ByteArrayInputStream(sb.toString().getBytes());
      tidy.parse(bis, System.out); // run tidy, providing an input and
                      // output
                      // stream
    }
  }

  private ContentFull convertUscFieldsToContentFull(Section section) {
    ContentFull content = new ContentFull();
    content.setNotes(true);

    for (Iterator<UscField> it = this.getWorkingUscFieldList().iterator(); it.hasNext();) {

      content = convertUscFieldToContentFull(it.next(), content);
    }

    content.setFormatType(UscTags.FORMATTYPE_HTML);
    content.setSection(section);

    return content;
  }

  private List<ContentPart> convertUscFieldsToContentPartList(Section section) {
    ArrayList<ContentPart> list = new ArrayList<ContentPart>();

    int counter = 0;
    for (Iterator<UscField> it = this.getWorkingUscFieldList().iterator(); it.hasNext();) {

      List<ContentPart> convertedItems = convertUscFieldToContentPartList(it.next(), counter, section);
      if (convertedItems != null) {
        counter += convertedItems.size();
      }

      list.addAll(convertedItems);
    }

    return list;
  }

  private ContentFull convertUscFieldToContentFull(UscField field, ContentFull content) {

    if (field.isPartOfStructural() || field.isPartOfCode() || field.isPartOfNotes()) {
      // content.setNotes(true);
    } else {
      content.setNotes(false);
    }

    for (Iterator<TagElement> it = field.getTagElementList().iterator(); it.hasNext();) {
      TagElement el = it.next();

      content = convertTagElementToContentFull(el, content);
    }

    List<UscField> children = field.getChildren();
    if (children != null) {
      for (Iterator<UscField> chIt = children.iterator(); chIt.hasNext();) {
        content = convertUscFieldToContentFull(chIt.next(), content);
      }
    }

    return content;
  }

  private List<ContentPart> convertUscFieldToContentPartList(UscField field, int startSequence, Section section) {
    ArrayList<ContentPart> contentPartList = new ArrayList<ContentPart>();

    int counter = startSequence;
    for (Iterator<TagElement> it = field.getTagElementList().iterator(); it.hasNext();) {
      TagElement el = it.next();
      ContentPart content = convertTagElementToContentPart(el);
      content.setContentType(field.getTopLevelFieldName());
      content.setContentSequence(counter++);
      content.setSection(section);

      if (field.isPartOfStructural() || field.isPartOfCode() || field.isPartOfNotes()) {
        content.setNotes(true);
        content.setNotesType(field.getFieldName());
      } else {
        content.setNotes(false);
      }

      contentPartList.add(content);
    }

    List<UscField> children = field.getChildren();
    if (children != null) {
      for (Iterator<UscField> chIt = children.iterator(); chIt.hasNext();) {
        List<ContentPart> convertedItems = convertUscFieldToContentPartList(chIt.next(), counter, section);
        if (convertedItems != null) {
          counter += convertedItems.size();
        }

        contentPartList.addAll(convertedItems);
      }
    }

    return contentPartList;
  }

  private ContentFull convertTagElementToContentFull(TagElement el, ContentFull content) {
    content.addContent(el.getOuterHTML());
    // addAttributes(el.getAttributeMap(), content);
    return content;
  }

  private ContentPart convertTagElementToContentPart(TagElement el) {
    ContentPart content = new ContentPart();
    if (el.isDiv() || el.isTable()) {
      content.setContent(el.getOuterHTML());
    } else {
      content.setContent(el.getInnerHTML());
    }
    content.setHeader(el.isHeading());
    content.setFormatType(UscTags.FORMATTYPE_HTML);

    addAttributes(el.getAttributeMap(), content);

    return content;
  }

  private void addAttributes(Map<String, String> attrs, BaseDatabaseObject compatibleObject) {
    if (attrs == null || compatibleObject == null) {
      return;
    }

    for (Iterator<Entry<String, String>> it = attrs.entrySet().iterator(); it.hasNext();) {
      Entry<String, String> attr = it.next();
      compatibleObject.addAttribute(attr.getKey(), attr.getValue());
    }
  }

  public String willProcessFile(File file) {
    FileInputStream fin = null;
    BufferedInputStream bin = null;

    try {
      fin = new FileInputStream(file);
      bin = new BufferedInputStream(fin);

      StringBuffer sb = new StringBuffer();

      byte[] bytes = new byte[102400];
      int len = -1;
      while ((len = bin.read(bytes)) != -1) {
        sb.append(new String(bytes, 0, len));
      }

      return willProcessHtml(sb.toString());
    } catch (Throwable t) {
      log.error("Could not open up file: " + t.getLocalizedMessage());
    } finally {
      try {
        if (bin != null) {
          bin.close();
          bin = null;
        }
        if (fin != null) {
          fin.close();
          fin = null;
        }
      } catch (Throwable t) {
      }
    }
    return null;
  }

  public String willProcessUrl(URL url) {
    try {
      return willProcessFile(new File(url.getFile()));
    } catch (Throwable t) {
      log.error("Could not convert url to file: " + t.getLocalizedMessage());
      return null;
    }
  }

  public String willProcessHtml(String html) {
    try {
      /*
       * Pattern pattern = Pattern.compile(REGEX_PDF_PATTERN); Matcher
       * match = pattern.matcher(html); while (match.find()) {
       * System.out.println("FOUND MATCH [" + match.group() + "]"); }
       */

      return html.replaceAll(REGEX_PDF_PATTERN, "");
    } catch (Throwable t) {
      log.error("Could not filter html with regex [" + REGEX_PDF_PATTERN + "]: " + t.getLocalizedMessage());
      return null;
    }
  }

  public void setIsUscPrelim(String isUscPrelim) {
    this.isUscPrelim =BooleanUtils.toBoolean(isUscPrelim);
  }

}
TOP

Related Classes of com.cardence.lawshelf.handler.UscParserHandler

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.