package com.cardence.lawshelf.handler;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.net.URL;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Deque;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeSet;
import org.apache.commons.lang.BooleanUtils;
import org.apache.commons.logging.Log;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import org.springframework.util.StringUtils;
import org.w3c.tidy.Tidy;
import com.cardence.lawshelf.html.HtmlParserHandler;
import com.cardence.lawshelf.html.MetaNode;
import com.cardence.lawshelf.model.Attribute;
import com.cardence.lawshelf.model.AttributeDao;
import com.cardence.lawshelf.model.BaseDatabaseObject;
import com.cardence.lawshelf.model.Code;
import com.cardence.lawshelf.model.CodeAlias;
import com.cardence.lawshelf.model.CodeAliasDao;
import com.cardence.lawshelf.model.CodeCollection;
import com.cardence.lawshelf.model.CodeCollectionDao;
import com.cardence.lawshelf.model.CodeDao;
import com.cardence.lawshelf.model.ContentFull;
import com.cardence.lawshelf.model.ContentFullDao;
import com.cardence.lawshelf.model.ContentPart;
import com.cardence.lawshelf.model.ContentPartDao;
import com.cardence.lawshelf.model.Section;
import com.cardence.lawshelf.model.SectionDao;
import com.cardence.lawshelf.model.UscCode;
import com.cardence.lawshelf.model.UscCollection;
import com.cardence.lawshelf.model.UscPrelimCollection;
import com.cardence.lawshelf.model.UscSection;
import com.cardence.lawshelf.model.helper.EntityPersistenceHelper;
@Component
public class UscParserHandler implements HtmlParserHandler {
private static final String REGEX_1s = "[0-9]";
private static final String REGEX_10s = "[1-9][0-9]";
private static final String REGEX_100s = "[1-9][1-9][0-9]";
private static final String REGEX_1000s = "[1-9][1-9][1-9][0-9]";
private static final String REGEX_10000s = "[1-9][1-9][1-9][1-9][0-9]";
private static final String REGEX_100000s = "[1-9][1-9][1-9][1-9][1-9][0-9]";
private static final String REGEX_PDF_PAGENUM = //
"(" + org.apache.commons.lang.StringUtils.join(new Object[] { //
REGEX_100000s, //
REGEX_10000s, //
REGEX_1000s, //
REGEX_100s, //
REGEX_10s, //
REGEX_1s //
}, '|') //
+ ")";
private static final String REGEX_PDF_START = "<!\\-\\- PDFPage:";
private static final String REGEX_PDF_END = " \\-\\->";
private static final String REGEX_PDF_PATTERN = REGEX_PDF_START + REGEX_PDF_PAGENUM + REGEX_PDF_END;
// String regex =
// "<!\\-\\- PDFPage:([1-9][0-9][0-9]|[1-9][0-9]|[0-9]) \\-\\->";
@Autowired
private Log log;
@Autowired
private EntityPersistenceHelper persistence;
private Boolean isUscPrelim;
private boolean debugMode;
private boolean isHead;
private boolean isBody;
private Map<String, Integer> sectionSequenceTracker;
private Set<String> unrecognizedKeySet;
private int elementDepthFromField = -1;
/* META NODE VARIABLES */
private Set<String> statuteElementMetaNodeClassPairs = new HashSet<String>();
private List<MetaNode> sectionMetaNodeList = new ArrayList<MetaNode>();
private Deque<MetaNode> workingCommentStack = new ArrayDeque<MetaNode>();
/*
* Only Top Level UscFields (nothing nested) <br> <!-- field-start: --> and
* <!-- field-end: -->
*/
private List<UscField> workingUscFieldList = null;
/*
* The most current one being worked on. This is important since the stack
* will only contain the top level elements, which would in turn have this
* object; however, it may prove difficult to find.
*/
private UscField currentUscField = null;
private List<MetaNode> toplevelList = new ArrayList<MetaNode>();
private ModelTracker model = null;
private static int elementcount;
public void beginDocument() {
log.info(" -- DOCUMENT START --");
this.model = new ModelTracker();
this.sectionSequenceTracker = new HashMap<String, Integer>();
this.unrecognizedKeySet = new TreeSet<String>();
}
public void endDocument() {
if (debugMode) {
debugMetaNodePrint();
}
log.info(" -- DOCUMENT END --");
log.info("");
log.info(" ---- PRINTING OUT INFORMATION ----");
for (String s : this.unrecognizedKeySet) {
log.info(" Found unrecognized field-start/field-end comment pair: " + s);
}
}
public void foundComment(String comment) {
processCommentType(comment);
}
public void foundElement(String tagname, String innerHTML, String outerHTML, String text,
Map<String, String> attributeMap) {
TagElement tagelement = new TagElement();
tagelement.setTagname(tagname);
tagelement.setTagvalue(text);
tagelement.setInnerHTML(innerHTML);
tagelement.setOuterHTML(outerHTML);
tagelement.setAttributeMap(attributeMap);
addContentElement(tagelement);
foundMetaNodeElement(tagname, outerHTML, text, attributeMap);
}
public void beginElementChildren() {
this.elementDepthFromField++;
}
public void endElementChildren() {
this.elementDepthFromField--;
}
public void beginHead() {
this.isHead = true;
}
public void endHead() {
CodeCollection codeCollection = model.getCodeCollection();
Code code = model.getCode();
persistence.storeCodeCollection(codeCollection);
code.setCodeCollection(codeCollection);
persistence.storeCode(code);
this.isHead = false;
}
public void beginBody() {
this.isBody = true;
}
public void endBody() {
Section priorSection = this.model.getSection();
if (priorSection != null) {
sectionEnd();
}
this.isBody = false;
addMetaNodesToSection(null);
}
private void processCommentType(String commentHtml) {
String[] bothSides = StringUtils.split(commentHtml, ":");
if (bothSides == null) {
return;
}
String type = StringUtils.trimWhitespace(StringUtils.replace(bothSides[0], "<!--", ""));
String value = StringUtils.trimWhitespace(StringUtils.replace(bothSides[1], "-->", ""));
if (this.isHead) {
CodeCollection codeCollection = model.getCodeCollection();
Code code = model.getCode();
if (codeCollection == null) {
if (isUscPrelim) {
codeCollection = new UscPrelimCollection();
} else {
codeCollection = new UscCollection();
}
model.setCodeCollection(codeCollection);
}
if (code == null) {
code = new UscCode();
model.setCode(code);
}
if (type.equals("AUTHORITIES-PUBLICATION-NAME")) {
code.addAlias(value);
code.addAttribute(type, value);
} else if (type.equals("AUTHORITIES-PUBLICATION-ID")) {
code.addAlias(value);
code.addAttribute(type, value);
} else if (type.equals("AUTHORITIES-PUBLICATION-YEAR")) {
if (!isUscPrelim) {
codeCollection.setYear(Integer.parseInt(value));
}
code.addAttribute(type, value);
} else if (type.equals("AUTHORITIES-LAWS-ENACTED-THROUGH-DATE")) {
// ignore... should be caught again on the first documentid
// element
code.addAttribute(type, value);
} else if (type.equals("AUTHORITIES-USC-TITLE-NAME")) {
String[] valueParts = StringUtils.split(value, "-");
code.setName(valueParts[1].trim());
code.addAttribute(type, value);
} else if (type.equals("AUTHORITIES-USC-TITLE-ENUM")) {
code.setCodeSequence(Integer.parseInt(value));
code.addAttribute(type, value);
} else if (type.equals("AUTHORITIES-USC-TITLE-STATUS")) {
code.setStatus(value);
code.addAttribute(type, value);
}
} else if (this.isBody) {
Section section = this.model.getSection();
if (type.equals("documentid")) {
this.addMetaNodesToSection(value);
this.sectionStart(commentHtml);
} else if (type.equals("field-start")) {
this.fieldStart(value);
this.startMetaNode(new MetaNode(value, "comment"));
} else if (type.equals("field-end")) {
this.fieldEnd(value);
this.endMetaNode(value);
} else if (type.equals("expcite")) {
this.updateMetaNodeSectionTitle(value);
if (section != null) {
section.addAttribute(type, value);
}
} else if (type.equals("itempath")) {
this.updateMetaNodeSectionTitle(value);
if (section != null) {
section.setSourceReference(value);
section.addAttribute(type, value);
}
} else if (type.equals("itemsortkey")) {
this.updateMetaNodeSectionTitle(value);
if (section != null) {
section.addAttribute(type, value);
}
}
}
}
private void fieldStart(String fieldKey) {
this.elementDepthFromField = 0;
UscField field = new UscField();
field.setFieldName(fieldKey);
if (!field.isKnownFieldKey()) {
unrecognizedKeySet.add(fieldKey);
}
if (this.currentUscField != null) {
// we are not gonna store nested elements on the stack.
this.currentUscField.addChild(field);
} else {
this.getWorkingUscFieldList().add(field);
}
// this most current one will always be here
this.currentUscField = field;
}
private void fieldEnd(String fieldKey) {
this.elementDepthFromField = -1;
if (this.currentUscField != null) {
if (this.currentUscField.getParent() != null) {
this.currentUscField = this.currentUscField.getParent();
} else {
this.currentUscField = null;
}
}
}
/**
* RESPONSIBILITIES:
*
* - IF PREVIOUS EXISTS... CALL ENDSECTION FOR IT <br>
* - Since itemkey has not been read yet (only the docuemntid, we need to
* wait till the sectionEnd() method to find the parent
*
* @param documentString
*/
private void sectionStart(String documentString) {
Section priorSection = this.model.getSection();
if (priorSection != null) {
// NOT THE FIRST - CLOSE THE PREVIOUS ONE
sectionEnd();
}
// create a new one
Section newSection = new UscSection();
newSection.setCode(model.getCode());
// process document text from comment
String[] tokenString = StringUtils.tokenizeToStringArray(documentString, " ", true, true);
for (String str : tokenString) {
String[] documentElements = StringUtils.split(str, ":");
if (documentElements == null) {
log.debug("documentElements does not contain a ':' for string: " + str);
} else {
newSection.addAttribute(documentElements[0], documentElements[1]);
}
}
// Is the previous element my parent???
// store in the model tracker
this.model.setSection(newSection);
}
/**
* Responsibilities:
*
* - CHECK ID: "itemkey" FOR USC FILES<br>
* - FIGURE OUT IF THE PREVIOUS SECTION WAS A PARENT OR NOT <br>
* - IF SO, ADD THE DB PK ID AS THE PARENT TO THIS NEW SECTION <br>
* - IF NOT, ITERATE UP THE STACK LOOKING FOR A PARENT, THEN PERFORM THE
* OPERATION TO ADD THE DB PK ID AS THE PARENT
*
* - DB WRITE: SECTION PLUS ALL CONTENT (FULL AND PARTS)
*/
private void sectionEnd() {
Section currentSection = this.model.getSection();
String itempath = currentSection.getSourceReference();
// get the primary key for the "parent" section
String[] pathelements = getItemPathElements(itempath);
if (pathelements != null && pathelements.length > 0 && !pathelements[0].equals(itempath)) {
pathelements = Arrays.copyOf(pathelements, pathelements.length - 1);
// String parentItempath = "/"
// + StringUtils.arrayToDelimitedString(pathelements, "/");
String parentItempath = getParentItemPath(itempath);
Integer parentPrimaryKey = this.model.findPrimaryKey(parentItempath);
if (parentPrimaryKey == null && pathelements.length > 0) {
// BUILD THE MISSING PARENT!!!
this.buildMissingParentSection(parentItempath);
parentPrimaryKey = this.model.findPrimaryKey(parentItempath);
}
String parentLevelPosition = this.model.findLevelPosition(parentItempath);
Integer currentSequence = this.sectionSequenceTracker.get(parentItempath);
Integer nextSequence = 1;
if (currentSequence != null) {
nextSequence = currentSequence + 1;
}
log.debug("looking for parent primary key: used itempath [" + itempath
+ "], converted to parent item path [" + parentItempath + "] found pk [" + parentPrimaryKey + "]");
// add parent's primary key
currentSection.setParentSectionId(parentPrimaryKey);
currentSection.setSectionSequence(nextSequence);
currentSection.setParentLevelPosition(parentLevelPosition);
this.sectionSequenceTracker.put(parentItempath, nextSequence);
}
// SAVE SECTION
persistence.storeSection(currentSection);
// CONFIRM: ADD PK TO MODEL TRACKER
assert currentSection.getId() != null : "Attempting to add a section's primary key, but it is null after a save";
log.debug("Adding Primary Key for itempath->sectionId: " + itempath + "," + currentSection.getId());
this.model.addPrimaryKeyMapping(itempath, currentSection.getId());
this.model.addLevelPositionMapping(itempath, currentSection.getLevelPosition());
this.sectionSequenceTracker.put(itempath, 0); // this is for any
// children
// SAVE CONTENTS
ContentFull contentFull = this.convertUscFieldsToContentFull(currentSection);
List<ContentPart> contentPartList = this.convertUscFieldsToContentPartList(currentSection);
// RULE: If content is ONLY NOTES, it almost always
// (I have not seen an exception yet)
// is all DIVS and TABLES...
// VERY difficult to chop up into content parts... only store full for
// now.
persistence.storeContentFull(contentFull, (!currentSection.getIsNewRecord() == Boolean.TRUE));
if (!contentFull.isNotes()) {
persistence.storeContentParts(currentSection.getId(), contentFull.getId(), contentPartList,
(!currentSection.getIsNewRecord() == Boolean.TRUE));
}
this.workingUscFieldList = null;
this.currentUscField = null;
this.model.setSection(null);
}
private String[] getItemPathElements(String itempath) {
return StringUtils.tokenizeToStringArray(itempath, "/", true, true);
}
private String getItemPathString(String[] pathelements) {
if (pathelements != null && pathelements.length > 0) {
return "/" + StringUtils.arrayToDelimitedString(pathelements, "/");
} else {
return "/";
}
}
private String getParentItemPath(String itempath) {
return getParentItemPath(itempath, 1);
}
private String getParentItemPath(String itempath, Integer levelsUp) {
if (levelsUp == null || levelsUp < 1) {
levelsUp = 1;
}
String[] pathelements = getItemPathElements(itempath);
if (pathelements != null && pathelements.length > 0 && pathelements.length > levelsUp
&& !pathelements[0].equals(itempath)) {
pathelements = Arrays.copyOf(pathelements, pathelements.length - levelsUp);
}
return getItemPathString(pathelements);
}
private String extractHeadingFromItemPath(String itempath, Integer levelsUp) {
return extractHeadingFromItemPath(getItemPathElements(itempath), levelsUp);
}
private String extractHeadingFromItemPath(String[] pathelements, Integer levelsUp) {
if (levelsUp == null || levelsUp < 0) {
levelsUp = 0;
}
int lengthOffset = levelsUp + 1;
if (pathelements != null && pathelements.length > 0 && pathelements.length > levelsUp) {
return pathelements[pathelements.length - lengthOffset];
} else {
return "";
}
}
private Section buildMissingParentSection(String parentItemPath) {
// NOTE: THIS RECURSES AS NEEDED
//
// rare circumstance!
// this does happen where the parent does not have its own
// section
// ex. /title 10/part 4/subpart 5/Sec. 123
// Current = 123
// Parent = subpart 5, but there is not a subpart 5
// create subpart 5
String[] pathelements = getItemPathElements(parentItemPath);
String onlyHeadingWeCouldFigureOut = extractHeadingFromItemPath(pathelements, 0);
Section newParentSection = new UscSection();
newParentSection.setCode(model.getCode());
newParentSection.setHeading(onlyHeadingWeCouldFigureOut);
newParentSection.setShortHeading(onlyHeadingWeCouldFigureOut);
newParentSection.setSourceReference(parentItemPath);
newParentSection.addAttribute("itempath", parentItemPath);
// in order to make this work right, we NEED the parent's parent
// information to "fit" this in
// LOOKUP PARENT'S PARENT
String grampsItemPath = getParentItemPath(parentItemPath);
String[] grampsPathElements = getItemPathElements(grampsItemPath);
Integer grampsPrimaryKey = this.model.findPrimaryKey(grampsItemPath);
if (grampsPrimaryKey == null && grampsPathElements.length > 0) {
buildMissingParentSection(grampsItemPath);
}
String grampsLevelPosition = this.model.findLevelPosition(grampsItemPath);
Integer grampsCurrentSequence = this.sectionSequenceTracker.get(grampsItemPath);
Integer grampsNextSequence = 1;
if (grampsCurrentSequence != null) {
grampsNextSequence = grampsCurrentSequence + 1;
}
// add parent's parent information
newParentSection.setParentSectionId(grampsPrimaryKey);
newParentSection.setSectionSequence(grampsNextSequence);
newParentSection.setParentLevelPosition(grampsLevelPosition);
// STORE NEW SIMULATED PARENT
persistence.storeSection(newParentSection);
assert newParentSection.getId() != null : "Attempting to add a new parent section's primary key, but it is null after a save";
log.debug("Adding Primary Key for itempath->sectionId: " + parentItemPath + "," + newParentSection.getId());
this.model.addPrimaryKeyMapping(parentItemPath, newParentSection.getId());
this.model.addLevelPositionMapping(parentItemPath, newParentSection.getLevelPosition());
this.sectionSequenceTracker.put(parentItemPath, 0);
// NEW PARENT IS NOW READY
return newParentSection;
}
private void addContentElement(TagElement el) {
if (this.elementDepthFromField == 0) {
if (this.currentUscField != null) {
// we only care if this exists
if (el.isHeading() && this.model.getSection() != null && this.model.getSection().getHeading() == null) {
this.model.getSection().setHeading(el.getTagvalue());
// 9.20.12 - changed from getinnerhtml to gettagvalue
}
this.currentUscField.addTagElement(el);
}
} else {
if (this.currentUscField != null) {
if (this.currentUscField.isHeading()) {
if ("strong".equals(el.getTagname()) || "cap-smallcap".equals(el.getTagname())) {
String s = this.model.getSection().getHeading();
s = StringUtils.replace(s, "<" + el.getTagname() + ">", "");
s = StringUtils.replace(s, "</" + el.getTagname() + ">", "");
this.model.getSection().setHeading(s);
} else if ("sup".equals(el.getTagname())) {
String s = this.model.getSection().getHeading();
int start = s.indexOf("<sup>");
int end = s.indexOf("</sup>") + 6;
if (start == -1 || end == -1) {
log.warn("Could not find <sup> or </sup> in heading: " + s);
} else {
s = s.substring(0, start) + s.substring(end);
this.model.getSection().setHeading(s);
}
}
}
}
}
}
/* ****
*
* ### USC FIELD METHODS ###
*
* ***
*/
private List<UscField> getWorkingUscFieldList() {
if (this.workingUscFieldList == null) {
this.workingUscFieldList = new ArrayList<UscField>();
}
return workingUscFieldList;
}
/* ****
*
* ### META NODE METHODS ###
*
* ***
*/
private MetaNode getCurrentMetaNode() {
if (this.workingCommentStack == null) {
return null;
}
return this.workingCommentStack.peek();
}
private void startMetaNode(MetaNode meta) {
this.workingCommentStack.push(meta);
}
private void endMetaNode(String name) {
if (this.workingCommentStack.isEmpty()) {
if (debugMode) {
System.out.println("ERROR: Why is the working stack empty? need to end node [" + name + "]");
}
} else {
MetaNode thisNode = this.workingCommentStack.pop();
if (!thisNode.getName().equals(name)) {
if (debugMode) {
System.out.println("ERROR: Why is the last item on the stack not me? me=[" + name
+ "]; last item = [" + thisNode.getName() + "]");
}
} else {
// looks good!
MetaNode priorNode = this.workingCommentStack.peek();
if (priorNode == null) {
// add to toplevel list
this.toplevelList.add(thisNode);
} else {
// add to prior node as child
priorNode.addChild(thisNode);
}
}
}
}
private void updateMetaNodeSectionTitle(String newTitle) {
if (this.sectionMetaNodeList.isEmpty()) {
// this only happens at the beginning of the document
} else {
if (newTitle == null) {
// this only happens at the end of the document
} else {
// init the next one
MetaNode sectionNode = this.sectionMetaNodeList.get((this.sectionMetaNodeList.size() - 1));
if (sectionNode.getType().equals("section")) {
sectionNode.setName(newTitle);
}
}
}
}
private void addMetaNodesToSection(String nextTitle) {
if (this.sectionMetaNodeList.isEmpty()) {
// this only happens at the beginning of the document
} else {
// not the first one
// copy the working stack into the
// previously created node
MetaNode sectionToFinish = this.sectionMetaNodeList.get(this.sectionMetaNodeList.size() - 1);
sectionToFinish.setChildren(this.toplevelList);
this.toplevelList = new ArrayList<MetaNode>();
}
if (nextTitle == null) {
// this only happens at the end of the document
} else {
// init the next one
MetaNode sectionNode = new MetaNode("", "section");
this.sectionMetaNodeList.add(sectionNode);
}
}
private void foundMetaNodeElement(String tagname, String html, String text, Map<String, String> attributeMap) {
MetaNode node = this.getCurrentMetaNode();
if (node == null) {
if (debugMode) {
log.debug("Why is this node null for tagname: " + tagname);
}
} else {
if (node.getName().equals("statute")) {
String classname = attributeMap.get("class");
if (classname != null) {
this.statuteElementMetaNodeClassPairs.add(tagname + ", " + classname);
if (debugMode && elementcount++ % 25 == 0) {
System.out.println("Text: " + text);
System.out.println("HTML: " + html);
}
} else {
if (debugMode) {
log.debug("Tag " + tagname + " doesnt have a class attribute");
}
}
}
}
}
private void debugMetaNodePrint() {
if (debugMode) {
StringBuffer sb = new StringBuffer();
for (MetaNode meta : sectionMetaNodeList) {
sb.append(meta.toString());
}
sb.append("\n");
for (String classname : this.statuteElementMetaNodeClassPairs) {
sb.append("\n");
sb.append("<classname>");
sb.append(classname);
sb.append("</classname>");
}
Tidy tidy = new Tidy(); // obtain a new Tidy instance
tidy.setXmlOut(true);
tidy.setXmlTags(true);
tidy.setXmlSpace(true);
ByteArrayInputStream bis = new ByteArrayInputStream(sb.toString().getBytes());
tidy.parse(bis, System.out); // run tidy, providing an input and
// output
// stream
}
}
private ContentFull convertUscFieldsToContentFull(Section section) {
ContentFull content = new ContentFull();
content.setNotes(true);
for (Iterator<UscField> it = this.getWorkingUscFieldList().iterator(); it.hasNext();) {
content = convertUscFieldToContentFull(it.next(), content);
}
content.setFormatType(UscTags.FORMATTYPE_HTML);
content.setSection(section);
return content;
}
private List<ContentPart> convertUscFieldsToContentPartList(Section section) {
ArrayList<ContentPart> list = new ArrayList<ContentPart>();
int counter = 0;
for (Iterator<UscField> it = this.getWorkingUscFieldList().iterator(); it.hasNext();) {
List<ContentPart> convertedItems = convertUscFieldToContentPartList(it.next(), counter, section);
if (convertedItems != null) {
counter += convertedItems.size();
}
list.addAll(convertedItems);
}
return list;
}
private ContentFull convertUscFieldToContentFull(UscField field, ContentFull content) {
if (field.isPartOfStructural() || field.isPartOfCode() || field.isPartOfNotes()) {
// content.setNotes(true);
} else {
content.setNotes(false);
}
for (Iterator<TagElement> it = field.getTagElementList().iterator(); it.hasNext();) {
TagElement el = it.next();
content = convertTagElementToContentFull(el, content);
}
List<UscField> children = field.getChildren();
if (children != null) {
for (Iterator<UscField> chIt = children.iterator(); chIt.hasNext();) {
content = convertUscFieldToContentFull(chIt.next(), content);
}
}
return content;
}
private List<ContentPart> convertUscFieldToContentPartList(UscField field, int startSequence, Section section) {
ArrayList<ContentPart> contentPartList = new ArrayList<ContentPart>();
int counter = startSequence;
for (Iterator<TagElement> it = field.getTagElementList().iterator(); it.hasNext();) {
TagElement el = it.next();
ContentPart content = convertTagElementToContentPart(el);
content.setContentType(field.getTopLevelFieldName());
content.setContentSequence(counter++);
content.setSection(section);
if (field.isPartOfStructural() || field.isPartOfCode() || field.isPartOfNotes()) {
content.setNotes(true);
content.setNotesType(field.getFieldName());
} else {
content.setNotes(false);
}
contentPartList.add(content);
}
List<UscField> children = field.getChildren();
if (children != null) {
for (Iterator<UscField> chIt = children.iterator(); chIt.hasNext();) {
List<ContentPart> convertedItems = convertUscFieldToContentPartList(chIt.next(), counter, section);
if (convertedItems != null) {
counter += convertedItems.size();
}
contentPartList.addAll(convertedItems);
}
}
return contentPartList;
}
private ContentFull convertTagElementToContentFull(TagElement el, ContentFull content) {
content.addContent(el.getOuterHTML());
// addAttributes(el.getAttributeMap(), content);
return content;
}
private ContentPart convertTagElementToContentPart(TagElement el) {
ContentPart content = new ContentPart();
if (el.isDiv() || el.isTable()) {
content.setContent(el.getOuterHTML());
} else {
content.setContent(el.getInnerHTML());
}
content.setHeader(el.isHeading());
content.setFormatType(UscTags.FORMATTYPE_HTML);
addAttributes(el.getAttributeMap(), content);
return content;
}
private void addAttributes(Map<String, String> attrs, BaseDatabaseObject compatibleObject) {
if (attrs == null || compatibleObject == null) {
return;
}
for (Iterator<Entry<String, String>> it = attrs.entrySet().iterator(); it.hasNext();) {
Entry<String, String> attr = it.next();
compatibleObject.addAttribute(attr.getKey(), attr.getValue());
}
}
public String willProcessFile(File file) {
FileInputStream fin = null;
BufferedInputStream bin = null;
try {
fin = new FileInputStream(file);
bin = new BufferedInputStream(fin);
StringBuffer sb = new StringBuffer();
byte[] bytes = new byte[102400];
int len = -1;
while ((len = bin.read(bytes)) != -1) {
sb.append(new String(bytes, 0, len));
}
return willProcessHtml(sb.toString());
} catch (Throwable t) {
log.error("Could not open up file: " + t.getLocalizedMessage());
} finally {
try {
if (bin != null) {
bin.close();
bin = null;
}
if (fin != null) {
fin.close();
fin = null;
}
} catch (Throwable t) {
}
}
return null;
}
public String willProcessUrl(URL url) {
try {
return willProcessFile(new File(url.getFile()));
} catch (Throwable t) {
log.error("Could not convert url to file: " + t.getLocalizedMessage());
return null;
}
}
public String willProcessHtml(String html) {
try {
/*
* Pattern pattern = Pattern.compile(REGEX_PDF_PATTERN); Matcher
* match = pattern.matcher(html); while (match.find()) {
* System.out.println("FOUND MATCH [" + match.group() + "]"); }
*/
return html.replaceAll(REGEX_PDF_PATTERN, "");
} catch (Throwable t) {
log.error("Could not filter html with regex [" + REGEX_PDF_PATTERN + "]: " + t.getLocalizedMessage());
return null;
}
}
public void setIsUscPrelim(String isUscPrelim) {
this.isUscPrelim =BooleanUtils.toBoolean(isUscPrelim);
}
}