package net.sf.jpluck.plucker.parsing.html;
import java.awt.Color;
import java.net.URI;
import java.util.ArrayList;
import java.util.List;
import net.sf.jpluck.jxl.URIRewriter;
import net.sf.jpluck.plucker.Bookmark;
import net.sf.jpluck.plucker.DataRecord;
import net.sf.jpluck.plucker.Document;
import net.sf.jpluck.plucker.Paragraph;
import net.sf.jpluck.plucker.TableCell;
import net.sf.jpluck.plucker.TableRecord;
import net.sf.jpluck.plucker.TableRow;
import net.sf.jpluck.plucker.TextRecord;
import net.sf.jpluck.plucker.parsing.DataRecordSerializer;
import net.sf.jpluck.spider.LinkFilter;
import net.sf.jpluck.util.ColorUtil;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
/**
* A {@link net.sf.jpluck.plucker.parsing.DataRecordSerializer} implementation for creating {@link net.sf.jpluck.plucker.TextRecord}s based on HTML. This serializer
* populates a TextRecord based on the HTML events it receives.
*/
public class HTMLSerializer extends DefaultHandler
implements DataRecordSerializer {
private static int tableCount = 0;
private Color currentColor = Color.BLACK;
private LinkFilter imageFilter;
private List embeddedImageURLList = new ArrayList();
private List invalidURLList = new ArrayList();
private List linkURLList = new ArrayList();
private ListStack listStack = new ListStack();
private Paragraph currentParagraph;
private StringBuffer title = new StringBuffer();
private StyledElement element;
private TextRecord textRecord;
private URI baseURI;
private URIRewriter uriRewriter;
private boolean boldApplied;
private boolean colorApplied;
private boolean fixedWidthApplied;
private boolean inLink;
private boolean includeImages = true;
private boolean italicApplied;
private boolean marginApplied;
private boolean strikeThroughApplied;
private boolean subscriptApplied;
private boolean superscriptApplied;
private boolean underlineApplied;
private int brightness;
private int currentAlignment;
private int currentHeading;
private boolean parseTables;
private Document document;
private TableRecord table;
private TableRow row;
private List bookmarks = new ArrayList();
public HTMLSerializer(TextRecord textRecord, URIRewriter uriRewriter,
int brightness, boolean parseTables, Document document) {
this.textRecord = textRecord;
this.uriRewriter = uriRewriter;
this.brightness = brightness;
this.parseTables = parseTables;
this.document = document;
baseURI = URI.create(textRecord.getURI());
currentParagraph = textRecord.addParagraph(Paragraph.DEFAULT_SPACING);
}
public TableRecord addTable(int border) {
addParagraph();
String uri = "table" + tableCount++;
currentParagraph.addTable(uri);
table = new TableRecord(uri, border);
document.addRecord(table);
return table;
}
public void endTable() {
table = null;
row = null;
}
public void addRow() {
if (table != null) {
row = table.addRow();
}
}
public TableCell addCell() {
TableCell cell = null;
if (row != null) {
cell=row.addCell();
currentParagraph=cell;
}
return cell;
}
public boolean isParseTables() {
return parseTables;
}
public void setBaseURI(String uri) {
baseURI = baseURI.resolve(uri);
}
public DataRecord getDataRecord() {
return textRecord;
}
public String[] getEmbeddedImageURIs() {
return (String[]) embeddedImageURLList.toArray(new String[embeddedImageURLList.size()]);
}
public LinkFilter getImageFilter() {
return imageFilter;
}
public void setIncludeImages(boolean includeImages) {
this.includeImages = includeImages;
}
public boolean isIncludeImages() {
return includeImages;
}
public String[] getInvalidURIs() {
return (String[]) invalidURLList.toArray(new String[invalidURLList.size()]);
}
public String[] getLinkURIs() {
return (String[]) linkURLList.toArray(new String[linkURLList.size()]);
}
public String getTitle() {
return title.toString();
}
public void characters(char[] ch, int start, int length)
throws SAXException {
if (element.isSelfOrAncestor("pre")) {
// Text in a preformatted section
for (int i = start; i < (start + length); i++) {
char c = ch[i];
if (c == '\n') {
if (!currentParagraph.containsSignificantContent()) {
currentParagraph.addPreformattedText(String.valueOf((char) 160));
}
if (inLink) {
currentParagraph.addNewline();
} else {
addParagraph(0, true);
}
} else {
currentParagraph.addPreformattedText(String.valueOf(c));
}
}
if (length == 0) {
addParagraph(0, true);
}
} else if (element.isSelfOrAncestor("script") ||
element.isSelfOrAncestor("style") ||
element.isSelfOrAncestor("option")) {
// Do nothing
} else {
// Normal text
StringBuffer sb = new StringBuffer();
for (int i = start; i < (start + length); i++) {
char c = ch[i];
if (Character.isWhitespace(c)) {
if (sb.length() > 0) {
if (!Character.isSpaceChar(sb.charAt(sb.length() - 1))) {
sb.append(" ");
}
} else {
sb.append(" ");
}
} else if (c > 0) {
sb.append(c);
}
}
if (sb.length() > 0) {
String text = sb.toString();
if (element.isSelfOrAncestor("title")) {
title.append(sb.toString());
} else if (element.isSelfOrAncestor("body")) {
currentParagraph.addText(text);
}
}
}
}
public void endDocument() throws SAXException {
textRecord.setTitle(title.toString());
}
public void endElement(String uri, String localName, String qName)
throws SAXException {
//localName = localName.toLowerCase();
qName = qName.toLowerCase();
TagHandler handler = TagHandlerFactory.getHandler(qName);
StyledElement currentElem = element;
element = (StyledElement) element.getParent();
if (handler != null) {
handler.end(this, currentElem);
}
if (element != null) {
if (element.getCascadedStyle() != null) {
applyStyle(element.getCascadedStyle());
}
}
}
public void ignorableWhitespace(char[] ch, int start, int length)
throws SAXException {
characters(ch, start, length);
}
public String resolveLink(String uri) {
uri = this.baseURI.resolve(uri).toString();
return uriRewriter.rewrite(uri);
}
public int scaledValue(int value) {
return value * (textRecord.isForHires() ? 2 : 1);
}
public void startElement(String uri, String localName, String qName,
Attributes attributes) throws SAXException {
localName = localName.toLowerCase();
qName = qName.toLowerCase();
StyledElement newElement = new StyledElement(uri, localName, qName,
attributes);
if (element != null) {
element.addChild(newElement);
}
element = newElement;
TagHandler handler = TagHandlerFactory.getHandler(qName);
if (handler != null) {
handler.start(this, element);
if (handler instanceof Alignable) {
String align = attributes.getValue("align");
if (align != null) {
Style style = element.getStyle();
if (style == null) {
style = new Style();
element.setStyle(style);
}
if (align.equalsIgnoreCase("left")) {
style.setAlignment(Style.ALIGN_LEFT);
} else if (align.equalsIgnoreCase("center") ||
align.equalsIgnoreCase("middle")) {
style.setAlignment(Style.ALIGN_CENTER);
} else if (align.equalsIgnoreCase("right")) {
style.setAlignment(Style.ALIGN_RIGHT);
} else if (align.equalsIgnoreCase("justify")) {
style.setAlignment(Style.ALIGN_JUSTIFY);
}
}
}
}
if (attributes.getValue("id") != null) {
currentParagraph.addAnchor(attributes.getValue("id"));
}
if (element.getStyle() != null) {
applyStyle(element.getCascadedStyle());
}
}
Color getColor(String s) {
Color color = ColorUtil.getColor(s);
if ((color != null) && (brightness != 0)) {
color = ColorUtil.changeBrightness(color, brightness);
}
return color;
}
Paragraph getCurrentParagraph() {
return currentParagraph;
}
void setInLink(boolean inLink) {
this.inLink = inLink;
}
boolean isInLink() {
return inLink;
}
ListStack getListStack() {
return listStack;
}
TextRecord getTextRecord() {
return textRecord;
}
void addEmbeddedImageURL(String url) {
if (!embeddedImageURLList.contains(url)) {
embeddedImageURLList.add(url);
}
}
void addInvalidURL(String url) {
if (!invalidURLList.contains(url)) {
invalidURLList.add(url);
}
}
void addLinkURL(String url) {
if (!linkURLList.contains(url)) {
linkURLList.add(url);
}
}
Paragraph addParagraph(int spacing, boolean force) {
if (!inLink &&
(force || currentParagraph.containsSignificantContent())) {
currentParagraph = textRecord.addParagraph(spacing);
} else if (!currentParagraph.containsSignificantContent()) {
if (!currentParagraph.containsListItemOnly()) {
currentParagraph.clear();
if (spacing > currentParagraph.getSpacing()) {
currentParagraph.setSpacing(spacing);
}
}
}
boldApplied = false;
italicApplied = false;
fixedWidthApplied = false;
strikeThroughApplied = false;
underlineApplied = false;
subscriptApplied = false;
superscriptApplied = false;
marginApplied = false;
currentAlignment = 0;
currentHeading = 0;
if (element != null) {
applyStyle(element.getCascadedStyle());
if (element.isSelfOrAncestor("pre")) {
currentParagraph.addFontFixedWidth();
}
}
return currentParagraph;
}
Paragraph addParagraph() {
return addParagraph(Paragraph.DEFAULT_SPACING, false);
}
Paragraph addParagraph(int spacing) {
return addParagraph(spacing, false);
}
Paragraph addParagraph(boolean force) {
return addParagraph(Paragraph.DEFAULT_SPACING, force);
}
void addBookmark(String name, String uri) {
bookmarks.add(new Bookmark(name, uri));
}
public Bookmark[] getBookmarks() {
return (Bookmark[])bookmarks.toArray(new Bookmark[bookmarks.size()]);
}
void applyStyle(Style style) {
int marginRight = style.getMarginRight();
int marginLeft = style.getMarginLeft();
if (((marginLeft > 0) || (marginRight > 0)) && !marginApplied) {
// TODO: optimize maximum indentation
if (marginLeft > 100) {
marginLeft = 100;
}
if (marginRight > 100) {
marginRight = 100;
}
currentParagraph.addMargin(marginLeft, marginRight);
marginApplied = true;
}
if (currentAlignment != style.getAlignment()) {
switch (style.getAlignment()) {
case Style.ALIGN_LEFT:
currentParagraph.addAlignLeft();
break;
case Style.ALIGN_CENTER:
currentParagraph.addAlignCenter();
break;
case Style.ALIGN_RIGHT:
currentParagraph.addAlignRight();
break;
case Style.ALIGN_JUSTIFY:
currentParagraph.addAlignJustify();
break;
}
}
currentAlignment = style.getAlignment();
if (style.isItalic()) {
if (!italicApplied) {
currentParagraph.addItalicStart();
italicApplied = true;
}
} else if (italicApplied) {
currentParagraph.addItalicEnd();
italicApplied = false;
}
if (style.isUnderline()) {
if (!underlineApplied) {
currentParagraph.addUnderlineStart();
underlineApplied = true;
}
} else if (underlineApplied) {
currentParagraph.addUnderlineEnd();
underlineApplied = false;
}
if (style.isStrikethrough()) {
if (!strikeThroughApplied) {
currentParagraph.addStrikethroughStart();
strikeThroughApplied = true;
}
} else if (strikeThroughApplied) {
currentParagraph.addStrikethroughEnd();
strikeThroughApplied = false;
}
if (style.getColor() != null) {
if (!colorApplied || !style.getColor().equals(currentColor)) {
currentParagraph.addTextColor(style.getColor());
colorApplied = true;
}
} else {
if (colorApplied) {
currentParagraph.addTextColor(Color.BLACK);
colorApplied = false;
}
}
currentColor = style.getColor();
if (style.getHeading() != currentHeading) {
switch (style.getHeading()) {
case 0:
if (style.useRegularFont()) {
currentParagraph.addFontRegular();
}
break;
case 1:
currentParagraph.addFontH1();
break;
case 2:
currentParagraph.addFontH2();
break;
case 3:
currentParagraph.addFontH3();
break;
case 4:
currentParagraph.addFontH4();
break;
case 5:
currentParagraph.addFontH5();
break;
case 6:
currentParagraph.addFontH6();
break;
}
currentHeading = style.getHeading();
if (currentHeading > 0) {
// Heading styles take precendence over regular text, bold and fixed width fonts. Skip the rest.
boldApplied = false;
fixedWidthApplied = false;
subscriptApplied = false;
superscriptApplied = false;
return;
}
}
//if (style.isRegular()) {
if (style.useRegularFont()) {
if (boldApplied || fixedWidthApplied || subscriptApplied ||
superscriptApplied) {
currentParagraph.addFontRegular();
boldApplied = false;
fixedWidthApplied = false;
subscriptApplied = false;
superscriptApplied = false;
}
// Font is regular, we don't need to check other styles.
return;
}
if (style.isBold() && !boldApplied) {
currentParagraph.addFontBold();
boldApplied = true;
fixedWidthApplied = false;
subscriptApplied = false;
superscriptApplied = false;
}
if (style.isFixedWidth() && !fixedWidthApplied) {
currentParagraph.addFontFixedWidth();
fixedWidthApplied = true;
boldApplied = false;
subscriptApplied = false;
superscriptApplied = false;
}
if (style.isSubscript() && !subscriptApplied) {
currentParagraph.addFontSubscript();
subscriptApplied = true;
superscriptApplied = false;
fixedWidthApplied = false;
boldApplied = false;
}
if (style.isSuperscript() && !superscriptApplied) {
currentParagraph.addFontSuperscript();
superscriptApplied = true;
subscriptApplied = false;
fixedWidthApplied = false;
boldApplied = false;
}
}
void marginApplied() {
marginApplied = true;
}
}