Package net.sf.jpluck.plucker.parsing.html

Source Code of net.sf.jpluck.plucker.parsing.html.HTMLSerializer

package net.sf.jpluck.plucker.parsing.html;

import java.awt.Color;
import java.net.URI;
import java.util.ArrayList;
import java.util.List;

import net.sf.jpluck.jxl.URIRewriter;
import net.sf.jpluck.plucker.Bookmark;
import net.sf.jpluck.plucker.DataRecord;
import net.sf.jpluck.plucker.Document;
import net.sf.jpluck.plucker.Paragraph;
import net.sf.jpluck.plucker.TableCell;
import net.sf.jpluck.plucker.TableRecord;
import net.sf.jpluck.plucker.TableRow;
import net.sf.jpluck.plucker.TextRecord;
import net.sf.jpluck.plucker.parsing.DataRecordSerializer;
import net.sf.jpluck.spider.LinkFilter;
import net.sf.jpluck.util.ColorUtil;

import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;


/**
* A {@link net.sf.jpluck.plucker.parsing.DataRecordSerializer} implementation for creating {@link net.sf.jpluck.plucker.TextRecord}s based on HTML. This serializer
* populates a TextRecord based on the HTML events it receives.
*/
public class HTMLSerializer extends DefaultHandler
    implements DataRecordSerializer {
    private static int tableCount = 0;
    private Color currentColor = Color.BLACK;
    private LinkFilter imageFilter;
    private List embeddedImageURLList = new ArrayList();
    private List invalidURLList = new ArrayList();
    private List linkURLList = new ArrayList();
    private ListStack listStack = new ListStack();
    private Paragraph currentParagraph;
    private StringBuffer title = new StringBuffer();
    private StyledElement element;
    private TextRecord textRecord;
    private URI baseURI;
    private URIRewriter uriRewriter;
    private boolean boldApplied;
    private boolean colorApplied;
    private boolean fixedWidthApplied;
    private boolean inLink;
    private boolean includeImages = true;
    private boolean italicApplied;
    private boolean marginApplied;
    private boolean strikeThroughApplied;
    private boolean subscriptApplied;
    private boolean superscriptApplied;
    private boolean underlineApplied;
    private int brightness;
    private int currentAlignment;
    private int currentHeading;
    private boolean parseTables;
    private Document document;
    private TableRecord table;
    private TableRow row;
    private List bookmarks = new ArrayList();

    public HTMLSerializer(TextRecord textRecord, URIRewriter uriRewriter,
                          int brightness, boolean parseTables, Document document) {
        this.textRecord = textRecord;
        this.uriRewriter = uriRewriter;
        this.brightness = brightness;
        this.parseTables = parseTables;
        this.document = document;
        baseURI = URI.create(textRecord.getURI());
        currentParagraph = textRecord.addParagraph(Paragraph.DEFAULT_SPACING);
    }

    public TableRecord addTable(int border) {
        addParagraph();
        String uri = "table" + tableCount++;
        currentParagraph.addTable(uri);
        table = new TableRecord(uri, border);
        document.addRecord(table);
        return table;
    }

    public void endTable() {
        table = null;
        row = null;
    }

    public void addRow() {
        if (table != null) {
            row = table.addRow();
        }
    }

    public TableCell addCell() {
        TableCell cell = null;
        if (row != null) {
            cell=row.addCell();
            currentParagraph=cell;
        }
        return cell;
    }

    public boolean isParseTables() {
        return parseTables;
    }

    public void setBaseURI(String uri) {
        baseURI = baseURI.resolve(uri);
    }

    public DataRecord getDataRecord() {
        return textRecord;
    }

    public String[] getEmbeddedImageURIs() {
        return (String[]) embeddedImageURLList.toArray(new String[embeddedImageURLList.size()]);
    }

    public LinkFilter getImageFilter() {
        return imageFilter;
    }

    public void setIncludeImages(boolean includeImages) {
        this.includeImages = includeImages;
    }

    public boolean isIncludeImages() {
        return includeImages;
    }

    public String[] getInvalidURIs() {
        return (String[]) invalidURLList.toArray(new String[invalidURLList.size()]);
    }

    public String[] getLinkURIs() {
        return (String[]) linkURLList.toArray(new String[linkURLList.size()]);
    }

    public String getTitle() {
        return title.toString();
    }

    public void characters(char[] ch, int start, int length)
                    throws SAXException {
        if (element.isSelfOrAncestor("pre")) {
            // Text in a preformatted section
            for (int i = start; i < (start + length); i++) {
                char c = ch[i];

                if (c == '\n') {
                    if (!currentParagraph.containsSignificantContent()) {
                        currentParagraph.addPreformattedText(String.valueOf((char) 160));
                    }

                    if (inLink) {
                        currentParagraph.addNewline();
                    } else {
                        addParagraph(0, true);
                    }
                } else {
                    currentParagraph.addPreformattedText(String.valueOf(c));
                }
            }

            if (length == 0) {
                addParagraph(0, true);
            }
        } else if (element.isSelfOrAncestor("script") ||
                       element.isSelfOrAncestor("style") ||
                       element.isSelfOrAncestor("option")) {
            // Do nothing
        } else {
            // Normal text
            StringBuffer sb = new StringBuffer();

            for (int i = start; i < (start + length); i++) {
                char c = ch[i];

                if (Character.isWhitespace(c)) {
                    if (sb.length() > 0) {
                        if (!Character.isSpaceChar(sb.charAt(sb.length() - 1))) {
                            sb.append(" ");
                        }
                    } else {
                        sb.append(" ");
                    }
                } else if (c > 0) {
                    sb.append(c);
                }
            }

            if (sb.length() > 0) {
                String text = sb.toString();

                if (element.isSelfOrAncestor("title")) {
                    title.append(sb.toString());
                } else if (element.isSelfOrAncestor("body")) {
                    currentParagraph.addText(text);
                }
            }
        }
    }

    public void endDocument() throws SAXException {
        textRecord.setTitle(title.toString());
    }

    public void endElement(String uri, String localName, String qName)
                    throws SAXException {
        //localName = localName.toLowerCase();
        qName = qName.toLowerCase();

        TagHandler handler = TagHandlerFactory.getHandler(qName);
        StyledElement currentElem = element;
        element = (StyledElement) element.getParent();

        if (handler != null) {
            handler.end(this, currentElem);
        }

        if (element != null) {
            if (element.getCascadedStyle() != null) {
                applyStyle(element.getCascadedStyle());
            }
        }
    }

    public void ignorableWhitespace(char[] ch, int start, int length)
                             throws SAXException {
        characters(ch, start, length);
    }

    public String resolveLink(String uri) {
        uri = this.baseURI.resolve(uri).toString();
        return uriRewriter.rewrite(uri);
    }

    public int scaledValue(int value) {
        return value * (textRecord.isForHires() ? 2 : 1);
    }

    public void startElement(String uri, String localName, String qName,
                             Attributes attributes) throws SAXException {
        localName = localName.toLowerCase();
        qName = qName.toLowerCase();

        StyledElement newElement = new StyledElement(uri, localName, qName,
                                                     attributes);

        if (element != null) {
            element.addChild(newElement);
        }

        element = newElement;

        TagHandler handler = TagHandlerFactory.getHandler(qName);

        if (handler != null) {
            handler.start(this, element);

            if (handler instanceof Alignable) {
                String align = attributes.getValue("align");

                if (align != null) {
                    Style style = element.getStyle();

                    if (style == null) {
                        style = new Style();
                        element.setStyle(style);
                    }

                    if (align.equalsIgnoreCase("left")) {
                        style.setAlignment(Style.ALIGN_LEFT);
                    } else if (align.equalsIgnoreCase("center") ||
                                   align.equalsIgnoreCase("middle")) {
                        style.setAlignment(Style.ALIGN_CENTER);
                    } else if (align.equalsIgnoreCase("right")) {
                        style.setAlignment(Style.ALIGN_RIGHT);
                    } else if (align.equalsIgnoreCase("justify")) {
                        style.setAlignment(Style.ALIGN_JUSTIFY);
                    }
                }
            }
        }

        if (attributes.getValue("id") != null) {
            currentParagraph.addAnchor(attributes.getValue("id"));
        }

        if (element.getStyle() != null) {
            applyStyle(element.getCascadedStyle());
        }
    }

    Color getColor(String s) {
        Color color = ColorUtil.getColor(s);

        if ((color != null) && (brightness != 0)) {
            color = ColorUtil.changeBrightness(color, brightness);
        }

        return color;
    }

    Paragraph getCurrentParagraph() {
        return currentParagraph;
    }

    void setInLink(boolean inLink) {
        this.inLink = inLink;
    }

    boolean isInLink() {
        return inLink;
    }

    ListStack getListStack() {
        return listStack;
    }

    TextRecord getTextRecord() {
        return textRecord;
    }

    void addEmbeddedImageURL(String url) {
        if (!embeddedImageURLList.contains(url)) {
            embeddedImageURLList.add(url);
        }
    }

    void addInvalidURL(String url) {
        if (!invalidURLList.contains(url)) {
            invalidURLList.add(url);
        }
    }

    void addLinkURL(String url) {
        if (!linkURLList.contains(url)) {
            linkURLList.add(url);
        }
    }

    Paragraph addParagraph(int spacing, boolean force) {
        if (!inLink &&
                (force || currentParagraph.containsSignificantContent())) {
            currentParagraph = textRecord.addParagraph(spacing);
        } else if (!currentParagraph.containsSignificantContent()) {
            if (!currentParagraph.containsListItemOnly()) {
                currentParagraph.clear();

                if (spacing > currentParagraph.getSpacing()) {
                    currentParagraph.setSpacing(spacing);
                }
            }
        }

        boldApplied = false;
        italicApplied = false;
        fixedWidthApplied = false;
        strikeThroughApplied = false;
        underlineApplied = false;
        subscriptApplied = false;
        superscriptApplied = false;
        marginApplied = false;
        currentAlignment = 0;
        currentHeading = 0;

        if (element != null) {
            applyStyle(element.getCascadedStyle());

            if (element.isSelfOrAncestor("pre")) {
                currentParagraph.addFontFixedWidth();
            }
        }

        return currentParagraph;
    }

    Paragraph addParagraph() {
        return addParagraph(Paragraph.DEFAULT_SPACING, false);
    }

    Paragraph addParagraph(int spacing) {
        return addParagraph(spacing, false);
    }

    Paragraph addParagraph(boolean force) {
        return addParagraph(Paragraph.DEFAULT_SPACING, force);
    }
   
    void addBookmark(String name, String uri) {
      bookmarks.add(new Bookmark(name, uri));
    }
   
    public Bookmark[] getBookmarks() {
      return (Bookmark[])bookmarks.toArray(new Bookmark[bookmarks.size()]);
    }

    void applyStyle(Style style) {
        int marginRight = style.getMarginRight();
        int marginLeft = style.getMarginLeft();

        if (((marginLeft > 0) || (marginRight > 0)) && !marginApplied) {
            // TODO: optimize maximum indentation
            if (marginLeft > 100) {
                marginLeft = 100;
            }

            if (marginRight > 100) {
                marginRight = 100;
            }

            currentParagraph.addMargin(marginLeft, marginRight);
            marginApplied = true;
        }

        if (currentAlignment != style.getAlignment()) {
            switch (style.getAlignment()) {
            case Style.ALIGN_LEFT:
                currentParagraph.addAlignLeft();
                break;
            case Style.ALIGN_CENTER:
                currentParagraph.addAlignCenter();
                break;
            case Style.ALIGN_RIGHT:
                currentParagraph.addAlignRight();
                break;
            case Style.ALIGN_JUSTIFY:
                currentParagraph.addAlignJustify();
                break;
            }
        }

        currentAlignment = style.getAlignment();

        if (style.isItalic()) {
            if (!italicApplied) {
                currentParagraph.addItalicStart();
                italicApplied = true;
            }
        } else if (italicApplied) {
            currentParagraph.addItalicEnd();
            italicApplied = false;
        }

        if (style.isUnderline()) {
            if (!underlineApplied) {
                currentParagraph.addUnderlineStart();
                underlineApplied = true;
            }
        } else if (underlineApplied) {
            currentParagraph.addUnderlineEnd();
            underlineApplied = false;
        }

        if (style.isStrikethrough()) {
            if (!strikeThroughApplied) {
                currentParagraph.addStrikethroughStart();
                strikeThroughApplied = true;
            }
        } else if (strikeThroughApplied) {
            currentParagraph.addStrikethroughEnd();
            strikeThroughApplied = false;
        }

        if (style.getColor() != null) {
            if (!colorApplied || !style.getColor().equals(currentColor)) {
                currentParagraph.addTextColor(style.getColor());
                colorApplied = true;
            }
        } else {
            if (colorApplied) {
                currentParagraph.addTextColor(Color.BLACK);
                colorApplied = false;
            }
        }

        currentColor = style.getColor();

        if (style.getHeading() != currentHeading) {
            switch (style.getHeading()) {
            case 0:

                if (style.useRegularFont()) {
                    currentParagraph.addFontRegular();
                }

                break;
            case 1:
                currentParagraph.addFontH1();
                break;
            case 2:
                currentParagraph.addFontH2();
                break;
            case 3:
                currentParagraph.addFontH3();
                break;
            case 4:
                currentParagraph.addFontH4();
                break;
            case 5:
                currentParagraph.addFontH5();
                break;
            case 6:
                currentParagraph.addFontH6();
                break;
            }

            currentHeading = style.getHeading();

            if (currentHeading > 0) {
                // Heading styles take precendence over regular text, bold and fixed width fonts. Skip the rest.
                boldApplied = false;
                fixedWidthApplied = false;
                subscriptApplied = false;
                superscriptApplied = false;
                return;
            }
        }

        //if (style.isRegular()) {
        if (style.useRegularFont()) {
            if (boldApplied || fixedWidthApplied || subscriptApplied ||
                    superscriptApplied) {
                currentParagraph.addFontRegular();
                boldApplied = false;
                fixedWidthApplied = false;
                subscriptApplied = false;
                superscriptApplied = false;
            }

            // Font is regular, we don't need to check other styles.
            return;
        }

        if (style.isBold() && !boldApplied) {
            currentParagraph.addFontBold();
            boldApplied = true;
            fixedWidthApplied = false;
            subscriptApplied = false;
            superscriptApplied = false;
        }

        if (style.isFixedWidth() && !fixedWidthApplied) {
            currentParagraph.addFontFixedWidth();
            fixedWidthApplied = true;
            boldApplied = false;
            subscriptApplied = false;
            superscriptApplied = false;
        }

        if (style.isSubscript() && !subscriptApplied) {
            currentParagraph.addFontSubscript();
            subscriptApplied = true;
            superscriptApplied = false;
            fixedWidthApplied = false;
            boldApplied = false;
        }

        if (style.isSuperscript() && !superscriptApplied) {
            currentParagraph.addFontSuperscript();
            superscriptApplied = true;
            subscriptApplied = false;
            fixedWidthApplied = false;
            boldApplied = false;
        }
    }

    void marginApplied() {
        marginApplied = true;
    }
}
TOP

Related Classes of net.sf.jpluck.plucker.parsing.html.HTMLSerializer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.