Package

Source Code of IncludeCrawler$Context

import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintWriter;
import java.net.URL;
import java.net.URLConnection;
import java.util.Iterator;
import java.util.Map;
import java.util.Vector;

import javax.imageio.ImageIO;

import com.quiotix.html.parser.HtmlDocument;
import com.quiotix.html.parser.HtmlDocument.Attribute;

class IncludeCrawler extends HTMLSpider {

    public static final String INITIAL_STATUS    = "initial";
    public static final String VALIDATED_STATUS  = "validated";
    public static final String AGREED_STATUS     = "agreed";
    public static final String FINAL_STATUS      = "final";
    public static final String INVALID_STATUS    = "invalid";
    public static final String STATUSES_DISABLED = null;
    public static       String DEFAULT_DOC_STATUS = INITIAL_STATUS;

    private class Context {
        File outputFile;
        File imageDirectory;
        String imageSubdir;
        PrintWriter out;
        Map anchorMap;
        ParagraphNumberer numberer;
        Vector prefixMap;
    }

    Context context;
    String docID;
    String skipThroughTag = null;
    int headingIncrement = 0;
    int imageNumber = 0;
    private boolean root;
    boolean printComments = true;
    boolean inHeading = false;
    private String docStatus = DEFAULT_DOC_STATUS;

    static int undefID = 0;


    public IncludeCrawler(File outputFile, String imageSubdir,
                          Map anchorMap, URL u,
                          ParagraphNumberer numberer) throws IOException
    {
        context = new Context();
        context.outputFile = outputFile;
        context.imageSubdir = imageSubdir;
        if (imageSubdir == null || "none".equalsIgnoreCase(imageSubdir))
            context.imageDirectory = null;
        else {
            context.imageDirectory =
                new File(outputFile.getParentFile(), imageSubdir);
            context.imageDirectory.mkdirs();
        }
        context.anchorMap = anchorMap;
        context.out = new PrintWriter(new FileWriter(outputFile));
        context.numberer = numberer;
        context.prefixMap = buildPrefixMap(outputFile.toURL());
        root = true;
        openURL(u);
    }
    public void finish() {
        context.out.flush();

        if (root)
            context.out.close();
    }

    protected IncludeCrawler(IncludeCrawler that, HtmlDocument.Tag t) {
        super(that);

        this.context = that.context;
        root = false;

        // look at the Tag and decide whether to skip until we find the
        // <body> tag, or the </h1> tag.
        skipThroughTag = "BODY";

        // should the document status be inherited from the parent document,
        // or should we always default it unless it has been specified?
        // docStatus = that.docStatus;
        docStatus = DEFAULT_DOC_STATUS;
        headingIncrement = that.headingIncrement;

        Attribute demote = getAttribute(t, "demoteTo");
        if (demote != null) try {
            headingIncrement += Integer.parseInt(deQuote(demote.value)) - 1;
        } catch (NumberFormatException nfe) {
        }
    }

    public void openURL(URL u) {
        String anchorName = getAnchorName(u);
        if (anchorName != null)
                                // strip "_top" from the end.
            docID = anchorName.substring(0, anchorName.length()-4);
        else
            docID = "undefined" + undefID++;

        super.openURL(u);
    }

    protected HTMLSpider getRecursiveInstance(HtmlDocument.Tag t) {
        skipThroughTag = "/A";
        endDocumentDivision();
        return new IncludeCrawler(this, t);
    }

    protected boolean print(Object o) {
        if (skipThroughTag == null) {
            context.out.print(o);
            return true;
        } else {
            return false;
        }
    }

    private void checkForSkipTag(String tagname) {
        if (skipThroughTag != null &&
            skipThroughTag.equalsIgnoreCase(tagname))
            skipThroughTag = null;
    }

    public void visit(HtmlDocument.Tag t) {
        String tagName = t.tagName;

        if (isHeading(tagName))
            visitHeading(t);

        else if (tagName.equalsIgnoreCase("A"))
            visitAnchor(t);

        else if (tagName.equalsIgnoreCase("IMG"))
            visitImg(t);

        else if (tagName.equalsIgnoreCase("SPAN"))
            visitSpan(t);

        else if (tagName.equalsIgnoreCase("META"))
            visitMeta(t);

        super.visit(t);

        if (!print(t))
            checkForSkipTag(tagName);
        else if (context.numberer != null && isHeading(tagName))
            printParagraphNumber(t);

        // if this was the <body> tag, print out the "top" anchor for
        // this document.
        if (tagName.equalsIgnoreCase("BODY")) {
            String anchorName = getAnchorName(documentURL);
            printAnchor(anchorName);
            startDocumentDivision();
        }
    }

    public void visit(HtmlDocument.EndTag t) {

        if (root && t.tagName.equalsIgnoreCase("HEAD"))
            printStyleSheet();

        // if this is a heading, increment it.
        // don't increment if we're currently skipping tags, since
        // we might be skipping until the </h1> tag.
        if (isHeading(t.tagName)) {
            inHeading = false;
            if (context.numberer != null)
                context.numberer.headingTextDone();
            if (skipThroughTag == null)
                t.tagName = incrHeading(t.tagName);
        }

        // if we just saw the </body> tag, start skipping forever, or
        // until we see another <body> tag (which would be malformed
        // html)
        if (!root && t.tagName.equalsIgnoreCase("BODY")) {
            skipThroughTag = "BODY";
            endDocumentDivision();
        }

        if (!print(t)) checkForSkipTag("/" + t.tagName);
    }

    public void visitHeading(HtmlDocument.Tag t) {
        t.tagName = incrHeading(t.tagName);
        inHeading = true;
    }

    public void visitHref(HtmlDocument.Tag t, Attribute href) {

        super.visitHref(t, href);

        URL url = resolveURL(href);
        String mapped = getAnchorName(url);
        if (mapped != null)
            setAttribute(href, "#" + mapped);
        else {
            mapped = rewriteRelativeURL(url);
            setAttribute(href, mapped);
        }

        // if following an included document link closed our document
        // status division, restart it.
        if (!inDocumentDivision) startDocumentDivision();
    }

    public void visitAnchor(HtmlDocument.Tag t) {
        Attribute name = getAttribute(t, "NAME");
        if (name != null) {
            URL anchorUrl = resolveHash(name);
            String anchorName = getAnchorName(anchorUrl);
            if (anchorName != null) {
                setAttribute(name, anchorName);
                if (//inHeading &&
                    context.numberer != null)
                    context.numberer.addUserAnchor(anchorName);
            }
        }
    }

    public void visitSpan(HtmlDocument.Tag t) {
        CrossReferencer.getCrossReference(t, docID + "_");
    }

    public void visitImg(HtmlDocument.Tag t) {
        Attribute src = getAttribute(t, "SRC");
        if (src != null) try {

            URL imageURL = resolveURL(src);
            if (imageURL == null) return;

            if (context.imageDirectory == null)
                rewriteImageURL(src, imageURL);
            else
                copyImage(src, imageURL);
           
            // Adding the "width" and "height" attributes to the image tag, but
            // only if they are not already set.
            boolean widthSet = getAttribute(t, "width") != null;
            boolean heightSet = getAttribute(t, "height") != null;
           
            if (!widthSet || !heightSet) {
                BufferedImage image = ImageIO.read(imageURL.openStream());
               
                if (image != null)  {
                    addImgSizeAttributes(t, widthSet, heightSet, image);
                }
            }

        } catch (IOException e) {
            e.printStackTrace();
        }
    }
   
    private void addImgSizeAttributes(HtmlDocument.Tag t,
            boolean widthSet, boolean heightSet,
            BufferedImage image) {
        int width = image.getWidth();
        int height = image.getHeight();
       
        if (!widthSet) {
            t.attributeList.addAttribute(
                new Attribute("width", "\"" + width + "\""));
        }
       
        if (!heightSet) {
            t.attributeList.addAttribute(
            new Attribute("height", "\"" + height + "\""));
        }
    }

    public void rewriteImageURL(Attribute src, URL imageURL)
        throws IOException
    {
        String mapped = rewriteRelativeURL(imageURL);
        setAttribute(src, mapped);
    }

    public void copyImage(Attribute src, URL imageURL) throws IOException {

        // check to ensure that the image file exists and we can
        // get to it.
        URLConnection conn = imageURL.openConnection();
        conn.connect();
        InputStream in = conn.getInputStream();

        // get the name of the original file.
        String originalFilename = imageURL.getFile();
        int queryPos = originalFilename.indexOf('?');
        if (queryPos != -1)
            originalFilename = originalFilename.substring(0, queryPos);

        // determine the filename extension of the image file.
        String extension;
        int extStart = originalFilename.lastIndexOf('.');
        if (extStart != -1)
            extension = originalFilename.substring(extStart);
        else
            extension = "";

        // construct the new filename for the image, and open an
        // output stream to it.
        String outputName = docID + "_" + imageNumber++ + extension;
        File outFile = new File(context.imageDirectory, outputName);
        FileOutputStream fos = new FileOutputStream(outFile);

        // copy the image from the original location to the new
        // location
        DocSpider.copyFile(in, fos);
        fos.close();

        // rewrite the SRC attribute to point to the new file.
        String newSrc = context.imageSubdir + "/" + outputName;
        setAttribute(src, newSrc);

    }
    public void visitMeta(HtmlDocument.Tag t) {
        if (DEFAULT_DOC_STATUS == STATUSES_DISABLED) return;
        Attribute a = getAttribute(t, "NAME");
        if (a == null) return;
        if ("Generator".equalsIgnoreCase(deQuote(a.value))) {
            a = getAttribute(t, "content");
            if (a != null && a.value.indexOf("Microsoft") != -1)
                printComments = false;
        } else if ("DocStatus".equalsIgnoreCase(deQuote(a.value))) {
            a = getAttribute(t, "content");
            if (a != null) docStatus = deQuote(a.value);
        }
    }

    public boolean isHeading(String tagName) {
        return (tagName.length() == 2 &&
                "hH".indexOf(tagName.charAt(0)) != -1 &&
                "123456789".indexOf(tagName.charAt(1)) != -1);
    }
    public String incrHeading(String tagName) {
        if (headingIncrement == 0) return tagName;

        char headingNum = tagName.charAt(1);
        headingNum += (char) headingIncrement;
        return tagName.substring(0, 1) + headingNum;
    }


    public void printParagraphNumber(HtmlDocument.Tag t) {
        if (getAttribute(t, "NO_NUMBER") != null) return;

        int headingNum = t.tagName.charAt(1) - '0';
        String paraNum = context.numberer.getNextNumber(headingNum);
        context.numberer.setUserStatus(docStatus);
        print(paraNum);
        print(" ");
    }

    public void printAnchor(String anchorName) {
        // always print anchors, even if skipping is on.
        context.out.print("<a name=\"" + anchorName + "\"></a>");
    }

    public String getAnchorName(URL u) {
        return (String) context.anchorMap.get(normalizeURL(u));
    }


    public void visit(HtmlDocument.Comment c)    {
        if (printComments)
            print(c);
    }
    public void visit(HtmlDocument.Annotation a) { print(a);    }

    public void visit(HtmlDocument.Newline n) {
        if (inHeading && context.numberer != null)
            context.numberer.appendHeadingText(" ");
        if (print("")) context.out.println();
    }

    public void visit(HtmlDocument.Text t) {
        // if we're in a heading, and paragraph numbering is on,
        if (inHeading && context.numberer != null) {
            if (context.numberer.isParagraphNumber(t.toString()))
                // this looks like a paragraph number,
                return;         // then don't print anything.
            else
                context.numberer.appendHeadingText(t.toString());
        } else {
            int pos = t.toString().indexOf(BEGIN_APPENDIX_TAG);
            if (pos != -1) {
                if (context.numberer != null)
                    context.numberer.startInitialAlpha();
                String text = t.toString();
                text = (text.substring(0, pos) +
                        text.substring(pos+BEGIN_APPENDIX_TAG.length()));
                t.text = text;
            }
        }
        if (t.toString().indexOf(CrossReferencer.TABLE_OF_CONTENTS_TAG)!= -1) {
            endDocumentDivision();
            print(t);
            startDocumentDivision();
        } else {
            print(t);
        }
    }

    boolean inDocumentDivision = false;
    protected void startDocumentDivision() {
        if (inDocumentDivision) endDocumentDivision();
        if (docStatus == STATUSES_DISABLED)
            context.out.println("<div>");
        else
            context.out.println("<div class=\"" + docStatus.toUpperCase() + "_STATUS\">");
        inDocumentDivision = true;
    }
    protected void endDocumentDivision() {
        if (!inDocumentDivision) return;
        context.out.println("</div>");
        inDocumentDivision = false;
    }

    protected void printStyleSheet() {
        if (DEFAULT_DOC_STATUS == STATUSES_DISABLED) return;
        context.out.println("<style>");
        context.out.println(".INITIAL_STATUS { border-left: .25in solid #CCCCFF; padding-left: .1in }");
        context.out.println(".VALIDATED_STATUS { border-left: .25in solid #aaffff; padding-left: .1in }");
        context.out.println(".AGREED_STATUS { border-left: .25in solid #CCFF99; padding-left: .1in }");
        context.out.println(".FINAL_STATUS { border-left: .25in solid #ffffff; padding-left: .1in }");
        context.out.println(".INVALID_STATUS { border-left: .25in solid #CC9900; padding-left: .1in }");
        context.out.println(".INITIAL_TOC_STATUS { background: #CCCCFF }");
        context.out.println(".VALIDATED_TOC_STATUS { background: #aaffff }");
        context.out.println(".AGREED_TOC_STATUS { background: #CCFF99 }");
        context.out.println(".FINAL_TOC_STATUS { background: #ffffff }");
        context.out.println(".INVALID_TOC_STATUS { background: #CC9900 }");
        context.out.println("</STYLE>");
    }

    private static final String BEGIN_APPENDIX_TAG = "BEGIN_APPENDIX";

    private String chopURL(String u) {
        // chop the final slash if it exists.
        if (u.endsWith("/"))
            u = u.substring(0, u.length()-1);

        // find the last slash in the string.
        int slashPos = u.lastIndexOf('/');
        if (slashPos == -1)
            return null;
        else
            return u.substring(0, slashPos+1);
    }

    public Vector buildPrefixMap(URL u) {
        System.err.println("buildPrefixMap("+u+")");
        Vector result = new Vector();
        try {
            URL base = new URL(u, "/");
            String url = u.toString();
            String baseURL = base.toString();
            baseURL = baseURL.substring(0, baseURL.length()-1);
            url = url.substring(baseURL.length());
            url = chopURL(url);
            String newPrefix = "";
            do {
                PrefixMap m = new PrefixMap();
                m.oldPrefix = normalizeURL(baseURL + url);
                m.newPrefix = newPrefix;
                result.add(m);

                url = chopURL(url);
                newPrefix = newPrefix + "../";
            } while (url != null);
        } catch (Exception e) {
            e.printStackTrace();
        }
        return result;
    }

    private class PrefixMap {
        public String oldPrefix;
        public String newPrefix;
    }

    public String rewriteRelativeURL(URL u) {
        String url = u.toString();
        String norm = normalizeURL(url);
        Iterator i = context.prefixMap.iterator();
        PrefixMap m;
        while (i.hasNext()) {
            m = (PrefixMap) i.next();
            if (norm.startsWith(m.oldPrefix))
                return m.newPrefix + url.substring(m.oldPrefix.length());
        }

        if (url.startsWith("file:"))
            System.err.println("Can't rewrite URL as a relative URL: " + url);

        return url;
    }

}
TOP

Related Classes of IncludeCrawler$Context

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.