import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintWriter;
import java.net.URL;
import java.net.URLConnection;
import java.util.Iterator;
import java.util.Map;
import java.util.Vector;
import javax.imageio.ImageIO;
import com.quiotix.html.parser.HtmlDocument;
import com.quiotix.html.parser.HtmlDocument.Attribute;
class IncludeCrawler extends HTMLSpider {
public static final String INITIAL_STATUS = "initial";
public static final String VALIDATED_STATUS = "validated";
public static final String AGREED_STATUS = "agreed";
public static final String FINAL_STATUS = "final";
public static final String INVALID_STATUS = "invalid";
public static final String STATUSES_DISABLED = null;
public static String DEFAULT_DOC_STATUS = INITIAL_STATUS;
private class Context {
File outputFile;
File imageDirectory;
String imageSubdir;
PrintWriter out;
Map anchorMap;
ParagraphNumberer numberer;
Vector prefixMap;
}
Context context;
String docID;
String skipThroughTag = null;
int headingIncrement = 0;
int imageNumber = 0;
private boolean root;
boolean printComments = true;
boolean inHeading = false;
private String docStatus = DEFAULT_DOC_STATUS;
static int undefID = 0;
public IncludeCrawler(File outputFile, String imageSubdir,
Map anchorMap, URL u,
ParagraphNumberer numberer) throws IOException
{
context = new Context();
context.outputFile = outputFile;
context.imageSubdir = imageSubdir;
if (imageSubdir == null || "none".equalsIgnoreCase(imageSubdir))
context.imageDirectory = null;
else {
context.imageDirectory =
new File(outputFile.getParentFile(), imageSubdir);
context.imageDirectory.mkdirs();
}
context.anchorMap = anchorMap;
context.out = new PrintWriter(new FileWriter(outputFile));
context.numberer = numberer;
context.prefixMap = buildPrefixMap(outputFile.toURL());
root = true;
openURL(u);
}
public void finish() {
context.out.flush();
if (root)
context.out.close();
}
protected IncludeCrawler(IncludeCrawler that, HtmlDocument.Tag t) {
super(that);
this.context = that.context;
root = false;
// look at the Tag and decide whether to skip until we find the
// <body> tag, or the </h1> tag.
skipThroughTag = "BODY";
// should the document status be inherited from the parent document,
// or should we always default it unless it has been specified?
// docStatus = that.docStatus;
docStatus = DEFAULT_DOC_STATUS;
headingIncrement = that.headingIncrement;
Attribute demote = getAttribute(t, "demoteTo");
if (demote != null) try {
headingIncrement += Integer.parseInt(deQuote(demote.value)) - 1;
} catch (NumberFormatException nfe) {
}
}
public void openURL(URL u) {
String anchorName = getAnchorName(u);
if (anchorName != null)
// strip "_top" from the end.
docID = anchorName.substring(0, anchorName.length()-4);
else
docID = "undefined" + undefID++;
super.openURL(u);
}
protected HTMLSpider getRecursiveInstance(HtmlDocument.Tag t) {
skipThroughTag = "/A";
endDocumentDivision();
return new IncludeCrawler(this, t);
}
protected boolean print(Object o) {
if (skipThroughTag == null) {
context.out.print(o);
return true;
} else {
return false;
}
}
private void checkForSkipTag(String tagname) {
if (skipThroughTag != null &&
skipThroughTag.equalsIgnoreCase(tagname))
skipThroughTag = null;
}
public void visit(HtmlDocument.Tag t) {
String tagName = t.tagName;
if (isHeading(tagName))
visitHeading(t);
else if (tagName.equalsIgnoreCase("A"))
visitAnchor(t);
else if (tagName.equalsIgnoreCase("IMG"))
visitImg(t);
else if (tagName.equalsIgnoreCase("SPAN"))
visitSpan(t);
else if (tagName.equalsIgnoreCase("META"))
visitMeta(t);
super.visit(t);
if (!print(t))
checkForSkipTag(tagName);
else if (context.numberer != null && isHeading(tagName))
printParagraphNumber(t);
// if this was the <body> tag, print out the "top" anchor for
// this document.
if (tagName.equalsIgnoreCase("BODY")) {
String anchorName = getAnchorName(documentURL);
printAnchor(anchorName);
startDocumentDivision();
}
}
public void visit(HtmlDocument.EndTag t) {
if (root && t.tagName.equalsIgnoreCase("HEAD"))
printStyleSheet();
// if this is a heading, increment it.
// don't increment if we're currently skipping tags, since
// we might be skipping until the </h1> tag.
if (isHeading(t.tagName)) {
inHeading = false;
if (context.numberer != null)
context.numberer.headingTextDone();
if (skipThroughTag == null)
t.tagName = incrHeading(t.tagName);
}
// if we just saw the </body> tag, start skipping forever, or
// until we see another <body> tag (which would be malformed
// html)
if (!root && t.tagName.equalsIgnoreCase("BODY")) {
skipThroughTag = "BODY";
endDocumentDivision();
}
if (!print(t)) checkForSkipTag("/" + t.tagName);
}
public void visitHeading(HtmlDocument.Tag t) {
t.tagName = incrHeading(t.tagName);
inHeading = true;
}
public void visitHref(HtmlDocument.Tag t, Attribute href) {
super.visitHref(t, href);
URL url = resolveURL(href);
String mapped = getAnchorName(url);
if (mapped != null)
setAttribute(href, "#" + mapped);
else {
mapped = rewriteRelativeURL(url);
setAttribute(href, mapped);
}
// if following an included document link closed our document
// status division, restart it.
if (!inDocumentDivision) startDocumentDivision();
}
public void visitAnchor(HtmlDocument.Tag t) {
Attribute name = getAttribute(t, "NAME");
if (name != null) {
URL anchorUrl = resolveHash(name);
String anchorName = getAnchorName(anchorUrl);
if (anchorName != null) {
setAttribute(name, anchorName);
if (//inHeading &&
context.numberer != null)
context.numberer.addUserAnchor(anchorName);
}
}
}
public void visitSpan(HtmlDocument.Tag t) {
CrossReferencer.getCrossReference(t, docID + "_");
}
public void visitImg(HtmlDocument.Tag t) {
Attribute src = getAttribute(t, "SRC");
if (src != null) try {
URL imageURL = resolveURL(src);
if (imageURL == null) return;
if (context.imageDirectory == null)
rewriteImageURL(src, imageURL);
else
copyImage(src, imageURL);
// Adding the "width" and "height" attributes to the image tag, but
// only if they are not already set.
boolean widthSet = getAttribute(t, "width") != null;
boolean heightSet = getAttribute(t, "height") != null;
if (!widthSet || !heightSet) {
BufferedImage image = ImageIO.read(imageURL.openStream());
if (image != null) {
addImgSizeAttributes(t, widthSet, heightSet, image);
}
}
} catch (IOException e) {
e.printStackTrace();
}
}
private void addImgSizeAttributes(HtmlDocument.Tag t,
boolean widthSet, boolean heightSet,
BufferedImage image) {
int width = image.getWidth();
int height = image.getHeight();
if (!widthSet) {
t.attributeList.addAttribute(
new Attribute("width", "\"" + width + "\""));
}
if (!heightSet) {
t.attributeList.addAttribute(
new Attribute("height", "\"" + height + "\""));
}
}
public void rewriteImageURL(Attribute src, URL imageURL)
throws IOException
{
String mapped = rewriteRelativeURL(imageURL);
setAttribute(src, mapped);
}
public void copyImage(Attribute src, URL imageURL) throws IOException {
// check to ensure that the image file exists and we can
// get to it.
URLConnection conn = imageURL.openConnection();
conn.connect();
InputStream in = conn.getInputStream();
// get the name of the original file.
String originalFilename = imageURL.getFile();
int queryPos = originalFilename.indexOf('?');
if (queryPos != -1)
originalFilename = originalFilename.substring(0, queryPos);
// determine the filename extension of the image file.
String extension;
int extStart = originalFilename.lastIndexOf('.');
if (extStart != -1)
extension = originalFilename.substring(extStart);
else
extension = "";
// construct the new filename for the image, and open an
// output stream to it.
String outputName = docID + "_" + imageNumber++ + extension;
File outFile = new File(context.imageDirectory, outputName);
FileOutputStream fos = new FileOutputStream(outFile);
// copy the image from the original location to the new
// location
DocSpider.copyFile(in, fos);
fos.close();
// rewrite the SRC attribute to point to the new file.
String newSrc = context.imageSubdir + "/" + outputName;
setAttribute(src, newSrc);
}
public void visitMeta(HtmlDocument.Tag t) {
if (DEFAULT_DOC_STATUS == STATUSES_DISABLED) return;
Attribute a = getAttribute(t, "NAME");
if (a == null) return;
if ("Generator".equalsIgnoreCase(deQuote(a.value))) {
a = getAttribute(t, "content");
if (a != null && a.value.indexOf("Microsoft") != -1)
printComments = false;
} else if ("DocStatus".equalsIgnoreCase(deQuote(a.value))) {
a = getAttribute(t, "content");
if (a != null) docStatus = deQuote(a.value);
}
}
public boolean isHeading(String tagName) {
return (tagName.length() == 2 &&
"hH".indexOf(tagName.charAt(0)) != -1 &&
"123456789".indexOf(tagName.charAt(1)) != -1);
}
public String incrHeading(String tagName) {
if (headingIncrement == 0) return tagName;
char headingNum = tagName.charAt(1);
headingNum += (char) headingIncrement;
return tagName.substring(0, 1) + headingNum;
}
public void printParagraphNumber(HtmlDocument.Tag t) {
if (getAttribute(t, "NO_NUMBER") != null) return;
int headingNum = t.tagName.charAt(1) - '0';
String paraNum = context.numberer.getNextNumber(headingNum);
context.numberer.setUserStatus(docStatus);
print(paraNum);
print(" ");
}
public void printAnchor(String anchorName) {
// always print anchors, even if skipping is on.
context.out.print("<a name=\"" + anchorName + "\"></a>");
}
public String getAnchorName(URL u) {
return (String) context.anchorMap.get(normalizeURL(u));
}
public void visit(HtmlDocument.Comment c) {
if (printComments)
print(c);
}
public void visit(HtmlDocument.Annotation a) { print(a); }
public void visit(HtmlDocument.Newline n) {
if (inHeading && context.numberer != null)
context.numberer.appendHeadingText(" ");
if (print("")) context.out.println();
}
public void visit(HtmlDocument.Text t) {
// if we're in a heading, and paragraph numbering is on,
if (inHeading && context.numberer != null) {
if (context.numberer.isParagraphNumber(t.toString()))
// this looks like a paragraph number,
return; // then don't print anything.
else
context.numberer.appendHeadingText(t.toString());
} else {
int pos = t.toString().indexOf(BEGIN_APPENDIX_TAG);
if (pos != -1) {
if (context.numberer != null)
context.numberer.startInitialAlpha();
String text = t.toString();
text = (text.substring(0, pos) +
text.substring(pos+BEGIN_APPENDIX_TAG.length()));
t.text = text;
}
}
if (t.toString().indexOf(CrossReferencer.TABLE_OF_CONTENTS_TAG)!= -1) {
endDocumentDivision();
print(t);
startDocumentDivision();
} else {
print(t);
}
}
boolean inDocumentDivision = false;
protected void startDocumentDivision() {
if (inDocumentDivision) endDocumentDivision();
if (docStatus == STATUSES_DISABLED)
context.out.println("<div>");
else
context.out.println("<div class=\"" + docStatus.toUpperCase() + "_STATUS\">");
inDocumentDivision = true;
}
protected void endDocumentDivision() {
if (!inDocumentDivision) return;
context.out.println("</div>");
inDocumentDivision = false;
}
protected void printStyleSheet() {
if (DEFAULT_DOC_STATUS == STATUSES_DISABLED) return;
context.out.println("<style>");
context.out.println(".INITIAL_STATUS { border-left: .25in solid #CCCCFF; padding-left: .1in }");
context.out.println(".VALIDATED_STATUS { border-left: .25in solid #aaffff; padding-left: .1in }");
context.out.println(".AGREED_STATUS { border-left: .25in solid #CCFF99; padding-left: .1in }");
context.out.println(".FINAL_STATUS { border-left: .25in solid #ffffff; padding-left: .1in }");
context.out.println(".INVALID_STATUS { border-left: .25in solid #CC9900; padding-left: .1in }");
context.out.println(".INITIAL_TOC_STATUS { background: #CCCCFF }");
context.out.println(".VALIDATED_TOC_STATUS { background: #aaffff }");
context.out.println(".AGREED_TOC_STATUS { background: #CCFF99 }");
context.out.println(".FINAL_TOC_STATUS { background: #ffffff }");
context.out.println(".INVALID_TOC_STATUS { background: #CC9900 }");
context.out.println("</STYLE>");
}
private static final String BEGIN_APPENDIX_TAG = "BEGIN_APPENDIX";
private String chopURL(String u) {
// chop the final slash if it exists.
if (u.endsWith("/"))
u = u.substring(0, u.length()-1);
// find the last slash in the string.
int slashPos = u.lastIndexOf('/');
if (slashPos == -1)
return null;
else
return u.substring(0, slashPos+1);
}
public Vector buildPrefixMap(URL u) {
System.err.println("buildPrefixMap("+u+")");
Vector result = new Vector();
try {
URL base = new URL(u, "/");
String url = u.toString();
String baseURL = base.toString();
baseURL = baseURL.substring(0, baseURL.length()-1);
url = url.substring(baseURL.length());
url = chopURL(url);
String newPrefix = "";
do {
PrefixMap m = new PrefixMap();
m.oldPrefix = normalizeURL(baseURL + url);
m.newPrefix = newPrefix;
result.add(m);
url = chopURL(url);
newPrefix = newPrefix + "../";
} while (url != null);
} catch (Exception e) {
e.printStackTrace();
}
return result;
}
private class PrefixMap {
public String oldPrefix;
public String newPrefix;
}
public String rewriteRelativeURL(URL u) {
String url = u.toString();
String norm = normalizeURL(url);
Iterator i = context.prefixMap.iterator();
PrefixMap m;
while (i.hasNext()) {
m = (PrefixMap) i.next();
if (norm.startsWith(m.oldPrefix))
return m.newPrefix + url.substring(m.oldPrefix.length());
}
if (url.startsWith("file:"))
System.err.println("Can't rewrite URL as a relative URL: " + url);
return url;
}
}