import java.util.*;
import java.io.*;
import java.net.*;
import com.quiotix.html.parser.*;
import com.quiotix.html.parser.HtmlDocument.Attribute;
import com.quiotix.html.parser.HtmlDocument.AttributeList;
class CrossReferencer extends HtmlVisitor {
InputStream in;
PrintWriter out;
ParagraphNumberer numberer;
String skipToEndTag = null;
public CrossReferencer(File inputFile, File outputFile,
ParagraphNumberer numberer) {
try {
this.in = new FileInputStream(inputFile);
this.out = new PrintWriter(new FileWriter(outputFile));
this.numberer = numberer;
HtmlParser parser = new HtmlParser(in);
parser.streamAccept(this);
} catch (Exception e) {
e.printStackTrace();
}
}
public void print(Object o) {
if (skipToEndTag == null)
out.print(o.toString());
}
public void visit(HtmlDocument.Text t) {
String text = t.toString();
int pos = text.indexOf(TABLE_OF_CONTENTS_TAG);
if (pos == -1)
print(t);
else {
print(text.substring(0, pos));
if (numberer != null)
numberer.printTOC(out);
print(text.substring(pos+TABLE_OF_CONTENTS_TAG.length()));
}
}
public static final String TABLE_OF_CONTENTS_TAG = "TABLE_OF_CONTENTS";
public void visit(HtmlDocument.Tag t) {
if (t.tagName.equalsIgnoreCase("SPAN"))
visitSpan(t);
print(t);
}
public void visit(HtmlDocument.EndTag t) {
print(t);
if (skipToEndTag != null && t.tagName.equalsIgnoreCase(skipToEndTag))
skipToEndTag = null;
}
public void visit(HtmlDocument.Comment c) { print(c); }
public void visit(HtmlDocument.Newline n) { out.println(); }
public void visit(HtmlDocument.Annotation a) { print(a); }
public void finish() {
try {
out.flush();
out.close();
in.close();
} catch (IOException ioe) {
ioe.printStackTrace();
}
}
public void visitSpan(HtmlDocument.Tag t) {
if (numberer == null) return;
String crossRef = getCrossReference(t, null);
if (crossRef == null) return;
String parNum = numberer.getNumberForRef(crossRef, true);
if (parNum == null) {
System.err.println("Couldn't find reference " + crossRef);
return;
}
out.print(t);
out.print(parNum);
out.print("</span>");
skipToEndTag = "span";
}
private static final String CROSS_REF_STARTER = "mso-field-code:\"REF ";
public static String getCrossReference(HtmlDocument.Tag t,
String insertPrefix) {
Attribute style = HTMLSpider.getAttribute(t, "STYLE");
if (style == null) return null;
String s = style.value;
// only handle cross references to paragraph numbers for now
if (s.indexOf("\\r") == -1) return null;
int beginPos = s.indexOf(CROSS_REF_STARTER);
if (beginPos == -1) return null;
beginPos += CROSS_REF_STARTER.length();
if (insertPrefix != null)
style.value = s =
s.substring(0, beginPos) +insertPrefix+ s.substring(beginPos);
int endPos = endOfWord(s, beginPos);
return s.substring(beginPos, endPos);
}
private static final String WORD_CHARS =
"abcdefghijklmnopqrstuvwxyz_ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
private static int endOfWord(String s, int wordStart) {
for (int i = wordStart; i < s.length(); i++)
if (WORD_CHARS.indexOf(s.charAt(i)) == -1)
return i;
return s.length();
}
}