public void parse(
InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
final XHTMLContentHandler xhtml =
new XHTMLContentHandler(handler,metadata);
DefaultHandler dh = new ElementMappingContentHandler(xhtml, MAPPINGS) {
private final BitSet textNodeStack = new BitSet();
private int nodeDepth = 0;
private int completelyFiltered = 0;
private Stack<String> headingStack = new Stack<String>();
@Override
public void characters(char[] ch, int start, int length)
throws SAXException {
// only forward content of tags from text:-namespace
if (completelyFiltered == 0 && nodeDepth > 0
&& textNodeStack.get(nodeDepth - 1)) {
super.characters(ch,start,length);
}
}
// helper for checking tags which need complete filtering
// (with sub-tags)
private boolean needsCompleteFiltering(
String namespaceURI, String localName) {
if (TEXT_NS.equals(namespaceURI)) {
return localName.endsWith("-template")
|| localName.endsWith("-style");
} else if (TABLE_NS.equals(namespaceURI)) {
return "covered-table-cell".equals(localName);
} else {
return false;
}
}
// map the heading level to <hX> HTML tags
private String getXHTMLHeaderTagName(Attributes atts) {
String depthStr = atts.getValue(TEXT_NS, "outline-level");
if (depthStr == null) {
return "h1";
}
int depth = Integer.parseInt(depthStr);
if (depth >= 6) {
return "h6";
} else if (depth <= 1) {
return "h1";
} else {
return "h" + depth;
}
}
/**
* Check if a node is a text node
*/
private boolean isTextNode(String namespaceURI, String localName) {
if (TEXT_NS.equals(namespaceURI)) {
return true;
}
if (SVG_NS.equals(namespaceURI)) {
return "title".equals(localName) ||
"desc".equals(localName);
}
return false;
}
@Override
public void startElement(
String namespaceURI, String localName, String qName,
Attributes atts) throws SAXException {
// keep track of current node type. If it is a text node,
// a bit at the current depth ist set in textNodeStack.
// characters() checks the top bit to determine, if the
// actual node is a text node to print out nodeDepth contains
// the depth of the current node and also marks top of stack.
assert nodeDepth >= 0;
textNodeStack.set(nodeDepth++,
isTextNode(namespaceURI, localName));
// filter *all* content of some tags
assert completelyFiltered >= 0;
if (needsCompleteFiltering(namespaceURI, localName)) {
completelyFiltered++;
}
// call next handler if no filtering
if (completelyFiltered == 0) {
// special handling of text:h, that are directly passed
// to xhtml handler
if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
xhtml.startElement(headingStack.push(
getXHTMLHeaderTagName(atts)));
} else {
super.startElement(
namespaceURI, localName, qName, atts);
}
}
}
@Override
public void endElement(
String namespaceURI, String localName, String qName)
throws SAXException {
// call next handler if no filtering
if (completelyFiltered == 0) {
// special handling of text:h, that are directly passed
// to xhtml handler
if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
xhtml.endElement(headingStack.pop());
} else {
super.endElement(namespaceURI,localName,qName);
}
// special handling of tabulators