package org.codehaus.staxmate.samples;
import java.io.*;
import javax.xml.stream.*;
import org.codehaus.staxmate.SMInputFactory;
import org.codehaus.staxmate.in.SMEvent;
import org.codehaus.staxmate.in.SMInputCursor;
/**
* Simple demonstration of using StaxMate on top of StAX, to simplify
* nested XML parsing: implements a converter from well-formed HTML
* to a Wiki-line textual output format.
*<p>
* General rules for output Wiki-like markup are:
* <ul>
* <li>Blocks (~= paragraphs) are separated by one or more empty lines
* (two or more consequtive linefeeds)
* </li>
* <li>There are 4 inline markups; bolding, italics, underline and
* hyperlink; these are marked by (respectively), ***text***,
* **text**, __text__, [[url | desc ]].
* </li>
* <li>Lists are marked lines that start with '*' (unordered) or '#'
* chars (ordered), followed by one or more spaces and list contents;
* nested lists are marked by indentation of 2 spaces per nesting level.
* Only inline markup is allowed inside list items, in addition to
* sub-lists.
* </li>
* <li>Non-nested tables are marked by pipe ('|') character starting a
* line; each text row represents a table row, and cells are separated
* by pipe chars as well. Cell or row spans are not supported, nor
* nested tables; inline markup is allowed inside cells
* </li>
* </ul>
*
* @author Tatu Saloranta
*/
public final class HTMLConverter
{
private HTMLConverter() { }
private void convert(String filename)
throws IOException, XMLStreamException
{
XMLInputFactory f = XMLInputFactory.newInstance();
// Let's configure factory 'optimally'...
f.setProperty(XMLInputFactory.IS_COALESCING, Boolean.TRUE);
f.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, Boolean.FALSE);
// just so it won't try to load DTD in if there's DOCTYPE
f.setProperty(XMLInputFactory.SUPPORT_DTD, Boolean.FALSE);
f.setProperty(XMLInputFactory.IS_VALIDATING, Boolean.FALSE);
InputStream in = new java.io.FileInputStream(filename);
XMLStreamReader sr = f.createXMLStreamReader(in);
SMInputCursor it = SMInputFactory.rootElementCursor(sr);
/* Need to store some information about preceding siblings,
* so let's enable tracking.
*/
it.setElementTracking(SMInputCursor.Tracking.VISIBLE_SIBLINGS);
Writer out = new PrintWriter(System.out);
try {
processHTML(it, out);
} finally {
try {
out.flush();
} catch (Throwable t) { }
sr.close();
try {
in.close();
} catch (Throwable t) { }
}
}
private void processHTML(SMInputCursor it, Writer out)
throws IOException, XMLStreamException
{
it.getNext(); // has to be of type element now...
String origName = it.getLocalName();
String name = origName.toLowerCase();
/* It _should_ be HTML... but let's also allow lone 'body'
* as well, for additional robustness
*/
if (name.equals("body")) {
processBody(it, out);
} else if (!name.equals("html")) {
throw new XMLStreamException("Non-HTML document? Root element '"
+origName+"'; excepted <HTML> or <html>");
}
SMInputCursor mainIt = it.childElementCursor();
while (mainIt.getNext() != null) {
origName = mainIt.getLocalName();
name = origName.toLowerCase();
// Should be 'head' or 'body'
if (name.equals("head")) {
processHead(mainIt, out);
} else if (name.equals("body")) {
processBody(mainIt, out);
} else {
throw new XMLStreamException("Non-HTML document? Unexpected element '"
+origName+"'; under <HTML>.");
}
}
}
/**
* Simple handler for HEAD section of a html document. Only looks for
* title element (for now); returns as soon as that's gotten.
*/
private void processHead(SMInputCursor parentIt, Writer out)
throws IOException, XMLStreamException
{
SMInputCursor headIt = parentIt.childElementCursor();
while (headIt.getNext() != null) {
if (headIt.getLocalName().toLowerCase().equals("title")) {
// Could capitalize it too...
out.write("== ");
String str = headIt.collectDescendantText(true);
// Let's remove linefeeds if there was any
addSingleLine(out, str);
out.write(" ==\n\n");
// Ok, that's it, we don't care about other stuff
break;
}
}
}
/**
* Simple handler for BODY section of a html document.
* Has special handling for some elements (paragraphs, lists,
* tables, links).
*/
private void processBody(SMInputCursor parentIt, Writer out)
throws IOException, XMLStreamException
{
/* We need both elements and text content (but not comments etc);
* further, due to loose nesting of HTML, let's just do flat
* iteration in general, as we can still do sub-scoping for
* specific elements (tables etc)
*/
SMInputCursor bodyIt = parentIt.descendantMixedCursor();
StringBuffer text = null; // for collected 'loose' text
SMEvent evt;
while ((evt = bodyIt.getNext()) != null) {
// Let's weed out end elements right away...
if (evt == SMEvent.END_ELEMENT) {
continue;
}
// And straight text as well:
String inline;
if (evt == SMEvent.START_ELEMENT) {
String tag = bodyIt.getLocalName().toLowerCase();
if (processBlockElement(bodyIt, out, tag, text)) {
// true -> was succesfully handled
text = null;
continue;
}
/* Ok; not a block we recognized... but maybe a well-known
* inline element?
*/
inline = checkInlineMarkup(bodyIt, tag);
} else {
inline = bodyIt.getText();
}
if (inline != null) {
if (text == null) {
text = new StringBuffer(inline);
} else {
text.append(inline);
}
}
} // while (...)
if (text != null) {
addPara(out, text);
text = null;
}
}
/**
* Method that is used to figure out type and handling of a node,
* at block level scope (but not from inside tables and lists)
*/
private boolean processBlockElement(SMInputCursor it, Writer out, String tag,
StringBuffer text)
throws IOException, XMLStreamException
{
// We'll only get START_ELEMENT events here
if (tag.charAt(0) == 'h' && tag.length() == 2) {
char c = tag.charAt(1);
// heading?
if (c >= '1' && c <= '5') {
if (text != null) {
addPara(out, text);
}
processHeading(it, out, (c - '1'));
return true;
}
}
/* Handling of paragraphs depends on whether it's a main level
* thing or not
*/
if (tag.equals("p") || tag.equals("blockquote")) {
// (no special handling for blockquote currently)
addPara(out, text);
/* Let's recursively call the main loop, and then add an
* empty line after it.
*/
processBody(it, out);
out.write("\n\n");
return true;
}
if (tag.equals("pre")) {
addPara(out, text);
// Can't have any markup in there...
String str = it.collectDescendantText(true);
if (str.length() > 0) {
addPara(out, str);
}
return true;
}
if (tag.equals("ul") || tag.equals("o")) {
addPara(out, text);
processList(it, out, (tag.charAt(0) == 'u') ? '*' : '#', 0);
return true;
}
if (tag.equals("table")) {
addPara(out, text);
processTable(it, out, false);
return true;
}
// Not a recognized (or handlable) block element
return false;
}
private void processHeading(SMInputCursor it, Writer out, int depth)
throws IOException, XMLStreamException
{
depth += 2;
if (depth > 5) {
depth = 5;
}
String prefix = "=====".substring(0, depth);
out.write(prefix);
out.write(' ');
it.processDescendantText(out, true);
out.write(' ');
out.write(prefix);
out.write("\n\n");
}
private void processList(SMInputCursor it, Writer out, char type, int depth)
throws IOException, XMLStreamException
{
/* Let's assume child elements have to be 'li' elements or
* sublists ('ul', 'ol'); and ignore everything else.
*/
SMInputCursor listIt = it.childElementCursor();
// We'll only get START_ELEMENTs here except for EOF:
while (listIt.getNext() != null) {
String tag = listIt.getLocalName().toLowerCase();
if (tag.equals("li")) {
processListItem(listIt, out, type, depth);
} else if (tag.equals("ul")) {
processList(listIt, out, '*', depth+1);
} else if (tag.equals("ol")) {
processList(listIt, out, '#', depth+1);
} else {
/* could add warnings, or append content to previous item,
* or create a list heading... whatever
*/
}
}
// And finally, trailing empty line, but only for main-level lists
if (depth == 0) {
out.write('\n');
}
}
private void processListItem(SMInputCursor it, Writer out, char listType, int depth)
throws IOException, XMLStreamException
{
// Ok, list item marker:
for (int i = 0; i < depth; ++i) {
out.write(" "); // 2 space indentation
}
out.write(listType);
out.write(' ');
/* List item contents are more varied; text, inline markup; maybe
* even sublists.
*/
SMInputCursor itemIt = it.childMixedCursor();
SMEvent evt;
while ((evt = itemIt.getNext()) != null) {
if (evt == SMEvent.START_ELEMENT) {
String tag = itemIt.getLocalName().toLowerCase();
// only care about sub-lists:
if (tag.equals("ul") || tag.equals("ol")) {
out.write('\n'); // to finish off the current line
processList(itemIt, out, (tag.charAt(0) == 'u') ? '*' : '#',
depth+1);
/* Also, let's also ignore whatever came after the sublist,
* for this item, if anything; most likely just whitespace.
* Problem otherwise is how to handle "leftovers"; can't
* add them to this item any more, would need to start
* a new item or something.
*/
return;
} else { // can also process inline markup
String str = checkInlineMarkup(itemIt, tag);
if (str != null) {
addSingleLine(out, str);
continue;
}
}
// Otherwise, let's just collect and output text:
addSingleLine(out, itemIt.collectDescendantText(true));
} else {
addSingleLine(out, itemIt.getText());
}
}
out.write('\n');
}
private void processTable(SMInputCursor it, Writer out, boolean header)
throws IOException, XMLStreamException
{
/* Let's assume child elements have to be 'tr', or one of grouping
* elements ('thead', 'tfoot' or 'tbody'), and ignore everything else.
*/
SMInputCursor tableIt = it.childElementCursor();
// We'll only get START_ELEMENTs here except for EOF:
while (tableIt.getNext() != null) {
String tag = tableIt.getLocalName().toLowerCase();
if (tag.equals("thead") || tag.equals("tfoot")
|| tag.equals("tbody")) {
/* Let's just recursively call this method, should be
* safe?
*/
processTable(tableIt, out, header || tag.equals("thead"));
} else if (tag.equals("tr")) {
processTableRow(tableIt, out, header);
}
// and ignore others....
}
// Let's add empty line as paragraph separator...
out.write("\n");
}
private void processTableRow(SMInputCursor it, Writer out, boolean headerRow)
throws IOException, XMLStreamException
{
// Let's assume only 'tr' elements are encountered...
SMInputCursor rowIt = it.childElementCursor();
out.write("|");
// We'll only get START_ELEMENTs here except for EOF:
while (rowIt.getNext() != null) {
String tag = rowIt.getLocalName().toLowerCase();
if (tag.equals("td")) {
processTableCell(rowIt, out, headerRow);
} else if (tag.equals("th")) {
processTableCell(rowIt, out, true);
} else {
continue;
}
out.write("|");
}
// Let's add lf, to separate rows...
out.write("\n");
}
private void processTableCell(SMInputCursor it, Writer out, boolean headerCell)
throws IOException, XMLStreamException
{
/* Cells can have varied content, though... generally we only care
* about text and inline markup, though.
*/
SMInputCursor cellIt = it.childMixedCursor();
SMEvent evt;
while ((evt = cellIt.getNext()) != null) {
if (evt == SMEvent.START_ELEMENT) {
String tag = cellIt.getLocalName().toLowerCase();
// No sub-tables or lists allowed... just inline markup
String str = checkInlineMarkup(cellIt, tag);
if (str != null) {
addSingleLine(out, str);
continue;
}
// Otherwise, let's just collect and output text:
addSingleLine(out, cellIt.collectDescendantText(true));
} else { // just plain text
addSingleLine(out, cellIt.getText());
}
}
}
private String checkInlineMarkup(SMInputCursor it, String tag)
throws IOException, XMLStreamException
{
if (tag.equals("a")) {
String url = it.getAttrValue(null, "href");
String str = it.collectDescendantText(true);
return "[["+url+" | "+str+" ]]";
}
if (tag.equals("b")) {
String str = it.collectDescendantText(true);
return "'''"+str+"'''";
}
if (tag.equals("i")) {
String str = it.collectDescendantText(true);
return "'''"+str+"'''";
}
if (tag.equals("u")) {
String str = it.collectDescendantText(true);
return "___"+str+"___";
}
if (tag.equals("hr")) {
return "\n-----\n";
}
if (tag.equals("br")) {
// Hmmh. This won't work too well...
return "\n";
}
// Nope, inline markup not recognized (or no effect can be applied)
return null;
}
/**
* Method called to output "unwrapped" text (either not contained in
* any element, or in unrecognized one). Let's just output it as
* is, but add paragraph separator after the text.
*/
private void addPara(Writer out, StringBuffer textBuf)
throws IOException
{
addPara(out, textBuf.toString());
}
private void addPara(Writer out, String text)
throws IOException
{
/* Let's remove all linefeeds from the start, and from the end,
* to make sure we won't have excessive empty lines...
*/
int len = text.length();
int i = 0;
while (i < len) {
char c = text.charAt(i);
if (c != '\r' && c != '\n') {
break;
}
++i;
}
if (i > 0) {
text = text.substring(i);
}
i = len = text.length()-1;
while (i >= 0) {
char c = text.charAt(i);
if (c != '\r' && c != '\n') {
break;
}
--i;
}
if (i < len) {
text = text.substring(0, i+1);
}
// Also, let's see if there's any non-space stuff left?
if (text.trim().length() > 0) {
out.write(text);
out.write("\n\n");
}
}
/**
* Simple (although not very efficient) method that'll replace linefeeds
* with single space chars and output results
*/
private void addSingleLine(Writer out, String text)
throws IOException
{
// Need to replace linefeeds, that's all
BufferedReader br = new BufferedReader(new StringReader(text));
String line;
boolean first = true;
while ((line = br.readLine()) != null) {
if (first) {
first = false;
} else {
out.write(' ');
}
out.write(line);
}
}
public static void main(String[] args)
throws Exception
{
if (args.length != 1) {
System.err.println("Usage: java "+SMInputFactory.class+" [input file]");
System.exit(1);
}
new HTMLConverter().convert(args[0]);
}
}