/**
* Copyright 2007 Gerard Toonstra
*
* Licensed under the terms of the Apache Software License v2
*
* This file is part of the XSS Protect library
*/
package com.blogspot.radialmind.html;
import java.io.IOException;
import java.io.Reader;
import java.io.Writer;
import org.antlr.runtime.ANTLRReaderStream;
import org.antlr.runtime.ANTLRStringStream;
import org.antlr.runtime.CommonTokenStream;
import org.antlr.runtime.RecognitionException;
import org.antlr.runtime.tree.CommonTreeNodeStream;
import org.antlr.runtime.tree.Tree;
/**
* Parses an HTML fragment or document and calls a breakout instance
* that is provided in the constructor, which can filter or modify the
* tags, values and attributes parsed.
*
* This class uses grammar (lexer and parser) generated by ANTLR, which
* calls public static functions of this class when new parse events occur.
*/
public class HTMLParser {
// Since the class is static, the threadlocals are used
// to store process-specific information.
// It's better to call instance-specific methods, but need to find
// out how to do that with ANTLR.
private static ThreadLocal topNode;
private static ThreadLocal currentNode;
private static ThreadLocal attrNode;
/**
* The only method that should be called to initiate the process
* @param is The input stream from where to get the data
* @param os The output stream to write the processed fragment/document to
* @param htmlFilter An interface called during the processing of the document. Can be used to modify elements
* @param convertIntoValidXML Converts the output into valid XML for XSL processing for example
*/
public static void process( Reader reader, Writer writer, IHTMLFilter htmlFilter, boolean convertIntoValidXML ) throws HandlingException {
try {
// Open a char stream input for the document
ANTLRStringStream input = new ANTLRReaderStream( reader );
// Start lexing the input
htmlLexerLexer lex = new htmlLexerLexer(input);
// Tokenstream for the parser.
CommonTokenStream tokens = new CommonTokenStream(lex);
htmlParserParser parser = new htmlParserParser(tokens);
htmlParserParser.document_return root = parser.document();
// Set up the tree parser
CommonTreeNodeStream nodes = new CommonTreeNodeStream((Tree)root.getTree());
htmlTreeParser walker = new htmlTreeParser(nodes);
// Initialize data structures
topNode = new ThreadLocal();
currentNode = new ThreadLocal();
attrNode = new ThreadLocal();
// Walk in the entire document using the tree parser.
walker.document();
// Get the top node
TagNode top = (TagNode)topNode.get();
// Write the clean document out.
top.writeAll( writer, htmlFilter, convertIntoValidXML, false );
} catch ( IOException ioe ) {
throw new HandlingException( "Could not parse document" );
} catch ( RecognitionException re ) {
throw new HandlingException( "Could not parse document" );
}
}
/**
* Notifies the opening of a new tag
*
* @param tagName The name of the tag
* @throws IOException
*/
static void openTag( String tagName ) throws IOException {
TagNode node = (TagNode)topNode.get();
if ( node == null ) {
node = new TagNode( tagName.toLowerCase() );
topNode.set( node );
currentNode.set( node );
} else {
TagNode curNode = (TagNode)currentNode.get();
node = new TagNode( tagName.toLowerCase() );
curNode.addNode( node );
if ( node.mayContainOtherTags() ) {
currentNode.set( node );
}
}
attrNode.set( node );
}
/**
* Adds an attribute
* @param attributeName The name of the attribute added to the tag
* @param value The value of the attribute
* @throws IOException Adding this can throw an exception
*/
static void addAttribute( String attributeName, String value ) throws IOException {
TagNode curNode = (TagNode)attrNode.get();
value = value.trim();
if ( value.length() > 1 ) {
if (( value.startsWith( "=\"" ) ) ||
( value.startsWith( "='" ) ) ||
( value.startsWith( "=`" ) ) )
{
value = value.substring( 2 );
value = value.substring( 0, value.length() - 1 );
} else {
value = value.substring( 1 );
}
}
curNode.addAttribute( attributeName.toLowerCase(), value );
}
/**
* Finish the addition of attributes
* @throws IOException This can throw an IOException
*/
static void finishAttributes() throws IOException {
Node node = (Node)attrNode.get();
attrNode.set( node.getPrevNode() );
}
/**
* This method adds a text to the tag.
* @param text The text to add to the tag
* @throws IOException This method can throw an IOException
*/
static void addText( String text ) throws IOException {
if ( text == null ) {
return;
}
if ( text.trim().equals( "" ) ) {
return;
}
TagNode curNode = (TagNode)currentNode.get();
if ( curNode.getName().equals( "body" )) {
TagNode p = new TagNode( "p" );
p.addNode( new TextNode( "p", text ));
curNode.addNode( p );
} else {
curNode.addNode( new TextNode( curNode.getName(), text ) );
}
}
/**
* Closes the tag
* @param tagName The tag name
* @throws IOException This method can throw an IOException
*/
static void closeTag( String tagName ) throws IOException {
TagNode curNode = (TagNode)currentNode.get();
TagNode tempNode = new TagNode( tagName );
if ( tempNode.mayContainOtherTags() ) {
while ( ! curNode.getName().equals( tagName.toLowerCase() )) {
curNode = (TagNode)curNode.getPrevNode();
}
currentNode.set( curNode.getPrevNode() );
}
}
}