/*
* Copyright (c) 2004, 2005 jASEN.org
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the distribution.
*
* 3. The names of the authors may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* 4. Any modification or additions to the software must be contributed back
* to the project.
*
* 5. Any investigation or reverse engineering of source code or binary to
* enable emails to bypass the filters, and hence inflict spam and or viruses
* onto users who use or do not use jASEN could subject the perpetrator to
* criminal and or civil liability.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESSED OR IMPLIED WARRANTIES,
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JASEN.ORG,
* OR ANY CONTRIBUTORS TO THIS SOFTWARE BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
* OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
*/
package org.jasen.core.parsers;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.io.StringReader;
import java.io.Writer;
import javax.mail.internet.MimeMessage;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.HTML.Tag;
import javax.swing.text.html.parser.ParserDelegator;
import org.jasen.core.StandardParserData;
import org.jasen.error.JasenException;
import org.jasen.interfaces.JasenMessage;
import org.jasen.interfaces.MimeMessageTokenizer;
import org.jasen.interfaces.ParserData;
/**
* <p>
* Parses the HTML part of an email for two main purposes.
* <ul>
* <li>To extract the plain text components of the message for tokenizing</li>
* <li>To inspect the html for anomalies like HTML concealment and mail bugs</li>
* </ul>
* </p>
*/
public class StandardHTMLParser extends HTMLEditorKit.ParserCallback implements org.jasen.interfaces.HTMLParser {
protected Reader reader = null;
protected Writer writer = null;
protected String encoding = "ISO8859_1";
protected Throwable exception = null;
protected StringBuffer buffer;
protected boolean strOutput = false;
protected boolean comment = false;
protected boolean start = true;
protected boolean ignoreNext = false; // set to true when the next text element should be ignored
// Text within these tags is ignored
protected static final Tag[] IGNORED_TAGS = new Tag[]{Tag.SCRIPT, Tag.STYLE, Tag.TITLE, Tag.HEAD, Tag.META};
protected boolean spaceBefore = false;
protected boolean spaceAfter = false;
protected boolean quit = false;
protected String debugIgnoreReason = null;
// We need to record a simple tag occurrence so we can
// trick the parser into recording whitespace between tags
protected int tagCount = 0;
protected int lastPosition = 0; // The last position a tag was encountered
public StandardHTMLParser() {
super();
}
/**
* Extracts the plain text components of the html given by the
* input stream and writes this plain text to the given output stream
* @param in The input stream from which the html is read
* @param out The ouput stream to which the plain text is written
* @throws JasenException
*/
public void extractText(InputStream in, OutputStream out) throws JasenException {
try {
reader = new InputStreamReader(in);
writer = new OutputStreamWriter(out, encoding);
parse(reader);
if (exception != null) {
throw new JasenException(exception);
}
}
catch (IOException ex) {
throw new JasenException(ex);
}
finally {
try {
out.flush();
}
catch (IOException ex) {
throw new JasenException(ex);
}
}
}
/**
* Sets the encoding to use on the output stream (optional)
* @param encoding
*/
public void setEncoding(String encoding) {
this.encoding = encoding;
}
/**
* Extracts plain text from the html given by the input stream and returns it as a String
* @param in The input stream from which the html is read
* @return A String containing the plain text of the html
* @throws JasenException
*/
public String extractText(InputStream in) throws JasenException {
reader = new InputStreamReader(in);
return extractText(reader);
}
/**
* Extracts plain text from the given html String and returns it as a String
* @param html The String containing the html
* @return The String containing the plain text
* @throws JasenException
*/
public String extractText(String html) throws JasenException {
reader = new StringReader(html);
return extractText(reader);
}
private String extractText(Reader reader) throws JasenException {
strOutput = true;
String text = null;
try {
parse(reader);
}
catch (IOException ex) {
throw new JasenException(ex);
}
if(buffer != null) {
// Replace new lines
text = buffer.toString();
//text = text.replaceAll("\\r", "");
//text = text.replaceAll("\\n", "");
}
return text;
}
protected boolean isSpaceRequired(Tag t, int pos) {
boolean spaceRequired = false;
// A space is required if we hit an end tag followed by
// a start tag without encountering text
// OR, if we hit two simple tags in the same way
if(tagCount > 1) {
// We hit two sequential tags
// If the end position of the last tag is more than 1 char away from the start
// position of the current tag, we need a space
if(lastPosition < pos) {
spaceRequired = true;
}
}
if(!spaceRequired) {
spaceRequired = (t.equals(HTML.Tag.P) ||
t.equals(HTML.Tag.TD) ||
t.equals(HTML.Tag.TR) ||
t.equals(HTML.Tag.TITLE) ||
t.equals(HTML.Tag.LI) ||
t.equals(HTML.Tag.BR) ||
t.equals(HTML.Tag.H1) ||
t.equals(HTML.Tag.H2) ||
t.equals(HTML.Tag.H3) ||
t.equals(HTML.Tag.H4) ||
t.equals(HTML.Tag.H5) ||
t.equals(HTML.Tag.H6) ||
t.equals(HTML.Tag.IMG) ||
t.equals(HTML.Tag.OBJECT) ||
t.equals(HTML.Tag.HR) ||
t.equals(HTML.Tag.UL));
}
return spaceRequired;
}
protected void parse(Reader in) throws IOException {
if (strOutput) {
buffer = new StringBuffer();
}
ParserDelegator delegator = new ParserDelegator();
delegator.parse(in, this, true);
}
/*
* (non-Javadoc)
* @see javax.swing.text.html.HTMLEditorKit.ParserCallback#handleComment(char[], int)
*/
public void handleComment(char[] text, int pos) {
// Do nothing
tagCount++;
lastPosition = pos;
comment = true;
}
/* (non-Javadoc)
* @see javax.swing.text.html.HTMLEditorKit.ParserCallback#handleEndTag(javax.swing.text.html.HTML.Tag, int)
*/
public void handleEndTag(Tag t, int pos) {
if(!spaceBefore && isSpaceRequired(t, pos)) {
spaceBefore = true;
}
// We are in a tag
tagCount++;
// Update the last position
// We want the END position of the tag (plus 2 for <>)
lastPosition = pos + t.toString().length() + 2;
// reset
ignoreNext = false;
}
/* (non-Javadoc)
* @see javax.swing.text.html.HTMLEditorKit.ParserCallback#handleSimpleTag(javax.swing.text.html.HTML.Tag, javax.swing.text.MutableAttributeSet, int)
*/
public void handleSimpleTag(Tag t, MutableAttributeSet a, int pos) {
// Test to see if we are at an end tag
String end = (String)a.getAttribute(HTML.Attribute.ENDTAG);
if("true".equalsIgnoreCase(end)) {
handleEndTag(t, pos);
}
else
{
handleStartTag(t, a, pos);
}
}
/* (non-Javadoc)
* @see javax.swing.text.html.HTMLEditorKit.ParserCallback#handleStartTag(javax.swing.text.html.HTML.Tag, javax.swing.text.MutableAttributeSet, int)
*/
public void handleStartTag(Tag t, MutableAttributeSet a, int pos) {
// We are in a tag
tagCount++;
// If we have a new line or spacer tag, add a space
if(!spaceBefore && isSpaceRequired(t, pos)) {
spaceBefore = true;
}
// Update the last position
// We want the END position of the tag (plus 2 for <>)
lastPosition = pos + t.toString().length() + 2;
// If we have hit a script or style tag.. ignore next
// Now check for ignored tags
if(!ignoreNext) {
for (int i = 0; i < IGNORED_TAGS.length; i++)
{
if(t.equals(IGNORED_TAGS[i])) {
ignoreNext = true;
break;
}
}
}
}
/*
* (non-Javadoc)
* @see javax.swing.text.html.HTMLEditorKit.ParserCallback#handleText(char[], int)
*/
public void handleText(char[] text, int pos) {
if(!quit) {
// We are not in a tag
tagCount = 0;
// It seems the parser does not recognise text prior to the commencement
// of HTML code. The following flag caters for this
if(start) {
start = false;
}
if (strOutput) {
if(!ignoreNext) {
if(spaceBefore) {
buffer.append(' ');
spaceBefore = false;
}
buffer.append(text);
}
// else
//{
// Append a space to cater for the ignored text
//buffer.append(' ');
//System.out.println("Ignoring: " + new String(text) + ": " + debugIgnoreReason);
// }
}
else {
// We must be output stream
try {
if(!ignoreNext) {
if(spaceBefore) {
writer.write(" ");
spaceBefore = false;
}
writer.write(text);
}
// else
//{
// Append a space to cater for the ignored text
//writer.write(" ");
//System.out.println("Ignoring: " + new String(text) + ": " + debugIgnoreReason);
// }
writer.flush(); // If we don't flush the writer we don't get any data!
}
catch (IOException ex) {
exception = ex;
}
}
comment = false;
}
}
/*
* (non-Javadoc)
* @see org.jasen.interfaces.HTMLParser#parse(javax.mail.internet.MimeMessage, org.jasen.interfaces.JasenMessage, org.jasen.interfaces.MimeMessageTokenizer)
*/
public ParserData parse(MimeMessage mm, JasenMessage message, MimeMessageTokenizer tokenizer) throws JasenException {
String rawHtml = message.getHtmlPart();
String rawText = message.getTextPart();
StandardParserData parserData = new StandardParserData();
if(rawHtml != null) {
String htmlText = extractText(rawHtml);
parserData.setHtmlAsText(htmlText);
quit = false;
}
if(rawText != null) {
String text = extractText(rawText);
parserData.setTextParsed(text);
}
// Now, tokenize the html and text parts of the message
parserData.setMessageTokens(tokenizer.tokenize(mm, message, parserData));
return parserData;
}
}