Source Code of org.jasen.core.parsers.StandardHTMLParser

/*
 * Copyright (c) 2004, 2005  jASEN.org
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *   1. Redistributions of source code must retain the above copyright notice,
 *      this list of conditions and the following disclaimer.
 *
 *   2. Redistributions in binary form must reproduce the above copyright
 *      notice, this list of conditions and the following disclaimer in
 *      the documentation and/or other materials provided with the distribution.
 *
 *   3. The names of the authors may not be used to endorse or promote products
 *      derived from this software without specific prior written permission.
 *
 *   4. Any modification or additions to the software must be contributed back
 *      to the project.
 *
 *   5. Any investigation or reverse engineering of source code or binary to
 *      enable emails to bypass the filters, and hence inflict spam and or viruses
 *      onto users who use or do not use jASEN could subject the perpetrator to
 *      criminal and or civil liability.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESSED OR IMPLIED WARRANTIES,
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JASEN.ORG,
 * OR ANY CONTRIBUTORS TO THIS SOFTWARE BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
 * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 */
package org.jasen.core.parsers;


import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.io.StringReader;
import java.io.Writer;


import javax.mail.internet.MimeMessage;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.HTML.Tag;
import javax.swing.text.html.parser.ParserDelegator;


import org.jasen.core.StandardParserData;
import org.jasen.error.JasenException;
import org.jasen.interfaces.JasenMessage;
import org.jasen.interfaces.MimeMessageTokenizer;
import org.jasen.interfaces.ParserData;




/**
 * <p>
 * Parses the HTML part of an email for two main purposes.
 * <ul>
 * <li>To extract the plain text components of the message for tokenizing</li>
 * <li>To inspect the html for anomalies like HTML concealment and mail bugs</li>
 * </ul>
 * </p>
 */
public class StandardHTMLParser extends HTMLEditorKit.ParserCallback implements org.jasen.interfaces.HTMLParser {


  protected Reader reader = null;
  protected Writer writer = null;


  protected String encoding = "ISO8859_1";
  protected Throwable exception = null;


  protected StringBuffer buffer;
  protected boolean strOutput = false;


  protected boolean comment = false;
  protected boolean start = true;
  protected boolean ignoreNext = false; // set to true when the next text element should be ignored


  // Text within these tags is ignored
  protected static final Tag[] IGNORED_TAGS = new Tag[]{Tag.SCRIPT, Tag.STYLE, Tag.TITLE, Tag.HEAD, Tag.META};


  protected boolean spaceBefore = false;
  protected boolean spaceAfter = false;


  protected boolean quit = false;


  protected String debugIgnoreReason = null;


  // We need to record a simple tag occurrence so we can
  // trick the parser into recording whitespace between tags
  protected int tagCount = 0;
  protected int lastPosition = 0; // The last position a tag was encountered


  public StandardHTMLParser() {
      super();
  }


  /**
   * Extracts the plain text components of the html given by the 
   * input stream and writes this plain text to the given output stream
   * @param in The input stream from which the html is read
   * @param out The ouput stream to which the plain text is written
   * @throws JasenException
   */
  public void extractText(InputStream in, OutputStream out) throws JasenException {


    try {


      reader = new InputStreamReader(in);
      writer = new OutputStreamWriter(out, encoding);


      parse(reader);


      if (exception != null) {
        throw new JasenException(exception);
      }


    }
    catch (IOException ex) {
      throw new JasenException(ex);
    }
    finally {
      try {
        out.flush();
      }
      catch (IOException ex) {
        throw new JasenException(ex);
      }
    }
  }


  /**
   * Sets the encoding to use on the output stream (optional)
   * @param encoding
   */
  public void setEncoding(String encoding) {
    this.encoding = encoding;
  }




  /**
   * Extracts plain text from the html given by the input stream and returns it as a String
   * @param in The input stream from which the html is read
   * @return A String containing the plain text of the html
   * @throws JasenException
   */
  public String extractText(InputStream in) throws JasenException {
    reader = new InputStreamReader(in);
    return extractText(reader);
  }


  /**
   * Extracts plain text from the given html String and returns it as a String
   * @param html The String containing the html
   * @return The String containing the plain text
   * @throws JasenException
   */
  public String extractText(String html) throws JasenException {
    reader = new StringReader(html);
    return extractText(reader);
  }


  
  private String extractText(Reader reader) throws JasenException {


    strOutput = true;
    String text = null;


    try {
      parse(reader);
    }
    catch (IOException ex) {
      throw new JasenException(ex);
    }


    if(buffer != null) {
      // Replace new lines
      text = buffer.toString();
      //text = text.replaceAll("\\r", "");
      //text = text.replaceAll("\\n", "");
    }


    return text;


  }


  protected boolean isSpaceRequired(Tag t, int pos) {


    boolean spaceRequired = false;


    // A space is required if we hit an end tag followed by
    // a start tag without encountering text
    // OR, if we hit two simple tags in the same way
    if(tagCount > 1) {
      // We hit two sequential tags
      // If the end position of the last tag is more than 1 char away from the start
      // position of the current tag, we need a space
      if(lastPosition < pos) {
        spaceRequired = true;
      }
    }


    if(!spaceRequired) {
      spaceRequired = (t.equals(HTML.Tag.P) ||
      t.equals(HTML.Tag.TD) ||
      t.equals(HTML.Tag.TR) ||
      t.equals(HTML.Tag.TITLE) ||
      t.equals(HTML.Tag.LI) ||
      t.equals(HTML.Tag.BR) ||
      t.equals(HTML.Tag.H1) ||
      t.equals(HTML.Tag.H2) ||
      t.equals(HTML.Tag.H3) ||
      t.equals(HTML.Tag.H4) ||
      t.equals(HTML.Tag.H5) ||
      t.equals(HTML.Tag.H6) ||
      t.equals(HTML.Tag.IMG) ||
      t.equals(HTML.Tag.OBJECT) ||
      t.equals(HTML.Tag.HR) ||
      t.equals(HTML.Tag.UL));
    }




    return spaceRequired;
  }


  protected void parse(Reader in) throws IOException {


    if (strOutput) {
      buffer = new StringBuffer();
    }


    ParserDelegator delegator = new ParserDelegator();
    delegator.parse(in, this, true);
  }




  /*
   * (non-Javadoc)
   * @see javax.swing.text.html.HTMLEditorKit.ParserCallback#handleComment(char[], int)
   */
  public void handleComment(char[] text, int pos) {
    // Do nothing
    tagCount++;
    lastPosition = pos;
    comment = true;
  }




  /* (non-Javadoc)
   * @see javax.swing.text.html.HTMLEditorKit.ParserCallback#handleEndTag(javax.swing.text.html.HTML.Tag, int)
   */
  public void handleEndTag(Tag t, int pos) {
    if(!spaceBefore && isSpaceRequired(t, pos)) {
      spaceBefore = true;
    }


    // We are in a tag
    tagCount++;


    // Update the last position
    // We want the END position of the tag (plus 2 for <>)
    lastPosition = pos + t.toString().length() + 2;


    // reset
    ignoreNext = false;
  }


  /* (non-Javadoc)
   * @see javax.swing.text.html.HTMLEditorKit.ParserCallback#handleSimpleTag(javax.swing.text.html.HTML.Tag, javax.swing.text.MutableAttributeSet, int)
   */
  public void handleSimpleTag(Tag t, MutableAttributeSet a, int pos) {
    // Test to see if we are at an end tag
    String end = (String)a.getAttribute(HTML.Attribute.ENDTAG);


    if("true".equalsIgnoreCase(end)) {
      handleEndTag(t, pos);
    }
    else
    {
      handleStartTag(t, a, pos);
    }
  }




  /* (non-Javadoc)
   * @see javax.swing.text.html.HTMLEditorKit.ParserCallback#handleStartTag(javax.swing.text.html.HTML.Tag, javax.swing.text.MutableAttributeSet, int)
   */
  public void handleStartTag(Tag t, MutableAttributeSet a, int pos) {


    // We are in a tag
    tagCount++;


    // If we have a new line or spacer tag, add a space
    if(!spaceBefore && isSpaceRequired(t, pos)) {
      spaceBefore = true;
    }


    // Update the last position
    // We want the END position of the tag (plus 2 for <>)
    lastPosition = pos + t.toString().length() + 2;


    // If we have hit a script or style tag.. ignore next
    // Now check for ignored tags
    if(!ignoreNext) {


        for (int i = 0; i < IGNORED_TAGS.length; i++)
            {
                if(t.equals(IGNORED_TAGS[i])) {
                    ignoreNext = true;
                    break;
                }
            }
    }


  }


  /*
   * (non-Javadoc)
   * @see javax.swing.text.html.HTMLEditorKit.ParserCallback#handleText(char[], int)
   */
  public void handleText(char[] text, int pos) {


    if(!quit) {
       // We are not in a tag
       tagCount = 0;


       // It seems the parser does not recognise text prior to the commencement
       // of HTML code.  The following flag caters for this
       if(start) {
         start = false;
       }


       if (strOutput) {


         if(!ignoreNext) {
           if(spaceBefore) {
             buffer.append(' ');
             spaceBefore = false;
           }
           buffer.append(text);
         }
        // else
        //{
           // Append a space to cater for the ignored text
           //buffer.append(' ');
           //System.out.println("Ignoring: " + new String(text) + ": " + debugIgnoreReason);
        // }
       }
       else {
         // We must be output stream
         try {


           if(!ignoreNext) {
             if(spaceBefore) {
               writer.write(" ");
               spaceBefore = false;
             }
             writer.write(text);
           }
          // else
           //{
             // Append a space to cater for the ignored text
             //writer.write(" ");
             //System.out.println("Ignoring: " + new String(text) + ": " + debugIgnoreReason);
          // }


           writer.flush(); // If we don't flush the writer we don't get any data!
         }
         catch (IOException ex) {
           exception = ex;
         }
       }


      comment = false;


    }
  }


  /*
   * (non-Javadoc)
   * @see org.jasen.interfaces.HTMLParser#parse(javax.mail.internet.MimeMessage, org.jasen.interfaces.JasenMessage, org.jasen.interfaces.MimeMessageTokenizer)
   */
    public ParserData parse(MimeMessage mm, JasenMessage message, MimeMessageTokenizer tokenizer) throws JasenException {
        String rawHtml = message.getHtmlPart();
        String rawText = message.getTextPart();
        StandardParserData parserData = new StandardParserData();


        if(rawHtml != null) {
            String htmlText = extractText(rawHtml);
            parserData.setHtmlAsText(htmlText);
            quit = false;
        }


        if(rawText != null) {
            String text = extractText(rawText);
            parserData.setTextParsed(text);
        }


        // Now, tokenize the html and text parts of the message
        parserData.setMessageTokens(tokenizer.tokenize(mm, message, parserData));


        return parserData;
    }
}
Source Code of org.jasen.core.parsers.StandardHTMLParser

Related Classes of org.jasen.core.parsers.StandardHTMLParser