Package org.jasen.core.parsers

Source Code of org.jasen.core.parsers.StandardHTMLParser

/*
* Copyright (c) 2004, 2005  jASEN.org
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
*   1. Redistributions of source code must retain the above copyright notice,
*      this list of conditions and the following disclaimer.
*
*   2. Redistributions in binary form must reproduce the above copyright
*      notice, this list of conditions and the following disclaimer in
*      the documentation and/or other materials provided with the distribution.
*
*   3. The names of the authors may not be used to endorse or promote products
*      derived from this software without specific prior written permission.
*
*   4. Any modification or additions to the software must be contributed back
*      to the project.
*
*   5. Any investigation or reverse engineering of source code or binary to
*      enable emails to bypass the filters, and hence inflict spam and or viruses
*      onto users who use or do not use jASEN could subject the perpetrator to
*      criminal and or civil liability.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESSED OR IMPLIED WARRANTIES,
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JASEN.ORG,
* OR ANY CONTRIBUTORS TO THIS SOFTWARE BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
* OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
*/
package org.jasen.core.parsers;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.io.StringReader;
import java.io.Writer;

import javax.mail.internet.MimeMessage;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.HTML.Tag;
import javax.swing.text.html.parser.ParserDelegator;

import org.jasen.core.StandardParserData;
import org.jasen.error.JasenException;
import org.jasen.interfaces.JasenMessage;
import org.jasen.interfaces.MimeMessageTokenizer;
import org.jasen.interfaces.ParserData;


/**
* <p>
* Parses the HTML part of an email for two main purposes.
* <ul>
* <li>To extract the plain text components of the message for tokenizing</li>
* <li>To inspect the html for anomalies like HTML concealment and mail bugs</li>
* </ul>
* </p>
*/
public class StandardHTMLParser extends HTMLEditorKit.ParserCallback implements org.jasen.interfaces.HTMLParser {

  protected Reader reader = null;
  protected Writer writer = null;

  protected String encoding = "ISO8859_1";
  protected Throwable exception = null;

  protected StringBuffer buffer;
  protected boolean strOutput = false;

  protected boolean comment = false;
  protected boolean start = true;
  protected boolean ignoreNext = false; // set to true when the next text element should be ignored

  // Text within these tags is ignored
  protected static final Tag[] IGNORED_TAGS = new Tag[]{Tag.SCRIPT, Tag.STYLE, Tag.TITLE, Tag.HEAD, Tag.META};

  protected boolean spaceBefore = false;
  protected boolean spaceAfter = false;

  protected boolean quit = false;

  protected String debugIgnoreReason = null;

  // We need to record a simple tag occurrence so we can
  // trick the parser into recording whitespace between tags
  protected int tagCount = 0;
  protected int lastPosition = 0; // The last position a tag was encountered

  public StandardHTMLParser() {
      super();
  }

  /**
   * Extracts the plain text components of the html given by the
   * input stream and writes this plain text to the given output stream
   * @param in The input stream from which the html is read
   * @param out The ouput stream to which the plain text is written
   * @throws JasenException
   */
  public void extractText(InputStream in, OutputStream out) throws JasenException {

    try {

      reader = new InputStreamReader(in);
      writer = new OutputStreamWriter(out, encoding);

      parse(reader);

      if (exception != null) {
        throw new JasenException(exception);
      }

    }
    catch (IOException ex) {
      throw new JasenException(ex);
    }
    finally {
      try {
        out.flush();
      }
      catch (IOException ex) {
        throw new JasenException(ex);
      }
    }
  }

  /**
   * Sets the encoding to use on the output stream (optional)
   * @param encoding
   */
  public void setEncoding(String encoding) {
    this.encoding = encoding;
  }


  /**
   * Extracts plain text from the html given by the input stream and returns it as a String
   * @param in The input stream from which the html is read
   * @return A String containing the plain text of the html
   * @throws JasenException
   */
  public String extractText(InputStream in) throws JasenException {
    reader = new InputStreamReader(in);
    return extractText(reader);
  }

  /**
   * Extracts plain text from the given html String and returns it as a String
   * @param html The String containing the html
   * @return The String containing the plain text
   * @throws JasenException
   */
  public String extractText(String html) throws JasenException {
    reader = new StringReader(html);
    return extractText(reader);
  }

 
  private String extractText(Reader reader) throws JasenException {

    strOutput = true;
    String text = null;

    try {
      parse(reader);
    }
    catch (IOException ex) {
      throw new JasenException(ex);
    }

    if(buffer != null) {
      // Replace new lines
      text = buffer.toString();
      //text = text.replaceAll("\\r", "");
      //text = text.replaceAll("\\n", "");
    }

    return text;

  }

  protected boolean isSpaceRequired(Tag t, int pos) {

    boolean spaceRequired = false;

    // A space is required if we hit an end tag followed by
    // a start tag without encountering text
    // OR, if we hit two simple tags in the same way
    if(tagCount > 1) {
      // We hit two sequential tags
      // If the end position of the last tag is more than 1 char away from the start
      // position of the current tag, we need a space
      if(lastPosition < pos) {
        spaceRequired = true;
      }
    }

    if(!spaceRequired) {
      spaceRequired = (t.equals(HTML.Tag.P) ||
      t.equals(HTML.Tag.TD) ||
      t.equals(HTML.Tag.TR) ||
      t.equals(HTML.Tag.TITLE) ||
      t.equals(HTML.Tag.LI) ||
      t.equals(HTML.Tag.BR) ||
      t.equals(HTML.Tag.H1) ||
      t.equals(HTML.Tag.H2) ||
      t.equals(HTML.Tag.H3) ||
      t.equals(HTML.Tag.H4) ||
      t.equals(HTML.Tag.H5) ||
      t.equals(HTML.Tag.H6) ||
      t.equals(HTML.Tag.IMG) ||
      t.equals(HTML.Tag.OBJECT) ||
      t.equals(HTML.Tag.HR) ||
      t.equals(HTML.Tag.UL));
    }


    return spaceRequired;
  }

  protected void parse(Reader in) throws IOException {

    if (strOutput) {
      buffer = new StringBuffer();
    }

    ParserDelegator delegator = new ParserDelegator();
    delegator.parse(in, this, true);
  }


  /*
   * (non-Javadoc)
   * @see javax.swing.text.html.HTMLEditorKit.ParserCallback#handleComment(char[], int)
   */
  public void handleComment(char[] text, int pos) {
    // Do nothing
    tagCount++;
    lastPosition = pos;
    comment = true;
  }


  /* (non-Javadoc)
   * @see javax.swing.text.html.HTMLEditorKit.ParserCallback#handleEndTag(javax.swing.text.html.HTML.Tag, int)
   */
  public void handleEndTag(Tag t, int pos) {
    if(!spaceBefore && isSpaceRequired(t, pos)) {
      spaceBefore = true;
    }

    // We are in a tag
    tagCount++;

    // Update the last position
    // We want the END position of the tag (plus 2 for <>)
    lastPosition = pos + t.toString().length() + 2;

    // reset
    ignoreNext = false;
  }

  /* (non-Javadoc)
   * @see javax.swing.text.html.HTMLEditorKit.ParserCallback#handleSimpleTag(javax.swing.text.html.HTML.Tag, javax.swing.text.MutableAttributeSet, int)
   */
  public void handleSimpleTag(Tag t, MutableAttributeSet a, int pos) {
    // Test to see if we are at an end tag
    String end = (String)a.getAttribute(HTML.Attribute.ENDTAG);

    if("true".equalsIgnoreCase(end)) {
      handleEndTag(t, pos);
    }
    else
    {
      handleStartTag(t, a, pos);
    }
  }


  /* (non-Javadoc)
   * @see javax.swing.text.html.HTMLEditorKit.ParserCallback#handleStartTag(javax.swing.text.html.HTML.Tag, javax.swing.text.MutableAttributeSet, int)
   */
  public void handleStartTag(Tag t, MutableAttributeSet a, int pos) {

    // We are in a tag
    tagCount++;

    // If we have a new line or spacer tag, add a space
    if(!spaceBefore && isSpaceRequired(t, pos)) {
      spaceBefore = true;
    }

    // Update the last position
    // We want the END position of the tag (plus 2 for <>)
    lastPosition = pos + t.toString().length() + 2;

    // If we have hit a script or style tag.. ignore next
    // Now check for ignored tags
    if(!ignoreNext) {

        for (int i = 0; i < IGNORED_TAGS.length; i++)
            {
                if(t.equals(IGNORED_TAGS[i])) {
                    ignoreNext = true;
                    break;
                }
            }
    }

  }

  /*
   * (non-Javadoc)
   * @see javax.swing.text.html.HTMLEditorKit.ParserCallback#handleText(char[], int)
   */
  public void handleText(char[] text, int pos) {

    if(!quit) {
       // We are not in a tag
       tagCount = 0;

       // It seems the parser does not recognise text prior to the commencement
       // of HTML code.  The following flag caters for this
       if(start) {
         start = false;
       }

       if (strOutput) {

         if(!ignoreNext) {
           if(spaceBefore) {
             buffer.append(' ');
             spaceBefore = false;
           }
           buffer.append(text);
         }
        // else
        //{
           // Append a space to cater for the ignored text
           //buffer.append(' ');
           //System.out.println("Ignoring: " + new String(text) + ": " + debugIgnoreReason);
        // }
       }
       else {
         // We must be output stream
         try {

           if(!ignoreNext) {
             if(spaceBefore) {
               writer.write(" ");
               spaceBefore = false;
             }
             writer.write(text);
           }
          // else
           //{
             // Append a space to cater for the ignored text
             //writer.write(" ");
             //System.out.println("Ignoring: " + new String(text) + ": " + debugIgnoreReason);
          // }

           writer.flush(); // If we don't flush the writer we don't get any data!
         }
         catch (IOException ex) {
           exception = ex;
         }
       }

      comment = false;

    }
  }

  /*
   * (non-Javadoc)
   * @see org.jasen.interfaces.HTMLParser#parse(javax.mail.internet.MimeMessage, org.jasen.interfaces.JasenMessage, org.jasen.interfaces.MimeMessageTokenizer)
   */
    public ParserData parse(MimeMessage mm, JasenMessage message, MimeMessageTokenizer tokenizer) throws JasenException {
        String rawHtml = message.getHtmlPart();
        String rawText = message.getTextPart();
        StandardParserData parserData = new StandardParserData();

        if(rawHtml != null) {
            String htmlText = extractText(rawHtml);
            parserData.setHtmlAsText(htmlText);
            quit = false;
        }

        if(rawText != null) {
            String text = extractText(rawText);
            parserData.setTextParsed(text);
        }

        // Now, tokenize the html and text parts of the message
        parserData.setMessageTokens(tokenizer.tokenize(mm, message, parserData));

        return parserData;
    }
}
TOP

Related Classes of org.jasen.core.parsers.StandardHTMLParser

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.