Package org.jasen.core.parsers

Source Code of org.jasen.core.parsers.SpamHTMLParser

/*
* Copyright (c) 2004, 2005  jASEN.org
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
*   1. Redistributions of source code must retain the above copyright notice,
*      this list of conditions and the following disclaimer.
*
*   2. Redistributions in binary form must reproduce the above copyright
*      notice, this list of conditions and the following disclaimer in
*      the documentation and/or other materials provided with the distribution.
*
*   3. The names of the authors may not be used to endorse or promote products
*      derived from this software without specific prior written permission.
*
*   4. Any modification or additions to the software must be contributed back
*      to the project.
*
*   5. Any investigation or reverse engineering of source code or binary to
*      enable emails to bypass the filters, and hence inflict spam and or viruses
*      onto users who use or do not use jASEN could subject the perpetrator to
*      criminal and or civil liability.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESSED OR IMPLIED WARRANTIES,
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JASEN.ORG,
* OR ANY CONTRIBUTORS TO THIS SOFTWARE BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
* OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
*/
package org.jasen.core.parsers;

import java.util.Arrays;
import java.util.List;
import java.util.Stack;
import java.util.Vector;

import javax.mail.internet.MimeMessage;
import javax.swing.text.AttributeSet;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTML.Attribute;
import javax.swing.text.html.HTML.Tag;

import org.jasen.core.StandardParserData;
import org.jasen.core.engine.JasenEngineConfiguration;
import org.jasen.core.parsers.handlers.ImageTagHandler;
import org.jasen.core.parsers.handlers.SrcCgiTagHandler;
import org.jasen.core.parsers.handlers.URLPortTagHandler;
import org.jasen.core.parsers.handlers.URLPortTagHandlerResult;
import org.jasen.error.JasenException;
import org.jasen.interfaces.HTMLTagHandler;
import org.jasen.interfaces.JasenMessage;
import org.jasen.interfaces.MimeMessageTokenizer;
import org.jasen.interfaces.ParserData;
import org.jasen.util.WebUtils;

/**
* <p>Extracts plain text elements from an HTML document.</p>
* <p>This implementation is specific to parsing the text out of spam emails</p>
* @author Jason Polites
*/
public class SpamHTMLParser extends StandardHTMLParser {

    /**
     * The default numerical bacjground color (white)
     */
  public static final int DEFAULT_BGCOLOR = 765;
 
  /**
   * The default numerical foreground color (black)
   */
  public static final int DEFAULT_COLOR = 0;

  /**
   * String (hex) value for the default background color (white)
   */
  public static final String DEFAULT_STR_BGCOLOR = "FFFFFF";
 
  /**
   * String (hex) value for the default foreground color (black)
   */
  public static final String DEFAULT_STR_COLOR = "000000";

  /**
   * The contrast threshold below which content is deemed concealed
   * @deprecated Use getContrastThreshold
   */
  public static final float COLOR_THRESHOLD = 0.075f;
 
  /**
   * The font size threshold below which content is deemed concealed
   * @deprecated Use getMicroFontSize
   */
  public static final int FONTSIZE_THRESHOLD = 1;
 
  /**
   * The size (in pixels) below which an element is considered concealed
   * @deprecated Use getMicroElementSize
   */
  public static final int ELEMENT_THRESHOLD = 5; // pixel width / height
 
  /**
   * @deprecated Not used
   */
  public static final double TOKEN_RECOGNITION_THRESHOLD = 0.1d;

  /**
   * The CSS name for background colors (background-color)
   */
  public static final String BGCOLOR_NAME = "backgound-color";
 
  /**
   * The CSS name for foreground colors (color)
   */
  public static final String COLOR_NAME = "color";

  /**
   * @deprecated Not used
   */
  public static final String URL_REGEX = "";

  private int currentBGColor = DEFAULT_BGCOLOR; // 255 x 3
  private int currentTextColor = DEFAULT_COLOR;

  private Stack activeColorStack;
  private Stack activeBGColorStack;
  private Stack activeColorTagStack;
  private Stack activeBGColorTagStack;

  private int inertColorTagCount = 0;
  private int inertBGColorTagCount = 0;
 
 
  float contrastThreshold = 0.075f;
  int microFontSize = 1;
  int microElementSize = 5;

  private String[] currentStyleAttributes = null;

  private int concealedHtmlCount = 0;
  private int srcCgiCount = 0;
  private int imageCount = 0;
  private int srcPortCount = 0;
  private int falseAnchorCount = 0;

  private List urlPorts;

  // Stores the value of an href from an anchor tag
  private String currentAnchorUrl = null;

  // Holds the value of a BASE tag if one is found
  private String urlBase = null;

  private ImageTagHandler imageHandler = null;
  private SrcCgiTagHandler cgiHandler = null;
  private URLPortTagHandler portHandler = null;

  public SpamHTMLParser() {
      super();

      imageHandler = new ImageTagHandler();
      cgiHandler = new SrcCgiTagHandler();
      portHandler = new URLPortTagHandler();
     
      // Set the default config
    contrastThreshold = JasenEngineConfiguration.getInstance().getParserContrastThreshold();
    microFontSize = JasenEngineConfiguration.getInstance().getParserMicroFontSize();
    microElementSize = JasenEngineConfiguration.getInstance().getParserMicroElementSize();     
     
  }

  // These color names MUST be in natural sort order, but also MUST be in the same order
  // as the corresponding color names below
  public static String[] HTML_COLOR_NAMES =
    {
      "aliceblue",
      "antiquewhite",
      "aqua",
      "aquamarine",
      "azure",
      "beige",
      "bisque",
      "black",
      "blanchedalmond",
      "blue",
      "blueviolet",
      "brown",
      "burlywood",
      "cadetblue",
      "chartreuse",
      "chocolate",
      "coral",
      "cornflowerblue",
      "cornsilk",
      "crimson",
      "cyan",
      "darkblue",
      "darkcyan",
      "darkgoldenrod",
      "darkgray",
      "darkgreen",
      "darkkhaki",
      "darkmagenta",
      "darkolivegreen",
      "darkorange",
      "darkorchid",
      "darkred",
      "darksalmon",
      "darkseagreen",
      "darkslateblue",
      "darkslategray",
      "darkturquoise",
      "darkviolet",
      "deeppink",
      "deepskyblue",
      "dimgray",
      "dodgerblue",
      "firebrick",
      "floralwhite",
      "forestgreen",
      "fuchsia",
      "gainsboro",
      "ghostwhite",
      "gold",
      "goldenrod",
      "gray",
      "green",
      "greenyellow",
      "honeydew",
      "hotpink",
      "indianred",
      "indigo",
      "ivory",
      "khaki",
      "lavender",
      "lavenderblush",
      "lawngreen",
      "lemonchiffon",
      "lightblue",
      "lightcoral",
      "lightcyan",
      "lightgoldenrodyellow",
      "lightgreen",
      "lightgrey",
      "lightpink",
      "lightsalmon",
      "lightseagreen",
      "lightskyblue",
      "lightslategray",
      "lightsteelblue",
      "lightyellow",
      "lime",
      "limegreen",
      "linen",
      "magenta",
      "maroon",
      "mediumaquamarine",
      "mediumblue",
      "mediumorchid",
      "mediumpurple",
      "mediumseagreen",
      "mediumslateblue",
      "mediumspringgreen",
      "mediumturquoise",
      "mediumvioletred",
      "midnightblue",
      "mintcream",
      "mistyrose",
      "moccasin",
      "navajowhite",
      "navy",
      "navyblue",
      "oldlace",
      "olive",
      "olivedrab",
      "orange",
      "orangered",
      "orchid",
      "palegoldenrod",
      "palegreen",
      "paleturquoise",
      "palevioletred",
      "papayawhip",
      "peachpuff",
      "peru",
      "pink",
      "plum",
      "powderblue",
      "purple",
      "red",
      "rosybrown",
      "royalblue",
      "saddlebrown",
      "salmon",
      "sandybrown",
      "seagreen",
      "seashell",
      "sienna",
      "silver",
      "skyblue",
      "slateblue",
      "slategray",
      "snow",
      "springgreen",
      "steelblue",
      "tan",
      "teal",
      "thistle",
      "tomato",
      "turquoise",
      "violet",
      "wheat",
      "white",
      "whitesmoke",
      "yellow",
      "yellowgreen" };

  // These are the hex values corresponding to the named values above
  public static String[] HTML_COLOR_VALUES =
    {
      "F0F8FF",
      "FAEBD7",
      "00FFFF",
      "7FFFD4",
      "F0FFFF",
      "F5F5DC",
      "FFE4C4",
      "000000",
      "FFEBCD",
      "0000FF",
      "8A2BE2",
      "A52A2A",
      "DEB887",
      "5F9EA0",
      "7FFF00",
      "D2691E",
      "FF7F50",
      "6495ED",
      "FFF8DC",
      "DC143C",
      "00FFFF",
      "00008B",
      "008B8B",
      "B8860B",
      "A9A9A9",
      "006400",
      "BDB76B",
      "8B008B",
      "556B2F",
      "FF8C00",
      "9932CC",
      "8B0000",
      "E9967A",
      "8FBC8F",
      "483D8B",
      "2F4F4F",
      "00CED1",
      "9400D3",
      "FF1493",
      "00BFFF",
      "696969",
      "1E90FF",
      "B22222",
      "FFFAF0",
      "228B22",
      "FF00FF",
      "DCDCDC",
      "F8F8FF",
      "FFD700",
      "DAA520",
      "7F7F7F",
      "008000",
      "ADFF2F",
      "F0FFF0",
      "FF69B4",
      "CD5C5C",
      "4B0082",
      "FFFFF0",
      "F0E68C",
      "E6E6FA",
      "FFF0F5",
      "7CFC00",
      "FFFACD",
      "ADD8E6",
      "F08080",
      "E0FFFF",
      "FAFAD2",
      "90EE90",
      "D3D3D3",
      "FFB6C1",
      "FFA07A",
      "20B2AA",
      "87CEFA",
      "778899",
      "B0C4DE",
      "FFFFE0",
      "00FF00",
      "32CD32",
      "FAF0E6",
      "FF00FF",
      "800000",
      "66CDAA",
      "0000CD",
      "BA55D3",
      "9370DB",
      "3CB371",
      "7B68EE",
      "00FA9A",
      "48D1CC",
      "C71585",
      "191970",
      "F5FFFA",
      "FFE4E1",
      "FFE4B5",
      "FFDEAD",
      "000080",
      "9FAFDF",
      "FDF5E6",
      "808000",
      "6B8E23",
      "FFA500",
      "FF4500",
      "DA70D6",
      "EEE8AA",
      "98FB98",
      "AFEEEE",
      "DB7093",
      "FFEFD5",
      "FFDAB9",
      "CD853F",
      "FFC0CB",
      "DDA0DD",
      "B0E0E6",
      "800080",
      "FF0000",
      "BC8F8F",
      "4169E1",
      "8B4513",
      "FA8072",
      "F4A460",
      "2E8B57",
      "FFF5EE",
      "A0522D",
      "C0C0C0",
      "87CEEB",
      "6A5ACD",
      "708090",
      "FFFAFA",
      "00FF7F",
      "4682B4",
      "D2B48C",
      "008080",
      "D8BFD8",
      "FF6347",
      "40E0D0",
      "EE82EE",
      "F5DEB3",
      "FFFFFF",
      "F5F5F5",
      "FFFF00",
      "9ACD32" };

  private static String[] INVALID_FONT_SIZES = { "x-small", "xx-small" };

  /* (non-Javadoc)
   * @see javax.swing.text.html.HTMLEditorKit.ParserCallback#handleStartTag(javax.swing.text.html.HTML.Tag, javax.swing.text.MutableAttributeSet, int)
   */
  public void handleStartTag(Tag t, MutableAttributeSet a, int pos) {

    if (!quit) {

      // First, we need to find the default body text color
      String color = null;

      int defaultColor = DEFAULT_COLOR;

      if (t.equals(HTML.Tag.BODY)) {
        color = getColor(a, HTML.Attribute.TEXT, COLOR_NAME);

        if (color != null) {
          // Set this as the default text color
          defaultColor = getIntColor(color);
        }
      } else {
        color = getColor(a, HTML.Attribute.COLOR, COLOR_NAME);
      }

      // We need to determine the current BGColor or Text Color attributes
      String bgcolor = getColor(a, HTML.Attribute.BGCOLOR, BGCOLOR_NAME);

      int iBGColor = DEFAULT_BGCOLOR;
      int iTextColor = defaultColor;

      if (bgcolor != null) {
        iBGColor = getIntColor(bgcolor);

        // Set the current BG Color
        currentBGColor = iBGColor;

        // Add the color to the stack
        if (activeBGColorStack == null) {
          activeBGColorStack = new Stack();
        }

        activeBGColorStack.push(String.valueOf(iBGColor));

        // Add the tag to the active tag stack
        if (activeBGColorTagStack == null) {
          activeBGColorTagStack = new Stack();
        }

        activeBGColorTagStack.push(t);
      }

      if (color != null) {
        iTextColor = getIntColor(color);

        // Set the current BG Color
        currentTextColor = iTextColor;

        // Add the color to the stack
        if (activeColorStack == null) {
          activeColorStack = new Stack();
        }

        activeColorStack.push(String.valueOf(iTextColor));

        // Add the tag to the active tag stack
        if (activeColorTagStack == null) {
          activeColorTagStack = new Stack();
        }

        activeColorTagStack.push(t);
      }

      if (bgcolor == null && color == null) {

        Tag current = null;

        // neither were found, we need determine if we should increment our inert counter
        if (activeBGColorTagStack != null && activeBGColorTagStack.size() > 0) {

          current = (Tag) activeBGColorTagStack.peek();

          if (current != null && current.toString().equalsIgnoreCase(t.toString())) {
            inertBGColorTagCount++;
          }
        }

        if (activeColorTagStack != null && activeColorTagStack.size() > 0) {

          current = (Tag) activeColorTagStack.peek();

          if (current != null && current.toString().equalsIgnoreCase(t.toString())) {
            inertColorTagCount++;
          }
        }
      }

      // Now, we need to determine if we should ignore the next text element
      if (!ignoreNext) {
        ignoreNext = (calculateColorThreshold() <= contrastThreshold);

        if(ignoreNext) {
            concealedHtmlCount++;
        }
        else
        {

            // Now test for font size in a style tag

          String fontSize = getStyleAttributeValue(a, "font-size");

          if(fontSize == null) {
            if(t.equals(HTML.Tag.FONT)) {
              fontSize = (String)a.getAttribute(HTML.Attribute.SIZE);
            }
          }

          if (fontSize != null) {
            fontSize = fontSize.replaceAll("px", "");
            fontSize = fontSize.replaceAll("pt", "");
            fontSize = fontSize.trim();

            try {
              int iFontSize = (int) Float.parseFloat(fontSize);

              ignoreNext = (iFontSize <= microFontSize);

              if(iFontSize <= 0) {
                  concealedHtmlCount++;
              }

            } catch (NumberFormatException e) {
              // We weren't able to treat the size as a number, it may be a valid CSS string
              if (Arrays.binarySearch(INVALID_FONT_SIZES, fontSize) > -1) {
                  //concealedHtmlCount++;
                ignoreNext = true;
              }
            }
          }
        }
      }

      // Now test element size
      if(!ignoreNext) {
          ignoreNext = ignoreElement(t, a);

          if(ignoreNext) {
              concealedHtmlCount++;
          }
      }

      // reset
      currentStyleAttributes = null;

      // Now check for cgi urls and images
      if(cgiHandler.handleTag(t, a, null) == HTMLTagHandler.MATCH) {
          srcCgiCount++;
      }

      if(imageHandler.handleTag(t, a, null) == HTMLTagHandler.MATCH) {
          imageCount++;
      }

      URLPortTagHandlerResult result = new URLPortTagHandlerResult();

      if(portHandler.handleTag(t, a, result) == HTMLTagHandler.MATCH) {
          srcPortCount++;

          // Add the port to the list
          if(urlPorts == null) {
              urlPorts = new Vector(5);
          }

          urlPorts.add(result.getPort());
      }


      // Check for an anchor tag to get the href
      if(t.equals(Tag.A)) {
          currentAnchorUrl = (String)a.getAttribute(Attribute.HREF);

                if(urlBase != null) {
                    // The anchor has a base..
                    currentAnchorUrl = urlBase + currentAnchorUrl;
                }
      }
      else
      {
          currentAnchorUrl = null;
      }

      // Check for BASE href
      if(t.equals(Tag.BASE)) {
          urlBase = (String)a.getAttribute(Attribute.HREF);

          if(urlBase != null && !urlBase.endsWith("/")) {
              urlBase += "/";
          }
      }


      // We MUST call the super class
      super.handleStartTag(t, a, pos);
    }
  }



    public void handleText(char[] text, int pos) {
        // If we are in an anchor tag, check the text against the url...
        if(currentAnchorUrl != null) {

            // Parse the text looking for a url...
            String strText = new String(text);

            if(strText != null) {
                strText = strText.trim().toLowerCase();
                if(strText.startsWith("www") || WebUtils.isUrl(strText)) {
                    // We have a direct url reference, check against the recorded value
                    if(!strText.equalsIgnoreCase(currentAnchorUrl)) {
                        // The URL text does not match the ACTUAL url
                        // This could be a deception
                        falseAnchorCount++;
                    }
                }
            }
        }

        // Now pass control to the super class
        super.handleText (text, pos);
    }
  /**
   * Attempts to find a color/bgcolor attribute from a tag
   * @param a
   * @param htmlTagAttribute
   * @param styleTagAttribute
   * @return
   */
  private String getColor(AttributeSet a, HTML.Attribute htmlTagAttribute, String styleTagAttribute) {
    String color = null;

    // style tags will override HTML tags
    color = getStyleAttributeValue(a, styleTagAttribute);

    if (color == null) {
      // We didn't find a style color, look for an HTML one
      color = (String) a.getAttribute(htmlTagAttribute);
    }

    if (color != null) {
      // Ensure we have removed hashes
      color = color.replaceAll("#", "");
    }

    // There seems to be a situation where the parser
    // cannot determine the color, and so gives it a
    // "DEFAULT" value.  Why it doesn't just give it null
    // I don't know!
    if(color != null && color.equalsIgnoreCase("DEFAULT")) {
        if(styleTagAttribute.equals(BGCOLOR_NAME)) {
            color = DEFAULT_STR_BGCOLOR;
        }
        else {
            color = DEFAULT_STR_COLOR;
        }
    }

    return color;
  }

  private String[] getCurrentStyleAttributes(AttributeSet a) {
    String attValue = (String) a.getAttribute(HTML.Attribute.STYLE);
    String[] styleAttributes = null;

    if (attValue != null && attValue.trim().length() > 0) {
      attValue = attValue.toLowerCase();
      styleAttributes = attValue.split(";");
    }

    return styleAttributes;
  }

  private String getStyleAttributeValue(AttributeSet a, String styleKey) {

    String value = null;
    String attribute = null;

    if (currentStyleAttributes == null) {
      currentStyleAttributes = getCurrentStyleAttributes(a);
    }

    if (currentStyleAttributes != null) {
      for (int i = 0; i < currentStyleAttributes.length; i++) {

        attribute = currentStyleAttributes[i].trim().toLowerCase();

        if (attribute.indexOf(styleKey) == 0) {
          // We have found our attribute, get the value
          value = currentStyleAttributes[i].substring(currentStyleAttributes[i].indexOf(":") + 1, currentStyleAttributes[i].length());

          if(value != null) value = value.trim();

          break;
        }
      }
    }
    return value;
  }



  private int getIntColor(String strColor) {

    int color = 0;

    strColor = strColor.replaceAll("#", "").trim();

    int strLength = strColor.length();

    String strPart;

    // See if it's a named color first
    int index = Arrays.binarySearch(HTML_COLOR_NAMES, strColor.toLowerCase());

    if (index > -1) {
      return getIntColor(HTML_COLOR_VALUES[index]);
    } else {
      // Analyze each character
      char[] chars = strColor.toCharArray();
      char chr;
      String rgb = "";
      boolean add = false;

      for (int i = 0; i < chars.length; i++) {
        // The color value should be between 0 and F (hex)
        chr = chars[i];

        if (chr < 0x0030 || (chr > 0x0039 && chr < 0x0041) || (chr > 0x0046 && chr < 0x0061) || (chr > 0x0066)) {
          chr = '0';
        }

        rgb += chr;

        if (add) {
          try {
            color += Integer.parseInt(rgb, 16);
            rgb = "";
          } catch (NumberFormatException e) {
            // Ignore the exception here
            e.printStackTrace();
          }
        }
        add = !add;
      }
    }

    return color;
  }

  private float calculateColorThreshold() {

    float threshhold = 0.0f;

    if (currentTextColor > currentBGColor) {
      threshhold = (((float) currentTextColor - (float) currentBGColor) / (float) currentTextColor);
    } else if (currentBGColor > currentTextColor) {
      threshhold = (((float) currentBGColor - (float) currentTextColor) / (float) currentBGColor);
    }

    return threshhold;
  }

  /**
   * Returns true if the text within this element should be ignored based on the element size
   * @param tag
   * @return
   */
  private boolean ignoreElement(HTML.Tag tag, AttributeSet a) {

      boolean ignoreElem = false;

      String strWidth = getStyleAttributeValue(a, "width");
      String strHeight = getStyleAttributeValue(a, "height");

      if(strWidth == null) {
          strWidth = (String)a.getAttribute(Attribute.WIDTH);
      }

      if(strHeight == null) {
          strHeight = (String)a.getAttribute(Attribute.HEIGHT);
      }


      try
        {
        if(strHeight != null) {
            strHeight = strHeight.replaceAll("px", "");
            ignoreElem = (Integer.parseInt(strHeight) <= microElementSize);
        }
        if(!ignoreElem && strWidth != null) {
            strWidth = strWidth.replaceAll("px", "");
            ignoreElem = (Integer.parseInt(strWidth) <= microElementSize);
        }
        }
        catch (NumberFormatException ignore){}

    return ignoreElem;
  }

  /* (non-Javadoc)
   * @see javax.swing.text.html.HTMLEditorKit.ParserCallback#handleEndTag(javax.swing.text.html.HTML.Tag, int)
   */
  public void handleEndTag(Tag t, int pos) {

    // If the end tag is a /html, we want to ignore everything else

    if (t.equals(HTML.Tag.HTML)) {
      quit = true;
    }

    if (!quit) {
      // If the current tag equals the last tag on either our color stack or
      // our bgcolor stack, we may need to pop
      Tag current = null;

      if (activeBGColorTagStack != null && activeBGColorTagStack.size() > 0) {
        current = (Tag) activeBGColorTagStack.peek();

        if (current != null && current.toString().equalsIgnoreCase(t.toString())) {

          if (inertBGColorTagCount > 0) {
            inertBGColorTagCount--;
          } else {
            // We have to remove the current color from the stack
            activeBGColorTagStack.pop();
            activeBGColorStack.pop();

            if (activeBGColorTagStack.size() > 0) {
              currentBGColor = Integer.parseInt((String) activeBGColorStack.peek());
            } else {
              currentBGColor = DEFAULT_BGCOLOR;
            }
          }
        }
      } else {
        // Set to default
        currentBGColor = DEFAULT_BGCOLOR;
      }

      if (activeColorTagStack != null && activeColorTagStack.size() > 0) {
        current = (Tag) activeColorTagStack.peek();

        if (current != null && current.toString().equalsIgnoreCase(t.toString())) {

          if (inertColorTagCount > 0) {
            inertColorTagCount--;
          } else {
            // We have to remove the current color from the stack
            activeColorTagStack.pop();
            activeColorStack.pop();

            if (activeColorTagStack.size() > 0) {
              currentTextColor = Integer.parseInt((String) activeColorStack.peek());
            } else {
              currentTextColor = DEFAULT_COLOR;
            }
          }
        }
      } else {
        // Set to default
        currentTextColor = DEFAULT_COLOR;
      }

      //Clear the href
      currentAnchorUrl = null;

      super.handleEndTag(t, pos);
    }
  }

  /**
   * Gets the number of times concealed html was found
   * @return An integer representing the number of times a concealment was discovered
   */
    public int getConcealedHtmlCount() {
        return concealedHtmlCount;
    }

    /**
     * Gets the number of times images were found
     * @return The number of images in the document
     */
    public int getImageCount() {
        return imageCount;
    }

    /**
     * Gets the number of times the source attribute of a tag referenced a remote CGI script
     * @return
     */
    public int getSrcCgiCount() {
        return srcCgiCount;
    }


    /**
     * Gets the list of url ports found in tags with a src attribute
     * @return
     */
    public int getSrcPortCount() {
        return srcPortCount;
    }
   
    /**
     * Gets the list of url ports found in anchor tags in the message html part
     * @return
     */
    public List getUrlPorts() {
        return urlPorts;
    }


    /**
     * Gets the number if occurrences of "false" anchor tags.
     * <p>
     * These exist where an anchor tag displays a url as the text component,
     * <br/>
     * but this url does not match the actual url of the href.
     * </p>
     * @return The number of times a false anchor reference was discovered
     */
    public int getFalseAnchorCount() {
        return falseAnchorCount;
    }
   
   
    /**
     * Gets the threshold for contrast between foreground and background content elements.
     * <br/>
     * In HTML emails, and particularly spam, content is often obscured via the use of low
     * contrast colors or tones between background and foreground elements.  For example,
     * the text of the email may be white, and the background white indicating a contrast of 0
     * @return A value between 0.0 and 1.0 such that 0.0 indicates no contrast, and 1.0 indicates
     * complete contrast (eg white on black)
     */
    public float getContrastThreshold() {
        return contrastThreshold;
    }
   
    /**
     * Sets the threshold for contrast between foreground and background content elements.
     * @see SpamHTMLParser#getContrastThreshold()
     * @param contrastThreshold A value between 0.0 and 1.0
     */
    public void setContrastThreshold(float contrastThreshold) {
        this.contrastThreshold = contrastThreshold;
    }
   
    /**
     * Gets the size (in pixels) of the minimum allowable element dimension (usually height).
     * <br/>
     * Content found inside elements smaller than this size is deemed concealed
     * @return The size in pixels of the smallest allowable element dimension
     */
    public int getMicroElementSize() {
        return microElementSize;
    }
   
    /**
     * Sets the size (in pixels) of the minimum allowable element dimension (usually height).
     * @param microElementSize The size in pixels.  It is recommended that this be less than 10.
     * Default is 5.
     */
    public void setMicroElementSize(int microElementSize) {
        this.microElementSize = microElementSize;
    }
   
    /**
     * Gets the size (in points) of the minimum allowable font size.
     * <br/>
     * Content found inside font tags with smaller point size than this size is deemed concealed
     * @return The size in points of the smallest allowable font.  Default is 1
     */
    public int getMicroFontSize() {
        return microFontSize;
    }
   
    /**
     * Sets the size (in points) of the minimum allowable font size.
     * @param microFontSize A size in points.  Default is 1
     */
    public void setMicroFontSize(int microFontSize) {
        this.microFontSize = microFontSize;
    }
   
    /*
     * (non-Javadoc)
     * @see org.jasen.interfaces.HTMLParser#parse(javax.mail.internet.MimeMessage, org.jasen.interfaces.JasenMessage, org.jasen.interfaces.MimeMessageTokenizer)
     */
    public ParserData parse(MimeMessage mm, JasenMessage message, MimeMessageTokenizer tokenizer) throws JasenException {
        StandardParserData parserData = (StandardParserData)super.parse(mm, message, tokenizer);
        parserData.setConcealedHtmlCount(getConcealedHtmlCount());
        parserData.setImageCount(getImageCount());
        parserData.setSrcCgiCount(getSrcCgiCount());
        parserData.setSrcPortCount(getSrcPortCount());
        parserData.setPorts(getUrlPorts());
        parserData.setFalseAnchorCount(getFalseAnchorCount());
        return parserData;
    }


}
TOP

Related Classes of org.jasen.core.parsers.SpamHTMLParser

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.