/*
* Copyright (c) 2004, 2005 jASEN.org
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the distribution.
*
* 3. The names of the authors may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* 4. Any modification or additions to the software must be contributed back
* to the project.
*
* 5. Any investigation or reverse engineering of source code or binary to
* enable emails to bypass the filters, and hence inflict spam and or viruses
* onto users who use or do not use jASEN could subject the perpetrator to
* criminal and or civil liability.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESSED OR IMPLIED WARRANTIES,
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JASEN.ORG,
* OR ANY CONTRIBUTORS TO THIS SOFTWARE BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
* OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
*/
package org.jasen.core.parsers;
import java.util.Arrays;
import java.util.List;
import java.util.Stack;
import java.util.Vector;
import javax.mail.internet.MimeMessage;
import javax.swing.text.AttributeSet;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTML.Attribute;
import javax.swing.text.html.HTML.Tag;
import org.jasen.core.StandardParserData;
import org.jasen.core.engine.JasenEngineConfiguration;
import org.jasen.core.parsers.handlers.ImageTagHandler;
import org.jasen.core.parsers.handlers.SrcCgiTagHandler;
import org.jasen.core.parsers.handlers.URLPortTagHandler;
import org.jasen.core.parsers.handlers.URLPortTagHandlerResult;
import org.jasen.error.JasenException;
import org.jasen.interfaces.HTMLTagHandler;
import org.jasen.interfaces.JasenMessage;
import org.jasen.interfaces.MimeMessageTokenizer;
import org.jasen.interfaces.ParserData;
import org.jasen.util.WebUtils;
/**
* <p>Extracts plain text elements from an HTML document.</p>
* <p>This implementation is specific to parsing the text out of spam emails</p>
* @author Jason Polites
*/
public class SpamHTMLParser extends StandardHTMLParser {
/**
* The default numerical bacjground color (white)
*/
public static final int DEFAULT_BGCOLOR = 765;
/**
* The default numerical foreground color (black)
*/
public static final int DEFAULT_COLOR = 0;
/**
* String (hex) value for the default background color (white)
*/
public static final String DEFAULT_STR_BGCOLOR = "FFFFFF";
/**
* String (hex) value for the default foreground color (black)
*/
public static final String DEFAULT_STR_COLOR = "000000";
/**
* The contrast threshold below which content is deemed concealed
* @deprecated Use getContrastThreshold
*/
public static final float COLOR_THRESHOLD = 0.075f;
/**
* The font size threshold below which content is deemed concealed
* @deprecated Use getMicroFontSize
*/
public static final int FONTSIZE_THRESHOLD = 1;
/**
* The size (in pixels) below which an element is considered concealed
* @deprecated Use getMicroElementSize
*/
public static final int ELEMENT_THRESHOLD = 5; // pixel width / height
/**
* @deprecated Not used
*/
public static final double TOKEN_RECOGNITION_THRESHOLD = 0.1d;
/**
* The CSS name for background colors (background-color)
*/
public static final String BGCOLOR_NAME = "backgound-color";
/**
* The CSS name for foreground colors (color)
*/
public static final String COLOR_NAME = "color";
/**
* @deprecated Not used
*/
public static final String URL_REGEX = "";
private int currentBGColor = DEFAULT_BGCOLOR; // 255 x 3
private int currentTextColor = DEFAULT_COLOR;
private Stack activeColorStack;
private Stack activeBGColorStack;
private Stack activeColorTagStack;
private Stack activeBGColorTagStack;
private int inertColorTagCount = 0;
private int inertBGColorTagCount = 0;
float contrastThreshold = 0.075f;
int microFontSize = 1;
int microElementSize = 5;
private String[] currentStyleAttributes = null;
private int concealedHtmlCount = 0;
private int srcCgiCount = 0;
private int imageCount = 0;
private int srcPortCount = 0;
private int falseAnchorCount = 0;
private List urlPorts;
// Stores the value of an href from an anchor tag
private String currentAnchorUrl = null;
// Holds the value of a BASE tag if one is found
private String urlBase = null;
private ImageTagHandler imageHandler = null;
private SrcCgiTagHandler cgiHandler = null;
private URLPortTagHandler portHandler = null;
public SpamHTMLParser() {
super();
imageHandler = new ImageTagHandler();
cgiHandler = new SrcCgiTagHandler();
portHandler = new URLPortTagHandler();
// Set the default config
contrastThreshold = JasenEngineConfiguration.getInstance().getParserContrastThreshold();
microFontSize = JasenEngineConfiguration.getInstance().getParserMicroFontSize();
microElementSize = JasenEngineConfiguration.getInstance().getParserMicroElementSize();
}
// These color names MUST be in natural sort order, but also MUST be in the same order
// as the corresponding color names below
public static String[] HTML_COLOR_NAMES =
{
"aliceblue",
"antiquewhite",
"aqua",
"aquamarine",
"azure",
"beige",
"bisque",
"black",
"blanchedalmond",
"blue",
"blueviolet",
"brown",
"burlywood",
"cadetblue",
"chartreuse",
"chocolate",
"coral",
"cornflowerblue",
"cornsilk",
"crimson",
"cyan",
"darkblue",
"darkcyan",
"darkgoldenrod",
"darkgray",
"darkgreen",
"darkkhaki",
"darkmagenta",
"darkolivegreen",
"darkorange",
"darkorchid",
"darkred",
"darksalmon",
"darkseagreen",
"darkslateblue",
"darkslategray",
"darkturquoise",
"darkviolet",
"deeppink",
"deepskyblue",
"dimgray",
"dodgerblue",
"firebrick",
"floralwhite",
"forestgreen",
"fuchsia",
"gainsboro",
"ghostwhite",
"gold",
"goldenrod",
"gray",
"green",
"greenyellow",
"honeydew",
"hotpink",
"indianred",
"indigo",
"ivory",
"khaki",
"lavender",
"lavenderblush",
"lawngreen",
"lemonchiffon",
"lightblue",
"lightcoral",
"lightcyan",
"lightgoldenrodyellow",
"lightgreen",
"lightgrey",
"lightpink",
"lightsalmon",
"lightseagreen",
"lightskyblue",
"lightslategray",
"lightsteelblue",
"lightyellow",
"lime",
"limegreen",
"linen",
"magenta",
"maroon",
"mediumaquamarine",
"mediumblue",
"mediumorchid",
"mediumpurple",
"mediumseagreen",
"mediumslateblue",
"mediumspringgreen",
"mediumturquoise",
"mediumvioletred",
"midnightblue",
"mintcream",
"mistyrose",
"moccasin",
"navajowhite",
"navy",
"navyblue",
"oldlace",
"olive",
"olivedrab",
"orange",
"orangered",
"orchid",
"palegoldenrod",
"palegreen",
"paleturquoise",
"palevioletred",
"papayawhip",
"peachpuff",
"peru",
"pink",
"plum",
"powderblue",
"purple",
"red",
"rosybrown",
"royalblue",
"saddlebrown",
"salmon",
"sandybrown",
"seagreen",
"seashell",
"sienna",
"silver",
"skyblue",
"slateblue",
"slategray",
"snow",
"springgreen",
"steelblue",
"tan",
"teal",
"thistle",
"tomato",
"turquoise",
"violet",
"wheat",
"white",
"whitesmoke",
"yellow",
"yellowgreen" };
// These are the hex values corresponding to the named values above
public static String[] HTML_COLOR_VALUES =
{
"F0F8FF",
"FAEBD7",
"00FFFF",
"7FFFD4",
"F0FFFF",
"F5F5DC",
"FFE4C4",
"000000",
"FFEBCD",
"0000FF",
"8A2BE2",
"A52A2A",
"DEB887",
"5F9EA0",
"7FFF00",
"D2691E",
"FF7F50",
"6495ED",
"FFF8DC",
"DC143C",
"00FFFF",
"00008B",
"008B8B",
"B8860B",
"A9A9A9",
"006400",
"BDB76B",
"8B008B",
"556B2F",
"FF8C00",
"9932CC",
"8B0000",
"E9967A",
"8FBC8F",
"483D8B",
"2F4F4F",
"00CED1",
"9400D3",
"FF1493",
"00BFFF",
"696969",
"1E90FF",
"B22222",
"FFFAF0",
"228B22",
"FF00FF",
"DCDCDC",
"F8F8FF",
"FFD700",
"DAA520",
"7F7F7F",
"008000",
"ADFF2F",
"F0FFF0",
"FF69B4",
"CD5C5C",
"4B0082",
"FFFFF0",
"F0E68C",
"E6E6FA",
"FFF0F5",
"7CFC00",
"FFFACD",
"ADD8E6",
"F08080",
"E0FFFF",
"FAFAD2",
"90EE90",
"D3D3D3",
"FFB6C1",
"FFA07A",
"20B2AA",
"87CEFA",
"778899",
"B0C4DE",
"FFFFE0",
"00FF00",
"32CD32",
"FAF0E6",
"FF00FF",
"800000",
"66CDAA",
"0000CD",
"BA55D3",
"9370DB",
"3CB371",
"7B68EE",
"00FA9A",
"48D1CC",
"C71585",
"191970",
"F5FFFA",
"FFE4E1",
"FFE4B5",
"FFDEAD",
"000080",
"9FAFDF",
"FDF5E6",
"808000",
"6B8E23",
"FFA500",
"FF4500",
"DA70D6",
"EEE8AA",
"98FB98",
"AFEEEE",
"DB7093",
"FFEFD5",
"FFDAB9",
"CD853F",
"FFC0CB",
"DDA0DD",
"B0E0E6",
"800080",
"FF0000",
"BC8F8F",
"4169E1",
"8B4513",
"FA8072",
"F4A460",
"2E8B57",
"FFF5EE",
"A0522D",
"C0C0C0",
"87CEEB",
"6A5ACD",
"708090",
"FFFAFA",
"00FF7F",
"4682B4",
"D2B48C",
"008080",
"D8BFD8",
"FF6347",
"40E0D0",
"EE82EE",
"F5DEB3",
"FFFFFF",
"F5F5F5",
"FFFF00",
"9ACD32" };
private static String[] INVALID_FONT_SIZES = { "x-small", "xx-small" };
/* (non-Javadoc)
* @see javax.swing.text.html.HTMLEditorKit.ParserCallback#handleStartTag(javax.swing.text.html.HTML.Tag, javax.swing.text.MutableAttributeSet, int)
*/
public void handleStartTag(Tag t, MutableAttributeSet a, int pos) {
if (!quit) {
// First, we need to find the default body text color
String color = null;
int defaultColor = DEFAULT_COLOR;
if (t.equals(HTML.Tag.BODY)) {
color = getColor(a, HTML.Attribute.TEXT, COLOR_NAME);
if (color != null) {
// Set this as the default text color
defaultColor = getIntColor(color);
}
} else {
color = getColor(a, HTML.Attribute.COLOR, COLOR_NAME);
}
// We need to determine the current BGColor or Text Color attributes
String bgcolor = getColor(a, HTML.Attribute.BGCOLOR, BGCOLOR_NAME);
int iBGColor = DEFAULT_BGCOLOR;
int iTextColor = defaultColor;
if (bgcolor != null) {
iBGColor = getIntColor(bgcolor);
// Set the current BG Color
currentBGColor = iBGColor;
// Add the color to the stack
if (activeBGColorStack == null) {
activeBGColorStack = new Stack();
}
activeBGColorStack.push(String.valueOf(iBGColor));
// Add the tag to the active tag stack
if (activeBGColorTagStack == null) {
activeBGColorTagStack = new Stack();
}
activeBGColorTagStack.push(t);
}
if (color != null) {
iTextColor = getIntColor(color);
// Set the current BG Color
currentTextColor = iTextColor;
// Add the color to the stack
if (activeColorStack == null) {
activeColorStack = new Stack();
}
activeColorStack.push(String.valueOf(iTextColor));
// Add the tag to the active tag stack
if (activeColorTagStack == null) {
activeColorTagStack = new Stack();
}
activeColorTagStack.push(t);
}
if (bgcolor == null && color == null) {
Tag current = null;
// neither were found, we need determine if we should increment our inert counter
if (activeBGColorTagStack != null && activeBGColorTagStack.size() > 0) {
current = (Tag) activeBGColorTagStack.peek();
if (current != null && current.toString().equalsIgnoreCase(t.toString())) {
inertBGColorTagCount++;
}
}
if (activeColorTagStack != null && activeColorTagStack.size() > 0) {
current = (Tag) activeColorTagStack.peek();
if (current != null && current.toString().equalsIgnoreCase(t.toString())) {
inertColorTagCount++;
}
}
}
// Now, we need to determine if we should ignore the next text element
if (!ignoreNext) {
ignoreNext = (calculateColorThreshold() <= contrastThreshold);
if(ignoreNext) {
concealedHtmlCount++;
}
else
{
// Now test for font size in a style tag
String fontSize = getStyleAttributeValue(a, "font-size");
if(fontSize == null) {
if(t.equals(HTML.Tag.FONT)) {
fontSize = (String)a.getAttribute(HTML.Attribute.SIZE);
}
}
if (fontSize != null) {
fontSize = fontSize.replaceAll("px", "");
fontSize = fontSize.replaceAll("pt", "");
fontSize = fontSize.trim();
try {
int iFontSize = (int) Float.parseFloat(fontSize);
ignoreNext = (iFontSize <= microFontSize);
if(iFontSize <= 0) {
concealedHtmlCount++;
}
} catch (NumberFormatException e) {
// We weren't able to treat the size as a number, it may be a valid CSS string
if (Arrays.binarySearch(INVALID_FONT_SIZES, fontSize) > -1) {
//concealedHtmlCount++;
ignoreNext = true;
}
}
}
}
}
// Now test element size
if(!ignoreNext) {
ignoreNext = ignoreElement(t, a);
if(ignoreNext) {
concealedHtmlCount++;
}
}
// reset
currentStyleAttributes = null;
// Now check for cgi urls and images
if(cgiHandler.handleTag(t, a, null) == HTMLTagHandler.MATCH) {
srcCgiCount++;
}
if(imageHandler.handleTag(t, a, null) == HTMLTagHandler.MATCH) {
imageCount++;
}
URLPortTagHandlerResult result = new URLPortTagHandlerResult();
if(portHandler.handleTag(t, a, result) == HTMLTagHandler.MATCH) {
srcPortCount++;
// Add the port to the list
if(urlPorts == null) {
urlPorts = new Vector(5);
}
urlPorts.add(result.getPort());
}
// Check for an anchor tag to get the href
if(t.equals(Tag.A)) {
currentAnchorUrl = (String)a.getAttribute(Attribute.HREF);
if(urlBase != null) {
// The anchor has a base..
currentAnchorUrl = urlBase + currentAnchorUrl;
}
}
else
{
currentAnchorUrl = null;
}
// Check for BASE href
if(t.equals(Tag.BASE)) {
urlBase = (String)a.getAttribute(Attribute.HREF);
if(urlBase != null && !urlBase.endsWith("/")) {
urlBase += "/";
}
}
// We MUST call the super class
super.handleStartTag(t, a, pos);
}
}
public void handleText(char[] text, int pos) {
// If we are in an anchor tag, check the text against the url...
if(currentAnchorUrl != null) {
// Parse the text looking for a url...
String strText = new String(text);
if(strText != null) {
strText = strText.trim().toLowerCase();
if(strText.startsWith("www") || WebUtils.isUrl(strText)) {
// We have a direct url reference, check against the recorded value
if(!strText.equalsIgnoreCase(currentAnchorUrl)) {
// The URL text does not match the ACTUAL url
// This could be a deception
falseAnchorCount++;
}
}
}
}
// Now pass control to the super class
super.handleText (text, pos);
}
/**
* Attempts to find a color/bgcolor attribute from a tag
* @param a
* @param htmlTagAttribute
* @param styleTagAttribute
* @return
*/
private String getColor(AttributeSet a, HTML.Attribute htmlTagAttribute, String styleTagAttribute) {
String color = null;
// style tags will override HTML tags
color = getStyleAttributeValue(a, styleTagAttribute);
if (color == null) {
// We didn't find a style color, look for an HTML one
color = (String) a.getAttribute(htmlTagAttribute);
}
if (color != null) {
// Ensure we have removed hashes
color = color.replaceAll("#", "");
}
// There seems to be a situation where the parser
// cannot determine the color, and so gives it a
// "DEFAULT" value. Why it doesn't just give it null
// I don't know!
if(color != null && color.equalsIgnoreCase("DEFAULT")) {
if(styleTagAttribute.equals(BGCOLOR_NAME)) {
color = DEFAULT_STR_BGCOLOR;
}
else {
color = DEFAULT_STR_COLOR;
}
}
return color;
}
private String[] getCurrentStyleAttributes(AttributeSet a) {
String attValue = (String) a.getAttribute(HTML.Attribute.STYLE);
String[] styleAttributes = null;
if (attValue != null && attValue.trim().length() > 0) {
attValue = attValue.toLowerCase();
styleAttributes = attValue.split(";");
}
return styleAttributes;
}
private String getStyleAttributeValue(AttributeSet a, String styleKey) {
String value = null;
String attribute = null;
if (currentStyleAttributes == null) {
currentStyleAttributes = getCurrentStyleAttributes(a);
}
if (currentStyleAttributes != null) {
for (int i = 0; i < currentStyleAttributes.length; i++) {
attribute = currentStyleAttributes[i].trim().toLowerCase();
if (attribute.indexOf(styleKey) == 0) {
// We have found our attribute, get the value
value = currentStyleAttributes[i].substring(currentStyleAttributes[i].indexOf(":") + 1, currentStyleAttributes[i].length());
if(value != null) value = value.trim();
break;
}
}
}
return value;
}
private int getIntColor(String strColor) {
int color = 0;
strColor = strColor.replaceAll("#", "").trim();
int strLength = strColor.length();
String strPart;
// See if it's a named color first
int index = Arrays.binarySearch(HTML_COLOR_NAMES, strColor.toLowerCase());
if (index > -1) {
return getIntColor(HTML_COLOR_VALUES[index]);
} else {
// Analyze each character
char[] chars = strColor.toCharArray();
char chr;
String rgb = "";
boolean add = false;
for (int i = 0; i < chars.length; i++) {
// The color value should be between 0 and F (hex)
chr = chars[i];
if (chr < 0x0030 || (chr > 0x0039 && chr < 0x0041) || (chr > 0x0046 && chr < 0x0061) || (chr > 0x0066)) {
chr = '0';
}
rgb += chr;
if (add) {
try {
color += Integer.parseInt(rgb, 16);
rgb = "";
} catch (NumberFormatException e) {
// Ignore the exception here
e.printStackTrace();
}
}
add = !add;
}
}
return color;
}
private float calculateColorThreshold() {
float threshhold = 0.0f;
if (currentTextColor > currentBGColor) {
threshhold = (((float) currentTextColor - (float) currentBGColor) / (float) currentTextColor);
} else if (currentBGColor > currentTextColor) {
threshhold = (((float) currentBGColor - (float) currentTextColor) / (float) currentBGColor);
}
return threshhold;
}
/**
* Returns true if the text within this element should be ignored based on the element size
* @param tag
* @return
*/
private boolean ignoreElement(HTML.Tag tag, AttributeSet a) {
boolean ignoreElem = false;
String strWidth = getStyleAttributeValue(a, "width");
String strHeight = getStyleAttributeValue(a, "height");
if(strWidth == null) {
strWidth = (String)a.getAttribute(Attribute.WIDTH);
}
if(strHeight == null) {
strHeight = (String)a.getAttribute(Attribute.HEIGHT);
}
try
{
if(strHeight != null) {
strHeight = strHeight.replaceAll("px", "");
ignoreElem = (Integer.parseInt(strHeight) <= microElementSize);
}
if(!ignoreElem && strWidth != null) {
strWidth = strWidth.replaceAll("px", "");
ignoreElem = (Integer.parseInt(strWidth) <= microElementSize);
}
}
catch (NumberFormatException ignore){}
return ignoreElem;
}
/* (non-Javadoc)
* @see javax.swing.text.html.HTMLEditorKit.ParserCallback#handleEndTag(javax.swing.text.html.HTML.Tag, int)
*/
public void handleEndTag(Tag t, int pos) {
// If the end tag is a /html, we want to ignore everything else
if (t.equals(HTML.Tag.HTML)) {
quit = true;
}
if (!quit) {
// If the current tag equals the last tag on either our color stack or
// our bgcolor stack, we may need to pop
Tag current = null;
if (activeBGColorTagStack != null && activeBGColorTagStack.size() > 0) {
current = (Tag) activeBGColorTagStack.peek();
if (current != null && current.toString().equalsIgnoreCase(t.toString())) {
if (inertBGColorTagCount > 0) {
inertBGColorTagCount--;
} else {
// We have to remove the current color from the stack
activeBGColorTagStack.pop();
activeBGColorStack.pop();
if (activeBGColorTagStack.size() > 0) {
currentBGColor = Integer.parseInt((String) activeBGColorStack.peek());
} else {
currentBGColor = DEFAULT_BGCOLOR;
}
}
}
} else {
// Set to default
currentBGColor = DEFAULT_BGCOLOR;
}
if (activeColorTagStack != null && activeColorTagStack.size() > 0) {
current = (Tag) activeColorTagStack.peek();
if (current != null && current.toString().equalsIgnoreCase(t.toString())) {
if (inertColorTagCount > 0) {
inertColorTagCount--;
} else {
// We have to remove the current color from the stack
activeColorTagStack.pop();
activeColorStack.pop();
if (activeColorTagStack.size() > 0) {
currentTextColor = Integer.parseInt((String) activeColorStack.peek());
} else {
currentTextColor = DEFAULT_COLOR;
}
}
}
} else {
// Set to default
currentTextColor = DEFAULT_COLOR;
}
//Clear the href
currentAnchorUrl = null;
super.handleEndTag(t, pos);
}
}
/**
* Gets the number of times concealed html was found
* @return An integer representing the number of times a concealment was discovered
*/
public int getConcealedHtmlCount() {
return concealedHtmlCount;
}
/**
* Gets the number of times images were found
* @return The number of images in the document
*/
public int getImageCount() {
return imageCount;
}
/**
* Gets the number of times the source attribute of a tag referenced a remote CGI script
* @return
*/
public int getSrcCgiCount() {
return srcCgiCount;
}
/**
* Gets the list of url ports found in tags with a src attribute
* @return
*/
public int getSrcPortCount() {
return srcPortCount;
}
/**
* Gets the list of url ports found in anchor tags in the message html part
* @return
*/
public List getUrlPorts() {
return urlPorts;
}
/**
* Gets the number if occurrences of "false" anchor tags.
* <p>
* These exist where an anchor tag displays a url as the text component,
* <br/>
* but this url does not match the actual url of the href.
* </p>
* @return The number of times a false anchor reference was discovered
*/
public int getFalseAnchorCount() {
return falseAnchorCount;
}
/**
* Gets the threshold for contrast between foreground and background content elements.
* <br/>
* In HTML emails, and particularly spam, content is often obscured via the use of low
* contrast colors or tones between background and foreground elements. For example,
* the text of the email may be white, and the background white indicating a contrast of 0
* @return A value between 0.0 and 1.0 such that 0.0 indicates no contrast, and 1.0 indicates
* complete contrast (eg white on black)
*/
public float getContrastThreshold() {
return contrastThreshold;
}
/**
* Sets the threshold for contrast between foreground and background content elements.
* @see SpamHTMLParser#getContrastThreshold()
* @param contrastThreshold A value between 0.0 and 1.0
*/
public void setContrastThreshold(float contrastThreshold) {
this.contrastThreshold = contrastThreshold;
}
/**
* Gets the size (in pixels) of the minimum allowable element dimension (usually height).
* <br/>
* Content found inside elements smaller than this size is deemed concealed
* @return The size in pixels of the smallest allowable element dimension
*/
public int getMicroElementSize() {
return microElementSize;
}
/**
* Sets the size (in pixels) of the minimum allowable element dimension (usually height).
* @param microElementSize The size in pixels. It is recommended that this be less than 10.
* Default is 5.
*/
public void setMicroElementSize(int microElementSize) {
this.microElementSize = microElementSize;
}
/**
* Gets the size (in points) of the minimum allowable font size.
* <br/>
* Content found inside font tags with smaller point size than this size is deemed concealed
* @return The size in points of the smallest allowable font. Default is 1
*/
public int getMicroFontSize() {
return microFontSize;
}
/**
* Sets the size (in points) of the minimum allowable font size.
* @param microFontSize A size in points. Default is 1
*/
public void setMicroFontSize(int microFontSize) {
this.microFontSize = microFontSize;
}
/*
* (non-Javadoc)
* @see org.jasen.interfaces.HTMLParser#parse(javax.mail.internet.MimeMessage, org.jasen.interfaces.JasenMessage, org.jasen.interfaces.MimeMessageTokenizer)
*/
public ParserData parse(MimeMessage mm, JasenMessage message, MimeMessageTokenizer tokenizer) throws JasenException {
StandardParserData parserData = (StandardParserData)super.parse(mm, message, tokenizer);
parserData.setConcealedHtmlCount(getConcealedHtmlCount());
parserData.setImageCount(getImageCount());
parserData.setSrcCgiCount(getSrcCgiCount());
parserData.setSrcPortCount(getSrcPortCount());
parserData.setPorts(getUrlPorts());
parserData.setFalseAnchorCount(getFalseAnchorCount());
return parserData;
}
}