/*
* @(#)EmailTokenizer.java 5/11/2004
*
* Copyright (c) 2004, 2005 jASEN.org
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the distribution.
*
* 3. The names of the authors may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* 4. Any modification or additions to the software must be contributed back
* to the project.
*
* 5. Any investigation or reverse engineering of source code or binary to
* enable emails to bypass the filters, and hence inflict spam and or viruses
* onto users who use or do not use jASEN could subject the perpetrator to
* criminal and or civil liability.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESSED OR IMPLIED WARRANTIES,
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JASEN.ORG,
* OR ANY CONTRIBUTORS TO THIS SOFTWARE BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
* OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
*/
package org.jasen.core.token;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.Arrays;
import java.util.List;
import javax.mail.Header;
import javax.mail.MessagingException;
import javax.mail.internet.MimeMessage;
import org.jasen.core.linguistics.LinguisticAnalyzer;
import org.jasen.core.parsers.StandardMimeMessageParser;
import org.jasen.core.parsers.URLParser;
import org.jasen.error.JasenException;
import org.jasen.interfaces.JasenMessage;
import org.jasen.interfaces.MimeMessageTokenizer;
import org.jasen.interfaces.ParserData;
import org.jasen.interfaces.TokenErrorRecorder;
import org.jasen.util.MimeUtils;
import com.sun.mail.smtp.SMTPMessage;
/**
*
* <P>
* Converts the subject, text and html parts of a MimeMessage into discrete String "tokens".
* </P>
* <p>
* Each token represents either a word, or a specialized representation of certain key information.
* </p>
* <p>
* For example:
* </p>
* <p>
* Often the subject line in a message is all that is required to identify it as spam. This can be a very
* good source of information because it will almost always be free from obfuscation (not withstanding the use of non-ascii characters).
* Hence, tokens found in the subject are annotated with the word "Subject" and delimited with a question mark.
* </p>
* <p>
* For example:
* </p>
* The subject line "Buy viagra!" would be tokenized as:
* <br/><br/>
* Subject?Buy<br/>
* Subject?viagra!
* <br/><br/>
* @author Jason Polites
*/
public class EmailTokenizer implements MimeMessageTokenizer
{
private SpamTokenizer tokenizer;
private boolean ignoreHeaders = false;
// This can optionally limit the number of tokens extracted
protected int tokenLimit = 20;
protected int linguisticLimit = 3;
/**
* This is just a rare character user to identify mail header tokens It
* looks like two pipes ||
*/
public static final char HEADER_TOKEN_DELIMITER = 0x01C1;
// If we are ignoring headers, ignore these
/**
* @deprecated This should be done in config
*/
public static String[] IGNORED_HEADERS = { "thread-index", "date", "content-class", "content-type", "received", "mime-version" };
// If we are including headers (ignoring everything else) include these
/**
* @deprecated This should be done in config
*/
public static String[] INCLUDED_HEADERS = { "subject", "from", "to", "cc", "bcc", "return-path" };
static
{
Arrays.sort (IGNORED_HEADERS);
Arrays.sort (INCLUDED_HEADERS);
}
public EmailTokenizer() throws IOException {
tokenizer = new SpamTokenizer ();
tokenizer.maxTokens=tokenLimit;
tokenizer.linguisticLimit = linguisticLimit;
}
protected String[] tokenize(MimeMessage mail, String realHtml, String realText, String text, String html, String tokenizee, TokenErrorRecorder recorder) throws JasenException {
String tokenPrefix = null;
String[] tokens = null;
String[] tokensTemp = null;
String[] headerTokens = null;
Header[] headers = null;
try
{
// Build a large String with the text OR HTML parts
// ... and tokenize
tokens = tokenizer.tokenize (tokenizee, recorder);
// Get the headers
headers = MimeUtils.getAllHeaders(mail);
String headerName = null;
int atIndex = 0;
for (int h = 0; h < headers.length; h++)
{
headerName = headers[h].getName ().toLowerCase();
headerTokens = null;
if (ignoreHeaders)
{
if (includeHeader (headerName))
{
text = headers[h].getValue ();
headerTokens = tokenizer.tokenize (text, recorder);
}
}
else if (!ignoreHeader (headerName))
{
// If the header is the Message-ID header, we want to split it
// This is because the first part of the message id has no
// meaning
text = headers[h].getValue ();
if (headerName.equalsIgnoreCase ("Message-ID"))
{
// Split the value on the @ discarding preceding characters
atIndex = text.indexOf ('@');
if (atIndex > -1)
{
text = text.substring (atIndex, text.length ());
}
headerTokens = tokenizer.tokenize (text, true, recorder);
}
else if (headerName.equalsIgnoreCase ("Received"))
{
// Otherwise we want to tokenize the value as usual
headerTokens = tokenizer.tokenize (text, true, recorder);
}
else
{
// Otherwise we want to tokenize the value as usual
headerTokens = tokenizer.tokenize (text, recorder);
}
}
// Now we need to prepend the header identifier to each token
if (headerTokens != null)
{
if(Arrays.binarySearch(INCLUDED_HEADERS, headerName) > -1) {
for (int i = 0; i < headerTokens.length; i++)
{
headerTokens[i] = HEADER_TOKEN_DELIMITER + headers[h].getName () + HEADER_TOKEN_DELIMITER + headerTokens[i];
}
}
// Now we need to append the current temp array to the master
// token array
// Use the native array copy method for greater speed
if (tokens != null)
{
tokensTemp = new String[tokens.length + headerTokens.length];
System.arraycopy (tokens, 0, tokensTemp, 0, tokens.length);
System.arraycopy (headerTokens, 0, tokensTemp, tokens.length, headerTokens.length);
// Now point the original token array at the temp array
tokens = tokensTemp;
}
else
{
tokens = headerTokens;
}
}
}
// Now add the url tokens
if (realHtml == null && text == null)
{
return tokens;
}
else
{
URLParser ext = null;
List urlList = null;
if (realHtml != null)
{
ext = new URLParser ();
ext.parse (realHtml);
urlList = ext.getUrls ();
}
if (realText != null)
{
ext = new URLParser ();
ext.parse (realText);
if (ext.getUrls () != null)
{
if (urlList == null)
{
urlList = ext.getUrls ();
}
else
{
urlList.addAll (ext.getUrls ());
}
}
}
if (urlList != null)
{
String[] urls = (String[]) urlList.toArray (new String[urlList.size ()]);
String[] all = null;
if (tokens != null && urls != null)
{
all = new String[tokens.length + urls.length];
System.arraycopy (tokens, 0, all, 0, tokens.length);
System.arraycopy (urls, 0, all, tokens.length, urls.length);
}
else if (urls == null)
{
all = tokens;
}
else
{
all = urls;
}
return all;
}
else
{
return tokens;
}
}
}
catch (IOException e)
{
throw new JasenException(e);
}
catch (MessagingException e)
{
throw new JasenException(e);
}
}
/*
* (non-Javadoc)
* @see org.jasen.interfaces.MimeMessageTokenizer#tokenize(javax.mail.internet.MimeMessage, org.jasen.interfaces.JasenMessage, org.jasen.interfaces.ParserData)
*/
public String[] tokenize(MimeMessage mail, JasenMessage message, ParserData data) throws JasenException {
String text = null;
String html = null;
String realHtml = null;
String realText = null;
String tokenizee = null;
realHtml = message.getHtmlPart();
realText = message.getTextPart();
html = data.getHtmlAsText();
text = data.getTextParsed();
if (html != null && html.trim().length () > 0)
{
tokenizee = html;
}
else if (text != null && text.trim().length () > 0)
{
tokenizee = text;
}
return tokenize(mail, realHtml, realText, text, html, tokenizee, data.getTokenErrorRecorder());
}
/**
* We won't use the Collection.contains method because we want to ignore
* case
*
* @param header
* @return
*/
private boolean ignoreHeader(String header) {
return (Arrays.binarySearch (IGNORED_HEADERS, header) > -1 || header.startsWith("x"));
}
private boolean includeHeader(String header) {
return (Arrays.binarySearch (INCLUDED_HEADERS, header) > -1);
}
/**
* Gets the maximum number of linguistic errors tolerated before tokenization is aborted.
* <P>
* The tokenizer uses the LinguisticAnalyzer to determine if each token is a real word. After
* linguisticLimit tokens have successively failed, tokenization is aborted.
* </P>
* @return Returns the linguisticLimit.
*/
public int getLinguisticLimit() {
return linguisticLimit;
}
/**
* Sets the maximum number of linguistic errors tolerated before tokenization is aborted.
* @param linguisticLimit The linguisticLimit to set.
* @see EmailTokenizer#getLinguisticLimit()
*/
public void setLinguisticLimit(int linguisticLimit) {
this.linguisticLimit = linguisticLimit;
}
/**
* Tells us if we are ignoring the list of IGNORED_HEADERS when tokenizing
* @return True if the tokenizer is ignoring headers in the IGNORED_HEADERS set
* @see EmailTokenizer#IGNORED_HEADERS
*/
public boolean isIgnoreHeaders() {
return ignoreHeaders;
}
/**
* Flags the tokenizer to ignore list of IGNORED_HEADERS when tokenizing
* @param b
*/
public void setIgnoreHeaders(boolean b) {
ignoreHeaders = b;
}
/**
* Gets the maximum number of tokens extracted before tokenization is aborted
* @return The maximum number if tokens that will be returned
*/
public int getTokenLimit() {
return tokenLimit;
}
/**
* Sets the maximum number of tokens extracted before tokenization is aborted
* @param i
*/
public void setTokenLimit(int i) {
tokenLimit = i;
if (tokenizer != null)
{
tokenizer.setMaxTokens (i);
}
}
/**
* Internal test harness only. DO NOT USE
* @param args
*/
public static void main(String[] args) {
try
{
//File mailDir = new
// File("D:\\Projects\\Synetek\\EverySpam\\DebugTests");
//File mailDir = new
// File("D:\\Projects\\Synetek\\EverySpam\\DebugTests_Special");
//File mailDir = new
// File("D:\\Projects\\Synetek\\EverySpam\\DebugTests_Source");
File mailDir = new File ("D:\\Projects\\Synetek\\Service\\EveryMail\\core\\poll");
SMTPMessage mail = null;
File[] files = mailDir.listFiles ();
File output = new File ("c:/output.txt");
if (output.exists ())
{
output.delete ();
}
FileOutputStream fout = new FileOutputStream (output);
FileInputStream fin = null;
StandardMimeMessageParser parser = null;
JasenMessage jm = null;
PrintWriter writer = new PrintWriter (fout);
for (int i = 0; i < files.length; i++)
{
if (files[i].isFile ())
{
try
{
writer.println ("*************************************************");
writer.println ("File " + (i + 1) + ": " + files[i].getName ());
writer.println ("*************************************************");
fin = new FileInputStream(files[i]);
mail = new SMTPMessage (null, fin);
parser = new StandardMimeMessageParser();
jm = parser.parse(mail);
writer.println ("HTML: " + jm.getHtmlPart());
EmailTokenizer et = new EmailTokenizer ();
et.setIgnoreHeaders (true);
long time = System.currentTimeMillis ();
String[] tokens = et.tokenize (mail, jm, null);
if (tokens != null)
{
//Arrays.sort(tokens);
/**
* Note to self:
* In order to determine if a token is to be included in the list of tokens we are using to train the spam filter,
* We test it first against the dictionary, then if no match was found we test it against the lexical analyzer.
* If it fails the lex test we record the failure. If the next token is valid, the failure returns to zero.
* Only after a certain threshold of successive failures has been reached do we deem the token "invalid".
* Once the threshold has been reach, successive failures are treated as true failures immediately
* This relies on the premise that tokens are listed in the order they appear in the mail, and "most" invalid
* tokens occur at the end of the message.
*/
LinguisticAnalyzer.getInstance ();
double threshold = 0.1d;
double prob;
for (int j = 0; j < tokens.length; j++)
{
writer.println ("TOKEN: [" + tokens[j] + "]");
prob = LinguisticAnalyzer.getInstance ().getWordScore (tokens[j]);
/* if (prob >= threshold)
{
System.out.println ("Yes:" + tokens[j] + ": " + prob);
}
else
{
System.out.println ("NO:" + tokens[j] + ": " + prob);
}*/
}
}
}
catch (Exception e)
{
e.printStackTrace ();
}
}
System.out.println ("Processed " + (i + 1) + "/" + files.length);
}
fout.flush ();
writer.flush ();
fout.close ();
writer.close ();
}
catch (Exception ex)
{
ex.printStackTrace ();
}
}
}