Source Code of org.jasen.core.token.EmailTokenizer

/*
 * @(#)EmailTokenizer.java 5/11/2004
 * 
 * Copyright (c) 2004, 2005  jASEN.org
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *   1. Redistributions of source code must retain the above copyright notice,
 *      this list of conditions and the following disclaimer.
 *
 *   2. Redistributions in binary form must reproduce the above copyright
 *      notice, this list of conditions and the following disclaimer in
 *      the documentation and/or other materials provided with the distribution.
 *
 *   3. The names of the authors may not be used to endorse or promote products
 *      derived from this software without specific prior written permission.
 *
 *   4. Any modification or additions to the software must be contributed back
 *      to the project.
 *
 *   5. Any investigation or reverse engineering of source code or binary to
 *      enable emails to bypass the filters, and hence inflict spam and or viruses
 *      onto users who use or do not use jASEN could subject the perpetrator to
 *      criminal and or civil liability.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESSED OR IMPLIED WARRANTIES,
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JASEN.ORG,
 * OR ANY CONTRIBUTORS TO THIS SOFTWARE BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
 * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 */
package org.jasen.core.token;


import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.Arrays;
import java.util.List;


import javax.mail.Header;
import javax.mail.MessagingException;
import javax.mail.internet.MimeMessage;


import org.jasen.core.linguistics.LinguisticAnalyzer;
import org.jasen.core.parsers.StandardMimeMessageParser;
import org.jasen.core.parsers.URLParser;
import org.jasen.error.JasenException;
import org.jasen.interfaces.JasenMessage;
import org.jasen.interfaces.MimeMessageTokenizer;
import org.jasen.interfaces.ParserData;
import org.jasen.interfaces.TokenErrorRecorder;
import org.jasen.util.MimeUtils;


import com.sun.mail.smtp.SMTPMessage;




/**
 *
 * <P>
 *  Converts the subject, text and html parts of a MimeMessage into discrete String "tokens".
 * </P>
 * <p>
 *   Each token represents either a word, or a specialized representation of certain key information.
 * </p>
 * <p>
 *   For example:
 * </p>
 * <p>
 *   Often the subject line in a message is all that is required to identify it as spam.  This can be a very
 *  good source of information because it will almost always be free from obfuscation (not withstanding the use of non-ascii characters).
 *   Hence, tokens found in the subject are annotated with the word "Subject" and delimited with a question mark.
 * </p>
 * <p>
 *   For example:
 * </p>
 * The subject line "Buy viagra!" would be tokenized as:
 * <br/><br/>
 * Subject?Buy<br/>
 * Subject?viagra!
 * <br/><br/>
 * @author Jason Polites
 */
public class EmailTokenizer implements MimeMessageTokenizer
{


    private SpamTokenizer tokenizer;


    private boolean ignoreHeaders = false;


    // This can optionally limit the number of tokens extracted
    protected int tokenLimit = 20;
    protected int linguisticLimit = 3;


    /**
     * This is just a rare character user to identify mail header tokens It
     * looks like two pipes ||
     */
    public static final char HEADER_TOKEN_DELIMITER = 0x01C1;


    // If we are ignoring headers, ignore these
    /**
     * @deprecated This should be done in config
     */
    public static String[] IGNORED_HEADERS = { "thread-index", "date", "content-class", "content-type", "received", "mime-version" };


    // If we are including headers (ignoring everything else) include these
    /**
     * @deprecated This should be done in config
     */
    public static String[] INCLUDED_HEADERS = { "subject", "from", "to", "cc", "bcc", "return-path" };


    static
    {
        Arrays.sort (IGNORED_HEADERS);
        Arrays.sort (INCLUDED_HEADERS);
    }


    public EmailTokenizer() throws IOException {
        tokenizer = new SpamTokenizer ();
        tokenizer.maxTokens=tokenLimit;
        tokenizer.linguisticLimit = linguisticLimit;
    }


    protected String[] tokenize(MimeMessage mail, String realHtml, String realText, String text, String html, String tokenizee, TokenErrorRecorder recorder) throws JasenException {


        String tokenPrefix = null;


        String[] tokens = null;
        String[] tokensTemp = null;
        String[] headerTokens = null;
        Header[] headers = null;


        try
        {
            // Build a large String with the text OR HTML parts


            // ... and tokenize
            tokens = tokenizer.tokenize (tokenizee, recorder);


            // Get the headers
            headers = MimeUtils.getAllHeaders(mail);


            String headerName = null;
            int atIndex = 0;


            for (int h = 0; h < headers.length; h++)
            {


                headerName = headers[h].getName ().toLowerCase();


                headerTokens = null;


                if (ignoreHeaders)
                {
                    if (includeHeader (headerName))
                    {
                        text = headers[h].getValue ();
                        headerTokens = tokenizer.tokenize (text, recorder);
                    }
                }
                else if (!ignoreHeader (headerName))
                {


                    // If the header is the Message-ID header, we want to split it
                    // This is because the first part of the message id has no
                    // meaning
                    text = headers[h].getValue ();


                    if (headerName.equalsIgnoreCase ("Message-ID"))
                    {
                        // Split the value on the @ discarding preceding characters
                        atIndex = text.indexOf ('@');


                        if (atIndex > -1)
                        {
                            text = text.substring (atIndex, text.length ());
                        }


                        headerTokens = tokenizer.tokenize (text, true, recorder);
                    }
                    else if (headerName.equalsIgnoreCase ("Received"))
                    {
                        // Otherwise we want to tokenize the value as usual
                        headerTokens = tokenizer.tokenize (text, true, recorder);
                    }
                    else
                    {
                        // Otherwise we want to tokenize the value as usual
                        headerTokens = tokenizer.tokenize (text, recorder);
                    }
                }


                // Now we need to prepend the header identifier to each token
                if (headerTokens != null)
                {


                    if(Arrays.binarySearch(INCLUDED_HEADERS, headerName) > -1) {
                        for (int i = 0; i < headerTokens.length; i++)
                        {
                            headerTokens[i] = HEADER_TOKEN_DELIMITER + headers[h].getName () + HEADER_TOKEN_DELIMITER + headerTokens[i];
                        }
                    }


                    // Now we need to append the current temp array to the master
                    // token array
                    // Use the native array copy method for greater speed
                    if (tokens != null)
                    {
                        tokensTemp = new String[tokens.length + headerTokens.length];
                        System.arraycopy (tokens, 0, tokensTemp, 0, tokens.length);
                        System.arraycopy (headerTokens, 0, tokensTemp, tokens.length, headerTokens.length);


                        // Now point the original token array at the temp array
                        tokens = tokensTemp;
                    }
                    else
                    {
                        tokens = headerTokens;
                    }
                }
            }


            // Now add the url tokens
            if (realHtml == null && text == null)
            {
                return tokens;
            }
            else
            {


                URLParser ext = null;
                List urlList = null;


                if (realHtml != null)
                {
                    ext = new URLParser ();
                    ext.parse (realHtml);
                    urlList = ext.getUrls ();
                }


                if (realText != null)
                {
                    ext = new URLParser ();
                    ext.parse (realText);


                    if (ext.getUrls () != null)
                    {
                        if (urlList == null)
                        {
                            urlList = ext.getUrls ();
                        }
                        else
                        {
                            urlList.addAll (ext.getUrls ());
                        }
                    }
                }


                if (urlList != null)
                {
                    String[] urls = (String[]) urlList.toArray (new String[urlList.size ()]);
                    String[] all = null;


                    if (tokens != null && urls != null)
                    {
                        all = new String[tokens.length + urls.length];
                        System.arraycopy (tokens, 0, all, 0, tokens.length);
                        System.arraycopy (urls, 0, all, tokens.length, urls.length);
                    }
                    else if (urls == null)
                    {
                        all = tokens;
                    }
                    else
                    {
                        all = urls;
                    }


                    return all;
                }
                else
                {
                    return tokens;
                }
            }
        }
        catch (IOException e)
        {
            throw new JasenException(e);
        }
        catch (MessagingException e)
        {
            throw new JasenException(e);
        }
    }


    /*
     * (non-Javadoc)
     * @see org.jasen.interfaces.MimeMessageTokenizer#tokenize(javax.mail.internet.MimeMessage, org.jasen.interfaces.JasenMessage, org.jasen.interfaces.ParserData)
     */
    public String[] tokenize(MimeMessage mail, JasenMessage message, ParserData data) throws JasenException {
        String text = null;
        String html = null;
        String realHtml = null;
        String realText = null;
        String tokenizee = null;


        realHtml = message.getHtmlPart();
        realText = message.getTextPart();
        html = data.getHtmlAsText();
        text = data.getTextParsed();


        if (html != null && html.trim().length () > 0)
        {
            tokenizee = html;
        }
        else if (text != null && text.trim().length () > 0)
        {
            tokenizee = text;
        }


        return tokenize(mail, realHtml, realText, text, html, tokenizee, data.getTokenErrorRecorder());
    }


    /**
     * We won't use the Collection.contains method because we want to ignore
     * case
     *
     * @param header
     * @return
     */
    private boolean ignoreHeader(String header) {
        return (Arrays.binarySearch (IGNORED_HEADERS, header) > -1 || header.startsWith("x"));
    }


    private boolean includeHeader(String header) {
        return (Arrays.binarySearch (INCLUDED_HEADERS, header) > -1);
    }


    /**
     * Gets the maximum number of linguistic errors tolerated before tokenization is aborted.
     * <P>
     * The tokenizer uses the LinguisticAnalyzer to determine if each token is a real word.  After 
     * linguisticLimit tokens have successively failed, tokenization is aborted.
     * </P>
     * @return Returns the linguisticLimit.
     */
    public int getLinguisticLimit() {
        return linguisticLimit;
    }
    
    /**
     * Sets the maximum number of linguistic errors tolerated before tokenization is aborted.
     * @param linguisticLimit The linguisticLimit to set.
     * @see EmailTokenizer#getLinguisticLimit()
     */
    public void setLinguisticLimit(int linguisticLimit) {
        this.linguisticLimit = linguisticLimit;
    }
    
    /**
     * Tells us if we are ignoring the list of IGNORED_HEADERS when tokenizing
     * @return True if the tokenizer is ignoring headers in the IGNORED_HEADERS set
     * @see EmailTokenizer#IGNORED_HEADERS
     */
    public boolean isIgnoreHeaders() {
        return ignoreHeaders;
    }


    /**
     * Flags the tokenizer to ignore list of IGNORED_HEADERS when tokenizing
     * @param b
     */
    public void setIgnoreHeaders(boolean b) {
        ignoreHeaders = b;
    }


    /**
     * Gets the maximum number of tokens extracted before tokenization is aborted
     * @return The maximum number if tokens that will be returned
     */
    public int getTokenLimit() {
        return tokenLimit;
    }


    /**
     * Sets the maximum number of tokens extracted before tokenization is aborted
     * @param i
     */
    public void setTokenLimit(int i) {
        tokenLimit = i;
        if (tokenizer != null)
        {
            tokenizer.setMaxTokens (i);
        }
    }




    /**
     * Internal test harness only.  DO NOT USE
     * @param args
     */
    public static void main(String[] args) {
        try
        {


            //File mailDir = new
            // File("D:\\Projects\\Synetek\\EverySpam\\DebugTests");
            //File mailDir = new
            // File("D:\\Projects\\Synetek\\EverySpam\\DebugTests_Special");
            //File mailDir = new
            // File("D:\\Projects\\Synetek\\EverySpam\\DebugTests_Source");
            File mailDir = new File ("D:\\Projects\\Synetek\\Service\\EveryMail\\core\\poll");


            SMTPMessage mail = null;


            File[] files = mailDir.listFiles ();


            File output = new File ("c:/output.txt");


            if (output.exists ())
            {
                output.delete ();
            }


            FileOutputStream fout = new FileOutputStream (output);
            FileInputStream fin = null;


            StandardMimeMessageParser parser = null;


            JasenMessage jm = null;


            PrintWriter writer = new PrintWriter (fout);


            for (int i = 0; i < files.length; i++)
            {


                if (files[i].isFile ())
                {
                    try
                    {
                        writer.println ("*************************************************");
                        writer.println ("File " + (i + 1) + ": " + files[i].getName ());
                        writer.println ("*************************************************");


                        fin = new FileInputStream(files[i]);


                        mail = new SMTPMessage (null, fin);


                        parser = new StandardMimeMessageParser();
                        jm = parser.parse(mail);


                        writer.println ("HTML: " + jm.getHtmlPart());


                        EmailTokenizer et = new EmailTokenizer ();
                        et.setIgnoreHeaders (true);


                        long time = System.currentTimeMillis ();


                        String[] tokens = et.tokenize (mail, jm, null);


                        if (tokens != null)
                        {


                            //Arrays.sort(tokens);


                            /**
                             * Note to self:
                             * In order to determine if a token is to be included in the list of tokens we are using to train the spam filter,
                             * We test it first against the dictionary, then if no match was found we test it against the lexical analyzer.
                             * If it fails the lex test we record the failure.  If the next token is valid, the failure returns to zero.
                             * Only after a certain threshold of successive failures has been reached do we deem the token "invalid".
                             * Once the threshold has been reach, successive failures are treated as true failures immediately
                             * This relies on the premise that tokens are listed in the order they appear in the mail, and "most" invalid
                             * tokens occur at the end of the message.
                             */


                            LinguisticAnalyzer.getInstance ();


                            double threshold = 0.1d;
                            double prob;


                            for (int j = 0; j < tokens.length; j++)
                            {


                                writer.println ("TOKEN: [" + tokens[j] + "]");


                                prob = LinguisticAnalyzer.getInstance ().getWordScore (tokens[j]);


                               /* if (prob >= threshold)
                                {
                                    System.out.println ("Yes:" + tokens[j] + ": " + prob);
                                }
                                else
                                {
                                    System.out.println ("NO:" + tokens[j] + ": " + prob);
                                }*/
                            }
                        }
                    }
                    catch (Exception e)
                    {
                        e.printStackTrace ();
                    }
                }


                System.out.println ("Processed " + (i + 1) + "/" + files.length);


            }


            fout.flush ();
            writer.flush ();
            fout.close ();
            writer.close ();


        }
        catch (Exception ex)
        {
            ex.printStackTrace ();
        }


    }


}
Source Code of org.jasen.core.token.EmailTokenizer

Related Classes of org.jasen.core.token.EmailTokenizer