/*
* @(#)JasenTrainer.java 3/11/2004
*
* Copyright (c) 2004, 2005 jASEN.org
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the distribution.
*
* 3. The names of the authors may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* 4. Any modification or additions to the software must be contributed back
* to the project.
*
* 5. Any investigation or reverse engineering of source code or binary to
* enable emails to bypass the filters, and hence inflict spam and or viruses
* onto users who use or do not use jASEN could subject the perpetrator to
* criminal and or civil liability.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESSED OR IMPLIED WARRANTIES,
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JASEN.ORG,
* OR ANY CONTRIBUTORS TO THIS SOFTWARE BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
* OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
*/
package org.jasen.core.engine;
import java.io.File;
import java.io.FileFilter;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Date;
import java.util.LinkedList;
import java.util.List;
import java.util.Properties;
import javax.mail.MessagingException;
import javax.mail.internet.MimeMessage;
import org.jasen.core.token.EmailTokenizer;
import org.jasen.error.EmptyErrorHandler;
import org.jasen.error.ErrorHandlerBroker;
import org.jasen.error.JasenException;
import org.jasen.interfaces.HTMLParser;
import org.jasen.interfaces.JasenMapStore;
import org.jasen.interfaces.JasenMessage;
import org.jasen.interfaces.MimeMessageParser;
import org.jasen.interfaces.MimeMessageTokenizer;
import org.jasen.interfaces.ParserData;
/**
* <P>
* Trains the Jasen Engine and builds the JasenMap from a ham corpus and a spam corpus.
* </P>
* <p>
* Training (and/or re-training) should be done regularly as new types of spam messages begin to appear
* </p>
* @author Jason Polites
*/
public class JasenTrainer
{
private String spamCorpusPath;
private String hamCorpusPath;
private String storePath;
private MimeMessageParser mimeParser;
private MimeMessageTokenizer tokenizer;
private Class htmlParserClass;
private JasenMapStore store;
private JasenMap map;
private boolean load = false;
private int errors = 0;
/**
*
*/
public JasenTrainer() {
super ();
}
/**
* <p>
* Simple file filter which just ensures the File objects listed are files and not folders
* </p>
*/
class TrainerFileFilter implements FileFilter {
public boolean accept(File pathname) {
return pathname.isFile();
}
}
/**
* Initialises the trainer
* @throws JasenException
*/
public void init() throws JasenException {
InputStream in = null;
try
{
in = getClass().getClassLoader().getResourceAsStream("JasenTrainer.properties");
Properties props = new Properties();
props.load(in);
tokenizer = (MimeMessageTokenizer)Class.forName(props.getProperty("tokenizer")).newInstance();
mimeParser = (MimeMessageParser)Class.forName(props.getProperty("mimeParser")).newInstance();
htmlParserClass = Class.forName(props.getProperty("htmlParser"));
store = (JasenMapStore)Class.forName(props.getProperty("store")).newInstance();
int maxTokens = Integer.parseInt(props.getProperty("max-tokens"));
int linguisticFailures = Integer.parseInt(props.getProperty("linguistic-failures"));
tokenizer.setTokenLimit(maxTokens);
// Set an empty error handler
ErrorHandlerBroker.getInstance().setErrorHandler(new EmptyErrorHandler());
if(tokenizer instanceof EmailTokenizer) {
((EmailTokenizer)tokenizer).setLinguisticLimit(linguisticFailures);
}
if(load) {
map = store.load(storePath);
}
else
{
map = new JasenMap();
}
}
catch (Exception e)
{
throw new JasenException(e);
}
finally
{
if(in != null) {
try
{
in.close();
}
catch (IOException ignore){}
}
}
}
/**
* Trains the engine to produce the JasenMap
* @throws JasenException
* @see JasenMap
*/
public void train() throws JasenException {
errors = 0;
try
{
System.out.println ("Jasen engine training commenced at " + new Date());
System.out.println ("---------------------------------------------------------------");
System.out.println ("Spam corpus: " + spamCorpusPath);
System.out.println ("Ham corpus: " + hamCorpusPath);
System.out.println ();
// Train spam
File spamFiles = new File(spamCorpusPath);
File[] files = spamFiles.listFiles(new TrainerFileFilter());
int observationsS = train(files, JasenMap.SPAM);
map.setSpamObservations(observationsS);
// Train ham
File hamFiles = new File(hamCorpusPath);
files = hamFiles.listFiles(new TrainerFileFilter());
int observationsH = train(files, JasenMap.HAM);
map.setHamObservations(observationsH);
System.out.println ("Saving map...");
store.save(map, storePath);
System.out.println ("Training complete with " + errors + " errors");
System.out.println ("Total mails scanned: " + (observationsS + observationsH));
}
catch (Exception e)
{
throw new JasenException(e);
}
}
private int train(File[] files, int type) throws InstantiationException, IllegalAccessException {
int count = 0;
JasenMessage message = null;
MimeMessage mm = null;
String[] tokens = null;
ParserData data = null;
int counter = 1;
HTMLParser htmlParser = null;
System.out.println ("Scanning " + files.length + " files");
for (int i = 0; i < files.length; i++)
{
try
{
htmlParser = (HTMLParser)htmlParserClass.newInstance();
mm = getMimeMessage(files[i]);
message = mimeParser.parse(mm);
data = htmlParser.parse(mm, message, tokenizer);
if(learn(data, type)) {
count++;
}
if((i / files.length) >= ((files.length/10)*counter)) {
System.out.print ((counter * 10) + "% ");
counter++;
}
}
catch (Exception e)
{
errors++;
ErrorHandlerBroker.getInstance().getErrorHandler().handleException(e);
}
}
System.out.print ("100%");
return count;
}
private boolean learn(ParserData data, int type) {
String[] tokens = data.getMessageTokens();
// We need to keep a log of the tokens we add so we don't add them twice
// This is an arguable point, however technically the probability
// calculations are only valid if we record the number of emails
// containing the word, not the number of words found in total
if(tokens != null) {
List log = new LinkedList();
String token = null;
for (int i = 0; i < tokens.length; i++)
{
token = tokens[i].trim();
if(!log.contains(token)) {
map.addToken(token, type);
log.add(token);
}
}
return true;
}
else
{
return false;
}
}
private MimeMessage getMimeMessage(File file) throws IOException, MessagingException {
FileInputStream fin = null;
MimeMessage mm = null;
try
{
fin = new FileInputStream(file);
mm = new MimeMessage(null, fin);
}
finally
{
if(fin != null) {
try
{
fin.close();
}
catch (IOException ignore){}
}
}
return mm;
}
/**
* Gets the local path to the folder containing the HAM corpus.
* @return Either an absolute or classpath-relative path to the folder as a String.
*/
public String getHamCorpusPath() {
return hamCorpusPath;
}
/**
* Sets the local path to the folder containind the HAM corpus.
* @param hamCorpusPath Either an absolute or classpath-relative path to the folder as a String.
*/
public void setHamCorpusPath(String hamCorpusPath) {
this.hamCorpusPath = hamCorpusPath;
}
/**
* Gets the JasenMap object produced as a result of a training run
* @return Returns the map.
* @see JasenMap
*/
public JasenMap getMap() {
return map;
}
/**
* Sets the map object to be used in training.
* @param map The map to set.
*/
protected void setMap(JasenMap map) {
this.map = map;
}
/**
* Gets the MIME parser to be used during training.
* @return Returns the mimeParser.
*/
public MimeMessageParser getMimeParser() {
return mimeParser;
}
/**
* Sets the MIME parser to be used during training.
* @param mimeParser The mimeParser to set.
*/
public void setMimeParser(MimeMessageParser mimeParser) {
this.mimeParser = mimeParser;
}
/**
* Gets the local path to the folder containind the SPAM corpus.
* @return Either an absolute or classpath-relative path to the folder as a String.
*/
public String getSpamCorpusPath() {
return spamCorpusPath;
}
/**
* Sets the local path to the folder containind the SPAM corpus.
* @param spamCorpusPath Either an absolute or classpath-relative path to the folder as a String.
*/
public void setSpamCorpusPath(String spamCorpusPath) {
this.spamCorpusPath = spamCorpusPath;
}
/**
* Gets the store into which the map produced by the training run will be stored.
* @return Returns the store.
*/
public JasenMapStore getStore() {
return store;
}
/**
* Sets the store into which the map produced by the training run will be stored.
* @param store The store to set.
*/
public void setStore(JasenMapStore store) {
this.store = store;
}
/**
* Gets the tokenizer that will be used during training.
* @return Returns the tokenizer.
*/
public MimeMessageTokenizer getTokenizer() {
return tokenizer;
}
/**
* Sets the tokenizer that will be used during training.
* @param tokenizer The tokenizer to set.
*/
public void setTokenizer(MimeMessageTokenizer tokenizer) {
this.tokenizer = tokenizer;
}
/**
* Returns the value of the load option
* @return True if the trainer was instructed to load a previously created map. False otherwise
*/
public boolean isLoad() {
return load;
}
/**
* Sets the load value for the trainer.
* @param load If true, the trainer will append data to an existing map. Otherwise a new map will be created
*/
public void setLoad(boolean load) {
this.load = load;
}
/**
* Gets the path to the local file system into which the final JasenMapStore will be saved.
* <P>
* This is only relevant to the DiskMapStore class, however the trainer assumes this anyway.
* </P>
* @return Either an absolute or classpath-relative path to the file as a String.
*/
public String getStorePath() {
return storePath;
}
/**
* Sets the path to the local file system into which the final JasenMapStore will be saved.
* <P>
* This is only relevant to the DiskMapStore class, however the trainer assumes this anyway.
* </P>
* @param storePath Either an absolute or classpath-relative path to the file as a String.
*/
public void setStorePath(String storePath) {
this.storePath = storePath;
}
}