package it.unimi.dsi.mg4j.document;
/*
* MG4J: Managing Gigabytes for Java
*
* Copyright (C) 2005-2010 Paolo Boldi and Sebastiano Vigna
*
* This library is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 3 of the License, or (at your option)
* any later version.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, see <http://www.gnu.org/licenses/>.
*
*/
import it.unimi.dsi.Util;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import it.unimi.dsi.fastutil.objects.Reference2ObjectArrayMap;
import it.unimi.dsi.fastutil.objects.Reference2ObjectMap;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.NullReader;
import it.unimi.dsi.io.WordReader;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.mg4j.document.PropertyBasedDocumentFactory.MetadataKeys;
import it.unimi.dsi.util.Properties;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.io.Serializable;
import java.io.StringReader;
import java.lang.reflect.InvocationTargetException;
import java.nio.charset.Charset;
import java.text.ParseException;
import java.util.Date;
import javax.mail.Address;
import javax.mail.Folder;
import javax.mail.Message;
import javax.mail.MessagingException;
import javax.mail.Session;
import javax.mail.Store;
import javax.mail.URLName;
import javax.mail.internet.AddressException;
import javax.mail.internet.MailDateFormat;
import org.apache.commons.configuration.ConfigurationException;
import org.apache.log4j.Logger;
import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.UnflaggedOption;
/** A {@link it.unimi.dsi.mg4j.document.DocumentCollection} corresponding to
* a Javamail {@link javax.mail.Store}.
*
* <p>This class is very simple: for instance, it will not understand correctly
* multipart MIME messages, which will seen as without content. You are invited
* to extend it.
*
* <p>This implementation is an example of a document collection that does not use a
* factory: more precisely, there is an internal class that act as a wired factory. This
* structure is made necessary by the fact that Javamail provide no means to parse messages
* starting from an {@link java.io.InputStream}, which makes a separate implementation
* of {@link it.unimi.dsi.mg4j.document.DocumentFactory#getDocument(InputStream,Reference2ObjectMap)}
* impossible.
*
* <p>Note that to be able to use this class you must configure properly Javamail:
* this involves setting up a <samp>javamail.properties</samp> file describing the
* providers you want to use for the various access schemes. GNU Javamail, for instance, contains
* providers for files, IMAP, POP, etc.
*/
public class JavamailDocumentCollection extends AbstractDocumentCollection implements Serializable {
private final static Logger LOGGER = Util.getLogger( JavamailDocumentCollection.class );
/** A special date (actually, 1 January 1970) representing no date. */
public final static Date NO_DATE = new Date( 0 );
private static final long serialVersionUID = 2L;
/** Our only session . */
private final static Session SESSION = Session.getDefaultInstance( new java.util.Properties() );
/** The number of messages. */
private final int numberOfMessages;
/** The factory to be used by this collection. */
private final JavamailDocumentFactory factory;
/** The URL for the store. */
private final String storeUrl;
/** The folder name. */
private final String folderName;
/** The javamail store we are reading. */
private final transient Store store;
/** The javamail folder we are reading. */
private final transient Folder folder;
/** Builds a document collection corresponding to a given store URL and folder name.
*
* <p><strong>Beware.</strong> This class is not suited for large mbox files!
*
* @param storeUrl the javamail URL of the store.
* @param folderName the folder name.
* @param factory the factory that will be used to create documents.
* @throws MessagingException
*/
protected JavamailDocumentCollection( final String storeUrl, final String folderName, final JavamailDocumentFactory factory ) throws MessagingException {
this.storeUrl = storeUrl;
this.folderName = folderName;
this.factory = factory;
this.store = SESSION.getStore( new URLName( storeUrl ) );
store.connect();
this.folder = store.getDefaultFolder().getFolder( folderName );
folder.open( Folder.READ_ONLY );
this.numberOfMessages = folder.getMessageCount();
}
public JavamailDocumentCollection( final String storeUrl, final String folderName ) throws MessagingException {
this( storeUrl, folderName, new JavamailDocumentFactory() );
}
public JavamailDocumentCollection( final String storeUrl, final String folderName, final Properties properties ) throws MessagingException, ConfigurationException {
this( storeUrl, folderName, new JavamailDocumentFactory( properties ) );
}
public JavamailDocumentCollection( final String storeUrl, final String folderName, final String[] property ) throws MessagingException, ConfigurationException {
this( storeUrl, folderName, new JavamailDocumentFactory( property ) );
}
public JavamailDocumentCollection( final String storeUrl, final String folderName, final Reference2ObjectMap<Enum<?>,Object> defaultMetadata ) throws MessagingException {
this( storeUrl, folderName, new JavamailDocumentFactory( defaultMetadata ) );
}
public JavamailDocumentCollection copy() {
try {
return new JavamailDocumentCollection( storeUrl, folderName, factory.copy() );
}
catch ( MessagingException e ) {
throw new RuntimeException( e );
}
}
private final static class JavamailDocumentFactory extends PropertyBasedDocumentFactory {
private static final long serialVersionUID = 1L;
/** The field names (each also corresponds to a header, except for the 0-th). */
private static final String[] FIELD_NAME = { "body", "subject", "from", "to", "date", "cc", "bcc", "content-type" };
/** The field types. */
private static final FieldType[] FIELD_TYPE = { FieldType.TEXT, FieldType.TEXT, FieldType.TEXT, FieldType.TEXT, FieldType.DATE, FieldType.TEXT, FieldType.TEXT, FieldType.TEXT };
/** The map from field names to field indices. */
private static final Object2IntOpenHashMap<String> FIELD2INDEX;
static {
FIELD2INDEX = new Object2IntOpenHashMap<String>( FIELD_NAME.length, .5f );
FIELD2INDEX.defaultReturnValue( -1 );
for( int i = 0; i < FIELD_NAME.length; i++ ) FIELD2INDEX.put( FIELD_NAME[ i ], i );
}
/** The word reader used for all documents. */
private WordReader wordReader = new FastBufferedReader();
protected boolean parseProperty( final String key, final String[] values, final Reference2ObjectMap<Enum<?>,Object> metadata ) throws ConfigurationException {
if ( sameKey( MetadataKeys.ENCODING, key) ) {
metadata.put( MetadataKeys.ENCODING, Charset.forName( ensureJustOne( key, values ) ).toString() );
return true;
}
return super.parseProperty( key, values, metadata );
}
public JavamailDocumentFactory() {
init();
}
public JavamailDocumentFactory( final Properties properties ) throws ConfigurationException {
super( properties );
init();
}
public JavamailDocumentFactory( final Reference2ObjectMap<Enum<?>,Object> defaultMetadata ) {
super( defaultMetadata );
init();
}
public JavamailDocumentFactory( final String[] property ) throws ConfigurationException {
super( property );
init();
}
private void init() {
wordReader = new FastBufferedReader();
}
public JavamailDocumentFactory copy() {
return new JavamailDocumentFactory( defaultMetadata );
}
public int numberOfFields() {
return FIELD_NAME.length;
}
public String fieldName( final int field ) {
ensureFieldIndex( field );
return FIELD_NAME[ field ];
}
public FieldType fieldType( final int field ) {
ensureFieldIndex( field );
return FIELD_TYPE[ field ];
}
public int fieldIndex( final String fieldName ) {
return FIELD2INDEX.getInt( fieldName );
}
public Document getDocument( final InputStream rawContent, final Reference2ObjectMap<Enum<?>,Object> metadata ) {
throw new UnsupportedOperationException();
}
}
public DocumentFactory factory() {
return factory;
}
public int size() {
return numberOfMessages;
}
public void close() throws IOException {
super.close();
try {
folder.close( false );
store.close();
}
catch( MessagingException e ) {
throw new IOException( e.toString() );
}
}
private Object readResolve() throws MessagingException, IOException {
super.close(); // To avoid spurious warnings about unclosed collected objects.
return new JavamailDocumentCollection( storeUrl, folderName, factory );
}
public Document document( final int index ) throws IOException {
try {
return new AbstractDocument() {
// Can you believe that? Javamail numbers messages from 1...
final Message message = folder.getMessage( index + 1 );
public CharSequence title() {
final String subject;
try {
subject = message.getSubject();
}
catch ( MessagingException e ) {
throw new RuntimeException( e.toString() );
}
if ( subject == null ) return (CharSequence)factory.resolve( MetadataKeys.TITLE, factory.defaultMetadata );
else return subject;
}
public CharSequence uri() {
try {
return folder.getURLName() + "#" + message.getMessageNumber();
}
catch ( MessagingException e ) {
throw new RuntimeException( e );
}
}
private Reader joinAddresses( final Address address[] ) {
if ( address == null ) return NullReader.getInstance();
final MutableString s = new MutableString();
if ( address != null ) {
for( int i = 0; i < address.length; i++ ) {
if ( i > 0 ) s.append( ", " );
s.append( address[ i ] );
}
}
return new FastBufferedReader( s );
}
public Object content( final int field ) throws IOException {
factory.ensureFieldIndex( field );
try {
switch ( field ) {
case 0: // body
// TODO: analyze multipart messages
Object content = null;
try {
content = message.getContent();
}
catch( Exception e ) {
LOGGER.warn( "Message " + message.getMessageNumber() + " cannot be decoded; content will be empty", e );
}
if ( content != null && content instanceof String ) return new StringReader( (String)content );
return NullReader.getInstance();
case 1: // subject
return message.getSubject() == null ? NullReader.getInstance() : new StringReader( message.getSubject() );
case 2: // from
return joinAddresses( message.getFrom() );
case 3: // to
return joinAddresses( message.getRecipients( Message.RecipientType.TO ) );
case 4: // date
final String[] date = message.getHeader( "date" );
if ( date == null || date.length == 0 ) return NO_DATE;
final MailDateFormat mailDateFormat = new MailDateFormat();
try {
return mailDateFormat.parse( date[ 0 ] );
}
catch ( ParseException e ) {
LOGGER.warn( "Error parsing date " + date[ 0 ] );
return NO_DATE;
}
case 5: // cc
return joinAddresses( message.getRecipients( Message.RecipientType.CC ) );
case 6: // bcc
return joinAddresses( message.getRecipients( Message.RecipientType.BCC ) );
case 7: // content-type
return new StringReader( message.getContentType() );
}
}
catch ( MessagingException e ) {
// A simple error
if ( e instanceof AddressException ) {
LOGGER.warn( "Error while parsing address", e );
return NullReader.getInstance();
}
throw new IOException( e.toString() );
}
throw new IllegalStateException();
}
public WordReader wordReader( final int field ) {
factory.ensureFieldIndex( field );
return factory.wordReader;
}
};
}
catch ( MessagingException e ) {
throw new IOException( e.toString() );
}
}
public Reference2ObjectMap<Enum<?>,Object> metadata( final int index ) {
ensureDocumentIndex( index );
final Reference2ObjectArrayMap<Enum<?>,Object> metadata = new Reference2ObjectArrayMap<Enum<?>,Object>( 2 );
metadata.put( MetadataKeys.TITLE, "Message #" + index );
metadata.put( MetadataKeys.URI, storeUrl + folder + "#" + index );
return metadata;
}
public InputStream stream( final int index ) throws IOException {
ensureDocumentIndex( index );
try {
// Can you believe that? Javamail numbers messages from 1...
return folder.getMessage( index + 1 ).getInputStream();
}
catch ( MessagingException e ) {
throw new IOException( e.toString() );
}
}
public static void main( final String[] arg ) throws IOException, JSAPException, IllegalAccessException, InvocationTargetException, NoSuchMethodException, InstantiationException, MessagingException, ConfigurationException {
SimpleJSAP jsap = new SimpleJSAP( JavamailDocumentCollection.class.getName(), "Saves a serialised mbox collection based on a given mbox file.",
new Parameter[] {
new FlaggedOption( "property", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'p', "property", "A 'key=value' specification, or the name of a property file" ).setAllowMultipleDeclarations( true ),
new UnflaggedOption( "collection", JSAP.STRING_PARSER, JSAP.REQUIRED, "The filename for the serialised collection." ),
new UnflaggedOption( "storeUrl", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The javamail store." ),
new UnflaggedOption( "folder", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The folder to be read." )
}
);
JSAPResult jsapResult = jsap.parse( arg );
if ( jsap.messagePrinted() ) return;
BinIO.storeObject( new JavamailDocumentCollection( jsapResult.getString( "storeUrl" ), jsapResult.getString( "folder" ), jsapResult.getStringArray( "property" ) ), jsapResult.getString( "collection" ) );
}
}