Source Code of net.sf.regain.crawler.preparator.MessagePreparator

/*
 * regain - A file search engine providing plenty of formats
 * Copyright (C) 2004-2010  Til Schneider, Thomas Tesche
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 * Contact: Til Schneider, info@murfman.de, Thomas Tesche, regain@thtesche.com
 *
 * CVS information:
 *  $RCSfile$
 *   $Source$
 *     $Date: 2008-10-25 18:35:21 +0200 (Sat, 25 Oct 2008) $
 *   $Author: thtesche $
 * $Revision: 349 $
 */
package net.sf.regain.crawler.preparator;


import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


import javax.mail.Address;
import javax.mail.BodyPart;
import javax.mail.MessagingException;
import javax.mail.Multipart;
import javax.mail.Part;
import javax.mail.Session;
import javax.mail.internet.AddressException;
import javax.mail.internet.MimeMessage;
import javax.mail.util.SharedByteArrayInputStream;


import net.sf.regain.RegainException;
import net.sf.regain.RegainToolkit;
import net.sf.regain.crawler.document.AbstractPreparator;
import net.sf.regain.crawler.document.RawDocument;
import net.sf.regain.crawler.preparator.util.StripEntities;


import org.apache.log4j.Logger;


/**
 * This class prepares  messages (MIME, rfc822), specifically spoof email messages.
 * <p>
 * The document contains the message text and the file names of the attachments.
 *
 * @author Thomas Tesche, www.thtesche.com, Kevin Black (KJB)
 * @see MessagePreparator
 */
public class MessagePreparator extends AbstractPreparator {


  /** The logger for this class */
  private static Logger mLog = Logger.getLogger(MessagePreparator.class);
  /** Regex Compilation to match URLs in body. */
  private static Pattern mURLPattern = Pattern.compile("(?im)((?:http|https|ftp|mailto):[^\\s\"'<>]*)");


  /**
   * Creates a new instance of MessagePreparator.
   *
   * @throws RegainException If creating of the preparator failed.
   */
  public MessagePreparator() throws RegainException {
    super("message/rfc822");
  }


  /**
   * Prepares the document for indexing.
   *
   * @param rawDocument The document to prepare.
   * @throws RegainException If the preparation fails.
   */
  public void prepare(RawDocument rawDocument) throws RegainException {


    Properties mailProperties = System.getProperties();
    Session session = Session.getInstance(mailProperties, null);
    SharedByteArrayInputStream mimeInput = new SharedByteArrayInputStream(rawDocument.getContent());


    Collection<String> textParts = new ArrayList<String>();
    Collection<String> attachments = new ArrayList<String>();
    SimpleDateFormat simpleFormat = new SimpleDateFormat("dd.MM.yyyy HH:mm:ss");


    StringBuffer resultText = new StringBuffer();
    StringBuffer keyText = new StringBuffer();


    try {
      MimeMessage message = new MimeMessage(session, mimeInput);


      resultText.append("Subject: " + message.getSubject()).append("\n");
      if (message.getSentDate() != null) {
        resultText.append("Sent: " + simpleFormat.format(message.getSentDate())).append("\n");
      }


      if (message.getReceivedDate() != null) {
        resultText.append("Received: " + simpleFormat.format(message.getReceivedDate())).append("\n");
      }
      Address[] recipientsArray = null;
      try {
        recipientsArray = message.getAllRecipients();
      } catch (AddressException ae) {
        // KJB: an issue with getAllRecipients if To: contains a semi-colon rather than comma.
        recipientsArray = fixAddress(ae, message, "To");
      }
      if (recipientsArray != null && recipientsArray.length > 0) {
        String recipients = "";
        for (int i = 0; i < recipientsArray.length; i++) {
          recipients += recipientsArray[i].toString() + ", ";
          keyText.append(stripNoneWordChars(recipientsArray[i].toString()) + " ");
        }
        recipients = recipients.substring(0, recipients.length() - 2);
        resultText.append("Recipient(s): " + recipients).append("\n");
      }


      Address[] repliesArray = null;
      try {
        repliesArray = message.getReplyTo();
      } catch (AddressException ae) {
        repliesArray = fixAddress(ae, message, "Reply-to");
      }
      if (repliesArray != null && repliesArray.length > 0) {
        String replies = "";
        for (int i = 0; i < repliesArray.length; i++) {
          replies += repliesArray[i].toString() + ", ";
          keyText.append(stripNoneWordChars(repliesArray[i].toString()) + " ");
        }
        replies = replies.substring(0, replies.length() - 2);
        resultText.append("Reply to: " + replies).append("\n");
      }


      Address[] senderArray = null;
      try {
        senderArray = message.getFrom();
      } catch (AddressException ae) {
        senderArray = fixAddress(ae, message, "From");
      }
      if (senderArray != null && senderArray.length > 0) {
        String sender = "";
        for (int i = 0; i < senderArray.length; i++) {
          sender += senderArray[i].toString() + ", ";
          keyText.append(stripNoneWordChars(senderArray[i].toString()) + " ");
        }
        sender = sender.substring(0, sender.length() - 2);
        setTitle(message.getSubject() + " from " + sender);
        resultText.append("Sender: " + sender).append("\n");
      }


      resultText.append("Header key words: " + keyText.toString()).append("\n");
      resultText.append("-------------------------------------------------------------").append("\n");


      // multipart or not multipart
      if (message.getContent() instanceof Multipart) {
        //contentType = "multipart";
        Multipart mp = (Multipart) message.getContent();


        for (int i = 0; i < mp.getCount(); i++) {
          BodyPart bp = mp.getBodyPart(i);
          String disposition = bp.getDisposition();


          if ((disposition != null) && ((disposition.equals(Part.ATTACHMENT)))) {
            attachments.add("attachment: " + bp.getFileName());
            if (bp.isMimeType("text/*")) {
              textParts.add((String) bp.getContent());
              mLog.debug("added txt from attachment: " + bp.getFileName() + " : " + bp.getContentType());
            }


          } else if (disposition == null || disposition.equals(Part.INLINE)) {


            // Plain Text oder HTML
            if (bp.isMimeType("text/*")) {
              textParts.add((String) bp.getContent());


            } else if (bp.isMimeType("multipart/*")) {
              // another bodypart container
              // we process only depth = 1 and not deeper and only text.
              Multipart mpInner = (Multipart) bp.getContent();


              for (int k = 0; k < mpInner.getCount(); k++) {
                BodyPart bpInner = mpInner.getBodyPart(k);


                if (bpInner != null && (bpInner.getDisposition() == null ||
                        bpInner.getDisposition().equals(Part.INLINE))) {


                  if (bpInner.isMimeType("text/*")) {
                    textParts.add((String) bpInner.getContent());
                  }


                } // end of bodypart which are not attachments
              } // end of iterate over all inner bodyparts
            } // MultipartContainer in a MultipartContainer
            else if (bp.isMimeType("message/*")) {
              Object bpContent = bp.getContent();
              if (bpContent instanceof MimeMessage) {


                // Added by KJB.
                // Message container.
                MimeMessage mmInner = (MimeMessage) bp.getContent();
                // tack on the headers.
                for (Enumeration<String> e = mmInner.getAllHeaderLines(); e.hasMoreElements();) {
                  textParts.add(e.nextElement());
                }


                Object mmContent = mmInner.getContent();
                if (mmContent instanceof String) {
                  textParts.add((String) mmContent);
                } else if (mmContent instanceof InputStream) {
                  textParts.add(RegainToolkit.readStringFromStream((InputStream) mmContent));
                } else {
                  mLog.error("Message Content is of unknown Class: " + mmContent.getClass().getName());
                }
              } else if (bpContent instanceof InputStream) {
                textParts.add(RegainToolkit.readStringFromStream((InputStream) bpContent));
              } else {
                mLog.error("Body Part Inner Message Content is of unknown Class: " + bpContent.getClass().getName());
              }


            } // Message container


          } // consider all none attachment parts
          // outmost container


        } // iterate over all bodyparts (a bodypart could be multipart container for itself)


      } else {
        // This is a plain text mail.
        Object content = message.getContent();
        if (content instanceof String) {
          textParts.add((String) content);
        } else if (content instanceof InputStream) {
          textParts.add(RegainToolkit.readStringFromStream((InputStream) content));
        } else {
          mLog.error("Message content is of unknown class: " + content.getClass().getName());
        }
      }


    } catch (MessagingException ex) {
      mLog.error("Could not instantiate mime message for parsing.", ex);


    } catch (IOException ex) {
      mLog.error("Could not parse mime message for parsing.", ex);
    }


    if (textParts.size() > 0) {
      // Iterate over all text parts of the mail and aggregate the content
      Iterator<String> iter = textParts.iterator();
      while (iter.hasNext()) {
        String current = (String) iter.next();
        resultText.append(StripEntities.stripHTMLTags(((String) current)) + " ").append("\n");
        Collection<String> urls = this.extractURLs(current);
        if (urls != null && !urls.isEmpty()) {
          Iterator<String> uIter = urls.iterator();
          while (uIter.hasNext()) {
            resultText.append(uIter.next()).append("\n");
          }
        }
      }
    }


    if (attachments.size() > 0) {
      Iterator<String> iter = attachments.iterator();
      // Note, attachments are simply the name of the attachment file.
      while (iter.hasNext()) {
        resultText.append(StripEntities.stripHTMLTags(((String) iter.next()))).append("\n");
      }
    }


    setCleanedContent(resultText.toString());


  }


  /** Occasionally see Addresses that have semi-colons rather than commas, which
   * cause "Illegal semicolon, not in group" AddressException.
   * This helper function attempts to change semi-colons to commas and return the
   * Address Array.
   *
   * @param ae          Address Exception object
   * @param message     MIME Message object
   * @param headerName  Name of header, e.g. To, From, Reply-To
   * @return if available, array of Addresses; otherwise, null
   */
  private Address[] fixAddress(AddressException ae, MimeMessage message, String headerName) {
    Address[] addresses = null;
    // "Illegal semicolon, not in group in string".?
    try {
      String[] toArray = message.getHeader(headerName);
      CharSequence cs = new String("Illegal semicolon, not in group");
      if (toArray != null && ae.getMessage().contains(cs)) {
        // replace semi-colon with comma, and try to build a From Address[]
        toArray[0] = toArray[0].replace(';', ',');
        message.setHeader(headerName, toArray[0]);
        try {
          addresses = message.getAllRecipients();
        } catch (Exception e) {
          addresses = null;
        }
      }
    } catch (MessagingException me) {
      addresses = null;
      mLog.error("Could not parse Header: " + headerName);
    }
    return addresses;
  }


  /** Extract URLs from text source.
   * Tried org.htmlparser functions here (eg. http://notetodogself.blogspot.com/2007/11/extract-links-using-htmlparser.html)
   * but that technique missed quite a few URLs.
   *  Using the RegEx technique.
   *
   * @param text input string of text or HTML
   * @return Collection of strings matching https?|ftp|mailto
   */
  private Collection<String> extractURLs(String text) {
    if (text == null || text.isEmpty()) {
      return null;
    }


    Collection<String> result = new HashSet<String>();


    Matcher matcher = mURLPattern.matcher(text);
    while (matcher.find()) {
      String url = matcher.group(1);
      result.add(url);
    }


    return result;
  }


  /**
   * Removes unwanted chars from a given string.
   * @param uncleanString
   * @return
   */
  private String stripNoneWordChars(String uncleanString) {
    return uncleanString.replace(".", " ").replace(":", " ").replace("@", " ").replace("-", " ").replace("_", " ").replace("<", " ").replace(">", " ");


  }


  /**
   * Get the content of an InputStream as String.
   *
   * @param stream the InputStream
   * @return the convertet content as String
   * @throws IOException
   */
  public static String inputStreamAsString(InputStream stream)
          throws IOException {
    BufferedReader br = new BufferedReader(new InputStreamReader(stream));
    StringBuilder sb = new StringBuilder();
    String line = null;


    while ((line = br.readLine()) != null) {
      sb.append(line + "\n");
    }


    br.close();
    return sb.toString();
  }
}
Source Code of net.sf.regain.crawler.preparator.MessagePreparator

Related Classes of net.sf.regain.crawler.preparator.MessagePreparator