Package org.apache.solr.handler.dataimport

Source Code of org.apache.solr.handler.dataimport.MailEntityProcessor

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.dataimport;

import com.sun.mail.imap.IMAPMessage;

import org.apache.tika.config.TikaConfig;
import org.apache.tika.utils.ParseUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.mail.*;
import javax.mail.internet.AddressException;
import javax.mail.internet.ContentType;
import javax.mail.internet.InternetAddress;
import javax.mail.internet.MimeMessage;
import javax.mail.search.AndTerm;
import javax.mail.search.ComparisonTerm;
import javax.mail.search.ReceivedDateTerm;
import javax.mail.search.SearchTerm;
import java.io.InputStream;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;

/**
* An {@link EntityProcessor} instance which can index emails along with their attachments from POP3 or IMAP sources. Refer to
* <a href="http://wiki.apache.org/solr/DataImportHandler">http://wiki.apache.org/solr/DataImportHandler</a> for more
* details. <b>This API is experimental and subject to change</b>
*
* @version $Id: MailEntityProcessor.java 1065312 2011-01-30 16:08:25Z rmuir $
* @since solr 1.4
*/
public class MailEntityProcessor extends EntityProcessorBase {

  public static interface CustomFilter {
    public SearchTerm getCustomSearch(Folder folder);
  }

  @Override
  public void init(Context context) {
    super.init(context);
    // set attributes using  XXX getXXXFromContext(attribute, defualtValue);
    // applies variable resolver and return default if value is not found or null
    // REQUIRED : connection and folder info
    user = getStringFromContext("user", null);
    password = getStringFromContext("password", null);
    host = getStringFromContext("host", null);
    protocol = getStringFromContext("protocol", null);
    folderNames = getStringFromContext("folders", null);
    // validate
    if (host == null || protocol == null || user == null || password == null
            || folderNames == null)
      throw new DataImportHandlerException(DataImportHandlerException.SEVERE,
              "'user|password|protocol|host|folders' are required attributes");

    //OPTIONAL : have defaults and are optional
    recurse = getBoolFromContext("recurse", true);
    String excludes = getStringFromContext("exclude", "");
    if (excludes != null && !excludes.trim().equals("")) {
      exclude = Arrays.asList(excludes.split(","));
    }
    String includes = getStringFromContext("include", "");
    if (includes != null && !includes.trim().equals("")) {
      include = Arrays.asList(includes.split(","));
    }
    batchSize = getIntFromContext("batchSize", 20);
    customFilter = getStringFromContext("customFilter", "");
    String s = getStringFromContext("fetchMailsSince", "");
    if (s != null)
      try {
        fetchMailsSince = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").parse(s);
      } catch (ParseException e) {
        throw new DataImportHandlerException(DataImportHandlerException.SEVERE, "Invalid value for fetchMailSince: " + s, e);
      }

    fetchSize = getIntFromContext("fetchSize", 32 * 1024);
    cTimeout = getIntFromContext("connectTimeout", 30 * 1000);
    rTimeout = getIntFromContext("readTimeout", 60 * 1000);
    processAttachment = getBoolFromContext("processAttachement", true);

    logConfig();
  }

  @Override
  public Map<String, Object> nextRow() {
    Message mail;
    Map<String, Object> row = null;
    do {
      // try till there is a valid document or folders get exhausted.
      // when mail == NULL, it means end of processing
      mail = getNextMail();
      if (mail != null)
        row = getDocumentFromMail(mail);
    } while (row == null && mail != null);   
    return row;
  }

  private Message getNextMail() {
    if (!connected) {
      if (!connectToMailBox())
        return null;
      connected = true;
    }
    if (folderIter == null) {
      createFilters();
      folderIter = new FolderIterator(mailbox);
    }
    // get next message from the folder
    // if folder is exhausted get next folder
    // loop till a valid mail or all folders exhausted.
    while (msgIter == null || !msgIter.hasNext()) {
      Folder next = folderIter.hasNext() ? folderIter.next() : null;
      if (next == null) {
        return null;
      }
      msgIter = new MessageIterator(next, batchSize);
    }
    return msgIter.next();
  }

  private Map<String, Object> getDocumentFromMail(Message mail) {
    Map<String, Object> row = new HashMap<String, Object>();
    try {
      addPartToDocument(mail, row, true);
      return row;
    } catch (Exception e) {
      return null;
    }
  }

  public void addPartToDocument(Part part, Map<String, Object> row, boolean outerMost) throws Exception {
    if (part instanceof Message) {
      addEnvelopToDocument(part, row);
    }

    String ct = part.getContentType();
    ContentType ctype = new ContentType(ct);
    if (part.isMimeType("multipart/*")) {
      Multipart mp = (Multipart) part.getContent();
      int count = mp.getCount();
      if (part.isMimeType("multipart/alternative"))
        count = 1;
      for (int i = 0; i < count; i++)
        addPartToDocument(mp.getBodyPart(i), row, false);
    } else if (part.isMimeType("message/rfc822")) {
      addPartToDocument((Part) part.getContent(), row, false);
    } else {
      String disp = part.getDisposition();
      if (!processAttachment || (disp != null && disp.equalsIgnoreCase(Part.ATTACHMENT)))        return;
      InputStream is = part.getInputStream();
      String fileName = part.getFileName();
      String content = ParseUtils.getStringContent(is, TikaConfig.getDefaultConfig(), ctype.getBaseType().toLowerCase(Locale.ENGLISH));
      if (disp != null && disp.equalsIgnoreCase(Part.ATTACHMENT)) {
        if (row.get(ATTACHMENT) == null)
          row.put(ATTACHMENT, new ArrayList<String>());
        List<String> contents = (List<String>) row.get(ATTACHMENT);
        contents.add(content);
        row.put(ATTACHMENT, contents);
        if (row.get(ATTACHMENT_NAMES) == null)
          row.put(ATTACHMENT_NAMES, new ArrayList<String>());
        List<String> names = (List<String>) row.get(ATTACHMENT_NAMES);
        names.add(fileName);
        row.put(ATTACHMENT_NAMES, names);
      } else {
        if (row.get(CONTENT) == null)
          row.put(CONTENT, new ArrayList<String>());
        List<String> contents = (List<String>) row.get(CONTENT);
        contents.add(content);
        row.put(CONTENT, contents);
      }
    }
  }

  private void addEnvelopToDocument(Part part, Map<String, Object> row) throws MessagingException {
    MimeMessage mail = (MimeMessage) part;
    Address[] adresses;
    if ((adresses = mail.getFrom()) != null && adresses.length > 0)
      row.put(FROM, adresses[0].toString());

    List<String> to = new ArrayList<String>();
    if ((adresses = mail.getRecipients(Message.RecipientType.TO)) != null)
      addAddressToList(adresses, to);
    if ((adresses = mail.getRecipients(Message.RecipientType.CC)) != null)
      addAddressToList(adresses, to);
    if ((adresses = mail.getRecipients(Message.RecipientType.BCC)) != null)
      addAddressToList(adresses, to);
    if (to.size() > 0)
      row.put(TO_CC_BCC, to);

    row.put(MESSAGE_ID, mail.getMessageID());
    row.put(SUBJECT, mail.getSubject());

    Date d = mail.getSentDate();
    if (d != null) {
      row.put(SENT_DATE, d);
    }

    List<String> flags = new ArrayList<String>();
    for (Flags.Flag flag : mail.getFlags().getSystemFlags()) {
      if (flag == Flags.Flag.ANSWERED)
        flags.add(FLAG_ANSWERED);
      else if (flag == Flags.Flag.DELETED)
        flags.add(FLAG_DELETED);
      else if (flag == Flags.Flag.DRAFT)
        flags.add(FLAG_DRAFT);
      else if (flag == Flags.Flag.FLAGGED)
        flags.add(FLAG_FLAGGED);
      else if (flag == Flags.Flag.RECENT)
        flags.add(FLAG_RECENT);
      else if (flag == Flags.Flag.SEEN)
        flags.add(FLAG_SEEN);
    }
    flags.addAll(Arrays.asList(mail.getFlags().getUserFlags()));
    row.put(FLAGS, flags);

    String[] hdrs = mail.getHeader("X-Mailer");
    if (hdrs != null)
      row.put(XMAILER, hdrs[0]);
  }


  private void addAddressToList(Address[] adresses, List<String> to) throws AddressException {
    for (Address address : adresses) {
      to.add(address.toString());
      InternetAddress ia = (InternetAddress) address;
      if (ia.isGroup()) {
        InternetAddress[] group = ia.getGroup(false);
        for (InternetAddress member : group)
          to.add(member.toString());
      }
    }
  }

  private boolean connectToMailBox() {
    try {
      Properties props = new Properties();
      props.setProperty("mail.store.protocol", protocol);
      props.setProperty("mail.imap.fetchsize", "" + fetchSize);
      props.setProperty("mail.imap.timeout", "" + rTimeout);
      props.setProperty("mail.imap.connectiontimeout", "" + cTimeout);
      Session session = Session.getDefaultInstance(props, null);
      mailbox = session.getStore(protocol);
      mailbox.connect(host, user, password);
      LOG.info("Connected to mailbox");
      return true;
    } catch (MessagingException e) {
      throw new DataImportHandlerException(DataImportHandlerException.SEVERE,
              "Connection failed", e);
    }
  }

  private void createFilters() {
    if (fetchMailsSince != null) {
      filters.add(new MailsSinceLastCheckFilter(fetchMailsSince));
    }
    if (customFilter != null && !customFilter.equals("")) {
      try {
        Class cf = Class.forName(customFilter);
        Object obj = cf.newInstance();
        if (obj instanceof CustomFilter) {
          filters.add((CustomFilter) obj);
        }
      } catch (Exception e) {
        throw new DataImportHandlerException(DataImportHandlerException.SEVERE,
                "Custom filter could not be created", e);
      }
    }
  }

  private void logConfig() {
    if (!LOG.isInfoEnabled()) return;
    StringBuffer config = new StringBuffer();
    config.append("user : ").append(user).append(System.getProperty("line.separator"));
    config.append("pwd : ").append(password).append(System.getProperty("line.separator"));
    config.append("protocol : ").append(protocol).append(System.getProperty("line.separator"));
    config.append("host : ").append(host).append(System.getProperty("line.separator"));
    config.append("folders : ").append(folderNames).append(System.getProperty("line.separator"));
    config.append("recurse : ").append(recurse).append(System.getProperty("line.separator"));
    config.append("exclude : ").append(exclude.toString()).append(System.getProperty("line.separator"));
    config.append("include : ").append(include.toString()).append(System.getProperty("line.separator"));
    config.append("batchSize : ").append(batchSize).append(System.getProperty("line.separator"));
    config.append("fetchSize : ").append(fetchSize).append(System.getProperty("line.separator"));
    config.append("read timeout : ").append(rTimeout).append(System.getProperty("line.separator"));
    config.append("conection timeout : ").append(cTimeout).append(System.getProperty("line.separator"));
    config.append("custom filter : ").append(customFilter).append(System.getProperty("line.separator"));
    config.append("fetch mail since : ").append(fetchMailsSince).append(System.getProperty("line.separator"));
    LOG.info(config.toString());
  }

  class FolderIterator implements Iterator<Folder> {
    private Store mailbox;
    private List<String> topLevelFolders;
    private List<Folder> folders = null;
    private Folder lastFolder = null;

    public FolderIterator(Store mailBox) {
      this.mailbox = mailBox;
      folders = new ArrayList<Folder>();
      getTopLevelFolders(mailBox);
    }

    public boolean hasNext() {
      return !folders.isEmpty();
    }

    public Folder next() {
      try {
        boolean hasMessages = false;
        Folder next;
        do {
          if (lastFolder != null) {
            lastFolder.close(false);
            lastFolder = null;
          }
          if (folders.isEmpty()) {
            mailbox.close();
            return null;
          }
          next = folders.remove(0);
          if (next != null) {
            String fullName = next.getFullName();
            if (!excludeFolder(fullName)) {
              hasMessages = (next.getType() & Folder.HOLDS_MESSAGES) != 0;
              next.open(Folder.READ_ONLY);
              lastFolder = next;
              LOG.info("Opened folder : " + fullName);
            }
            if (recurse && ((next.getType() & Folder.HOLDS_FOLDERS) != 0)) {
              Folder[] children = next.list();
              LOG.info("Added its children to list  : ");
              for (int i = children.length - 1; i >= 0; i--) {
                folders.add(0, children[i]);
                LOG.info("child name : " + children[i].getFullName());
              }
              if (children.length == 0)
                LOG.info("NO children : ");
            }
          }
        }
        while (!hasMessages);
        return next;
      } catch (MessagingException e) {
        //throw new DataImportHandlerException(DataImportHandlerException.SEVERE,
        //        "Folder open failed", e);
      }
      return null;
    }

    public void remove() {
      throw new UnsupportedOperationException("Its read only mode...");
    }

    private void getTopLevelFolders(Store mailBox) {
      if (folderNames != null)
        topLevelFolders = Arrays.asList(folderNames.split(","));
      for (int i = 0; topLevelFolders != null && i < topLevelFolders.size(); i++) {
        try {
          folders.add(mailbox.getFolder(topLevelFolders.get(i)));
        } catch (MessagingException e) {
          // skip bad ones unless its the last one and still no good folder
          if (folders.size() == 0 && i == topLevelFolders.size() - 1)
            throw new DataImportHandlerException(DataImportHandlerException.SEVERE,
                    "Folder retreival failed");
        }
      }
      if (topLevelFolders == null || topLevelFolders.size() == 0) {
        try {
          folders.add(mailBox.getDefaultFolder());
        } catch (MessagingException e) {
          throw new DataImportHandlerException(DataImportHandlerException.SEVERE,
                  "Folder retreival failed");
        }
      }
    }

    private boolean excludeFolder(String name) {
      for (String s : exclude) {
        if (name.matches(s))
          return true;
      }
      for (String s : include) {
        if (name.matches(s))
          return false;
      }
      return include.size() > 0;
    }
  }

  class MessageIterator implements Iterator<Message> {
    private Folder folder;
    private Message[] messagesInCurBatch;
    private int current = 0;
    private int currentBatch = 0;
    private int batchSize = 0;
    private int totalInFolder = 0;
    private boolean doBatching = true;

    public MessageIterator(Folder folder, int batchSize) {
      try {
        this.folder = folder;
        this.batchSize = batchSize;
        SearchTerm st = getSearchTerm();
        if (st != null) {
          doBatching = false;
          messagesInCurBatch = folder.search(st);
          totalInFolder = messagesInCurBatch.length;
          folder.fetch(messagesInCurBatch, fp);
          current = 0;
          LOG.info("Total messages : " + totalInFolder);
          LOG.info("Search criteria applied. Batching disabled");
        } else {
          totalInFolder = folder.getMessageCount();
          LOG.info("Total messages : " + totalInFolder);
          getNextBatch(batchSize, folder);
        }
      } catch (MessagingException e) {
        throw new DataImportHandlerException(DataImportHandlerException.SEVERE,
                "Message retreival failed", e);
      }
    }

    private void getNextBatch(int batchSize, Folder folder) throws MessagingException {
      // after each batch invalidate cache
      if (messagesInCurBatch != null) {
        for (Message m : messagesInCurBatch) {
          if (m instanceof IMAPMessage)
            ((IMAPMessage) m).invalidateHeaders();
        }
      }
      int lastMsg = (currentBatch + 1) * batchSize;
      lastMsg = lastMsg > totalInFolder ? totalInFolder : lastMsg;
      messagesInCurBatch = folder.getMessages(currentBatch * batchSize + 1, lastMsg);
      folder.fetch(messagesInCurBatch, fp);
      current = 0;
      currentBatch++;
      LOG.info("Current Batch  : " + currentBatch);
      LOG.info("Messages in this batch  : " + messagesInCurBatch.length);
    }

    public boolean hasNext() {
      boolean hasMore = current < messagesInCurBatch.length;
      if (!hasMore && doBatching
              && currentBatch * batchSize < totalInFolder) {
        // try next batch
        try {
          getNextBatch(batchSize, folder);
          hasMore = current < messagesInCurBatch.length;
        } catch (MessagingException e) {
          throw new DataImportHandlerException(DataImportHandlerException.SEVERE,
                  "Message retreival failed", e);
        }
      }
      return hasMore;
    }

    public Message next() {
      return hasNext() ? messagesInCurBatch[current++] : null;
    }

    public void remove() {
      throw new UnsupportedOperationException("Its read only mode...");
    }

    private SearchTerm getSearchTerm() {
      if (filters.size() == 0)
        return null;
      if (filters.size() == 1)
        return filters.get(0).getCustomSearch(folder);
      SearchTerm last = filters.get(0).getCustomSearch(folder);
      for (int i = 1; i < filters.size(); i++) {
        CustomFilter filter = filters.get(i);
        SearchTerm st = filter.getCustomSearch(folder);
        if (st != null) {
          last = new AndTerm(last, st);
        }
      }
      return last;
    }
  }

  class MailsSinceLastCheckFilter implements CustomFilter {

    private Date since;

    public MailsSinceLastCheckFilter(Date date) {
      since = date;
    }

    public SearchTerm getCustomSearch(Folder folder) {
      return new ReceivedDateTerm(ComparisonTerm.GE, since);
    }
  }

  // user settings stored in member variables
  private String user;
  private String password;
  private String host;
  private String protocol;

  private String folderNames;
  private List<String> exclude = new ArrayList<String>();
  private List<String> include = new ArrayList<String>();
  private boolean recurse;

  private int batchSize;
  private int fetchSize;
  private int cTimeout;
  private int rTimeout;

  private Date fetchMailsSince;
  private String customFilter;

  private boolean processAttachment = true;

  // holds the current state
  private Store mailbox;
  private boolean connected = false;
  private FolderIterator folderIter;
  private MessageIterator msgIter;
  private List<CustomFilter> filters = new ArrayList<CustomFilter>();
  private static FetchProfile fp = new FetchProfile();
  private static final Logger LOG = LoggerFactory.getLogger(DataImporter.class);

  // diagnostics
  private int rowCount = 0;

  static {
    fp.add(FetchProfile.Item.ENVELOPE);
    fp.add(FetchProfile.Item.FLAGS);
    fp.add("X-Mailer");
  }

  // Fields To Index
  // single valued
  private static final String MESSAGE_ID = "messageId";
  private static final String SUBJECT = "subject";
  private static final String FROM = "from";
  private static final String SENT_DATE = "sentDate";
  private static final String XMAILER = "xMailer";
  // multi valued
  private static final String TO_CC_BCC = "allTo";
  private static final String FLAGS = "flags";
  private static final String CONTENT = "content";
  private static final String ATTACHMENT = "attachment";
  private static final String ATTACHMENT_NAMES = "attachmentNames";
  // flag values
  private static final String FLAG_ANSWERED = "answered";
  private static final String FLAG_DELETED = "deleted";
  private static final String FLAG_DRAFT = "draft";
  private static final String FLAG_FLAGGED = "flagged";
  private static final String FLAG_RECENT = "recent";
  private static final String FLAG_SEEN = "seen";

  private int getIntFromContext(String prop, int ifNull) {
    int v = ifNull;
    try {
      String val = context.getEntityAttribute(prop);
      if (val != null) {
        val = context.replaceTokens(val);
        v = Integer.valueOf(val);
      }
    } catch (NumberFormatException e) {
      //do nothing
    }
    return v;
  }

  private boolean getBoolFromContext(String prop, boolean ifNull) {
    boolean v = ifNull;
    String val = context.getEntityAttribute(prop);
    if (val != null) {
      val = context.replaceTokens(val);
      v = Boolean.valueOf(val);
    }
    return v;
  }

  private String getStringFromContext(String prop, String ifNull) {
    String v = ifNull;
    String val = context.getEntityAttribute(prop);
    if (val != null) {
      val = context.replaceTokens(val);
      v = val;
    }
    return v;
  }
}
TOP

Related Classes of org.apache.solr.handler.dataimport.MailEntityProcessor

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.