Package org.lilyproject.tools.mboximport

Source Code of org.lilyproject.tools.mboximport.MboxImport$ImportMboxFileTask

/*
* Copyright 2010 Outerthought bvba
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.lilyproject.tools.mboximport;

import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.zip.GZIPInputStream;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.io.IOUtils;
import org.apache.james.mime4j.codec.Base64InputStream;
import org.apache.james.mime4j.codec.QuotedPrintableInputStream;
import org.apache.james.mime4j.field.AddressListField;
import org.apache.james.mime4j.field.DefaultFieldParser;
import org.apache.james.mime4j.field.FieldName;
import org.apache.james.mime4j.field.MailboxField;
import org.apache.james.mime4j.field.MailboxListField;
import org.apache.james.mime4j.field.ParsedField;
import org.apache.james.mime4j.field.address.Address;
import org.apache.james.mime4j.field.address.AddressList;
import org.apache.james.mime4j.field.address.Mailbox;
import org.apache.james.mime4j.field.address.MailboxList;
import org.apache.james.mime4j.io.EOLConvertingInputStream;
import org.apache.james.mime4j.parser.Field;
import org.apache.james.mime4j.parser.MimeEntityConfig;
import org.apache.james.mime4j.parser.MimeTokenStream;
import org.apache.james.mime4j.util.MimeUtil;
import org.lilyproject.repository.api.Blob;
import org.lilyproject.repository.api.Link;
import org.lilyproject.repository.api.QName;
import org.lilyproject.repository.api.Record;
import org.lilyproject.repository.api.RecordId;
import org.lilyproject.testclientfw.BaseRepositoryTestTool;
import org.lilyproject.tools.import_.cli.JsonImport;
import org.lilyproject.util.Version;
import org.lilyproject.util.io.Closer;

public class MboxImport extends BaseRepositoryTestTool {

    private Option fileOption;

    private Option schemaOption;

    private Map<String, Integer> partsByMediaType = new HashMap<String, Integer>();

    private static final String NS = "org.lilyproject.mail";

    private static final int MAX_LINE_LENGTH = 10000;

    @Override
    protected String getCmdName() {
        return "lily-mbox-import";
    }

    @Override
    protected String getVersion() {
        return Version.readVersion("org.lilyproject", "lily-mbox-import");
    }

    public static void main(String[] args) throws Exception {
        new MboxImport().start(args);
    }

    @Override
    @SuppressWarnings("static-access")
    public List<Option> getOptions() {
        List<Option> options = super.getOptions();

        fileOption = OptionBuilder
                .withArgName("file")
                .hasArg()
                .withDescription("File or directory name")
                .withLongOpt("file")
                .create("f");
        options.add(fileOption);

        schemaOption = OptionBuilder
                .withDescription("Create/update the schema")
                .withLongOpt("schema")
                .create("s");
        options.add(schemaOption);

        return options;
    }

    @Override
    protected int getDefaultWorkers() {
        return 1;
    }

    @Override
    public int run(CommandLine cmd) throws Exception {
        int result = super.run(cmd);
        if (result != 0) {
            return result;
        }

        if (!cmd.hasOption(schemaOption.getOpt()) && !cmd.hasOption(fileOption.getOpt())) {
            printHelp();
            return 1;
        }

        setupLily();

        if (cmd.hasOption(schemaOption.getOpt()) || cmd.hasOption(fileOption.getOpt())) {
            loadSchema();
        }

        if (cmd.hasOption(fileOption.getOpt())) {
            String fileName = cmd.getOptionValue(fileOption.getOpt());
            File file = new File(fileName);

            if (!file.exists()) {
                System.out.println("File does not exist: " + file.getAbsolutePath());
                return 1;
            }

            setupMetrics();

            startExecutor();

            if (file.isDirectory()) {
                File[] files = file.listFiles();
                Arrays.sort(files);
                for (File item : files) {
                    if (!item.isDirectory()) {
                        executor.submit(new ImportMboxFileTask(item));
                    }
                }
            } else {
                executor.submit(new ImportMboxFileTask(file));
            }

            stopExecutor();

            finishMetrics();

            System.out.println();
            System.out.println("Number of created parts per media type:");
            for (Map.Entry<String, Integer> entry : partsByMediaType.entrySet()) {
                System.out.println("  " + entry.getKey() + " : " + entry.getValue());
            }
            System.out.println();
        }

        lilyClient.close();

        return 0;
    }

    private void loadSchema() throws Exception {
        System.out.println("Creating the schema (if necessary)");
        System.out.println();
        InputStream is = getClass().getClassLoader().getResourceAsStream("org/lilyproject/tools/mboximport/mail_schema.json");
        JsonImport.loadSchema(repository, is);
        System.out.println();
    }

    private class ImportMboxFileTask implements Runnable {
        private File file;

        private ImportMboxFileTask(File file) {
            this.file = file;
        }

        @Override
        public void run() {
            try {
                importFile(file);
            } catch (Throwable t) {
                t.printStackTrace();
                metrics.increment("Exceptions", 1);
            }
        }
    }

    private void importFile(File file) throws Exception {
        System.out.println("Processing file " + file.getAbsolutePath());
        InputStream is = null;
        try {
            is = new FileInputStream(file);

            if (file.getName().endsWith(".gz")) {
                is = new GZIPInputStream(is);
            }

            MboxInputStream mboxStream = new MboxInputStream(is, MAX_LINE_LENGTH);

            while (mboxStream.nextMessage()) {
                MimeTokenStream stream = new MyMimeTokenStream();
                stream.parse(mboxStream);
                importMessage(stream);
            }

        } finally {
            Closer.close(is);
        }
        System.out.println();
    }

    public static class MyMimeTokenStream extends MimeTokenStream {
        protected MyMimeTokenStream() {
            super(getConfig());
        }

        private static MimeEntityConfig getConfig() {
            MimeEntityConfig config = new MimeEntityConfig();
            config.setMaxLineLen(MAX_LINE_LENGTH);
            return config;
        }
    }

    private void importMessage(MimeTokenStream stream) throws Exception {
        int multiPartNesting = 0; // note that a multipart can again contain a multipart

        Message message = new Message();

        for (int state = stream.getState();
             state != MimeTokenStream.T_END_OF_STREAM;
             state = stream.next()) {

            switch (state) {
                case MimeTokenStream.T_BODY:
                    String mediaType = stream.getBodyDescriptor().getMimeType() + "; charset=" + stream.getBodyDescriptor().getCharset();

                    // oftwewel: gebruik getDecodedInputStream
                    InputStream bodyDataStream;
                    if (MimeUtil.isQuotedPrintableEncoded(stream.getBodyDescriptor().getTransferEncoding())) {
                        bodyDataStream = new QuotedPrintableInputStream(new EOLConvertingInputStream(stream.getInputStream(), EOLConvertingInputStream.CONVERT_LF));
                    } else if (MimeUtil.isBase64Encoding(stream.getBodyDescriptor().getTransferEncoding())) {
                        bodyDataStream = new Base64InputStream(stream.getInputStream());
                    } else {
                        bodyDataStream = stream.getInputStream();
                    }

                    byte[] data = IOUtils.toByteArray(bodyDataStream);

                    // TODO could fill in filename
                    long startTime = System.nanoTime();
                    Blob blob = new Blob(mediaType, (long)data.length, null);
                    OutputStream os = table.getOutputStream(blob);
                    try {
                        IOUtils.write(data, os);
                    } finally {
                        os.close();
                    }
                    double duration = System.nanoTime() - startTime;
                    metrics.increment("Blob creation", "Blob", duration / 1e6d);

                    Part part = message.addPart(blob);
                    part.baseMediaType = stream.getBodyDescriptor().getMimeType();

                    break;
                case MimeTokenStream.T_FIELD:
                    if (multiPartNesting == 0) {
                        Field field = stream.getField();
                        ParsedField parsedField = new DefaultFieldParser().parse(field.getName(), MimeUtil.unfold(field.getBody()), null);
                        if (parsedField.getParseException() != null) {
                            // TODO print error
                        } else if (parsedField.getName().equals(FieldName.TO)) {
                            message.to = ((AddressListField)parsedField).getAddressList();
                        } else if (parsedField.getName().equals(FieldName.CC)) {
                            message.cc = ((AddressListField)parsedField).getAddressList();
                        } else if (parsedField.getName().equals(FieldName.FROM)) {
                            message.from = ((MailboxListField)parsedField).getMailboxList();
                        } else if (parsedField.getName().equals(FieldName.SENDER)) {
                            message.sender = ((MailboxField)parsedField).getMailbox();
                        } else if (parsedField.getName().equals("List-Id")) {
                            message.listId = parsedField.getBody();
                        } else if (parsedField.getName().equals(FieldName.SUBJECT)) {
                            message.subject = parsedField.getBody();
                        }
                    }
                    break;
                case MimeTokenStream.T_START_MULTIPART:
                    multiPartNesting++;
                    break;
                case MimeTokenStream.T_END_MULTIPART:
                    multiPartNesting--;
            }
        }

        // Now create the records in Lily


        // Since we want to link the messages and parts bidirectionally, and for performance we want to avoid
        // having to update the message, we generate record IDs ourselves.
        // Since for the current usage typically parts are indexed with information dereferenced from messages,
        // we can save additional indexer work (update of dereferenced data) by first creating the messages
        // and then the parts.
        List<RecordId> partRecordIds = new ArrayList<RecordId>(message.parts.size());
        for (Part part : message.parts) {
            partRecordIds.add(idGenerator.newRecordId());
        }

        Record messageRecord = repository.getRecordFactory().newRecord(idGenerator.newRecordId());
        messageRecord.setRecordType(new QName(NS, "Message"));
        if (message.subject != null) {
            messageRecord.setField(new QName(NS, "subject"), message.subject);
        }
        if (message.to != null) {
            messageRecord.setField(new QName(NS, "to"), message.getToAddressesAsStringList());
        }
        if (message.cc != null) {
            messageRecord.setField(new QName(NS, "cc"), message.getCcAddressesAsStringList());
        }
        if (message.from != null) {
            messageRecord.setField(new QName(NS, "from"), message.getFromAddressesAsStringList());
        }
        if (message.sender != null) {
            messageRecord.setField(new QName(NS, "sender"), message.getSenderAddressAsString());
        }
        if (message.listId != null) {
            messageRecord.setField(new QName(NS, "listId"), message.listId);
        }

        if (messageRecord.getFields().size() == 0 || message.parts.size() == 0) {
            // Message has no useful headers, do not create it.
            metrics.increment("Invalid messages", 1);
            return;
        }

        List<Link> partLinks = new ArrayList<Link>(message.parts.size());
        for (RecordId recordId : partRecordIds) {
            partLinks.add(new Link(recordId));
        }
        messageRecord.setField(new QName(NS, "parts"), partLinks);

        long startTime = System.nanoTime();
        messageRecord = table.createOrUpdate(messageRecord);
        double duration = System.nanoTime() - startTime;
        metrics.increment("Message record", "Create", duration / 1e6d);

        for (int i = 0; i < message.parts.size(); i++) {
            Part part = message.parts.get(i);
            Record partRecord = table.newRecord(partRecordIds.get(i));
            partRecord.setRecordType(new QName(NS, "Part"));
            partRecord.setField(new QName(NS, "mediaType"), part.blob.getMediaType());
            partRecord.setField(new QName(NS, "content"), part.blob);
            partRecord.setField(new QName(NS, "message"), new Link(messageRecord.getId()));

            startTime = System.nanoTime();
            partRecord = table.createOrUpdate(partRecord);
            duration = System.nanoTime() - startTime;
            metrics.increment("Part record", "Create", duration / 1e6d);

            part.recordId = partRecord.getId();
            increment(part.baseMediaType);

            if (verbose) {
                System.out.println("Created part record: " + partRecord.getId());
            }
        }

        if (verbose) {
            System.out.println("Created message record " + messageRecord.getId());
        }
    }

    public void increment(String mediaType) {
        Integer count = partsByMediaType.get(mediaType);
        if (count == null) {
            partsByMediaType.put(mediaType, 1);
        } else {
            partsByMediaType.put(mediaType, count + 1);
        }
    }

    private static class Message {
        public String subject;
        public AddressList to;
        public AddressList cc;
        public MailboxList from;
        public Mailbox sender;
        public String listId;

        public List<Part> parts = new ArrayList<Part>();

        public Part addPart(Blob blob) {
            Part part = new Part();
            part.blob = blob;
            parts.add(part);
            return part;
        }

        public List<String> getToAddressesAsStringList() {
            List<String> result = new ArrayList<String>(to.size());
            for (Address address : to) {
                result.add(address.getDisplayString());
            }
            return result;
        }

        public List<String> getCcAddressesAsStringList() {
            List<String> result = new ArrayList<String>(cc.size());
            for (Address address : cc) {
                result.add(address.getDisplayString());
            }
            return result;
        }

        public List<String> getFromAddressesAsStringList() {
            List<String> result = new ArrayList<String>(from.size());
            for (Mailbox mailbox : from) {
                result.add(mailbox.getDisplayString());
            }
            return result;
        }

        public String getSenderAddressAsString() {
            return sender.getDisplayString();
        }
    }

    private static class Part {
        public Blob blob;
        public RecordId recordId;
        /** Media type without parameters. */
        public String baseMediaType;
    }
}
TOP

Related Classes of org.lilyproject.tools.mboximport.MboxImport$ImportMboxFileTask

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.