Package com.cloudera.cdk.morphline.tika

Source Code of com.cloudera.cdk.morphline.tika.DetectMimeTypeBuilder

/*
* Copyright 2013 Cloudera Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.cloudera.cdk.morphline.tika;

import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Map.Entry;

import org.apache.tika.config.ServiceLoader;
import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.detect.Detector;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MimeTypeException;
import org.apache.tika.mime.MimeTypes;
import org.apache.tika.mime.MimeTypesFactory;

import com.cloudera.cdk.morphline.api.Command;
import com.cloudera.cdk.morphline.api.CommandBuilder;
import com.cloudera.cdk.morphline.api.MorphlineCompilationException;
import com.cloudera.cdk.morphline.api.MorphlineContext;
import com.cloudera.cdk.morphline.api.MorphlineRuntimeException;
import com.cloudera.cdk.morphline.api.Record;
import com.cloudera.cdk.morphline.base.AbstractCommand;
import com.cloudera.cdk.morphline.base.Fields;
import com.google.common.base.Preconditions;
import com.google.common.io.Closeables;
import com.typesafe.config.Config;

/**
* Command that auto-detects the MIME type of the first attachment, if no MIME type is defined yet.
*/
public final class DetectMimeTypeBuilder implements CommandBuilder {

  @Override
  public Collection<String> getNames() {
    return Collections.singletonList("detectMimeType");
  }
 
  @Override
  public Command build(Config config, Command parent, Command child, MorphlineContext context) {
    try {
      return new DetectMimeType(this, config, parent, child, context);
    } catch (IOException e) {
      throw new MorphlineCompilationException("Cannot instantiate command", config, e, this);
    } catch (MimeTypeException e) {
      throw new MorphlineCompilationException("Cannot instantiate command", config, e, this);
    }
  }
 
 
  ///////////////////////////////////////////////////////////////////////////////
  // Nested classes:
  ///////////////////////////////////////////////////////////////////////////////
  private static final class DetectMimeType extends AbstractCommand {

    private final Detector detector;
    private final boolean preserveExisting;
    private final boolean includeMetaData;
    private final boolean excludeParameters;
   
    public DetectMimeType(CommandBuilder builder, Config config, Command parent, Command child, MorphlineContext context) throws IOException, MimeTypeException {
      super(builder, config, parent, child, context);
      this.preserveExisting = getConfigs().getBoolean(config, "preserveExisting", true);     
      this.includeMetaData = getConfigs().getBoolean(config, "includeMetaData", false);
      this.excludeParameters = getConfigs().getBoolean(config, "excludeParameters", true);
      List<InputStream> inputStreams = new ArrayList();
      try {
        if (getConfigs().getBoolean(config, "includeDefaultMimeTypes", true)) {
          // adapted from Tika MimeTypesFactory.create(String coreFilePath, String extensionFilePath)
          String coreFilePath = "tika-mimetypes.xml";
          String classPrefix = MimeTypesFactory.class.getPackage().getName().replace('.', '/') + "/";
          ClassLoader cl = MimeTypesFactory.class.getClassLoader();      
          URL coreURL = cl.getResource(classPrefix + coreFilePath);
          InputStream in = new BufferedInputStream(coreURL.openStream());
          inputStreams.add(in);
        }
        for (String mimeTypesFile : getConfigs().getStringList(config, "mimeTypesFiles", Collections.EMPTY_LIST)) {
          InputStream in = new BufferedInputStream(new FileInputStream(new File(mimeTypesFile)));
          inputStreams.add(in);
        }
        String mimeTypesString = getConfigs().getString(config, "mimeTypesString", null);
        if (mimeTypesString != null) {
          InputStream in = new ByteArrayInputStream(mimeTypesString.getBytes("UTF-8"));
          inputStreams.add(in);
        }
       
        if (inputStreams.size() > 0) {
          MimeTypes mimeTypes = MimeTypesFactory.create(inputStreams.toArray(new InputStream[inputStreams.size()]));
          ServiceLoader loader = new ServiceLoader();
          this.detector = new DefaultDetector(mimeTypes, loader);
        } else {
          throw new MorphlineCompilationException("Missing specification for MIME type mappings", config);
        }     
      } finally {
        for (InputStream in : inputStreams) {
          Closeables.closeQuietly(in);
        }
      }
      validateArguments();
    }
   
    @Override
    protected boolean doProcess(Record record) {
      if (preserveExisting && record.getFields().containsKey(Fields.ATTACHMENT_MIME_TYPE)) {
        ; // we must preserve the existing MIME type
      } else {
        List attachments = record.get(Fields.ATTACHMENT_BODY);
        if (attachments.size() > 0) {
          Object attachment = attachments.get(0);
          Preconditions.checkNotNull(attachment);
          InputStream stream;
          if (attachment instanceof byte[]) {
            stream = new ByteArrayInputStream((byte[]) attachment);
          } else {
            stream = (InputStream) attachment;
          }
         
          Metadata metadata = new Metadata();
         
          // If you specify the resource name (the filename, roughly) with this
          // parameter, then Tika can use it in guessing the right MIME type
          String resourceName = (String) record.getFirstValue(Fields.ATTACHMENT_NAME);
          if (resourceName != null) {
            metadata.add(Metadata.RESOURCE_NAME_KEY, resourceName);
          }

          // Provide stream's charset as hint to Tika for better auto detection
          String charset = (String) record.getFirstValue(Fields.ATTACHMENT_CHARSET);
          if (charset != null) {
            metadata.add(Metadata.CONTENT_ENCODING, charset);
          }

          if (includeMetaData) {
            for (Entry<String, Object> entry : record.getFields().entries()) {
              metadata.add(entry.getKey(), entry.getValue().toString());
            }
          }

          String mimeType = getMediaType(stream, metadata, excludeParameters);
          record.replaceValues(Fields.ATTACHMENT_MIME_TYPE, mimeType);
       
      }
      return super.doProcess(record);
    }
   
    /**
     * Detects the content type of the given input event. Returns
     * <code>application/octet-stream</code> if the type of the event can not be
     * detected.
     * <p>
     * It is legal for the event headers or body to be empty. The detector may
     * read bytes from the start of the body stream to help in type detection.
     *
     * @return detected media type, or <code>application/octet-stream</code>
     */
    private String getMediaType(InputStream in, Metadata metadata, boolean excludeParameters) {
      MediaType mediaType;
      try {
        mediaType = getDetector().detect(in, metadata);
      } catch (IOException e) {
        throw new MorphlineRuntimeException(e);
      }
      String mediaTypeStr = mediaType.toString();
      if (excludeParameters) {
        int i = mediaTypeStr.indexOf(';');
        if (i >= 0) {
          mediaTypeStr = mediaTypeStr.substring(0, i);
        }
      }
      return mediaTypeStr;
    }

    protected Detector getDetector() {
      return detector;
    }

  }
 
}
TOP

Related Classes of com.cloudera.cdk.morphline.tika.DetectMimeTypeBuilder

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.
;i[r]=i[r]||function(){ (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o), m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m) })(window,document,'script','//www.google-analytics.com/analytics.js','ga'); ga('create', 'UA-20639858-1', 'auto'); ga('send', 'pageview');