Package org.kitesdk.morphline.stdio

Source Code of org.kitesdk.morphline.stdio.AbstractParser

/*
* Copyright 2013 Cloudera Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.kitesdk.morphline.stdio;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Set;

import org.kitesdk.morphline.api.Command;
import org.kitesdk.morphline.api.CommandBuilder;
import org.kitesdk.morphline.api.MorphlineContext;
import org.kitesdk.morphline.api.MorphlineRuntimeException;
import org.kitesdk.morphline.api.Record;
import org.kitesdk.morphline.base.AbstractCommand;
import org.kitesdk.morphline.base.Fields;
import org.kitesdk.morphline.base.Metrics;

import com.codahale.metrics.Meter;
import com.google.common.base.Preconditions;
import com.google.common.collect.Sets;

import org.kitesdk.morphline.shaded.com.google.common.io.Closeables;

import com.typesafe.config.Config;

/**
* Base class for convenient implementation of morphline parsers.
*/
public abstract class AbstractParser extends AbstractCommand {

  private final Meter numRecordsMeter;
  private Set<MediaType> supportedMimeTypes = null;

  public static final String SUPPORTED_MIME_TYPES = "supportedMimeTypes";

  protected AbstractParser(CommandBuilder builder, Config config, Command parent, Command child, MorphlineContext context) {
    super(builder, config, parent, child, context);     
    List<String> mimeTypes = getConfigs().getStringList(config, SUPPORTED_MIME_TYPES, Collections.<String>emptyList());
    for (String mimeType : mimeTypes) {
      addSupportedMimeType(mimeType);
    }
    this.numRecordsMeter = getMeter(Metrics.NUM_RECORDS);
  }

  /** Deprecated; will be removed in the next release */
  @Deprecated
  protected AbstractParser(Config config, Command parent, Command child, MorphlineContext context) {
    super(config, parent, child, context);     
    List<String> mimeTypes = getConfigs().getStringList(config, SUPPORTED_MIME_TYPES, Collections.<String>emptyList());
    for (String mimeType : mimeTypes) {
      addSupportedMimeType(mimeType);
    }
    this.numRecordsMeter = getMeter(Metrics.NUM_RECORDS);
  }

  protected void addSupportedMimeType(String mediaType) {
    if (supportedMimeTypes == null) {
      supportedMimeTypes = Sets.newHashSet();
    }
    supportedMimeTypes.add(parseMimeType(mediaType));
  }

  @Override
  protected boolean doProcess(Record record) {
    if (!hasAtLeastOneAttachment(record)) {
      return false;
    }

    // TODO: make field for stream configurable
    String streamMediaType = (String) record.getFirstValue(Fields.ATTACHMENT_MIME_TYPE);
    if (!isMimeTypeSupported(streamMediaType, record)) {
      return false;
    }

    InputStream stream = getAttachmentInputStream(record);
    try {
      return doProcess(record, stream);
    } catch (IOException e) {
      throw new MorphlineRuntimeException(e);
    } finally {
      Closeables.closeQuietly(stream);
    }
  }
 
  protected abstract boolean doProcess(Record record, InputStream stream) throws IOException;

  protected void incrementNumRecords() {
    if (isMeasuringMetrics()) {
      numRecordsMeter.mark();
    }
  }
 
  private boolean isMimeTypeSupported(String mediaTypeStr, Record record) {
    if (supportedMimeTypes == null) {
      return true;
    }
    if (!hasAtLeastOneMimeType(record)) {
      return false;
    }
    MediaType mediaType = parseMimeType(mediaTypeStr);
    if (supportedMimeTypes.contains(mediaType)) {
      return true; // fast path
    }
    // wildcard matching
    for (MediaType rangePattern : supportedMimeTypes) {     
      if (isMimeTypeMatch(mediaType, rangePattern)) {
        return true;
      }
    }
    if (LOG.isDebugEnabled()) {
      LOG.debug("No supported MIME type found for " + Fields.ATTACHMENT_MIME_TYPE + "=" + mediaTypeStr);
    }
    return false;
  }

  private MediaType parseMimeType(String mediaTypeStr) {
    MediaType mediaType = MediaType.parse(mediaTypeStr.trim().toLowerCase(Locale.ROOT));
    return mediaType.getBaseType();
  };
     
  /** Returns true if mediaType falls withing the given range (pattern), false otherwise */
  private boolean isMimeTypeMatch(MediaType mediaType, MediaType rangePattern) {
    String WILDCARD = "*";
    String rangePatternType = rangePattern.getType();
    String rangePatternSubtype = rangePattern.getSubtype();
    return (rangePatternType.equals(WILDCARD) || rangePatternType.equals(mediaType.getType()))
        && (rangePatternSubtype.equals(WILDCARD) || rangePatternSubtype.equals(mediaType.getSubtype()));
  }

  protected Charset detectCharset(Record record, Charset charset) {
    if (charset != null) {
      return charset;
    }
    List charsets = record.get(Fields.ATTACHMENT_CHARSET);
    if (charsets.size() == 0) {
      // TODO try autodetection (AutoDetectReader)
      throw new MorphlineRuntimeException("Missing charset for record: " + record);
    }
    String charsetName = (String) charsets.get(0);       
    return Charset.forName(charsetName);
  }

  private boolean hasAtLeastOneAttachment(Record record) {
    if (!record.getFields().containsKey(Fields.ATTACHMENT_BODY)) {
      LOG.debug("Command failed because of missing attachment for record: {}", record);
      return false;
    }
    return true;
  }
 
  private boolean hasAtLeastOneMimeType(Record record) {
    if (!record.getFields().containsKey(Fields.ATTACHMENT_MIME_TYPE)) {
      LOG.debug("Command failed because of missing MIME type for record: {}", record);
      return false;
   
    return true;
  }

  private InputStream getAttachmentInputStream(Record record) {
    Object body = record.getFirstValue(Fields.ATTACHMENT_BODY);
    Preconditions.checkNotNull(body);
    if (body instanceof byte[]) {
      return new ByteArrayInputStream((byte[]) body);
    } else {
      return (InputStream) body;
    }
  }

  public static void removeAttachments(Record outputRecord) {
    outputRecord.removeAll(Fields.ATTACHMENT_BODY);
    outputRecord.removeAll(Fields.ATTACHMENT_MIME_TYPE);
    outputRecord.removeAll(Fields.ATTACHMENT_CHARSET);
    outputRecord.removeAll(Fields.ATTACHMENT_NAME);
  }
 
  int getBufferSize(InputStream stream) {
    if (stream instanceof ByteArrayInputStream) {
      return 1024; // probably a single log line from Flume   
    } else {
      return 8192; // same as default for new BufferedReader()
    }
  }


//public static XMediaType toGuavaMediaType(TMediaType tika) {
//return XMediaType.create(tika.getType(), tika.getSubtype()).withParameters(Multimaps.forMap(tika.getParameters()));
//}
//
//public static List<XMediaType> toGuavaMediaType(Iterable<TMediaType> tikaCollection) {
//List<XMediaType> list = new ArrayList();
//for (TMediaType tika : tikaCollection) {
//  list.add(toGuavaMediaType(tika));
//}
//return list;
//}

}
TOP

Related Classes of org.kitesdk.morphline.stdio.AbstractParser

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.