Package org.kitesdk.morphline.hadoop.rcfile

Source Code of org.kitesdk.morphline.hadoop.rcfile.ReadRCFileBuilder$ReadRCFile$RCFileColumn

/*
* Copyright 2013 Cloudera Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.kitesdk.morphline.hadoop.rcfile;

import java.io.DataInput;
import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.io.RCFile;
import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable;
import org.apache.hadoop.hive.serde2.columnar.BytesRefWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.ReflectionUtils;
import org.kitesdk.morphline.api.Command;
import org.kitesdk.morphline.api.CommandBuilder;
import org.kitesdk.morphline.api.MorphlineCompilationException;
import org.kitesdk.morphline.api.MorphlineContext;
import org.kitesdk.morphline.api.MorphlineRuntimeException;
import org.kitesdk.morphline.api.Record;
import org.kitesdk.morphline.base.Configs;
import org.kitesdk.morphline.base.Fields;
import org.kitesdk.morphline.base.Validator;
import org.kitesdk.morphline.stdio.AbstractParser;

import com.google.common.io.ByteStreams;
import com.typesafe.config.Config;

/**
* Command for reading Record Columnar (RC) Files. RCFile can be read in two
* ways
*
* 1. Row Wise - Emits one record for every row in the RCFile, with columns
* mapped with names as provided in the column map
*
* 2. Column Wise - For every row split (block) in the RC File, Emits one record
* for each column with all the row values for that column as a List
*
* For a RCFile with n rows and m columns. Row wise would emit n records and
* Column wise would emit m * (row splits) records.
* De-Compression is handled automatically
*
*/
public final class ReadRCFileBuilder implements CommandBuilder {
 
  public static final String OUTPUT_MEDIA_TYPE = "application/java-rc-file-record";
  public static final String RC_FILE_META_DATA = "RCFileMetaData";

  @Override
  public Collection<String> getNames() {
    return Collections.singletonList("readRCFile");
  }

  @Override
  public Command build(Config config, Command parent, Command child, MorphlineContext context) {
    return new ReadRCFile(this, config, parent, child, context);
  }
 

  // /////////////////////////////////////////////////////////////////////////////
  // Nested classes:
  // /////////////////////////////////////////////////////////////////////////////
  private static final class ReadRCFile extends AbstractParser {
   
    private final List<RCFileColumn> columns = new ArrayList();
    private final RCFileReadMode readMode;
    private final boolean includeMetaData;
    private final Configuration conf = new Configuration();

    private static final Object STREAM_PROTOCOL = "stream://";

    public ReadRCFile(CommandBuilder builder, Config config, Command parent,
        Command child, MorphlineContext context) {
      super(builder, config, parent, child, context);
      this.includeMetaData = getConfigs().getBoolean(config, "includeMetaData", false);
      this.readMode = new Validator<RCFileReadMode>()
          .validateEnum(
              config,
              getConfigs().getString(config, "readMode",
                  RCFileReadMode.row.name()), RCFileReadMode.class);
     
      for (Config columnConfig : getConfigs().getConfigList(config, "columns")) {
        columns.add(new RCFileColumn(columnConfig, conf));
      }
      validateArguments();
    }

    @Override
    protected boolean doProcess(Record record, InputStream in) throws IOException {
      Path attachmentPath = getAttachmentPath(record);
      SingleStreamFileSystem fs = new SingleStreamFileSystem(in, attachmentPath);
      RCFile.Reader reader = null;
      try {
        reader = new RCFile.Reader(fs, attachmentPath, conf);
        Record template = record.copy();
        removeAttachments(template);
        template.put(Fields.ATTACHMENT_MIME_TYPE, OUTPUT_MEDIA_TYPE);
        if (includeMetaData) {
          SequenceFile.Metadata metadata = reader.getMetadata();
          if (metadata != null) {
            template.put(RC_FILE_META_DATA, metadata);
          }
        }
       
        switch (readMode) {
        case row:
          return readRowWise(reader, template);
        case column:
          return readColumnWise(reader, template);
        default :
          throw new IllegalStateException();
        }
      } catch (IOException e) {
        throw new MorphlineRuntimeException("IOException while processing attachment "
                + attachmentPath.getName(), e);
      } finally {
        if (reader != null) {
          reader.close();
        }
      }
    }

    private Path getAttachmentPath(Record record) {
      // We have more meaningful RCFile error messages if we have an attachment name
      String attachmentName = (String) record.getFirstValue(Fields.ATTACHMENT_NAME);
      if (attachmentName == null) {
        attachmentName = "UNKNOWN";
      }
      return new Path(STREAM_PROTOCOL + attachmentName);
    }

    private boolean readRowWise(final RCFile.Reader reader, final Record record)
        throws IOException {
     
      LongWritable rowID = new LongWritable();     

      while (true) {
        boolean next;
        try {
          next = reader.next(rowID);
        } catch (EOFException ex) {
          // We have hit EOF of the stream
          break;
        }

        if (!next) {
          break;
        }

        incrementNumRecords();
        Record outputRecord = record.copy();       
        BytesRefArrayWritable rowBatchBytes = new BytesRefArrayWritable();
        rowBatchBytes.resetValid(columns.size());
        reader.getCurrentRow(rowBatchBytes);

        // Read all the columns configured and set it in the output record
        for (RCFileColumn rcColumn : columns) {
          BytesRefWritable columnBytes = rowBatchBytes.get(rcColumn.getInputField());
          outputRecord.put(rcColumn.getOutputField(), updateColumnValue(rcColumn, columnBytes));
        }
       
        // pass record to next command in chain:
        if (!getChild().process(outputRecord)) {
          return false;
        }
      }
      return true;
    }

    private boolean readColumnWise(RCFile.Reader reader, Record record)
        throws IOException {

      while (true) {
        boolean next;
        try {
          next = reader.nextBlock();
        } catch (EOFException ex) {
          // We have hit EOF of the stream
          break;
        }
        if (!next) {
          break;
        }

        for (RCFileColumn rcColumn : columns) {
          BytesRefArrayWritable rowBatchBytes = reader.getColumn(rcColumn.getInputField(), null);
          List<Writable> rowValues = new ArrayList<Writable>(rowBatchBytes.size());
          incrementNumRecords();
          Record outputRecord = record.copy();
          for (int rowIndex = 0; rowIndex < rowBatchBytes.size(); rowIndex++) {
            BytesRefWritable rowBytes = rowBatchBytes.get(rowIndex);
            rowValues.add(updateColumnValue(rcColumn, rowBytes));
          }
          outputRecord.put(rcColumn.getOutputField(), rowValues);
         
          // pass record to next command in chain:
          if (!getChild().process(outputRecord)) {
            return false;
          }
        }
      }
      return true;
    }

    private Writable updateColumnValue(RCFileColumn column, BytesRefWritable bytesRef) throws IOException {
      Writable newColumnValue = column.newWritable();
      // Small optimization to bypass DataInput read if the column writable is
      // BytesRefWritable
      if (newColumnValue.getClass() == BytesRefWritable.class) {
        newColumnValue = bytesRef;
      } else {
        byte[] currentRowBytes = Arrays.copyOfRange(bytesRef.getData(),
            bytesRef.getStart(), bytesRef.getStart() + bytesRef.getLength());
        DataInput dataInput = ByteStreams.newDataInput(currentRowBytes);
        newColumnValue.readFields(dataInput);
      }
      return newColumnValue;
    }

   
    // /////////////////////////////////////////////////////////////////////////////
    // Nested classes:
    // /////////////////////////////////////////////////////////////////////////////
    private static enum RCFileReadMode {
      column, row
    }

   
    // /////////////////////////////////////////////////////////////////////////////
    // Nested classes:
    // /////////////////////////////////////////////////////////////////////////////
    private static final class RCFileColumn {
     
      private final int inputField;
      private final String outputField;
      private final Class<Writable> writableClass;
      private final Configuration conf;

      public RCFileColumn(Config columnConfig, Configuration conf) {
        this.conf = conf;
        Configs configs = new Configs();
        this.inputField = configs.getInt(columnConfig, "inputField");
        if (inputField < 0) {
          throw new MorphlineCompilationException(
              "Invalid column inputField specified: " + inputField, columnConfig);
        }

        this.outputField = configs.getString(columnConfig, "outputField");
        String writableClassString = configs.getString(columnConfig, "writableClass");

        if (writableClassString == null || writableClassString.isEmpty()) {
          throw new MorphlineCompilationException(
              "No writableClass specified for column " + outputField, columnConfig);
        }
        try {
          Class clazz = Class.forName(writableClassString);
          if (!Writable.class.isAssignableFrom(clazz)) {
            throw new MorphlineCompilationException("writableClass provided "
                + writableClassString + " for column " + outputField
                + " does not implement " + Writable.class.getName(), columnConfig);
          }
          this.writableClass = clazz;
        } catch (ClassNotFoundException e) {
          throw new MorphlineCompilationException("Could not load class "
              + writableClassString + " definition", columnConfig, e);
        }
        configs.validateArguments(columnConfig);
      }

      public int getInputField() {
        return inputField;
      }

      public String getOutputField() {
        return outputField;
      }

      public Writable newWritable() {
        return ReflectionUtils.newInstance(writableClass, conf);
      }

    }
  }
}
TOP

Related Classes of org.kitesdk.morphline.hadoop.rcfile.ReadRCFileBuilder$ReadRCFile$RCFileColumn

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.