Package org.apache.drill.exec.store.json

Source Code of org.apache.drill.exec.store.json.JSONRecordReader

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.drill.exec.store.json;

import static com.fasterxml.jackson.core.JsonToken.END_ARRAY;
import static com.fasterxml.jackson.core.JsonToken.END_OBJECT;
import static com.fasterxml.jackson.core.JsonToken.FIELD_NAME;

import java.io.IOException;
import java.nio.charset.Charset;
import java.util.List;
import java.util.Map;

import org.apache.drill.common.exceptions.DrillRuntimeException;
import org.apache.drill.common.exceptions.ExecutionSetupException;
import org.apache.drill.common.expression.ExpressionPosition;
import org.apache.drill.common.expression.FieldReference;
import org.apache.drill.common.expression.SchemaPath;
import org.apache.drill.common.types.TypeProtos.MajorType;
import org.apache.drill.common.types.TypeProtos.MinorType;
import org.apache.drill.common.types.Types;
import org.apache.drill.exec.exception.SchemaChangeException;
import org.apache.drill.exec.expr.TypeHelper;
import org.apache.drill.exec.expr.holders.NullableBitHolder;
import org.apache.drill.exec.expr.holders.NullableFloat4Holder;
import org.apache.drill.exec.expr.holders.NullableIntHolder;
import org.apache.drill.exec.memory.BufferAllocator;
import org.apache.drill.exec.ops.FragmentContext;
import org.apache.drill.exec.physical.impl.OutputMutator;
import org.apache.drill.exec.record.MaterializedField;
import org.apache.drill.exec.schema.DiffSchema;
import org.apache.drill.exec.schema.Field;
import org.apache.drill.exec.schema.NamedField;
import org.apache.drill.exec.schema.ObjectSchema;
import org.apache.drill.exec.schema.RecordSchema;
import org.apache.drill.exec.schema.SchemaIdGenerator;
import org.apache.drill.exec.schema.json.jackson.JacksonHelper;
import org.apache.drill.exec.store.RecordReader;
import org.apache.drill.exec.store.VectorHolder;
import org.apache.drill.exec.vector.AllocationHelper;
import org.apache.drill.exec.vector.NullableBitVector;
import org.apache.drill.exec.vector.NullableFloat4Vector;
import org.apache.drill.exec.vector.NullableIntVector;
import org.apache.drill.exec.vector.NullableVarCharVector;
import org.apache.drill.exec.vector.RepeatedBitVector;
import org.apache.drill.exec.vector.RepeatedFloat4Vector;
import org.apache.drill.exec.vector.RepeatedIntVector;
import org.apache.drill.exec.vector.RepeatedVarCharVector;
import org.apache.drill.exec.vector.ValueVector;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import com.fasterxml.jackson.core.JsonFactory;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.JsonToken;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;

public class JSONRecordReader implements RecordReader {
  static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(JSONRecordReader.class);
  private static final int DEFAULT_LENGTH = 256 * 1024; // 256kb
  public static final Charset UTF_8 = Charset.forName("UTF-8");

  private final Map<String, VectorHolder> valueVectorMap;
  private final FileSystem fileSystem;
  private final Path hadoopPath;

  private JsonParser parser;
  private SchemaIdGenerator generator;
  private DiffSchema diffSchema;
  private RecordSchema currentSchema;
  private List<Field> removedFields;
  private OutputMutator outputMutator;
  private BufferAllocator allocator;
  private int batchSize;
  private final FieldReference ref;

  public JSONRecordReader(FragmentContext fragmentContext, String inputPath, FileSystem fileSystem, int batchSize, FieldReference ref) {
    this.hadoopPath = new Path(inputPath);
    this.fileSystem = fileSystem;
    this.allocator = fragmentContext.getAllocator();
    this.batchSize = batchSize;
    valueVectorMap = Maps.newHashMap();
    this.ref = ref;
  }

  public JSONRecordReader(FragmentContext fragmentContext, String inputPath, FileSystem fileSystem, FieldReference ref) {
    this(fragmentContext, inputPath, fileSystem, DEFAULT_LENGTH, ref);
  }

  private JsonParser getParser() {
    return parser;
  }

  @Override
  public void setup(OutputMutator output) throws ExecutionSetupException {
    outputMutator = output;
    currentSchema = new ObjectSchema();
    diffSchema = new DiffSchema();
    removedFields = Lists.newArrayList();

    try {
      JsonFactory factory = new JsonFactory();
      parser = factory.createJsonParser(fileSystem.open(hadoopPath));
      parser.nextToken(); // Read to the first START_OBJECT token
      generator = new SchemaIdGenerator();
    } catch (IOException e) {
      throw new ExecutionSetupException(e);
    }
  }

  @Override
  public int next() {
    if (parser.isClosed() || !parser.hasCurrentToken()) {
      return 0;
    }

    resetBatch();

    int nextRowIndex = 0;

    try {
      while (ReadType.OBJECT.readRecord(this, null, nextRowIndex++, 0)) {
        parser.nextToken(); // Read to START_OBJECT token

        if (!parser.hasCurrentToken()) {
          parser.close();
          break;
        }
      }

      parser.nextToken();

      if (!parser.hasCurrentToken()) {
        parser.close();
      }

      // Garbage collect fields never referenced in this batch
      for (Field field : Iterables.concat(currentSchema.removeUnreadFields(), removedFields)) {
        diffSchema.addRemovedField(field);
        outputMutator.removeField(field.getAsMaterializedField(ref));
      }

      if (diffSchema.isChanged()) {
        outputMutator.setNewSchema();
      }


    } catch (IOException | SchemaChangeException e) {
      logger.error("Error reading next in Json reader", e);
    }

    for (VectorHolder holder : valueVectorMap.values()) {
      holder.populateVectorLength();
    }

    return nextRowIndex;
  }

  private void resetBatch() {
    for (VectorHolder value : valueVectorMap.values()) {
      value.reset();
    }

    currentSchema.resetMarkedFields();
    diffSchema.reset();
    removedFields.clear();
  }

  @Override
  public void cleanup() {
    try {
      parser.close();
    } catch (IOException e) {
      logger.warn("Error closing Json parser", e);
    }
  }


  private RecordSchema getCurrentSchema() {
    return currentSchema;
  }

  private void setCurrentSchema(RecordSchema schema) {
    currentSchema = schema;
  }

  private List<Field> getRemovedFields() {
    return removedFields;
  }

  public BufferAllocator getAllocator() {
    return allocator;
  }

  public static enum ReadType {
    ARRAY(END_ARRAY) {
      @Override
      public Field createField(RecordSchema parentSchema, String prefixFieldName, String fieldName, MajorType fieldType, int index) {
        return new NamedField(parentSchema, prefixFieldName, fieldName, fieldType);
      }

      @Override
      public RecordSchema createSchema() throws IOException {
        return new ObjectSchema();
      }
    },
    OBJECT(END_OBJECT) {
      @Override
      public Field createField(RecordSchema parentSchema,
                               String prefixFieldName,
                               String fieldName,
                               MajorType fieldType,
                               int index) {
        return new NamedField(parentSchema, prefixFieldName, fieldName, fieldType);
      }

      @Override
      public RecordSchema createSchema() throws IOException {
        return new ObjectSchema();
      }
    };

    private final JsonToken endObject;

    ReadType(JsonToken endObject) {
      this.endObject = endObject;
    }

    public JsonToken getEndObject() {
      return endObject;
    }

    @SuppressWarnings("ConstantConditions")
    public boolean readRecord(JSONRecordReader reader,
                              String prefixFieldName,
                              int rowIndex,
                              int groupCount) throws IOException, SchemaChangeException {
      JsonParser parser = reader.getParser();
      JsonToken token = parser.nextToken();
      JsonToken endObject = getEndObject();
      int colIndex = 0;
      boolean isFull = false;
      while (token != endObject) {
        if (token == FIELD_NAME) {
          token = parser.nextToken();
          continue;
        }

        String fieldName = parser.getCurrentName();
        MajorType fieldType = JacksonHelper.getFieldType(token, this == ReadType.ARRAY);
        ReadType readType = null;
        switch (token) {
          case START_ARRAY:
            readType = ReadType.ARRAY;
            groupCount++;
            break;
          case START_OBJECT:
            readType = ReadType.OBJECT;
            groupCount = 0;
            break;
        }

        if (fieldType != null) { // Including nulls
          boolean currentFieldFull = !recordData(
              readType,
              reader,
              fieldType,
              prefixFieldName,
              fieldName,
              rowIndex,
              colIndex,
              groupCount);

          isFull = isFull || currentFieldFull;
        }
        token = parser.nextToken();
        colIndex += 1;
      }
      return !isFull;
    }

    private void removeChildFields(List<Field> removedFields, Field field) {
      RecordSchema schema = field.getAssignedSchema();
      if (schema == null) {
        return;
      }
      for (Field childField : schema.getFields()) {
        removedFields.add(childField);
        if (childField.hasSchema()) {
          removeChildFields(removedFields, childField);
        }
      }
    }

    private boolean recordData(JSONRecordReader.ReadType readType,
                               JSONRecordReader reader,
                               MajorType fieldType,
                               String prefixFieldName,
                               String fieldName,
                               int rowIndex,
                               int colIndex,
                               int groupCount) throws IOException, SchemaChangeException {
      RecordSchema currentSchema = reader.getCurrentSchema();
      Field field = currentSchema.getField(fieldName == null ? prefixFieldName : fieldName, colIndex);
      boolean isFieldFound = field != null;
      List<Field> removedFields = reader.getRemovedFields();
      boolean newFieldLateBound = fieldType.getMinorType().equals(MinorType.LATE);

      if (isFieldFound && !field.getFieldType().equals(fieldType)) {
        boolean existingFieldLateBound = field.getFieldType().getMinorType().equals(MinorType.LATE);

        if (newFieldLateBound && !existingFieldLateBound) {
          fieldType = Types.overrideMinorType(fieldType, field.getFieldType().getMinorType());
        } else if (!newFieldLateBound && existingFieldLateBound) {
          field.setFieldType(Types.overrideMinorType(field.getFieldType(), fieldType.getMinorType()));
        } else if (!newFieldLateBound && !existingFieldLateBound) {
          if (field.hasSchema()) {
            removeChildFields(removedFields, field);
          }
          removedFields.add(field);
          currentSchema.removeField(field, colIndex);

          isFieldFound = false;
        }
      }

      if (!isFieldFound) {
        field = createField(
            currentSchema,
            prefixFieldName,
            fieldName,
            fieldType,
            colIndex
        );

        reader.recordNewField(field);
        currentSchema.addField(field);
      }

      field.setRead(true);

      VectorHolder holder = getOrCreateVectorHolder(reader, field);
      if (readType != null) {
        RecordSchema fieldSchema = field.getAssignedSchema();
        RecordSchema newSchema = readType.createSchema();

        if (readType != ReadType.ARRAY) {
          reader.setCurrentSchema(fieldSchema);
          if (fieldSchema == null) reader.setCurrentSchema(newSchema);
          readType.readRecord(reader, field.getFullFieldName(), rowIndex, groupCount);
        } else {
          readType.readRecord(reader, field.getFullFieldName(), rowIndex, groupCount);
        }

        reader.setCurrentSchema(currentSchema);

      } else if (holder != null && !newFieldLateBound && fieldType.getMinorType() != MinorType.LATE) {
        return addValueToVector(
            rowIndex,
            holder,
            JacksonHelper.getValueFromFieldType(
                reader.getParser(),
                fieldType.getMinorType()
            ),
            fieldType.getMinorType(),
            groupCount
        );
      }

      return true;
    }

    private static <T> boolean addValueToVector(int index, VectorHolder holder, T val, MinorType minorType, int groupCount) {
      switch (minorType) {
        case INT: {
          holder.incAndCheckLength(NullableIntHolder.WIDTH * 8 + 1);
          if (groupCount == 0) {
            if (val != null) {
              NullableIntVector int4 = (NullableIntVector) holder.getValueVector();
              NullableIntVector.Mutator m = int4.getMutator();
              m.set(index, (Integer) val);
            }
          } else {
            if (val == null) {
              throw new UnsupportedOperationException("Nullable repeated int is not supported.");
            }

            RepeatedIntVector repeatedInt4 = (RepeatedIntVector) holder.getValueVector();
            RepeatedIntVector.Mutator m = repeatedInt4.getMutator();
            holder.setGroupCount(index);
            m.add(index, (Integer) val);
          }

          return holder.hasEnoughSpace(NullableIntHolder.WIDTH * 8 + 1);
        }
        case FLOAT4: {
          holder.incAndCheckLength(NullableFloat4Holder.WIDTH * 8 + 1);
          if (groupCount == 0) {
            if (val != null) {
              NullableFloat4Vector float4 = (NullableFloat4Vector) holder.getValueVector();
              NullableFloat4Vector.Mutator m = float4.getMutator();
              m.set(index, (Float) val);
            }
          } else {
            if (val == null) {
              throw new UnsupportedOperationException("Nullable repeated float is not supported.");
            }

            RepeatedFloat4Vector repeatedFloat4 = (RepeatedFloat4Vector) holder.getValueVector();
            RepeatedFloat4Vector.Mutator m = repeatedFloat4.getMutator();
            holder.setGroupCount(index);
            m.add(index, (Float) val);
          }
          return holder.hasEnoughSpace(NullableFloat4Holder.WIDTH * 8 + 1);
        }
        case VARCHAR: {
          if (val == null) {
            return (index + 1) * 4 <= holder.getLength();
          } else {
            byte[] bytes = ((String) val).getBytes(UTF_8);
            int length = bytes.length;
            holder.incAndCheckLength(length);
            if (groupCount == 0) {
              NullableVarCharVector varLen4 = (NullableVarCharVector) holder.getValueVector();
              NullableVarCharVector.Mutator m = varLen4.getMutator();
              m.set(index, bytes);
            } else {
              RepeatedVarCharVector repeatedVarLen4 = (RepeatedVarCharVector) holder.getValueVector();
              RepeatedVarCharVector.Mutator m = repeatedVarLen4.getMutator();
              holder.setGroupCount(index);
              m.add(index, bytes);
            }
            return holder.hasEnoughSpace(length + 4 + 1);
          }
        }
        case BIT: {
          holder.incAndCheckLength(NullableBitHolder.WIDTH + 1);
          if (groupCount == 0) {
            if (val != null) {
              NullableBitVector bit = (NullableBitVector) holder.getValueVector();
              NullableBitVector.Mutator m = bit.getMutator();
              m.set(index, (Boolean) val ? 1 : 0);
            }
          } else {
            if (val == null) {
              throw new UnsupportedOperationException("Nullable repeated boolean is not supported.");
            }

            RepeatedBitVector repeatedBit = (RepeatedBitVector) holder.getValueVector();
            RepeatedBitVector.Mutator m = repeatedBit.getMutator();
            holder.setGroupCount(index);
            m.add(index, (Boolean) val ? 1 : 0);
          }
          return holder.hasEnoughSpace(NullableBitHolder.WIDTH + 1);
        }
        default:
          throw new DrillRuntimeException("Type not supported to add value. Type: " + minorType);
      }
    }

    private VectorHolder getOrCreateVectorHolder(JSONRecordReader reader, Field field) throws SchemaChangeException {
      return reader.getOrCreateVectorHolder(field);
    }

    public abstract RecordSchema createSchema() throws IOException;

    public abstract Field createField(RecordSchema parentSchema,
                                      String prefixFieldName,
                                      String fieldName,
                                      MajorType fieldType,
                                      int index);
  }

  private void recordNewField(Field field) {
    diffSchema.recordNewField(field);
  }

  private VectorHolder getOrCreateVectorHolder(Field field) throws SchemaChangeException {
    String fullFieldName = ref != null ? ref.getPath() + "." + field.getFullFieldName() : field.getFullFieldName();
    VectorHolder holder = valueVectorMap.get(fullFieldName);

    if (holder == null) {
      MajorType type = field.getFieldType();
      MinorType minorType = type.getMinorType();

      if (minorType.equals(MinorType.MAP) || minorType.equals(MinorType.LATE)) {
        return null;
      }

      MaterializedField f = MaterializedField.create(new SchemaPath(fullFieldName, ExpressionPosition.UNKNOWN), type);

      ValueVector v = TypeHelper.getNewVector(f, allocator);
      AllocationHelper.allocate(v, batchSize, 50);
      holder = new VectorHolder(v);
      valueVectorMap.put(fullFieldName, holder);
      outputMutator.addField(v);
      return holder;
    }
    return holder;
  }
}
TOP

Related Classes of org.apache.drill.exec.store.json.JSONRecordReader

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.