Examples of parquet.column.ColumnDescriptor

parquet.column.ColumnDescriptor
Describes a column's type as well as its position in its containing schema. @author Julien Le Dem

        List<ColumnDescriptor> sources = source.getColumns();
        List<? extends PropertyDescriptor> targets = target.getPropertyDescriptors();
        List<Mapping> mappings = new ArrayList<Mapping>();
        int limit = Math.min(sources.size(), targets.size());
        for (int i = 0; i < limit; i++) {
            ColumnDescriptor s = sources.get(i);
            PropertyDescriptor t = targets.get(i);
            mappings.add(new Mapping(s, t));
        }
        for (int i = limit, n = sources.size(); i < n; i++) {
            mappings.add(new Mapping(sources.get(i), null));

View Full Code Here


    totalRecords = footer.getBlocks().get(rowGroupIndex).getRowCount();


    List<ColumnDescriptor> columns = footer.getFileMetaData().getSchema().getColumns();
    allFieldsFixedLength = true;
    ColumnDescriptor column;
    ColumnChunkMetaData columnChunkMetaData;


    // loop to add up the length of the fixed width columns and build the schema
    for (int i = 0; i < columns.size(); ++i) {
      column = columns.get(i);
      // sum the lengths of all of the fixed length fields
      if (column.getType() != PrimitiveType.PrimitiveTypeName.BINARY) {
        // There is not support for the fixed binary type yet in parquet, leaving a task here as a reminder
        // TODO - implement this when the feature is added upstream
//          if (column.getType() == PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY){
//              byteWidthAllFixedFields += column.getType().getWidth()
//          }
//          else { } // the code below for the rest of the fixed length fields


        bitWidthAllFixedFields += getTypeLengthInBits(column.getType());
      } else {
        allFieldsFixedLength = false;
      }


    }
    rowGroupOffset = footer.getBlocks().get(rowGroupIndex).getColumns().get(0).getFirstDataPageOffset();


    if (allFieldsFixedLength) {
      recordsPerBatch = (int) Math.min(batchSize / bitWidthAllFixedFields, footer.getBlocks().get(0).getColumns().get(0).getValueCount());
    }
    try {
      ArrayList<VarLenBinaryReader.VarLengthColumn> varLengthColumns = new ArrayList<>();
      ArrayList<VarLenBinaryReader.NullableVarLengthColumn> nullableVarLengthColumns = new ArrayList<>();
      // initialize all of the column read status objects
      boolean fieldFixedLength = false;
      MaterializedField field;
      for (int i = 0; i < columns.size(); ++i) {
        column = columns.get(i);
        columnChunkMetaData = footer.getBlocks().get(0).getColumns().get(i);
        field = MaterializedField.create(toFieldName(column.getPath()),
            toMajorType(column.getType(), getDataMode(column)));
        fieldFixedLength = column.getType() != PrimitiveType.PrimitiveTypeName.BINARY;
        ValueVector v = TypeHelper.getNewVector(field, allocator);
        if (column.getType() != PrimitiveType.PrimitiveTypeName.BINARY) {
          createFixedColumnReader(fieldFixedLength, column, columnChunkMetaData, recordsPerBatch, v);
        } else {
          if (column.getMaxDefinitionLevel() == 0){// column is required
            varLengthColumns.add(new VarLenBinaryReader.VarLengthColumn(this, -1, column, columnChunkMetaData, false, v));
          }
          else{
            nullableVarLengthColumns.add(new VarLenBinaryReader.NullableVarLengthColumn(this, -1, column, columnChunkMetaData, false, v));
          }

View Full Code Here

        } else {
          valsWritten = columnValuesWritten.get(fieldInfo.name);
        }


        String[] path1 = { (String) fieldInfo.name };
        ColumnDescriptor c1 = schema.getColumnDescription(path1);


        w.startColumn(c1, recordsPerRowGroup, codec);
        int valsPerPage = (int) Math.ceil(recordsPerRowGroup / (float) fieldInfo.numberOfPages);
        byte[] bytes;
        // for variable length binary fields

View Full Code Here

        } else {
          valsWritten = columnValuesWritten.get(fieldInfo.name);
        }


        String[] path1 = {(String) fieldInfo.name};
        ColumnDescriptor c1 = schema.getColumnDescription(path1);


        w.startColumn(c1, props.recordsPerRowGroup, codec);
        int valsPerPage = (int) Math.ceil(props.recordsPerRowGroup / (float) fieldInfo.numberOfPages);
        byte[] bytes;
        // for variable length binary fields

View Full Code Here


    columnStatuses = new ArrayList<>();
    totalRecords = footer.getBlocks().get(rowGroupIndex).getRowCount();
    List<ColumnDescriptor> columns = footer.getFileMetaData().getSchema().getColumns();
    allFieldsFixedLength = true;
    ColumnDescriptor column;
    ColumnChunkMetaData columnChunkMetaData;
    int columnsToScan = 0;


    MaterializedField field;
    ParquetMetadataConverter metaConverter = new ParquetMetadataConverter();
    FileMetaData fileMetaData;


    // TODO - figure out how to deal with this better once we add nested reading, note also look where this map is used below
    // store a map from column name to converted types if they are non-null
    HashMap<String, SchemaElement> schemaElements = new HashMap<>();
    fileMetaData = new ParquetMetadataConverter().toParquetMetadata(ParquetFileWriter.CURRENT_VERSION, footer);
    for (SchemaElement se : fileMetaData.getSchema()) {
      schemaElements.put(se.getName(), se);
    }


    // loop to add up the length of the fixed width columns and build the schema
    for (int i = 0; i < columns.size(); ++i) {
      column = columns.get(i);
      logger.debug("name: " + fileMetaData.getSchema().get(i).name);
      SchemaElement se = schemaElements.get(column.getPath()[0]);
      MajorType mt = ParquetToDrillTypeConverter.toMajorType(column.getType(), se.getType_length(), getDataMode(column), se);
      field = MaterializedField.create(toFieldName(column.getPath()),mt);
      if ( ! fieldSelected(field)){
        continue;
      }
      columnsToScan++;
      // sum the lengths of all of the fixed length fields
      if (column.getType() != PrimitiveType.PrimitiveTypeName.BINARY) {
        if (column.getMaxRepetitionLevel() > 0) {
          allFieldsFixedLength = false;
        }
        // There is not support for the fixed binary type yet in parquet, leaving a task here as a reminder
        // TODO - implement this when the feature is added upstream
          if (column.getType() == PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY){
              bitWidthAllFixedFields += se.getType_length() * 8;
          } else {
            bitWidthAllFixedFields += getTypeLengthInBits(column.getType());
          }
      } else {
        allFieldsFixedLength = false;
      }
    }
    rowGroupOffset = footer.getBlocks().get(rowGroupIndex).getColumns().get(0).getFirstDataPageOffset();


    // none of the columns in the parquet file matched the request columns from the query
    if (columnsToScan == 0){
      throw new ExecutionSetupException("Error reading from parquet file. No columns requested were found in the file.");
    }
    if (allFieldsFixedLength) {
      recordsPerBatch = (int) Math.min(Math.min(batchSize / bitWidthAllFixedFields,
          footer.getBlocks().get(0).getColumns().get(0).getValueCount()), 65535);
    }
    else {
      recordsPerBatch = DEFAULT_RECORDS_TO_READ_IF_NOT_FIXED_WIDTH;
    }


    try {
      ValueVector v;
      ConvertedType convertedType;
      SchemaElement schemaElement;
      ArrayList<VarLengthColumn> varLengthColumns = new ArrayList<>();
      // initialize all of the column read status objects
      boolean fieldFixedLength = false;
      for (int i = 0; i < columns.size(); ++i) {
        column = columns.get(i);
        columnChunkMetaData = footer.getBlocks().get(rowGroupIndex).getColumns().get(i);
        schemaElement = schemaElements.get(column.getPath()[0]);
        convertedType = schemaElement.getConverted_type();
        MajorType type = ParquetToDrillTypeConverter.toMajorType(column.getType(), schemaElement.getType_length(), getDataMode(column), schemaElement);
        field = MaterializedField.create(toFieldName(column.getPath()), type);
        // the field was not requested to be read
        if ( ! fieldSelected(field)) continue;


        fieldFixedLength = column.getType() != PrimitiveType.PrimitiveTypeName.BINARY;
        v = output.addField(field, (Class<? extends ValueVector>) TypeHelper.getValueVectorClass(type.getMinorType(), type.getMode()));
        if (column.getType() != PrimitiveType.PrimitiveTypeName.BINARY) {
          if (column.getMaxRepetitionLevel() > 0) {
            ColumnReader dataReader = ColumnReaderFactory.createFixedColumnReader(this, fieldFixedLength, column, columnChunkMetaData, recordsPerBatch,
                ((RepeatedFixedWidthVector) v).getMutator().getDataVector(), schemaElement);
            varLengthColumns.add(new FixedWidthRepeatedReader(this, dataReader,
                getTypeLengthInBits(column.getType()), -1, column, columnChunkMetaData, false, v, schemaElement));
          }
          else {
            columnStatuses.add(ColumnReaderFactory.createFixedColumnReader(this, fieldFixedLength, column, columnChunkMetaData, recordsPerBatch, v,
                schemaElement));
          }

View Full Code Here

        } else {
          valsWritten = columnValuesWritten.get(fieldInfo.name);
        }


        String[] path1 = {(String) fieldInfo.name};
        ColumnDescriptor c1 = schema.getColumnDescription(path1);


        w.startColumn(c1, props.recordsPerRowGroup, codec);
        int valsPerPage = (int) Math.ceil(props.recordsPerRowGroup / (float) fieldInfo.numberOfPages);
        byte[] bytes;
        RunLengthBitPackingHybridValuesWriter defLevels = new RunLengthBitPackingHybridValuesWriter(MAX_EXPECTED_BIT_WIDTH_FOR_DEFINITION_LEVELS, valsPerPage);

View Full Code Here

        } else {
          valsWritten = columnValuesWritten.get(fieldInfo.name);
        }


        String[] path1 = {(String) fieldInfo.name};
        ColumnDescriptor c1 = schema.getColumnDescription(path1);


        w.startColumn(c1, props.recordsPerRowGroup, codec);
        int valsPerPage = (int) Math.ceil(props.recordsPerRowGroup / (float) fieldInfo.numberOfPages);
        byte[] bytes;
        RunLengthBitPackingHybridValuesWriter defLevels = new RunLengthBitPackingHybridValuesWriter(

View Full Code Here

    List<String[]> paths = this.getPaths(0);
    List<ColumnDescriptor> columns = new ArrayList<ColumnDescriptor>(paths.size());
    for (String[] path : paths) {
      // TODO: optimize this
      PrimitiveType primitiveType = getType(path).asPrimitiveType();
      columns.add(new ColumnDescriptor(
                      path, 
                      primitiveType.getPrimitiveTypeName(), 
                      primitiveType.getTypeLength(),
                      getMaxRepetitionLevel(path), 
                      getMaxDefinitionLevel(path)));

View Full Code Here


  @Override
  void setLevels(int r, int d, String[] fieldPath, int[] fieldIndexPath, List<ColumnIO> repetition, List<ColumnIO> path) {
    super.setLevels(r, d, fieldPath, fieldIndexPath, repetition, path);
    PrimitiveType type = getType().asPrimitiveType();
    this.columnDescriptor = new ColumnDescriptor(
        fieldPath, 
        type.getPrimitiveTypeName(),
        type.getTypeLength(),
        getRepetitionLevel(), 
        getDefinitionLevel());

View Full Code Here


  public ColumnDescriptor getColumnDescription(String[] path) {
    int maxRep = getMaxRepetitionLevel(path);
    int maxDef = getMaxDefinitionLevel(path);
    PrimitiveType type = getType(path).asPrimitiveType();
    return new ColumnDescriptor(path, type.getPrimitiveTypeName(),
                                type.getTypeLength(), maxRep, maxDef);
  }

View Full Code Here

0 1

TOP

Related Classes of parquet.column.ColumnDescriptor

com.asakusafw.directio.hive.parquet.DataModelMaterializer

org.apache.drill.exec.store.parquet.columnreaders.ParquetRecordReader

org.apache.drill.exec.store.parquet.ParquetRecordReader

org.apache.drill.exec.store.parquet.TestFileGenerator

org.apache.drill.exec.store.ParquetRecordReaderTest

parquet.hadoop.ParquetFileReader

parquet.hadoop.PrintFooter

parquet.io.PrimitiveColumnIO

parquet.schema.MessageType

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.