}
columnStatuses = new ArrayList<>();
// totalRecords = footer.getBlocks().get(rowGroupIndex).getRowCount();
List<ColumnDescriptor> columns = footer.getFileMetaData().getSchema().getColumns();
allFieldsFixedLength = true;
ColumnDescriptor column;
ColumnChunkMetaData columnChunkMetaData;
int columnsToScan = 0;
mockRecordsRead = 0;
MaterializedField field;
// ParquetMetadataConverter metaConverter = new ParquetMetadataConverter();
FileMetaData fileMetaData;
logger.debug("Reading row group({}) with {} records in file {}.", rowGroupIndex, footer.getBlocks().get(rowGroupIndex).getRowCount(),
hadoopPath.toUri().getPath());
totalRecordsRead = 0;
// TODO - figure out how to deal with this better once we add nested reading, note also look where this map is used below
// store a map from column name to converted types if they are non-null
HashMap<String, SchemaElement> schemaElements = new HashMap<>();
fileMetaData = new ParquetMetadataConverter().toParquetMetadata(ParquetFileWriter.CURRENT_VERSION, footer);
for (SchemaElement se : fileMetaData.getSchema()) {
schemaElements.put(se.getName(), se);
}
// loop to add up the length of the fixed width columns and build the schema
for (int i = 0; i < columns.size(); ++i) {
column = columns.get(i);
logger.debug("name: " + fileMetaData.getSchema().get(i).name);
SchemaElement se = schemaElements.get(column.getPath()[0]);
MajorType mt = ParquetToDrillTypeConverter.toMajorType(column.getType(), se.getType_length(), getDataMode(column), se);
field = MaterializedField.create(toFieldName(column.getPath()),mt);
if ( ! fieldSelected(field)){
continue;
}
columnsToScan++;
// sum the lengths of all of the fixed length fields
if (column.getType() != PrimitiveType.PrimitiveTypeName.BINARY) {
if (column.getMaxRepetitionLevel() > 0) {
allFieldsFixedLength = false;
}
if (column.getType() == PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY){
bitWidthAllFixedFields += se.getType_length() * 8;
} else {
bitWidthAllFixedFields += getTypeLengthInBits(column.getType());
}
} else {
allFieldsFixedLength = false;
}
}
// rowGroupOffset = footer.getBlocks().get(rowGroupIndex).getColumns().get(0).getFirstDataPageOffset();
if (columnsToScan != 0 && allFieldsFixedLength) {
recordsPerBatch = (int) Math.min(Math.min(batchSize / bitWidthAllFixedFields,
footer.getBlocks().get(0).getColumns().get(0).getValueCount()), 65535);
}
else {
recordsPerBatch = DEFAULT_RECORDS_TO_READ_IF_NOT_FIXED_WIDTH;
}
try {
ValueVector v;
SchemaElement schemaElement;
ArrayList<VarLengthColumn> varLengthColumns = new ArrayList<>();
// initialize all of the column read status objects
boolean fieldFixedLength;
for (int i = 0; i < columns.size(); ++i) {
column = columns.get(i);
columnChunkMetaData = footer.getBlocks().get(rowGroupIndex).getColumns().get(i);
schemaElement = schemaElements.get(column.getPath()[0]);
MajorType type = ParquetToDrillTypeConverter.toMajorType(column.getType(), schemaElement.getType_length(), getDataMode(column), schemaElement);
field = MaterializedField.create(toFieldName(column.getPath()), type);
// the field was not requested to be read
if ( ! fieldSelected(field)) continue;
fieldFixedLength = column.getType() != PrimitiveType.PrimitiveTypeName.BINARY;
v = output.addField(field, (Class<? extends ValueVector>) TypeHelper.getValueVectorClass(type.getMinorType(), type.getMode()));
if (column.getType() != PrimitiveType.PrimitiveTypeName.BINARY) {
if (column.getMaxRepetitionLevel() > 0) {
ColumnReader dataReader = ColumnReaderFactory.createFixedColumnReader(this, fieldFixedLength,
column, columnChunkMetaData, recordsPerBatch,
((RepeatedFixedWidthVector) v).getMutator().getDataVector(), schemaElement);
varLengthColumns.add(new FixedWidthRepeatedReader(this, dataReader,
getTypeLengthInBits(column.getType()), -1, column, columnChunkMetaData, false, v, schemaElement));
}
else {
columnStatuses.add(ColumnReaderFactory.createFixedColumnReader(this, fieldFixedLength,
column, columnChunkMetaData, recordsPerBatch, v,
schemaElement));