Package parquet.schema

Examples of parquet.schema.MessageType


import parquet.schema.Type.Repetition;

public class HiveSchemaConverter {

  public static MessageType convert(final List<String> columnNames, final List<TypeInfo> columnTypes) {
    final MessageType schema = new MessageType("hive_schema", convertTypes(columnNames, columnTypes));
    return schema;
  }
View Full Code Here


import parquet.schema.Types;

public class HiveSchemaConverter {

  public static MessageType convert(final List<String> columnNames, final List<TypeInfo> columnTypes) {
    final MessageType schema = new MessageType("hive_schema", convertTypes(columnNames, columnTypes));
    return schema;
  }
View Full Code Here

            // below allows schema evolution
            typeListTable.add(new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.BINARY, col));
          }
        }
      }
      MessageType tableSchema = new MessageType(TABLE_SCHEMA, typeListTable);
      contextMetadata.put(HIVE_SCHEMA_KEY, tableSchema.toString());

      final List<Integer> indexColumnsWanted = ColumnProjectionUtils.getReadColumnIDs(configuration);

      final List<Type> typeListWanted = new ArrayList<Type>();

      for (final Integer idx : indexColumnsWanted) {
        if (idx < listColumns.size()) {
          String col = listColumns.get(idx);
          if (indexAccess) {
              typeListWanted.add(fileSchema.getFields().get(idx));
          } else {
            col = col.toLowerCase();
            if (lowerCaseFileSchemaColumns.containsKey(col)) {
              typeListWanted.add(tableSchema.getType(lowerCaseFileSchemaColumns.get(col)));
            }
          }
        }
      }
      MessageType requestedSchemaByUser = new MessageType(fileSchema.getName(), typeListWanted);
      return new ReadContext(requestedSchemaByUser, contextMetadata);
    } else {
      contextMetadata.put(HIVE_SCHEMA_KEY, fileSchema.toString());
      return new ReadContext(fileSchema, contextMetadata);
    }
View Full Code Here

    final Map<String, String> metadata = readContext.getReadSupportMetadata();
    if (metadata == null) {
      throw new IllegalStateException("ReadContext not initialized properly. " +
        "Don't know the Hive Schema.");
    }
    final MessageType tableSchema = MessageTypeParser.parseMessageType(metadata.get(HIVE_SCHEMA_KEY));
    return new DataWritableRecordConverter(readContext.getRequestedSchema(), tableSchema);
  }
View Full Code Here

  }

  private void testConversion(final String columnNamesStr, final String columnsTypeStr, final String expectedSchema) throws Exception {
    final List<String> columnNames = createHiveColumnsFrom(columnNamesStr);
    final List<TypeInfo> columnTypes = createHiveTypeInfoFrom(columnsTypeStr);
    final MessageType messageTypeFound = HiveSchemaConverter.convert(columnNames, columnTypes);
    final MessageType expectedMT = MessageTypeParser.parseMessageType(expectedSchema);
    assertEquals("converting " + columnNamesStr + ": " + columnsTypeStr + " to " + expectedSchema, expectedMT, messageTypeFound);
  }
View Full Code Here

  public void testMapOriginalType() throws Exception {
    final String hiveColumnTypes = "map<string,string>";
    final String hiveColumnNames = "mapCol";
    final List<String> columnNames = createHiveColumnsFrom(hiveColumnNames);
    final List<TypeInfo> columnTypes = createHiveTypeInfoFrom(hiveColumnTypes);
    final MessageType messageTypeFound = HiveSchemaConverter.convert(columnNames, columnTypes);
    // this messageType only has one optional field, whose name is mapCol, original Type is MAP
    assertEquals(1, messageTypeFound.getFieldCount());
    parquet.schema.Type topLevel = messageTypeFound.getFields().get(0);
    assertEquals("mapCol",topLevel.getName());
    assertEquals(OriginalType.MAP, topLevel.getOriginalType());
    assertEquals(Repetition.OPTIONAL, topLevel.getRepetition());

    assertEquals(1, topLevel.asGroupType().getFieldCount());
View Full Code Here

    // remove the last semicolon, java really needs a join method for strings...
    // TODO - nvm apparently it requires a semicolon after every field decl, might want to file a bug
    //messageSchema = messageSchema.substring(schemaType, messageSchema.length() - 1);
    messageSchema += "}";

    MessageType schema = MessageTypeParser.parseMessageType(messageSchema);

    CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED;
    ParquetFileWriter w = new ParquetFileWriter(configuration, schema, path);
    w.start();
    HashMap<String, Integer> columnValuesWritten = new HashMap();
    int valsWritten;
    for (int k = 0; k < props.numberRowGroups; k++){
      w.startBlock(props.recordsPerRowGroup);
      currentBooleanByte = 0;
      booleanBitCounter.reset();

      for (FieldInfo fieldInfo : props.fields.values()) {

        if ( ! columnValuesWritten.containsKey(fieldInfo.name)){
          columnValuesWritten.put((String) fieldInfo.name, 0);
          valsWritten = 0;
        } else {
          valsWritten = columnValuesWritten.get(fieldInfo.name);
        }

        String[] path1 = {(String) fieldInfo.name};
        ColumnDescriptor c1 = schema.getColumnDescription(path1);

        w.startColumn(c1, props.recordsPerRowGroup, codec);
        int valsPerPage = (int) Math.ceil(props.recordsPerRowGroup / (float) fieldInfo.numberOfPages);
        byte[] bytes;
        RunLengthBitPackingHybridValuesWriter defLevels = new RunLengthBitPackingHybridValuesWriter(
View Full Code Here

      if (blocksForCurrentSplit.size() == 0) {
        LOG.debug("HDFS block without row group: " + hdfsBlocks[i]);
      } else {
        long length = 0;
        for (BlockMetaData block : blocksForCurrentSplit) {
          MessageType requested = MessageTypeParser.parseMessageType(requestedSchema);
          List<ColumnChunkMetaData> columns = block.getColumns();
          for (ColumnChunkMetaData column : columns) {
            if (requested.containsPath(column.getPath().toArray())) {
              length += column.getTotalSize();
            }
          }
        }
        splits.add(new ParquetInputSplit(
View Full Code Here

  }

  private static void add(ParquetMetadata footer) {
    for (BlockMetaData blockMetaData : footer.getBlocks()) {
      ++ blockCount;
      MessageType schema = footer.getFileMetaData().getSchema();
      recordCount += blockMetaData.getRowCount();
      List<ColumnChunkMetaData> columns = blockMetaData.getColumns();
      for (ColumnChunkMetaData columnMetaData : columns) {
        ColumnDescriptor desc = schema.getColumnDescription(columnMetaData.getPath().toArray());
        add(
            desc,
            columnMetaData.getValueCount(),
            columnMetaData.getTotalSize(),
            columnMetaData.getTotalUncompressedSize(),
View Full Code Here

   * @return the result of the merge
   */
  static GlobalMetaData mergeInto(
      FileMetaData toMerge,
      GlobalMetaData mergedMetadata) {
    MessageType schema = null;
    Map<String, Set<String>> newKeyValues = new HashMap<String, Set<String>>();
    Set<String> createdBy = new HashSet<String>();
    if (mergedMetadata != null) {
      schema = mergedMetadata.getSchema();
      newKeyValues.putAll(mergedMetadata.getKeyValueMetaData());
      createdBy.addAll(mergedMetadata.getCreatedBy());
    }
    if ((schema == null && toMerge.getSchema() != null)
        || (schema != null && !schema.equals(toMerge.getSchema()))) {
      schema = mergeInto(toMerge.getSchema(), schema);
    }
    for (Entry<String, String> entry : toMerge.getKeyValueMetaData().entrySet()) {
      Set<String> values = newKeyValues.get(entry.getKey());
      if (values == null) {
View Full Code Here

TOP

Related Classes of parquet.schema.MessageType

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.