Package co.cask.cdap.internal.io

Source Code of co.cask.cdap.internal.io.Schema

/*
* Copyright © 2014 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/

package co.cask.cdap.internal.io;

import com.google.common.base.Preconditions;
import com.google.common.base.Throwables;
import com.google.common.collect.BiMap;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.ImmutableBiMap;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Iterables;
import com.google.common.collect.Maps;
import com.google.common.collect.Multimap;
import com.google.common.io.CharStreams;
import com.google.gson.stream.JsonWriter;

import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.Set;

/**
* This class represents schema of data types.
*/
public final class Schema {

  /**
   * Types known to Schema.
   */
  public enum Type {
    NULL(true),
    BOOLEAN(true),
    INT(true),
    LONG(true),
    FLOAT(true),
    DOUBLE(true),
    BYTES(true),
    STRING(true),

    ENUM(false),

    ARRAY(false),
    MAP(false),
    RECORD(false),
    UNION(false);

    private final boolean simpleType;

    private Type(boolean primitive) {
      this.simpleType = primitive;
    }

    /**
     * @return true if this enum represents a simple schema type.
     */
    public boolean isSimpleType() {
      return simpleType;
    }
  }

  /**
   * Represents a field inside a {@link Type#RECORD} schema.
   */
  public static final class Field {
    private final String name;
    private final Schema schema;

    /**
     * Creates a {@link Field} instance with the given name and {@link Schema}.
     *
     * @param name Name of the field.
     * @param schema Schema of the field.
     * @return A new {@link Field} instance.
     */
    public static Field of(String name, Schema schema) {
      return new Field(name, schema);
    }

    private Field(String name, Schema schema) {
      this.name = name;
      this.schema = schema;
    }

    /**
     * @return Name of the field.
     */
    public String getName() {
      return name;
    }

    /**
     * @return Schema of the field.
     */
    public Schema getSchema() {
      return schema;
    }
  }

  /**
   * Creates a {@link Schema} for the given type. The type given must be a
   * {@link Schema.Type#isSimpleType() Simple Type}.
   *
   * @param type Type of the schema to create.
   * @return A {@link Schema} with the given type.
   */
  public static Schema of(Type type) {
    Preconditions.checkArgument(type.isSimpleType(), "Type %s is not a simple type.", type);
    return new Schema(type, null, null, null, null, null, null, null);
  }

  /**
   * Creates a {@link Schema} of {@link Type#ENUM ENUM} type, with the given enum values.
   * The set of values given should be unique and must contains at least one value.
   * The ordering of values in the enum type schema would be the same as the order being passed in.
   *
   * @param values Enum values.
   * @return A {@link Schema} of {@link Type#ENUM ENUM} type.
   */
  public static Schema enumWith(String...values) {
    return enumWith(ImmutableList.copyOf(values));
  }

  /**
   * Creates a {@link Schema} of {@link Type#ENUM ENUM} type, with the given enum values.
   * The set of values given should be unique and must contains at least one value.
   * The ordering of values in the enum type schema would be the same as the {@link Iterable#iterator()} order.
   *
   * @param values Enum values.
   * @return A {@link Schema} of {@link Type#ENUM ENUM} type.
   */
  public static Schema enumWith(Iterable<String> values) {
    Set<String> uniqueValues = ImmutableSet.copyOf(values);
    Preconditions.checkArgument(uniqueValues.size() > 0, "No enum value provided.");
    Preconditions.checkArgument(Iterables.size(values) == uniqueValues.size(), "Duplicate enum value is not allowed.");
    return new Schema(Type.ENUM, uniqueValues, null, null, null, null, null, null);
  }

  /**
   * Creates a {@link Schema} of {@link Type#ENUM ENUM} type, with values extracted from the given {@link Enum} class.
   * The ordering of values in the enum type schema would be the same as the {@link Enum#ordinal()} order.
   *
   * @param enumClass Enum values.
   * @return A {@link Schema} of {@link Type#ENUM ENUM} type.
   */
  public static Schema enumWith(Class<Enum<?>> enumClass) {
    Enum<?>[] enumConstants = enumClass.getEnumConstants();
    String[] names = new String[enumConstants.length];
    for (int i = 0; i < enumConstants.length; i++) {
      names[i] = enumConstants[i].name();
    }
    return enumWith(names);
  }

  /**
   * Creates an {@link Type#ARRAY ARRAY} {@link Schema} of the given component type.
   * @param componentSchema Schema of the array component.
   * @return A {@link Schema} of {@link Type#ARRAY ARRAY} type.
   */
  public static Schema arrayOf(Schema componentSchema) {
    return new Schema(Type.ARRAY, null, componentSchema, null, null, null, null, null);
  }

  /**
   * Creates a {@link Type#MAP MAP} {@link Schema} of the given key and value types.
   * @param keySchema Schema of the map key.
   * @param valueSchema Schema of the map value
   * @return A {@link Schema} of {@link Type#MAP MAP} type.
   */
  public static Schema mapOf(Schema keySchema, Schema valueSchema) {
    return new Schema(Type.MAP, null, null, keySchema, valueSchema, null, null, null);
  }

  /**
   * Creates a {@link Type#RECORD RECORD} {@link Schema} of the given name. The schema created
   * doesn't carry any record fields, which makes it only useful to be used as a component schema
   * for other schema type, where the actual schema is resolved from the top level container schema.
   *
   * @param name Name of the record.
   * @return A {@link Schema} of {@link Type#RECORD RECORD} type.
   */
  public static Schema recordOf(String name) {
    Preconditions.checkNotNull(name, "Record name cannot be null.");
    return new Schema(Type.RECORD, null, null, null, null, name, null, null);
  }

  /**
   * Creates a {@link Type#RECORD RECORD} {@link Schema} with the given name and {@link Field Fields}.
   * The ordering of the fields inside the record would be the same as the one being passed in.
   *
   * @param name Name of the record
   * @param fields All the fields that the record contains.
   * @return A {@link Schema} of {@link Type#RECORD RECORD} type.
   */
  public static Schema recordOf(String name, Field...fields) {
    return recordOf(name, ImmutableList.copyOf(fields));
  }

  /**
   * Creates a {@link Type#RECORD RECORD} {@link Schema} with the given name and {@link Field Fields}.
   * The ordering of the fields inside the record would be the same as the {@link Iterable#iterator()} order.
   *
   * @param name Name of the record
   * @param fields All the fields that the record contains.
   * @return A {@link Schema} of {@link Type#RECORD RECORD} type.
   */
  public static Schema recordOf(String name, Iterable<Field> fields) {
    Preconditions.checkNotNull(name, "Record name cannot be null.");
    ImmutableMap.Builder<String, Field> fieldMapBuilder = ImmutableMap.builder();
    for (Field field : fields) {
      fieldMapBuilder.put(field.getName(), field);
    }
    Map<String, Field> fieldMap = fieldMapBuilder.build();
    Preconditions.checkArgument(fieldMap.size() > 0, "No record field provided for %s", name);
    return new Schema(Type.RECORD, null, null, null, null, name, fieldMap, null);
  }

  /**
   * Creates a {@link Type#UNION UNION} {@link Schema} which represents a union of all the given schemas.
   * The ordering of the schemas inside the union would be the same as the one being passed in.
   *
   * @param schemas All the {@link Schema Schemas} constitutes the union.
   * @return A {@link Schema} of {@link Type#UNION UNION} type.
   */
  public static Schema unionOf(Schema...schemas) {
    return unionOf(ImmutableList.copyOf(schemas));
  }

  /**
   * Creates a {@link Type#UNION UNION} {@link Schema} which represents a union of all the given schemas.
   * The ordering of the schemas inside the union would be the same as the {@link Iterable#iterator()} order.
   *
   * @param schemas All the {@link Schema Schemas} constitutes the union.
   * @return A {@link Schema} of {@link Type#UNION UNION} type.
   */
  public static Schema unionOf(Iterable<Schema> schemas) {
    List<Schema> schemaList = ImmutableList.copyOf(schemas);
    Preconditions.checkArgument(schemaList.size() > 0, "No union schema provided.");
    return new Schema(Type.UNION, null, null, null, null, null, null, schemaList);
  }

  private final Type type;

  private final BiMap<String, Integer> enumValues;
  private final BiMap<Integer, String> enumIndexes;

  private final Schema componentSchema;

  private final Schema keySchema;
  private final Schema valueSchema;
  private final Map.Entry<Schema, Schema> mapSchema;

  private final String recordName;
  private final Map<String, Field> fieldMap;
  private final List<Field> fields;

  private final List<Schema> unionSchemas;

  private String schemaString;
  private SchemaHash schemaHash;

  private Schema(Type type, Set<String> enumValues, Schema componentSchema, Schema keySchema, Schema valueSchema,
                 String recordName, Map<String, Field> fieldMap, List<Schema> unionSchemas) {
    this.type = type;
    this.enumValues = createIndex(enumValues);
    this.enumIndexes = this.enumValues == null ? null : this.enumValues.inverse();
    this.componentSchema = componentSchema;
    this.keySchema = keySchema;
    this.valueSchema = valueSchema;
    this.mapSchema = (keySchema == null || valueSchema == null) ? null : Maps.immutableEntry(keySchema, valueSchema);
    this.recordName = recordName;
    this.fieldMap = populateRecordFields(fieldMap);
    this.fields = this.fieldMap == null ? null : ImmutableList.copyOf(this.fieldMap.values());
    this.unionSchemas = unionSchemas;
  }

  /**
   * @return The {@link Type} that this schema represents.
   */
  public Type getType() {
    return type;
  }

  /**
   * @return An immutable {@link Set} of enum values or {@code null} if this is not a {@link Type#ENUM ENUM} schema.
   *         The {@link Set#iterator()} order would be the enum values orders.
   */
  public Set<String> getEnumValues() {
    return enumValues.keySet();
  }

  /**
   * @param value The enum value
   * @return The 0-base index of the given value in the enum values or {@code -1} if this is not a
   *         {@link Type#ENUM ENUM} schema.
   */
  public int getEnumIndex(String value) {
    if (enumValues == null) {
      return -1;
    }
    Integer idx = enumValues.get(value);
    return idx == null ? -1 : idx;
  }

  /**
   * @param idx The index in the enum values
   * @return The string represents the enum value, or {@code null} if this is not a {@link Type#ENUM ENUM} schema or
   *         the given index is invalid.
   */
  public String getEnumValue(int idx) {
    if (enumIndexes == null) {
      return null;
    }
    return enumIndexes.get(idx);
  }

  /**
   * @return The schema of the array component or {@code null} if this is not a {@link Type#ARRAY ARRAY} schema.
   */
  public Schema getComponentSchema() {
    return componentSchema;
  }

  /**
   * @return An immutable {@code Map.Entry} if this is a {@code Type#MAP MAP} schema or {@code null} otherwise.
   *         The {@code Map.Entry#getKey()} would returns the key schema, while {@code Map.Entry#getValue()}
   *         would returns the value schema.
   */
  public Map.Entry<Schema, Schema> getMapSchema() {
    return mapSchema;
  }

  /**
   * @return Name of the record if this is a {@link Type#RECORD RECORD} schema or {@code null} otherwise.
   */
  public String getRecordName() {
    return recordName;
  }

  /**
   * @return An immutable {@link List} of record {@link Field Fields} if this is a {@link Type#RECORD RECORD} schema
   *         or {@code null} otherwise.
   */
  public List<Field> getFields() {
    return fields;
  }

  /**
   * Returns the record {@link Field} of the given name.
   *
   * @param name Name of the field
   * @return A {@link Field} or {@code null} if there is no such field in this record
   *         or this is not a {@link Type#RECORD RECORD} schema.
   */
  public Field getField(String name) {
    if (fieldMap == null) {
      return null;
    }
    return fieldMap.get(name);
  }

  /**
   * @return An immutable {@link List} of schemas inside this union
   *         or {@code null} if this is not a {@link Type#UNION UNION} schema.
   */
  public List<Schema> getUnionSchemas() {
    return unionSchemas;
  }

  /**
   * @param idx Index to the union schemas
   * @return A {@link Schema} of the given union index or {@code null} if this is not a {@link Type#UNION UNION}
   *         schema or the given index is invalid.
   */
  public Schema getUnionSchema(int idx) {
    return (unionSchemas == null || idx < 0 || unionSchemas.size() <= idx) ? null : unionSchemas.get(idx);
  }

  @Override
  public String toString() {
    // The follow logic is thread safe, as all the fields buildString() needs are immutable.
    // It's possible that buildString() get triggered multiple times, but they should yield the same result.
    String str = schemaString;
    if (str == null) {
      schemaString = str = buildString();
    }
    return str;
  }

  @Override
  public boolean equals(Object other) {
    if (this == other) {
      return true;
    }
    if (other == null || getClass() != other.getClass()) {
      return false;
    }

    return getSchemaHash().equals(((Schema) other).getSchemaHash());
  }

  @Override
  public int hashCode() {
    return getSchemaHash().hashCode();
  }

  /**
   * @return A MD5 hash of this schema.
   */
  public SchemaHash getSchemaHash() {
    SchemaHash hash = schemaHash;
    if (hash == null) {
      schemaHash = hash = new SchemaHash(this);
    }
    return hash;
  }


  /**
   * Checks if the given target schema is compatible with this schema, meaning datum being written with this
   * schema could be projected correctly into the given target schema.
   *
   * TODO: Add link to document of the target type projection.
   *
   * @param target Schema to check for compatibility to this target
   * @return {@code true} if the schemas are compatible, {@code false} otherwise.
   */
  public boolean isCompatible(Schema target) {
    if (equals(target)) {
      return true;
    }
    Multimap<String, String> recordCompared = HashMultimap.create();
    return checkCompatible(target, recordCompared);
  }

  private boolean checkCompatible(Schema target, Multimap<String, String> recordCompared) {
    if (type.isSimpleType()) {
      if (type == target.getType()) {
        // Same simple type are always compatible
        return true;
      }
      switch (target.getType()) {
        case LONG:
          return type == Type.INT;
        case FLOAT:
          return type == Type.INT || type == Type.LONG;
        case DOUBLE:
          return type == Type.INT || type == Type.LONG || type == Type.FLOAT;
        case STRING:
          return type != Type.NULL && type != Type.BYTES;
        case UNION:
          for (Schema targetSchema : target.unionSchemas) {
            if (checkCompatible(targetSchema, recordCompared)) {
              return true;
            }
          }
      }
      return false;
    }

    if (type == target.type) {
      switch (type) {
        case ENUM:
          return target.getEnumValues().containsAll(getEnumValues());
        case ARRAY:
          // The component schema must be compatible
          return componentSchema.checkCompatible(target.getComponentSchema(), recordCompared);
        case MAP:
          // Both key and value schemas must be compatible
          return keySchema.checkCompatible(target.keySchema, recordCompared)
            && valueSchema.checkCompatible(target.valueSchema, recordCompared);
        case RECORD:
          // For every common field (by name), their schema must be compatible
          if (!recordCompared.containsEntry(recordName, target.recordName)) {
            recordCompared.put(recordName, target.recordName);
            for (Field field : fields) {
              Field targetField = target.getField(field.getName());
              if (targetField == null) {
                continue;
              }
              if (!field.getSchema().checkCompatible(targetField.getSchema(), recordCompared)) {
                return false;
              }
            }
          }
          return true;
        case UNION:
          // Compare each source union to target union
          for (Schema sourceSchema : unionSchemas) {
            for (Schema targetSchema : target.unionSchemas) {
              if (sourceSchema.checkCompatible(targetSchema, recordCompared)) {
                return true;
              }
            }
          }
          return false;
      }
    }

    if (type == Type.UNION || target.type == Type.UNION) {
      List<Schema> unions = type == Type.UNION ? unionSchemas : target.unionSchemas;
      Schema checkSchema = type == Type.UNION ? target : this;
      for (Schema schema : unions) {
        if (schema.checkCompatible(checkSchema, recordCompared)) {
          return true;
        }
      }
    }

    return false;
  }

  /**
   * Creates a map of indexes based on the iteration order of the given set.
   *
   * @param values Set of values to create index on
   * @return A map from the values to indexes in the set iteration order.
   */
  private <V> BiMap<V, Integer> createIndex(Set<V> values) {
    if (values == null) {
      return null;
    }

    ImmutableBiMap.Builder<V, Integer> builder = ImmutableBiMap.builder();
    int idx = 0;
    for (V value : values) {
      builder.put(value, idx++);
    }
    return builder.build();
  }

  /**
   * Resolves all field schemas.
   *
   * @param fields All the fields that need to be resolved.
   * @return A {@link Map} which has all the field schemas resolved.
   * @see #resolveSchema(Schema, java.util.Map)
   */
  private Map<String, Field> populateRecordFields(Map<String, Field> fields) {
    if (fields == null) {
      return null;
    }

    Map<String, Schema> knownRecordSchemas = Maps.newHashMap();
    knownRecordSchemas.put(recordName, this);
    ImmutableMap.Builder<String, Field> builder = ImmutableMap.builder();

    for (Map.Entry<String, Field> fieldEntry : fields.entrySet()) {
      String fieldName = fieldEntry.getKey();
      Field field = fieldEntry.getValue();
      Schema fieldSchema = resolveSchema(field.getSchema(), knownRecordSchemas);

      if (fieldSchema == field.getSchema()) {
        builder.put(fieldName, field);
      } else {
        builder.put(fieldName, Field.of(fieldName, fieldSchema));
      }
    }

    return builder.build();
  }

  /**
   * This method is to recursively resolves all name only record schema in the given schema.
   *
   * @param schema The schema needs to be resolved.
   * @param knownRecordSchemas The mapping of the already resolved record schemas.
   * @return A {@link Schema} that is structurally the same as the input schema, but with all
   *         name only record schemas resolved to full schemas (i.e. with fields sets).
   *         If nothing in the given schema needs to be resolved, the same schema instance would be returned,
   *         otherwise, a new instance would be returned.
   */
  private Schema resolveSchema(final Schema schema, final Map<String, Schema> knownRecordSchemas) {
    switch (schema.getType()) {
      case ARRAY:
        Schema componentSchema = resolveSchema(schema.getComponentSchema(), knownRecordSchemas);
        return (componentSchema == schema.getComponentSchema()) ? schema : Schema.arrayOf(componentSchema);
      case MAP:
        Map.Entry<Schema, Schema> entry = schema.getMapSchema();
        Schema keySchema = resolveSchema(entry.getKey(), knownRecordSchemas);
        Schema valueSchema = resolveSchema(entry.getValue(), knownRecordSchemas);
        return (keySchema == entry.getKey() && valueSchema == entry.getValue()) ?
                schema : Schema.mapOf(keySchema, valueSchema);
      case UNION:
        ImmutableList.Builder<Schema> schemaBuilder = ImmutableList.builder();
        boolean changed = false;
        for (Schema input : schema.getUnionSchemas()) {
          Schema output = resolveSchema(input, knownRecordSchemas);
          if (output != input) {
            changed = true;
          }
          schemaBuilder.add(output);
        }
        return changed ? Schema.unionOf(schemaBuilder.build()) : schema;
      case RECORD:
        if (schema.fields == null) {
          // It is a named record that refers to previously defined record
          Schema knownSchema = knownRecordSchemas.get(schema.recordName);
          Preconditions.checkArgument(knownSchema != null, "Undefined schema %s", schema.recordName);
          return knownSchema;
        } else {
          // It is a concrete schema
          knownRecordSchemas.put(schema.recordName, schema);
          return schema;
        }
    }
    return schema;
  }

  /**
   * Helper method to encode this schema into json string.
   *
   * @return A json string representing this schema.
   */
  private String buildString() {
    if (type.isSimpleType()) {
      return '"' + type.name().toLowerCase() + '"';
    }
    StringBuilder builder = new StringBuilder();
    JsonWriter writer = new JsonWriter(CharStreams.asWriter(builder));
    try {
      new co.cask.cdap.internal.io.SchemaTypeAdapter().write(writer, this);
      writer.close();
      return builder.toString();
    } catch (IOException e) {
      // It should never throw IOException on the StringBuilder Writer, if it does, something very wrong.
      throw Throwables.propagate(e);
    }
  }
}
TOP

Related Classes of co.cask.cdap.internal.io.Schema

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.