/*
* Copyright 2013 Cloudera Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.cloudera.cdk.morphline.avro;
import java.io.File;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;
import org.apache.avro.AvroRuntimeException;
import org.apache.avro.Schema;
import org.apache.avro.Schema.Field;
import org.apache.avro.Schema.Parser;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.IndexedRecord;
import org.apache.avro.reflect.ReflectData;
import com.cloudera.cdk.morphline.api.Command;
import com.cloudera.cdk.morphline.api.CommandBuilder;
import com.cloudera.cdk.morphline.api.MorphlineCompilationException;
import com.cloudera.cdk.morphline.api.MorphlineContext;
import com.cloudera.cdk.morphline.api.MorphlineRuntimeException;
import com.cloudera.cdk.morphline.api.Record;
import com.cloudera.cdk.morphline.base.AbstractCommand;
import com.cloudera.cdk.morphline.base.Configs;
import com.cloudera.cdk.morphline.base.Fields;
import com.cloudera.cdk.morphline.stdio.AbstractParser;
import com.google.common.base.Preconditions;
import com.typesafe.config.Config;
import com.typesafe.config.ConfigFactory;
/**
* Command that converts a morphline record to an Avro record.
*
* @since 0.9.0
*/
public final class ToAvroBuilder implements CommandBuilder {
@Override
public Collection<String> getNames() {
return Collections.singletonList("toAvro");
}
@Override
public Command build(Config config, Command parent, Command child, MorphlineContext context) {
return new ToAvro(this, config, parent, child, context);
}
///////////////////////////////////////////////////////////////////////////////
// Nested classes:
///////////////////////////////////////////////////////////////////////////////
private static final class ToAvro extends AbstractCommand {
private final Map<String, String> mappings = new HashMap();
private final Schema fixedSchema;
private final String schemaField;
// more efficient than raising & catching exceptions
private static final Object ERROR = new Object();
public ToAvro(CommandBuilder builder, Config config, Command parent, Command child, MorphlineContext context) {
super(builder, config, parent, child, context);
String schemaFile = getConfigs().getString(config, "schemaFile", null);
String schemaString = getConfigs().getString(config, "schemaString", null);
this.schemaField = getConfigs().getString(config, "schemaField", null);
int numDefinitions = 0;
if (schemaFile != null) {
numDefinitions++;
}
if (schemaString != null) {
numDefinitions++;
}
if (schemaField != null) {
numDefinitions++;
}
if (numDefinitions == 0) {
throw new MorphlineCompilationException(
"Either schemaFile or schemaString or schemaField must be defined", config);
}
if (numDefinitions > 1) {
throw new MorphlineCompilationException(
"Must define only one of schemaFile or schemaString or schemaField at the same time", config);
}
if (schemaString != null) {
this.fixedSchema = new Parser().parse(schemaString);
} else if (schemaFile != null) {
try {
this.fixedSchema = new Parser().parse(new File(schemaFile));
} catch (IOException e) {
throw new MorphlineCompilationException(
"Cannot parse external Avro schema file: " + schemaFile, config, e);
}
} else {
this.fixedSchema = null;
}
Config mappingsConfig = getConfigs().getConfig(config, "mappings", ConfigFactory.empty());
for (Map.Entry<String, Object> entry : new Configs().getEntrySet(mappingsConfig)) {
mappings.put(entry.getKey(), entry.getValue().toString());
}
validateArguments();
}
@Override
protected boolean doProcess(Record inputRecord) {
Schema schema;
if (schemaField != null) {
schema = (Schema) inputRecord.getFirstValue(schemaField);
Preconditions.checkNotNull(schema);
} else {
schema = fixedSchema;
}
Record outputRecord = inputRecord.copy();
AbstractParser.removeAttachments(outputRecord);
IndexedRecord avroRecord = new GenericData.Record(schema);
for (Field field : schema.getFields()) {
String morphlineFieldName = mappings.get(field.name());
if (morphlineFieldName == null) {
morphlineFieldName = field.name();
}
List list = inputRecord.get(morphlineFieldName);
Object avroResult = ERROR;
if (field.schema().getType() == Schema.Type.ARRAY) {
avroResult = toAvro(list, field);
} else if (list.size() == 0) {
try { // this will fail if there is no default value
avroResult = ReflectData.get().getDefaultValue(field);
} catch (AvroRuntimeException e) {
avroResult = ERROR;
}
} else if (list.size() == 1) {
avroResult = toAvro(list.get(0), field);
}
if (avroResult == ERROR) {
LOG.debug("Cannot convert item: {} to schema: {}", list, schema);
return false;
}
avroRecord.put(field.pos(), avroResult);
}
outputRecord.put(Fields.ATTACHMENT_BODY, avroRecord);
// pass record to next command in chain:
return super.doProcess(outputRecord);
}
/* returns true if schema allows the value to be null, false otherwise */
private static boolean nullOk(Schema schema) {
if (Schema.Type.NULL == schema.getType()) {
return true;
} else if (Schema.Type.UNION == schema.getType()) {
for (Schema candidate : schema.getTypes()) {
if (nullOk(candidate)) {
return true;
}
}
}
return false;
}
private Object toAvro(Object item, Field field) {
if (item == null && !nullOk(field.schema())) {
try { // this will fail if there is no default value
return ReflectData.get().getDefaultValue(field);
} catch (AvroRuntimeException e) {
return ERROR;
}
}
Object result = toAvro(item, field.schema());
return result;
}
private Object toAvro(Object item, Schema schema) {
// RECORD, ENUM, ARRAY, MAP, UNION, FIXED, STRING, BYTES, INT, LONG, FLOAT,
// DOUBLE, BOOLEAN, NULL
switch (schema.getType()) {
case RECORD:
if (item instanceof Map) {
Map<String,Object> map = (Map) item;
IndexedRecord record = new GenericData.Record(schema);
for (Field field : schema.getFields()) {
Object value = map.get(field.name());
Object result = toAvro(value, field);
if (result == ERROR) {
return ERROR;
}
record.put(field.pos(), result);
}
return record;
}
return ERROR;
case ENUM:
if (schema.hasEnumSymbol(item.toString())) {
return item.toString();
}
return ERROR;
case ARRAY:
if (item instanceof List) {
ListIterator iter = ((List)item).listIterator();
while (iter.hasNext()) {
Object result = toAvro(iter.next(), schema.getElementType());
if (result == ERROR) {
return ERROR;
}
iter.set(result);
}
return item;
}
return ERROR;
case MAP:
if (item instanceof Map) {
Map<String,Object> map = (Map) item;
for (Map.Entry entry : map.entrySet()) {
if (!(entry.getKey() instanceof CharSequence)) {
return ERROR; // Avro requires that map keys are CharSequences
}
Object result = toAvro(entry.getValue(), schema.getValueType());
if (result == ERROR) {
return ERROR;
}
entry.setValue(result);
}
return item;
}
return ERROR;
case UNION:
return toAvroUnion(item, schema);
case FIXED:
if (item instanceof byte[]) {
return new GenericData.Fixed(schema, (byte[])item);
}
return ERROR;
case STRING:
assert item != null;
return item.toString();
case BYTES:
if (item instanceof ByteBuffer) {
return item;
}
if (item instanceof byte[]) {
return ByteBuffer.wrap((byte[])item);
}
return ERROR;
case INT:
if (item instanceof Integer) {
return item;
}
if (item instanceof Number) {
return ((Number) item).intValue();
}
try {
return Integer.valueOf(item.toString());
} catch (NumberFormatException e) {
return ERROR;
}
case LONG:
if (item instanceof Long) {
return item;
}
if (item instanceof Number) {
return ((Number) item).longValue();
}
try {
return Long.valueOf(item.toString());
} catch (NumberFormatException e) {
return ERROR;
}
case FLOAT:
if (item instanceof Float) {
return item;
}
if (item instanceof Number) {
return ((Number) item).floatValue();
}
try {
return Float.valueOf(item.toString());
} catch (NumberFormatException e) {
return ERROR;
}
case DOUBLE:
if (item instanceof Double) {
return item;
}
if (item instanceof Number) {
return ((Number) item).doubleValue();
}
try {
return Double.valueOf(item.toString());
} catch (NumberFormatException e) {
return ERROR;
}
case BOOLEAN:
if (item instanceof Boolean) {
return item;
}
assert item != null;
String str = item.toString();
if ("true".equals(str)) {
return Boolean.TRUE;
}
if ("false".equals(str)) {
return Boolean.FALSE;
}
return ERROR;
case NULL:
if (item == null) {
return null;
}
return ERROR;
default:
throw new MorphlineRuntimeException("Unknown Avro schema type: " + schema.getType());
}
}
private Object toAvroUnion(Object item, Schema schema) {
assert schema.getType() == Schema.Type.UNION;
List<Schema> types = schema.getTypes();
int index = -1;
if (item instanceof Map) {
// a map can be converted both into an avro record or an avro map.
// so there's some ambiguity - we choose which one applies based on specified order.
for (int j = 0; j < types.size(); j++) {
Schema.Type t = types.get(j).getType();
if (t == Schema.Type.RECORD || t == Schema.Type.MAP) {
index = j;
break;
}
}
} else {
try {
// check if there's a perfect fit for a mapping
index = GenericData.get().resolveUnion(schema, item); // TODO: optimize
} catch (AvroRuntimeException e) {
; // proceed to find first fit based on specified order (see below)
// LOG.trace("Cannot find perfect fit for item: {} to union schema: {}", item, schema);
}
}
if (index >= 0) { // found perfect fit
Schema candidate = types.get(index);
Object result = toAvro(item, candidate);
return result;
} else { // find first fit based on specified order
for (Schema candidate : types) {
Object result = toAvro(item, candidate);
if (result != ERROR) {
return result;
}
}
return ERROR;
}
}
}
}