/**
* This code is made available under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*/
package com.acme.io;
import java.io.ByteArrayInputStream;
import java.io.DataInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import org.codehaus.jackson.JsonFactory;
import org.codehaus.jackson.JsonParser;
import org.codehaus.jackson.JsonToken;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.pig.Expression;
import org.apache.pig.LoadCaster;
import org.apache.pig.LoadFunc;
import org.apache.pig.LoadMetadata;
import org.apache.pig.LoadPushDown;
import org.apache.pig.ResourceSchema;
import org.apache.pig.ResourceSchema.ResourceFieldSchema;
import org.apache.pig.ResourceStatistics;
import org.apache.pig.LoadPushDown.OperatorSet;
import org.apache.pig.LoadPushDown.RequiredFieldList;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
import org.apache.pig.builtin.Utf8StorageConverter;
import org.apache.pig.data.BagFactory;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.util.UDFContext;
import org.apache.pig.impl.util.Utils;
/**
* A loader for data stored using {@link JsonStorage}. This is not a generic
* JSON loader. It depends on the schema being stored with the data when
* conceivably you could write a loader that determines the schema from the
* JSON. It is also not well tested, for functionality or performance. It
* works for simple demonstrations.
*
* Also note that this loader and the associated storage function require a
* version of Pig that has PIG-2112 to work with complex data.
*/
public class JsonLoader extends LoadFunc implements LoadMetadata {
protected RecordReader reader = null;
protected ResourceFieldSchema[] fields = null;
protected final Log log = LogFactory.getLog(getClass());
private String udfcSignature = null;
private JsonFactory jsonFactory = null;
private TupleFactory tupleFactory = TupleFactory.getInstance();
private BagFactory bagFactory = BagFactory.getInstance();
/**
* Communicate to the loader the location of the object(s) being loaded.
* The location string passed to the LoadFunc here is the return value of
* {@link LoadFunc#relativeToAbsolutePath(String, Path)}. Implementations
* should use this method to communicate the location (and any other
* information) to its underlying InputFormat through the Job object.
*
* This method will be called in the backend multiple times. Implementations
* should bear in mind that this method is called multiple times and should
* ensure there are no inconsistent side effects due to the multiple calls.
*
* @param location Location as returned by
* {@link LoadFunc#relativeToAbsolutePath(String, Path)}
* @param job the {@link Job} object
* store or retrieve earlier stored information from the {@link UDFContext}
* @throws IOException if the location is not valid.
*/
public void setLocation(String location, Job job) throws IOException {
// Tell our input format where we will be reading from
FileInputFormat.setInputPaths(job, location);
}
/**
* This will be called during planning on the front end. This is the
* instance of InputFormat (rather than the class name) because the
* load function may need to instantiate the InputFormat in order
* to control how it is constructed.
* @return the InputFormat associated with this loader.
* @throws IOException if there is an exception during InputFormat
* construction
*/
@SuppressWarnings("unchecked")
public InputFormat getInputFormat() throws IOException {
// We will use TextInputFormat, the default Hadoop input format for
// text. It has a LongWritable key that we will ignore, and the value
// is a Text (a string writable) that the JSON data is in.
return new TextInputFormat();
}
/**
* This will be called on the front end during planning and not on the back
* end during execution.
* @return the {@link LoadCaster} associated with this loader.
* Returning null indicates that casts from byte array are not supported
* for this loader.
* @throws IOException if there is an exception during LoadCaster
*/
public LoadCaster getLoadCaster() throws IOException {
// We do not expect to do casting of byte arrays, because we will be
// returning typed data.
return null;
}
/**
* Initializes LoadFunc for reading data. This will be called during
* execution before any calls to getNext. The RecordReader needs to be
* passed here because it has been instantiated for a particular InputSplit.
* @param reader {@link RecordReader} to be used by this instance of
* the LoadFunc
* @param split The input {@link PigSplit} to process
* @throws IOException if there is an exception during initialization
*/
@SuppressWarnings("unchecked")
public void prepareToRead(RecordReader reader, PigSplit split)
throws IOException {
this.reader = reader;
// Get the schema string from the UDFContext object.
UDFContext udfc = UDFContext.getUDFContext();
Properties p =
udfc.getUDFProperties(this.getClass(), new String[]{udfcSignature});
String strSchema = p.getProperty("pig.jsonloader.schema");
if (strSchema == null) {
throw new IOException("Could not find schema in UDF context");
}
// Parse the schema from the string stored in the properties object.
ResourceSchema schema =
new ResourceSchema(Utils.getSchemaFromString(strSchema));
fields = schema.getFields();
jsonFactory = new JsonFactory();
}
/**
* Retrieves the next tuple to be processed. Implementations should NOT
* reuse tuple objects (or inner member objects) they return across calls
* and should return a different tuple object in each call.
* @return the next tuple to be processed or null if there are no more
* tuples to be processed.
* @throws IOException if there is an exception while retrieving the next
* tuple
*/
public Tuple getNext() throws IOException {
Text val = null;
try {
// Read the next key value pair from the record reader. If it's
// finished, return null
if (!reader.nextKeyValue()) return null;
// Get the current value. We don't use the key.
val = (Text)reader.getCurrentValue();
} catch (InterruptedException ie) {
throw new IOException(ie);
}
// Create a parser specific for this input line. This may not be the
// most efficient approach.
ByteArrayInputStream bais = new ByteArrayInputStream(val.getBytes());
JsonParser p = jsonFactory.createJsonParser(bais);
// Create the tuple we will be returning. We create it with the right
// number of fields, as the Tuple object is optimized for this case.
Tuple t = tupleFactory.newTuple(fields.length);
// Read the start object marker. Throughout this file if the parsing
// isn't what we expect we return a tuple with null fields rather than
// throwing an exception. That way a few mangled lines don't fail the
// job.
if (p.nextToken() != JsonToken.START_OBJECT) {
log.warn("Bad record, could not find start of record " +
val.toString());
return t;
}
// Read each field in the record
for (int i = 0; i < fields.length; i++) {
t.set(i, readField(p, fields[i], i));
}
if (p.nextToken() != JsonToken.END_OBJECT) {
log.warn("Bad record, could not find end of record " +
val.toString());
return t;
}
p.close();
return t;
}
private Object readField(JsonParser p,
ResourceFieldSchema field,
int fieldnum) throws IOException {
// Read the next token
JsonToken tok = p.nextToken();
if (tok == null) {
log.warn("Early termination of record, expected " + fields.length
+ " fields bug found " + fieldnum);
return null;
}
// Check to see if this value was null
if (tok == JsonToken.VALUE_NULL) return null;
// Read based on our expected type
switch (field.getType()) {
case DataType.INTEGER:
// Read the field name
p.nextToken();
return p.getValueAsInt();
case DataType.LONG:
p.nextToken();
return p.getValueAsLong();
case DataType.FLOAT:
p.nextToken();
return (float)p.getValueAsDouble();
case DataType.DOUBLE:
p.nextToken();
return p.getValueAsDouble();
case DataType.BYTEARRAY:
p.nextToken();
byte[] b = p.getBinaryValue();
// Use the DBA constructor that copies the bytes so that we own
// the memory
return new DataByteArray(b, 0, b.length);
case DataType.CHARARRAY:
p.nextToken();
return p.getText();
case DataType.MAP:
// Should be a start of the map object
if (p.nextToken() != JsonToken.START_OBJECT) {
log.warn("Bad map field, could not find start of object, field "
+ fieldnum);
return null;
}
Map<String, String> m = new HashMap<String, String>();
while (p.nextToken() != JsonToken.END_OBJECT) {
String k = p.getCurrentName();
String v = p.getText();
m.put(k, v);
}
return m;
case DataType.TUPLE:
if (p.nextToken() != JsonToken.START_OBJECT) {
log.warn("Bad tuple field, could not find start of object, "
+ "field " + fieldnum);
return null;
}
ResourceSchema s = field.getSchema();
ResourceFieldSchema[] fs = s.getFields();
Tuple t = tupleFactory.newTuple(fs.length);
for (int j = 0; j < fs.length; j++) {
t.set(j, readField(p, fs[j], j));
}
if (p.nextToken() != JsonToken.END_OBJECT) {
log.warn("Bad tuple field, could not find end of object, "
+ "field " + fieldnum);
return null;
}
return t;
case DataType.BAG:
if (p.nextToken() != JsonToken.START_ARRAY) {
log.warn("Bad bag field, could not find start of array, "
+ "field " + fieldnum);
return null;
}
s = field.getSchema();
fs = s.getFields();
// Drill down the next level to the tuple's schema.
s = fs[0].getSchema();
fs = s.getFields();
DataBag bag = bagFactory.newDefaultBag();
JsonToken innerTok;
while ((innerTok = p.nextToken()) != JsonToken.END_ARRAY) {
if (innerTok != JsonToken.START_OBJECT) {
log.warn("Bad bag tuple field, could not find start of "
+ "object, field " + fieldnum);
return null;
}
t = tupleFactory.newTuple(fs.length);
for (int j = 0; j < fs.length; j++) {
t.set(j, readField(p, fs[j], j));
}
if (p.nextToken() != JsonToken.END_OBJECT) {
log.warn("Bad bag tuple field, could not find end of "
+ "object, field " + fieldnum);
return null;
}
bag.add(t);
}
return bag;
default:
throw new IOException("Unknown type in input schema: " +
field.getType());
}
}
//------------------------------------------------------------------------
/**
* This method will be called by Pig both in the front end and back end to
* pass a unique signature to the {@link LoadFunc}. The signature can be used
* to store into the {@link UDFContext} any information which the
* {@link LoadFunc} needs to store between various method invocations in the
* front end and back end. A use case is to store {@link RequiredFieldList}
* passed to it in {@link LoadPushDown#pushProjection(RequiredFieldList)} for
* use in the back end before returning tuples in {@link LoadFunc#getNext()}.
* This method will be call before other methods in {@link LoadFunc}
* @param signature a unique signature to identify this LoadFunc
*/
public void setUDFContextSignature(String signature) {
udfcSignature = signature;
}
/**
* Get a schema for the data to be loaded.
* @param location Location as returned by
* {@link LoadFunc#relativeToAbsolutePath(String, org.apache.hadoop.fs.Path)}
* @param job The {@link Job} object - this should be used only to obtain
* cluster properties through {@link Job#getConfiguration()} and not to
* set/query any runtime job information.
* @return schema for the data to be loaded. This schema should represent
* all tuples of the returned data. If the schema is unknown or it is
* not possible to return a schema that represents all returned data,
* then null should be returned. The schema should not be affected by
* pushProjection, ie. getSchema should always return the original schema
* even after pushProjection
* @throws IOException if an exception occurs while determining the schema
*/
public ResourceSchema getSchema(String location, Job job)
throws IOException {
// Open the schema file and read the schema
// Get an HDFS handle.
FileSystem fs = FileSystem.get(job.getConfiguration());
DataInputStream in = fs.open(new Path(location + "/_schema"));
String line = in.readLine();
in.close();
// Parse the schema
ResourceSchema s =
new ResourceSchema(Utils.getSchemaFromString(line));
if (s == null) {
throw new IOException("Unable to parse schema found in file " +
location + "/_schema");
}
// Now that we have determined the schema, store it in our
// UDFContext properties object so we have it when we need it on the
// backend
UDFContext udfc = UDFContext.getUDFContext();
Properties p =
udfc.getUDFProperties(this.getClass(), new String[]{udfcSignature});
p.setProperty("pig.jsonloader.schema", line);
return s;
}
/**
* Get statistics about the data to be loaded. If no statistics are
* available, then null should be returned.
* @param location Location as returned by
* {@link LoadFunc#relativeToAbsolutePath(String, org.apache.hadoop.fs.Path)}
* @param job The {@link Job} object - this should be used only to obtain
* cluster properties through {@link Job#getConfiguration()} and not to set/query
* any runtime job information.
* @return statistics about the data to be loaded. If no statistics are
* available, then null should be returned.
* @throws IOException if an exception occurs while retrieving statistics
*/
public ResourceStatistics getStatistics(String location, Job job)
throws IOException {
// We don't implement this one.
return null;
}
/**
* Find what columns are partition keys for this input.
* @param location Location as returned by
* {@link LoadFunc#relativeToAbsolutePath(String, org.apache.hadoop.fs.Path)}
* @param job The {@link Job} object - this should be used only to obtain
* cluster properties through {@link Job#getConfiguration()} and not to
* set/query any runtime job information.
* @return array of field names of the partition keys. Implementations
* should return null to indicate that there are no partition keys
* @throws IOException if an exception occurs while retrieving partition keys
*/
public String[] getPartitionKeys(String location, Job job)
throws IOException {
// We don't have partitions
return null;
}
/**
* Set the filter for partitioning. It is assumed that this filter
* will only contain references to fields given as partition keys in
* getPartitionKeys. So if the implementation returns null in
* {@link #getPartitionKeys(String, Job)}, then this method is not
* called by Pig runtime. This method is also not called by the Pig runtime
* if there are no partition filter conditions.
* @param partitionFilter that describes filter for partitioning
* @throws IOException if the filter is not compatible with the storage
* mechanism or contains non-partition fields.
*/
public void setPartitionFilter(Expression partitionFilter)
throws IOException {
// We don't have partitions
}
}