/*
* Copyright 2008-2009 LinkedIn, Inc
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package voldemort.store.readonly.mr.utils;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.avro.Schema;
import org.apache.avro.file.DataFileStream;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
public class AvroUtils {
/**
* Pull the schema off of the given file (if it is a file). If it is a
* directory, then pull schemas off of all subfiles, and check that they are
* all the same schema. If so, return that schema, otherwise throw an
* exception
*
* @param fs The filesystem to use
* @param path The path from which to get the schema
* @param checkSameSchema boolean flag to check all files in directory for
* same schema
* @return The schema of this file or all its subfiles
* @throws IOException
*/
@SuppressWarnings({ "unchecked", "rawtypes" })
private static Schema getSchemaFromPath(FileSystem fs, Path path, boolean checkSameSchema) {
try {
if(fs.isFile(path)) {
BufferedInputStream inStream = null;
try {
inStream = new BufferedInputStream(fs.open(path));
} catch(IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
GenericDatumReader datum = new GenericDatumReader();
DataFileStream reader = null;
try {
reader = new DataFileStream(inStream, datum);
} catch(IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return reader.getSchema();
} else {
FileStatus[] statuses = null;
if(fs.isDirectory(path)) {
// this is a directory, get schemas from all subfiles
statuses = fs.listStatus(path);
} else {
// this is wildcard path, get schemas from all matched files
statuses = fs.globStatus(path);
}
if(statuses == null || statuses.length == 0)
throw new IllegalArgumentException("No files found in path pattern "
+ path.toUri().getPath());
List<Schema> schemas = new ArrayList<Schema>();
for(FileStatus status: statuses) {
if(!HadoopUtils.shouldPathBeIgnored(status.getPath())) {
if(!checkSameSchema) {
// return first valid schema w/o checking all files
return getSchemaFromPath(fs, status.getPath(), checkSameSchema);
}
schemas.add(getSchemaFromPath(fs, status.getPath(), checkSameSchema));
}
}
// now check that all the schemas are the same
if(schemas.size() > 0) {
Schema schema = schemas.get(0);
for(int i = 1; i < schemas.size(); i++)
if(!schema.equals(schemas.get(i)))
throw new IllegalArgumentException("The directory "
+ path.toString()
+ " contains heterogenous schemas: found both '"
+ schema.toString() + "' and '"
+ schemas.get(i).toString() + "'.");
return schema;
} else {
throw new IllegalArgumentException("No Valid metadata file found for Path:"
+ path.toString());
}
}
} catch(Exception e) {
// logger.error("failed to get metadata from path:" + path);
throw new RuntimeException(e);
}
}
public static Schema getAvroSchemaFromPath(Path path) throws IOException {
return getSchemaFromPath(path.getFileSystem(new Configuration()), path, true);
}
}