/**
* Copyright 2011 Twitter, Inc.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.twitter.pycascading;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import cascading.scheme.Scheme;
import cascading.tap.Tap;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import cascading.tuple.TupleEntry;
/**
* A Cascading Scheme that stores header information for an output dataset. It
* records all formatting information so that later on the tuple field names and
* types can be reloaded without having to specify them explicitly.
*
* It also stores the original scheme object so that at load time we don't have
* to worry about that either.
*
* @author Gabor Szabo
*/
public class MetaScheme extends Scheme {
private static final long serialVersionUID = 8194175541999063797L;
private static final String schemeFileName = ".pycascading_scheme";
private static final String headerFileName = ".pycascading_header";
private static final String typeFileName = ".pycascading_types";
private Scheme scheme;
private String outputPath;
private boolean firstLine = true;
private boolean typeFileToWrite = true;
/**
* Call this to get the original Cascading scheme that the data was written
* in.
*
* @param inputPath
* The path to where the scheme information was stored (normally the
* same as the path to the data)
* @return The Cascading scheme that was used when the data was written.
* @throws IOException
*/
public static Scheme getSourceScheme(String inputPath) throws IOException {
Path path = new Path(inputPath + "/" + schemeFileName);
FileSystem fs = path.getFileSystem(new Configuration());
try {
FSDataInputStream file = fs.open(path);
ObjectInputStream ois = new ObjectInputStream(file);
Scheme scheme = (Scheme) ois.readObject();
Fields fields = (Fields) ois.readObject();
scheme.setSourceFields(fields);
ois.close();
file.close();
return scheme;
} catch (ClassNotFoundException e) {
throw new IOException("Could not read PyCascading file header: " + inputPath + "/"
+ schemeFileName);
}
}
/**
* Returns the scheme that will store field information and the scheme in
* outputPath. Additionally, a file called .pycascading_header will be
* generated, which stores the names of the fields in a TAB-delimited format.
*
* @param scheme
* The Cascading scheme to be used to store the data
* @param outputPath
* Path were the metainformation about the scheme and field names
* should be stored
* @return A scheme that can be used to sink the data into
* @throws IOException
*/
public static Scheme getSinkScheme(Scheme scheme, String outputPath) throws IOException {
return new MetaScheme(scheme, outputPath);
}
protected MetaScheme(Scheme scheme, String outputPath) throws IOException {
this.scheme = scheme;
this.outputPath = outputPath;
}
@Override
public void sourceInit(Tap tap, JobConf conf) throws IOException {
// We're returning the original storage scheme, so this should not be called
// ever.
}
@Override
public Tuple source(Object key, Object value) {
// This should never be called.
return null;
}
@Override
public void sinkInit(Tap tap, JobConf conf) throws IOException {
scheme.sinkInit(tap, conf);
}
@Override
public void sink(TupleEntry tupleEntry, OutputCollector outputCollector) throws IOException {
// TODO: do it so such that we don't need to specify /user/gabor if the path
// doesn't start with /
if (firstLine) {
Path path = new Path(outputPath + "/" + headerFileName);
FileSystem fs = path.getFileSystem(new Configuration());
try {
// We're trying to create the file by just one of the mappers/reducers,
// the one that can do it first
if (fs.createNewFile(path)) {
FSDataOutputStream stream = fs.create(path, true);
boolean firstField = true;
for (Comparable<?> field : tupleEntry.getFields()) {
if (firstField)
firstField = false;
else
stream.writeBytes("\t");
stream.writeBytes(field.toString());
}
stream.writeBytes("\n");
stream.close();
}
} catch (IOException e) {
}
path = new Path(outputPath + "/" + schemeFileName);
fs = path.getFileSystem(new Configuration());
try {
if (fs.createNewFile(path)) {
FSDataOutputStream stream = fs.create(path, true);
ObjectOutputStream ostream = new ObjectOutputStream(stream);
ostream.writeObject(scheme);
ostream.writeObject(tupleEntry.getFields());
ostream.close();
stream.close();
}
} catch (IOException e) {
}
firstLine = false;
}
if (typeFileToWrite) {
Path path = new Path(outputPath + "/" + typeFileName);
FileSystem fs = path.getFileSystem(new Configuration());
try {
if (fs.createNewFile(path)) {
FSDataOutputStream stream = fs.create(path, true);
for (int i = 0; i < tupleEntry.size(); i++) {
Comparable fieldName = null;
if (tupleEntry.getFields().size() < tupleEntry.size()) {
// We don't have names for the fields
fieldName = "";
} else {
fieldName = tupleEntry.getFields().get(i) + "\t";
}
Object object = tupleEntry.getObject(i);
Class<?> objectClass = (object == null ? Object.class : object.getClass());
stream.writeBytes(fieldName + objectClass.getName() + "\n");
}
stream.close();
}
} catch (IOException e) {
}
typeFileToWrite = false;
}
scheme.sink(tupleEntry, outputCollector);
}
}