Source Code of voldemort.store.readonly.mr.serialization.JsonSequenceFileInputFormat

/*
 * Copyright 2008-2009 LinkedIn, Inc
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */


package voldemort.store.readonly.mr.serialization;


import java.io.IOException;
import java.util.ArrayList;
import java.util.List;


import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.log4j.Logger;


import voldemort.store.readonly.mr.utils.HadoopUtils;


/**
 * Extends {@link SequenceFileInputFormat} to support our JSON based
 * serialization format.
 * 
 * Reads in a SequenceFile Read out the schema from Metadata and save it as keys
 * in configuration.
 * <ul>
 * <li>"mapper.input.key.schema"</li>
 * <li>"mapper.input.value.schema"</li>
 * </ul>
 * 
 * @author bbansal
 * 
 */
public class JsonSequenceFileInputFormat extends
        SequenceFileInputFormat<BytesWritable, BytesWritable> {


    protected static final Logger log = Logger.getLogger(JsonSequenceFileInputFormat.class.getName());


    @Override
    public RecordReader<BytesWritable, BytesWritable> getRecordReader(InputSplit split,
                                                                      JobConf conf,
                                                                      Reporter reporter)
            throws IOException {
        String inputPathString = ((FileSplit) split).getPath().toUri().getPath();
        log.info("Input file path:" + inputPathString);
        Path inputPath = new Path(inputPathString);


        SequenceFile.Reader reader = new SequenceFile.Reader(inputPath.getFileSystem(conf),
                                                             inputPath,
                                                             conf);
        SequenceFile.Metadata meta = reader.getMetadata();


        try {
            Text keySchema = meta.get(new Text("key.schema"));
            Text valueSchema = meta.get(new Text("value.schema"));


            if(0 == keySchema.getLength() || 0 == valueSchema.getLength()) {
                throw new Exception();
            }


            // update Joboconf with schemas
            conf.set("mapper.input.key.schema", keySchema.toString());
            conf.set("mapper.input.value.schema", valueSchema.toString());
        } catch(Exception e) {
            throw new IOException("Failed to Load Schema from file:" + inputPathString + "\n");
        }
        return super.getRecordReader(split, conf, reporter);
    }


    @Override
    protected FileStatus[] listStatus(JobConf job) throws IOException {
        String dirs = job.get("mapred.input.dir", "");
        String[] list = StringUtils.split(dirs);


        List<FileStatus> status = new ArrayList<FileStatus>();
        for(int i = 0; i < list.length; i++) {
            status.addAll(getAllSubFileStatus(job, new Path(list[i])));
        }


        return status.toArray(new FileStatus[0]);
    }


    private List<FileStatus> getAllSubFileStatus(JobConf inputConf, Path filterMemberPath)
            throws IOException {
        List<FileStatus> list = new ArrayList<FileStatus>();


        FileSystem fs = filterMemberPath.getFileSystem(inputConf);
        FileStatus[] subFiles = fs.listStatus(filterMemberPath);


        if(null != subFiles) {
            if(fs.isDirectory(filterMemberPath)) {
                for(FileStatus subFile: subFiles) {
                    if(!HadoopUtils.shouldPathBeIgnored(subFile.getPath())) {
                        list.addAll(getAllSubFileStatus(inputConf, subFile.getPath()));
                    }
                }
            } else {
                if(subFiles.length > 0 && !HadoopUtils.shouldPathBeIgnored(subFiles[0].getPath())) {
                    list.add(subFiles[0]);
                }
            }
        }


        return list;
    }
}
Source Code of voldemort.store.readonly.mr.serialization.JsonSequenceFileInputFormat

Related Classes of voldemort.store.readonly.mr.serialization.JsonSequenceFileInputFormat