Package voldemort.store.readonly.mr.serialization

Source Code of voldemort.store.readonly.mr.serialization.JsonSequenceFileInputFormat

/*
* Copyright 2008-2009 LinkedIn, Inc
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/

package voldemort.store.readonly.mr.serialization;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.log4j.Logger;

import voldemort.store.readonly.mr.utils.HadoopUtils;

/**
* Extends {@link SequenceFileInputFormat} to support our JSON based
* serialization format.
*
* Reads in a SequenceFile Read out the schema from Metadata and save it as keys
* in configuration.
* <ul>
* <li>"mapper.input.key.schema"</li>
* <li>"mapper.input.value.schema"</li>
* </ul>
*
* @author bbansal
*
*/
public class JsonSequenceFileInputFormat extends
        SequenceFileInputFormat<BytesWritable, BytesWritable> {

    protected static final Logger log = Logger.getLogger(JsonSequenceFileInputFormat.class.getName());

    @Override
    public RecordReader<BytesWritable, BytesWritable> getRecordReader(InputSplit split,
                                                                      JobConf conf,
                                                                      Reporter reporter)
            throws IOException {
        String inputPathString = ((FileSplit) split).getPath().toUri().getPath();
        log.info("Input file path:" + inputPathString);
        Path inputPath = new Path(inputPathString);

        SequenceFile.Reader reader = new SequenceFile.Reader(inputPath.getFileSystem(conf),
                                                             inputPath,
                                                             conf);
        SequenceFile.Metadata meta = reader.getMetadata();

        try {
            Text keySchema = meta.get(new Text("key.schema"));
            Text valueSchema = meta.get(new Text("value.schema"));

            if(0 == keySchema.getLength() || 0 == valueSchema.getLength()) {
                throw new Exception();
            }

            // update Joboconf with schemas
            conf.set("mapper.input.key.schema", keySchema.toString());
            conf.set("mapper.input.value.schema", valueSchema.toString());
        } catch(Exception e) {
            throw new IOException("Failed to Load Schema from file:" + inputPathString + "\n");
        }
        return super.getRecordReader(split, conf, reporter);
    }

    @Override
    protected FileStatus[] listStatus(JobConf job) throws IOException {
        String dirs = job.get("mapred.input.dir", "");
        String[] list = StringUtils.split(dirs);

        List<FileStatus> status = new ArrayList<FileStatus>();
        for(int i = 0; i < list.length; i++) {
            status.addAll(getAllSubFileStatus(job, new Path(list[i])));
        }

        return status.toArray(new FileStatus[0]);
    }

    private List<FileStatus> getAllSubFileStatus(JobConf inputConf, Path filterMemberPath)
            throws IOException {
        List<FileStatus> list = new ArrayList<FileStatus>();

        FileSystem fs = filterMemberPath.getFileSystem(inputConf);
        FileStatus[] subFiles = fs.listStatus(filterMemberPath);

        if(null != subFiles) {
            if(fs.isDirectory(filterMemberPath)) {
                for(FileStatus subFile: subFiles) {
                    if(!HadoopUtils.shouldPathBeIgnored(subFile.getPath())) {
                        list.addAll(getAllSubFileStatus(inputConf, subFile.getPath()));
                    }
                }
            } else {
                if(subFiles.length > 0 && !HadoopUtils.shouldPathBeIgnored(subFiles[0].getPath())) {
                    list.add(subFiles[0]);
                }
            }
        }

        return list;
    }
}
TOP

Related Classes of voldemort.store.readonly.mr.serialization.JsonSequenceFileInputFormat

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.