Package com.asakusafw.directio.hive.orc

Source Code of com.asakusafw.directio.hive.orc.OrcFileInput

/**
* Copyright 2011-2014 Asakusa Framework Team.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.asakusafw.directio.hive.orc;

import java.io.IOException;
import java.text.MessageFormat;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.io.orc.OrcFile;
import org.apache.hadoop.hive.ql.io.orc.Reader;
import org.apache.hadoop.hive.ql.io.orc.RecordReader;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;

import com.asakusafw.directio.hive.serde.DataModelDescriptor;
import com.asakusafw.directio.hive.serde.DataModelDriver;
import com.asakusafw.directio.hive.serde.DataModelMapping;
import com.asakusafw.runtime.directio.Counter;
import com.asakusafw.runtime.io.ModelInput;

/**
* An implementation of {@link ModelInput} for reading ORCFile.
* @param <T> the data model type
* @since 0.7.0
*/
public class OrcFileInput<T> implements ModelInput<T> {

    static final Log LOG = LogFactory.getLog(OrcFileInput.class);

    private final DataModelDescriptor descriptor;

    private final DataModelMapping configuration;

    private final FileSystem fileSystem;

    private final Path path;

    private final long offset;

    private final long fragmentSize;

    private final Counter counter;

    private DataModelDriver driver;

    private RecordReader currentReader;

    private Object structBuffer;

    private long lastCount;

    /**
     * Creates a new instance.
     * @param descriptor the target data model descriptor
     * @param fileSystem the file system to open the target path
     * @param path the path to the target file
     * @param offset starting stream offset
     * @param fragmentSize suggested fragment bytes count, or {@code -1} as infinite
     * @param counter the current counter
     */
    public OrcFileInput(
            DataModelDescriptor descriptor,
            FileSystem fileSystem, Path path,
            long offset, long fragmentSize,
            Counter counter) {
        this(descriptor, new DataModelMapping(), fileSystem, path, offset, fragmentSize, counter);
    }

    /**
     * Creates a new instance.
     * @param descriptor the target data model descriptor
     * @param configuration the {@link DataModelDriver} configuration
     * @param fileSystem the file system to open the target path
     * @param path the path to the target file
     * @param offset starting stream offset
     * @param fragmentSize suggested fragment bytes count, or {@code -1} as infinite
     * @param counter the current counter
     */
    public OrcFileInput(
            DataModelDescriptor descriptor,
            DataModelMapping configuration,
            FileSystem fileSystem, Path path,
            long offset, long fragmentSize,
            Counter counter) {
        this.descriptor = descriptor;
        this.configuration = configuration;
        this.fileSystem = fileSystem;
        this.path = path;
        this.offset = offset;
        this.fragmentSize = fragmentSize;
        this.counter = counter;
    }

    @Override
    public boolean readTo(T model) throws IOException {
        RecordReader reader = prepare();
        if (reader.hasNext() == false) {
            return false;
        }
        Object buf = reader.next(structBuffer);
        driver.set(model, buf);
        structBuffer = buf;

        advanceCounter((long) (fragmentSize * reader.getProgress()));
        return true;
    }

    private RecordReader prepare() throws IOException {
        RecordReader reader = currentReader;
        if (reader == null) {
            if (LOG.isInfoEnabled()) {
                LOG.info(MessageFormat.format(
                        "Loading ORCFile metadata ({0}): {1}",
                        descriptor.getDataModelClass().getSimpleName(),
                        path));
            }
            Reader orc = OrcFile.createReader(fileSystem, path);
            StructObjectInspector sourceInspector = (StructObjectInspector) orc.getObjectInspector();
            driver = new DataModelDriver(descriptor, sourceInspector, configuration);
            if (LOG.isInfoEnabled()) {
                LOG.info(MessageFormat.format(
                        "Loading ORCFile contents ({0}): path={1}, range={2}+{3}",
                        descriptor.getDataModelClass().getSimpleName(),
                        path,
                        offset,
                        fragmentSize));
            }
            boolean[] availableColumns = computeAvailableColumns(orc);
            reader = orc.rows(offset, fragmentSize, availableColumns);
            currentReader = reader;
        }
        return reader;
    }

    private boolean[] computeAvailableColumns(Reader orc) {
        assert driver != null;
        StructObjectInspector inspector = driver.getSourceInspector();
        List<? extends StructField> all = inspector.getAllStructFieldRefs();
        Set<StructField> projected = new HashSet<StructField>(driver.getSourceFields());
        // the first type must be a root type
        boolean[] availables = new boolean[all.size() + 1];
        availables[0] = true;
        for (int i = 0, n = all.size(); i < n; i++) {
            StructField field = all.get(i);
            boolean available = projected.contains(field);
            if (LOG.isDebugEnabled()) {
                LOG.debug(MessageFormat.format(
                        "ORCFile projection: column={0}, include={1}",
                        field.getFieldName(),
                        available));
            }
            availables[i + 1] = available;
        }
        return availables;
    }

    private void advanceCounter(long nextCount) {
        long deltaCount = nextCount - lastCount;
        if (deltaCount > 0) {
            counter.add(deltaCount);
            lastCount = nextCount;
        }
    }

    @Override
    public void close() throws IOException {
        if (currentReader != null) {
            currentReader.close();
        }
    }
}
TOP

Related Classes of com.asakusafw.directio.hive.orc.OrcFileInput

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.