Package org.apache.pig.builtin

Source Code of org.apache.pig.builtin.TestOrcStorage

/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pig.builtin;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;

import java.io.File;
import java.io.IOException;
import java.math.BigDecimal;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.io.orc.CompressionKind;
import org.apache.hadoop.hive.ql.io.orc.OrcFile;
import org.apache.hadoop.hive.ql.io.orc.OrcStruct;
import org.apache.hadoop.hive.ql.io.orc.Reader;
import org.apache.hadoop.hive.ql.io.orc.RecordReader;
import org.apache.hadoop.hive.serde2.io.ByteWritable;
import org.apache.hadoop.hive.serde2.io.DoubleWritable;
import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
import org.apache.hadoop.hive.serde2.io.ShortWritable;
import org.apache.hadoop.hive.serde2.io.TimestampWritable;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.io.BooleanWritable;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.pig.ExecType;
import org.apache.pig.PigServer;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.backend.hadoop.datastorage.ConfigurationUtil;
import org.apache.pig.data.BinSedesTuple;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.DataType;
import org.apache.pig.data.DefaultDataBag;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.test.Util;
import org.joda.time.DateTime;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;

public class TestOrcStorage {
    final protected static Log LOG = LogFactory.getLog(TestOrcStorage.class);

    final private static String basedir = "test/org/apache/pig/builtin/orc/";
    final private static String outbasedir = System.getProperty("user.dir") + "/build/test/TestOrcStorage/";

    private static String INPUT1 = outbasedir + "TestOrcStorage_1";
    private static String OUTPUT1 = outbasedir + "TestOrcStorage_2";
    private static String OUTPUT2 = outbasedir + "TestOrcStorage_3";
    private static String OUTPUT3 = outbasedir + "TestOrcStorage_4";
    private static String OUTPUT4 = outbasedir + "TestOrcStorage_5";

    private static PigServer pigServer = null;
    private static FileSystem fs;

    @BeforeClass
    public static void oneTimeSetup(){
        if(Util.WINDOWS){
            INPUT1 = INPUT1.replace("\\", "/");
            OUTPUT1 = OUTPUT1.replace("\\", "/");
            OUTPUT2 = OUTPUT2.replace("\\", "/");
            OUTPUT3 = OUTPUT3.replace("\\", "/");
            OUTPUT4 = OUTPUT4.replace("\\", "/");
        }
    }

    @Before
    public void setup() throws ExecException, IOException {
        pigServer = new PigServer(ExecType.LOCAL);
        fs = FileSystem.get(ConfigurationUtil.toConfiguration(pigServer.getPigContext().getProperties()));
        deleteTestFiles();
        pigServer.mkdirs(outbasedir);
        generateInputFiles();
    }

    @After
    public void teardown() throws IOException {
        if(pigServer != null) {
            pigServer.shutdown();
        }
        deleteTestFiles();
    }

    private void generateInputFiles() throws IOException {
        String[] input = {"65536\tworld", "1\thello"};
        Util.createLocalInputFile(INPUT1, input);
    }

    private static void deleteTestFiles() throws IOException {
        Util.deleteDirectory(new File(outbasedir));
    }

    @Test
    public void testSimpleLoad() throws Exception {
        pigServer.registerQuery("A = load '" + basedir + "orc-file-11-format.orc'" + " using OrcStorage();" );
        Schema s = pigServer.dumpSchema("A");
        assertEquals(s.toString(), "{boolean1: boolean,byte1: int,short1: int,int1: int,long1: long," +
                "float1: float,double1: double,bytes1: bytearray,string1: chararray," +
                "middle: (list: {(int1: int,string1: chararray)}),list: {(int1: int,string1: chararray)}," +
                "map: map[(int1: int,string1: chararray)],ts: datetime,decimal1: bigdecimal}");
        Iterator<Tuple> iter = pigServer.openIterator("A");

        verifyData(new Path(basedir + "orc-file-11-format.orc"), iter, fs, 7500);
    }

    @Test
    public void testJoinWithPruning() throws Exception {
        pigServer.registerQuery("A = load '" + basedir + "orc-file-11-format.orc'" + " using OrcStorage();" );
        pigServer.registerQuery("B = foreach A generate int1, string1;");
        pigServer.registerQuery("C = order B by int1;");
        pigServer.registerQuery("D = limit C 10;");
        pigServer.registerQuery("E = load '" + INPUT1 + "' as (e0:int, e1:chararray);");
        pigServer.registerQuery("F = join D by int1, E by e0;");
        Iterator<Tuple> iter = pigServer.openIterator("F");
        int count=0;
        Tuple t=null;
        while (iter.hasNext()) {
            t = iter.next();
            assertEquals(t.size(), 4);
            count++;
        }
        assertEquals(count, 10);
    }

    @Test
    // See PIG-4195
    public void testCharVarchar() throws Exception {
        pigServer.registerQuery("A = load '" + basedir + "charvarchar.orc'" + " using OrcStorage();" );
        Schema schema = pigServer.dumpSchema("A");
        assertEquals(schema.size(), 4);
        assertEquals(schema.getField(0).type, DataType.CHARARRAY);
        assertEquals(schema.getField(1).type, DataType.CHARARRAY);
        Iterator<Tuple> iter = pigServer.openIterator("A");
        int count=0;
        Tuple t=null;
        while (iter.hasNext()) {
            t = iter.next();
            assertEquals(t.size(), 4);
            assertTrue(t.get(0) instanceof String);
            assertTrue(t.get(1) instanceof String);
            assertEquals(((String)t.get(1)).length(), 20);
            count++;
        }
        assertEquals(count, 10000);
    }

    @Test
    // See PIG-4218
    public void testNullMapKey() throws Exception {
        pigServer.registerQuery("A = load '" + basedir + "nullmapkey.orc'" + " using OrcStorage();" );
        Iterator<Tuple> iter = pigServer.openIterator("A");
        assertEquals(iter.next().toString(), "([hello#world])");
        assertEquals(iter.next().toString(), "([])");
        assertFalse(iter.hasNext());
    }

    @Test
    public void testSimpleStore() throws Exception {
        pigServer.registerQuery("A = load '" + INPUT1 + "' as (a0:int, a1:chararray);");
        pigServer.store("A", OUTPUT1, "OrcStorage");
        Path outputFilePath = new Path(new Path(OUTPUT1), "part-m-00000");
        Reader reader = OrcFile.createReader(fs, outputFilePath);
        assertEquals(reader.getNumberOfRows(), 2);

        RecordReader rows = reader.rows(null);
        Object row = rows.next(null);
        StructObjectInspector soi = (StructObjectInspector)reader.getObjectInspector();
        IntWritable intWritable = (IntWritable)soi.getStructFieldData(row,
                soi.getAllStructFieldRefs().get(0));
        Text text = (Text)soi.getStructFieldData(row,
                soi.getAllStructFieldRefs().get(1));
        assertEquals(intWritable.get(), 65536);
        assertEquals(text.toString(), "world");

        row = rows.next(null);
        intWritable = (IntWritable)soi.getStructFieldData(row,
                soi.getAllStructFieldRefs().get(0));
        text = (Text)soi.getStructFieldData(row,
                soi.getAllStructFieldRefs().get(1));
        assertEquals(intWritable.get(), 1);
        assertEquals(text.toString(), "hello");

        // A bug in ORC InputFormat does not allow empty file in input directory
        fs.delete(new Path(OUTPUT1, "_SUCCESS"), true);

        // Read the output file back
        pigServer.registerQuery("A = load '" + OUTPUT1 + "' using OrcStorage();");
        Schema s = pigServer.dumpSchema("A");
        assertEquals(s.toString(), "{a0: int,a1: chararray}");
        Iterator<Tuple> iter = pigServer.openIterator("A");
        Tuple t = iter.next();
        assertEquals(t.size(), 2);
        assertEquals(t.get(0), 65536);
        assertEquals(t.get(1), "world");

        t = iter.next();
        assertEquals(t.size(), 2);
        assertEquals(t.get(0), 1);
        assertEquals(t.get(1), "hello");

        assertFalse(iter.hasNext());
        rows.close();
    }

    @Test
    public void testMultiStore() throws Exception {
        pigServer.setBatchOn();
        pigServer.registerQuery("A = load '" + INPUT1 + "' as (a0:int, a1:chararray);");
        pigServer.registerQuery("B = order A by a0;");
        pigServer.registerQuery("store B into '" + OUTPUT2 + "' using OrcStorage();");
        pigServer.registerQuery("store B into '" + OUTPUT3 +"' using OrcStorage('-c SNAPPY');");
        pigServer.executeBatch();

        Path outputFilePath = new Path(new Path(OUTPUT2), "part-r-00000");
        Reader reader = OrcFile.createReader(fs, outputFilePath);
        assertEquals(reader.getNumberOfRows(), 2);
        assertEquals(reader.getCompression(), CompressionKind.ZLIB);

        Path outputFilePath2 = new Path(new Path(OUTPUT3), "part-r-00000");
        reader = OrcFile.createReader(fs, outputFilePath2);
        assertEquals(reader.getNumberOfRows(), 2);
        assertEquals(reader.getCompression(), CompressionKind.SNAPPY);

        verifyData(outputFilePath, outputFilePath2, fs, 2);
    }

    @Test
    public void testMultipleLoadStore() throws Exception {
        pigServer.registerQuery("A = load '" + basedir + "orc-file-11-format.orc'" + " using OrcStorage();" );
        pigServer.registerQuery("store A into '" + OUTPUT1 + "' using OrcStorage();");
        pigServer.registerQuery("B = load '" + OUTPUT1 + "' using OrcStorage();");
        verifyData(new Path(basedir + "orc-file-11-format.orc"), pigServer.openIterator("B"), fs, 7500);
    }

    @Test
    public void testLoadStoreMoreDataType() throws Exception {
        pigServer.registerQuery("A = load '" + basedir + "orc-file-11-format.orc'" + " using OrcStorage();" );
        pigServer.registerQuery("B = foreach A generate boolean1..double1, '' as bytes1, string1..;");
        pigServer.store("B", OUTPUT4, "OrcStorage");

        // A bug in ORC InputFormat does not allow empty file in input directory
        fs.delete(new Path(OUTPUT4, "_SUCCESS"), true);

        pigServer.registerQuery("A = load '" + OUTPUT4 + "' using OrcStorage();" );
        Iterator<Tuple> iter = pigServer.openIterator("A");
        Tuple t = iter.next();
        assertTrue(t.toString().startsWith("(false,1,1024,65536,9223372036854775807,1.0,-15.0," +
                ",hi,({(1,bye),(2,sigh)}),{(3,good),(4,bad)},[],"));
        assertTrue(t.get(12).toString().matches("2000-03-12T15:00:00.000.*"));
        assertTrue(t.toString().endsWith(",12345678.6547456)"));
    }

    private void verifyData(Path orcFile, Iterator<Tuple> iter, FileSystem fs, int expectedTotalRows) throws Exception {

        int expectedRows = 0;
        int actualRows = 0;
        Reader orcReader = OrcFile.createReader(fs, orcFile);
        ObjectInspector oi = orcReader.getObjectInspector();
        StructObjectInspector soi = (StructObjectInspector) oi;

        RecordReader reader = orcReader.rows(null);
        Object row = null;

        while (reader.hasNext()) {
            row = reader.next(row);
            expectedRows++;
            List<?> orcRow = soi.getStructFieldsDataAsList(row);
            if (!iter.hasNext()) {
                break;
            }
            Tuple t = iter.next();
            assertEquals(orcRow.size(), t.size());
            actualRows++;

            for (int i = 0; i < orcRow.size(); i++) {
                Object expected = orcRow.get(i);
                Object actual = t.get(i);
                compareData(expected, actual);
            }
        }
        assertFalse(iter.hasNext());
        assertEquals(expectedRows, actualRows);
        assertEquals(expectedTotalRows, actualRows);

    }

    private void verifyData(Path orcFile, Path pigOrcFile, FileSystem fs, int expectedTotalRows) throws Exception {

        int expectedRows = 0;
        int actualRows = 0;
        Reader orcReaderExpected = OrcFile.createReader(fs, orcFile);
        StructObjectInspector soiExpected = (StructObjectInspector) orcReaderExpected.getObjectInspector();
        Reader orcReaderActual = OrcFile.createReader(fs, orcFile);
        StructObjectInspector soiActual = (StructObjectInspector) orcReaderActual.getObjectInspector();

        RecordReader readerExpected = orcReaderExpected.rows(null);
        Object expectedRow = null;
        RecordReader readerActual = orcReaderActual.rows(null);
        Object actualRow = null;

        while (readerExpected.hasNext()) {
            expectedRow = readerExpected.next(expectedRow);
            expectedRows++;
            List<?> orcRowExpected = soiExpected.getStructFieldsDataAsList(expectedRow);
            if (!readerActual.hasNext()) {
                break;
            }
            actualRow = readerActual.next(actualRow);
            actualRows++;
            List<?> orcRowActual = soiActual.getStructFieldsDataAsList(actualRow);
            assertEquals(orcRowExpected.size(), orcRowActual.size());

            for (int i = 0; i < orcRowExpected.size(); i++) {
                assertEquals(orcRowExpected.get(i), orcRowActual.get(i));
            }
        }
        assertFalse(readerActual.hasNext());
        assertEquals(expectedRows, actualRows);
        assertEquals(expectedTotalRows, actualRows);

        readerExpected.close();
        readerActual.close();
    }

    @SuppressWarnings("rawtypes")
    private void compareData(Object expected, Object actual) {
        if (expected instanceof Text) {
            assertEquals(String.class, actual.getClass());
            assertEquals(expected.toString(), actual);
        } else if (expected instanceof ShortWritable) {
            assertEquals(Integer.class, actual.getClass());
            assertEquals((int)((ShortWritable) expected).get(), actual);
        } else if (expected instanceof IntWritable) {
            assertEquals(Integer.class, actual.getClass());
            assertEquals(((IntWritable) expected).get(), actual);
        } else if (expected instanceof LongWritable) {
            assertEquals(Long.class, actual.getClass());
            assertEquals(((LongWritable) expected).get(), actual);
        } else if (expected instanceof FloatWritable) {
            assertEquals(Float.class, actual.getClass());
            assertEquals(((FloatWritable) expected).get(), actual);
        } else if (expected instanceof HiveDecimalWritable) {
            assertEquals(BigDecimal.class, actual.getClass());
            assertEquals(((HiveDecimalWritable) expected).toString(), actual.toString());
        } else if (expected instanceof DoubleWritable) {
            assertEquals(Double.class, actual.getClass());
            assertEquals(((DoubleWritable) expected).get(), actual);
        } else if (expected instanceof BooleanWritable) {
            assertEquals(Boolean.class, actual.getClass());
            assertEquals(((BooleanWritable) expected).get(), actual);
        } else if (expected instanceof TimestampWritable) {
            assertEquals(DateTime.class, actual.getClass());
            assertEquals(((TimestampWritable) expected).getTimestamp().getTime(),
                    ((DateTime) actual).getMillis());
        } else if (expected instanceof BytesWritable) {
            assertEquals(DataByteArray.class, actual.getClass());
            BytesWritable bw = (BytesWritable) expected;
            assertEquals(new DataByteArray(bw.getBytes(), 0, bw.getLength()), actual);
        } else if (expected instanceof ByteWritable) {
            assertEquals(Integer.class, actual.getClass());
            assertEquals((int) ((ByteWritable) expected).get(), actual);
        } else if (expected instanceof OrcStruct) {
            assertEquals(BinSedesTuple.class, actual.getClass());
            // TODO: compare actual values. No getters in OrcStruct
        } else if (expected instanceof ArrayList) {
            assertEquals(DefaultDataBag.class, actual.getClass());
            // TODO: compare actual values. No getters in OrcStruct
        } else if (expected instanceof HashMap) {
            assertEquals(HashMap.class, actual.getClass());
            assertEquals(((HashMap) expected).size(), ((HashMap) actual).size());
            // TODO: compare actual values. No getters in OrcStruct
        } else if (expected == null) {
            assertEquals(expected, actual);
        } else {
            Assert.fail("Unknown object type: " + expected.getClass().getName());
        }
    }

}
TOP

Related Classes of org.apache.pig.builtin.TestOrcStorage

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.