Package

Source Code of Example1

/*
*
*  Copyright © 2010, 2011 Inadco, Inc. All rights reserved.
*     Licensed under the Apache License, Version 2.0 (the "License");
*     you may not use this file except in compliance with the License.
*     You may obtain a copy of the License at
*         http://www.apache.org/licenses/LICENSE-2.0
*     Unless required by applicable law or agreed to in writing, software
*     distributed under the License is distributed on an "AS IS" BASIS,
*     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*     See the License for the specific language governing permissions and
*     limitations under the License.
*/

import java.io.BufferedReader;
import java.io.Closeable;
import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.io.StringReader;
import java.io.StringWriter;
import java.math.BigInteger;
import java.util.ArrayDeque;
import java.util.Arrays;
import java.util.Calendar;
import java.util.Deque;
import java.util.GregorianCalendar;
import java.util.HashSet;
import java.util.Random;
import java.util.TimeZone;

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.pig.ExecType;
import org.apache.pig.impl.PigContext;
import org.apache.pig.tools.grunt.Grunt;
import org.apache.pig.tools.parameters.ParameterSubstitutionPreprocessor;
import org.springframework.core.io.ClassPathResource;
import org.springframework.core.io.Resource;

import com.google.protobuf.ByteString;
import com.inadco.hb.example1.codegen.Example1.CompilerInput;
import com.inadco.hbl.client.AggregateQuery;
import com.inadco.hbl.client.AggregateResult;
import com.inadco.hbl.client.AggregateResultSet;
import com.inadco.hbl.client.HblAdmin;
import com.inadco.hbl.client.HblException;
import com.inadco.hbl.client.HblQueryClient;
import com.inadco.hbl.client.PreparedAggregateQuery;
import com.inadco.hbl.client.PreparedAggregateResult;
import com.inadco.hbl.compiler.Pig8CubeIncrementalCompilerBean;
import com.inadco.hbl.math.aggregators.OnlineCannyAvgSummarizer;
import com.inadco.hbl.util.HblUtil;
import com.inadco.hbl.util.IOUtil;

/**
* to run, use hadoop command line
*
* @author dmitriy
*
*/

public class Example1 extends Configured implements Tool {

    public static void main(String[] args) throws Throwable {

        ToolRunner.run(new Example1(), args);

    }

    private static ExecType      EXEC_TYPE  = ExecType.MAPREDUCE;
    private static final boolean QUERY_ONLY = false;

    private HblQueryClient       queryClient;
    private Deque<Closeable>     closeables = new ArrayDeque<Closeable>();

    @Override
    public int run(String[] args) throws Exception {
        try {

            // script resource
            Resource cubeModelRsrc = new ClassPathResource("example1.yaml");

            /*
             * deploy cube schema (optionally dropping the existing one)
             * WARNING: would drop existing cube!!
             */
            HblAdmin hblAdmin = new HblAdmin(cubeModelRsrc);
            if (!QUERY_ONLY) {
                hblAdmin.dropCube(getConf());
                hblAdmin.deployCube(getConf());
            }

            String cubeName = hblAdmin.getCube().getName();

            /*
             * prepare incremental simulated input and select work dir for the
             * compiler job
             */

            FileSystem dfs =
                EXEC_TYPE == ExecType.MAPREDUCE ? FileSystem.get(getConf()) : FileSystem.getLocal(getConf());
            Path workPath = new Path(dfs.getWorkingDirectory(), "hbltemp-" + System.currentTimeMillis());
            Path inputPath = new Path(dfs.getWorkingDirectory(), "sample1-input" + System.currentTimeMillis());

            simulateInput(dfs, inputPath);

            // run compiler for the model
            Pig8CubeIncrementalCompilerBean compiler =
                new Pig8CubeIncrementalCompilerBean(
                    getConf(),
                    cubeName,
                    new ClassPathResource("example1-preambula.pig"),
                    5);
            /*
             * test fact compile time exclusion to allow merging different fact
             * stream sources
             */

            compiler.setMeasureExclude(new HashSet<String>(Arrays.asList("excludedMeasure")));

            // or:
            // compiler.setMeasureInclude(new
            // HashSet<String>(Arrays.asList("impCnt", "click")));

            /*
             * this is the version that uses model from resource instead of hbl
             * system table.
             */
            // new Pig8CubeIncrementalCompilerBean(cubeModelRsrc, new
            // ClassPathResource("example1-preambula.pig"), 5);

            String script = compiler.preparePigSource(workPath.toString());

            // ////////////////////////////////////
            // ------------- debug: dump the script
            Path dumpDir = new Path(inputPath, "__debug");
            dfs.mkdirs(dumpDir);
            Path scriptDumpPath = new Path(dumpDir, "compiler.pig");
            System.out.printf("script saved at:%s\n", scriptDumpPath.toString());
            FSDataOutputStream fsdos = dfs.create(scriptDumpPath);
            try {
                fsdos.writeUTF(script);
            } finally {
                fsdos.close();
            }
            // ------------- debug: dump the script
            // ////////////////////////////////////

            if (!QUERY_ONLY)
                runScript(script, inputPath);

            queryClient = new HblQueryClient(getConf());
            closeables.addFirst(queryClient);

            testClient1(cubeName);
            testClient2(cubeName);
            testClient3(cubeName);
            testClient4(cubeName);

            // query based tests
            testClient5(cubeName);
            testClient6(cubeName);
            testClient7(cubeName);
            testClient8(cubeName);
            testClient9(cubeName);

            return 0;

        } finally {
            IOUtil.closeAllQuietly(closeables);
        }
    }

    private void testClient1(String cubeName) throws IOException, HblException {
        Deque<Closeable> closeables = new ArrayDeque<Closeable>();
        try {

            byte ids[][] = new byte[2][];
            ids[0] = new byte[16];
            ids[1] = new byte[16];
            HblUtil.incrementKey(ids[1], 0, 16);

            /*
             * this should be equivalent to select aggr_func(impCnt),
             * aggr_func(click) from ... where dim1<=ids[0] and dim1>=ids[0]
             * group by dim1
             */

            AggregateQuery query = queryClient.createQuery();
            query.setCube(cubeName).addMeasure("impCnt").addMeasure("click");
            query.addClosedSlice("dim1", ids[0], ids[0]).addGroupBy("dim1");
            AggregateResultSet rs = query.execute();
            closeables.addFirst(rs);
            while (rs.hasNext()) {
                rs.next();
                AggregateResult ar = rs.current();
                System.out.printf("%032X sum/cnt: impCnt %.4f/%d, click %.4f/%d\n",
                                  new BigInteger(1, (byte[]) ar.getGroupMember("dim1")),
                                  ar.getAggregate("impCnt", "SUM"),
                                  ar.getAggregate("impCnt", "COUNT"),
                                  ar.getAggregate("click", "SUM"),
                                  ar.getAggregate("click", "COUNT"));
            }

            closeables.remove(rs);
            rs.close();

        } finally {
            IOUtil.closeAll(closeables);
        }

    }

    private void testClient2(String cubeName) throws IOException, HblException {
        Deque<Closeable> closeables = new ArrayDeque<Closeable>();
        try {

            byte ids[][] = new byte[2][];
            ids[0] = new byte[16];
            ids[1] = new byte[16];
            HblUtil.incrementKey(ids[1], 0, 16);

            /**
             * Now, more difficult. try to hit both keys lifetime. This will
             * result in composite key filtering with a restart (most
             * fundamental composite key filtering capability but only one part
             * of the key). This now passes too.
             *
             */
            AggregateQuery query = queryClient.createQuery();

            query.setCube(cubeName).addMeasure("impCnt").addMeasure("click");
            query.addClosedSlice("dim1", ids[0], ids[1]).addGroupBy("dim1");
            AggregateResultSet rs = query.execute();
            closeables.addFirst(rs);
            while (rs.hasNext()) {
                rs.next();
                AggregateResult ar = rs.current();
                System.out.printf("%032X sum/cnt: impCnt %.4f/%d, click %.4f/%d\n",
                                  new BigInteger(1, (byte[]) ar.getGroupMember("dim1")),
                                  ar.getAggregate("impCnt", "SUM"),
                                  ar.getAggregate("impCnt", "COUNT"),
                                  ar.getAggregate("click", "SUM"),
                                  ar.getAggregate("click", "COUNT"));
            }
            closeables.remove(rs);
            rs.close();

        } finally {
            IOUtil.closeAll(closeables);
        }
    }

    private void testClient3(String cubeName) throws IOException, HblException {
        Deque<Closeable> closeables = new ArrayDeque<Closeable>();
        try {
            HblQueryClient queryClient = new HblQueryClient(getConf());
            closeables.addFirst(queryClient);

            byte ids[][] = new byte[2][];
            ids[0] = new byte[16];
            ids[1] = new byte[16];
            HblUtil.incrementKey(ids[1], 0, 16);

            /**
             * same as client2 but print the summaries separately (no grouping).
             * This is obviously not terribly useful, the queries have got to
             * have group specification -- -- unless we group up all of it.
             */
            AggregateQuery query = queryClient.createQuery();

            query.setCube(cubeName).addMeasure("impCnt").addMeasure("click");
            query.addClosedSlice("dim1", ids[0], ids[1])/* .addGroupBy("dim1") */;
            AggregateResultSet rs = query.execute();
            closeables.addFirst(rs);
            while (rs.hasNext()) {
                rs.next();
                AggregateResult ar = rs.current();
                System.out.printf("%s sum/cnt: impCnt %.4f/%d, click %.4f/%d\n",
                // new BigInteger(1,(byte[])ar.getGroupMember("dim1")),
                                  "no-group",
                                  ar.getAggregate("impCnt", "SUM"),
                                  ar.getAggregate("impCnt", "COUNT"),
                                  ar.getAggregate("click", "SUM"),
                                  ar.getAggregate("click", "COUNT"));
            }
            closeables.remove(rs);
            rs.close();

        } finally {
            IOUtil.closeAll(closeables);
        }
    }

    private void testClient4(String cubeName) throws IOException, HblException {
        Deque<Closeable> closeables = new ArrayDeque<Closeable>();
        try {

            AggregateQuery query = queryClient.createQuery();
            query.setCube(cubeName);
            for (int i = 0; i < 5; i++) {

                byte ids[][] = new byte[2][];
                ids[0] = new byte[16];
                ids[1] = new byte[16];
                HblUtil.incrementKey(ids[1], 0, 16);

                /*
                 * will try to also constrain for half-open [1:00am,3:00am)
                 */

                GregorianCalendar startTime = IOUtil.tryClone(START_BASE);
                GregorianCalendar endTime = IOUtil.tryClone(START_BASE);

                /*
                 * this will be in local time, whereas example was generated in
                 * UTC. So for PST we get actually normally hours from 9,10 am.
                 * which would result in impression count of 17 for key 00000,
                 * and 21 ifor key 000001.
                 */
                startTime.add(Calendar.HOUR_OF_DAY, 1);
                endTime.add(Calendar.HOUR_OF_DAY, 3);
                // recalculate the calendars
                startTime.getTimeInMillis();
                endTime.getTimeInMillis();

                /*
                 * same as client2 but print the summaries separately (no
                 * grouping).
                 */

                query.addMeasure("impCnt").addMeasure("click");
                query.addClosedSlice("dim1", ids[0], ids[1]).addGroupBy("dim1");
                query.addHalfOpenSlice("impressionTime", startTime, endTime);

                long ms = System.currentTimeMillis();
                AggregateResultSet rs = query.execute();
                closeables.addFirst(rs);
                while (rs.hasNext()) {
                    rs.next();
                    AggregateResult ar = rs.current();
                    System.out.printf("%032X sum/cnt: impCnt %.4f/%d, click %.4f/%d\n",
                                      new BigInteger(1, (byte[]) ar.getGroupMember("dim1")),
                                      ar.getAggregate("impCnt", "SUM"),
                                      ar.getAggregate("impCnt", "COUNT"),
                                      ar.getAggregate("click", "SUM"),
                                      ar.getAggregate("click", "COUNT"));
                }
                closeables.remove(rs);
                rs.close();

                System.out.printf("query+printout complete in %d ms\n", System.currentTimeMillis() - ms);
            }

        } finally {
            IOUtil.closeAll(closeables);
        }
    }

    private void testClient5(String cubeName) throws IOException, HblException {

        System.out.println("Test5:\n\n");

        Deque<Closeable> closeables = new ArrayDeque<Closeable>();
        try {

            byte ids[][] = new byte[2][];
            ids[0] = new byte[16];
            ids[1] = new byte[16];
            HblUtil.incrementKey(ids[1], 0, 16);

            /*
             * will try to also constrain for half-open [1:00am,3:00am)
             */

            GregorianCalendar startTime = IOUtil.tryClone(START_BASE);
            GregorianCalendar endTime = IOUtil.tryClone(START_BASE);

            // our actual example generated facts in utc zone of that day.
            startTime.setTimeZone(TimeZone.getTimeZone("UTC"));
            endTime.setTimeZone(TimeZone.getTimeZone("UTC"));

            // flush
            startTime.getTimeInMillis();
            endTime.getTimeInMillis();

            // modify time-of-day-wise
            startTime.add(Calendar.HOUR_OF_DAY, 1);
            endTime.add(Calendar.HOUR_OF_DAY, 3);
            // recalculate the calendars
            startTime.getTimeInMillis();
            endTime.getTimeInMillis();

            PreparedAggregateQuery query = queryClient.createPreparedQuery();

            /*
             * test reuse of the prepared query. Should speedup stuff exactly as
             * prepared query is supposed to do. we also have an option of
             * re-preparing query at any time, but we still need to run reset()
             * to clean out stuff like parameters initialized and execution.
             * reset() does not necessarily cancel previously existing AST tree
             * of the query, only prepare() updates that. but prepare does
             * reset() implicitly, so if we re-prepared the query, the previous
             * parameter set cannot be used.
             */
            long ms = System.currentTimeMillis();
            query.prepare("select dim1, SUM(impCnt) as ?, COUNT(impCnt) as ?, SUM(click) as clickSum, "
                + "COUNT(click) as clickCnt, cannyAvg7d(clickTimeSeries) as ctr " +

                "from Example1 where dim1 in [?] " + ", impressionTime in [?,?) " + ", dim2 in [ '1' ]"
                + "group by dim1");
            System.out.printf("query prepared in %d ms\n", System.currentTimeMillis() - ms);

            for (int i = 0; i < 5; i++) {

                /**
                 * same as client2 but print the summaries separately (no
                 * grouping).
                 *
                 */
                ms = System.currentTimeMillis();

                // demo: can parameterize aliases
                // or measure names in the select expression.
                query.setHblParameter(0, "impSum");
                query.setHblParameter(1, "impCnt");

                query.setHblParameter(2, ids[1]);
                // query.setHblParameter(3, ids[1]);
                query.setHblParameter(3, startTime);
                query.setHblParameter(4, endTime);

                // query.addMeasure("impCnt").addMeasure("click");
                // query.addClosedSlice("dim1",ids[0],ids[1]).addGroupBy("dim1");
                // query.addHalfOpenSlice("impressionTime", startTime, endTime);

                AggregateResultSet rs = query.execute();
                closeables.addFirst(rs);
                while (rs.hasNext()) {
                    rs.next();
                    PreparedAggregateResult ar = (PreparedAggregateResult) rs.current();

                    OnlineCannyAvgSummarizer ctrSum = (OnlineCannyAvgSummarizer) ar.getObject("ctr");
                    double wctr = ctrSum == null ? 0 : ctrSum.getValue();

                    Double impSum = (Double) ar.getObject("impSum");
                    if (impSum == null)
                        impSum = new Double(0);
                    Double clickSum = (Double) ar.getObject("clickSum");
                    if (clickSum == null)
                        clickSum = new Double(0);

                    System.out.printf("%032X sum/cnt: impCnt %.4f/%d, click %.4f/%d, ctr: %.4f, weighted ctr: %.4f \n",

                                      new BigInteger(1, (byte[]) ar.getObject(0)),
                                      ar.getObject("impSum"),
                                      ar.getObject("impCnt"),
                                      ar.getObject("clickSum"),
                                      ar.getObject("clickCnt"),
                                      clickSum / impSum,
                                      wctr);
                }
                closeables.remove(rs);
                rs.close();

                System.out.printf("query+printout complete in %d ms\n", System.currentTimeMillis() - ms);
            }

        } finally {
            IOUtil.closeAll(closeables);
        }
    }

    private void testClient6(String cubeName) throws IOException, HblException {

        System.out.println("Test6:\n\n");

        Deque<Closeable> closeables = new ArrayDeque<Closeable>();
        try {

            PreparedAggregateQuery query = queryClient.createPreparedQuery();

            /*
             * test reuse of the prepared query. Should speedup stuff exactly as
             * prepared query is supposed to do. we also have an option of
             * re-preparing query at any time, but we still need to run reset()
             * to clean out stuff like parameters initialized and execution.
             * reset() does not necesserily cancel previously existing AST tree
             * of the query, only prepare() updates that. but prepare does
             * reset() implicitly, so if we re-prepared the query, the previous
             * parameter set cannot be used.
             */
            long ms = System.currentTimeMillis();
            query.prepare("select SUM(impCnt) as imp, SUM(click) as click, cannyAvg7d(clickTimeSeries) as wctr7d,"
                + "cannyAvg90d(clickTimeSeries) as wctr90d " + " " + "from Example1 where dim1 in [?]");
            System.out.printf("query prepared in %d ms\n", System.currentTimeMillis() - ms);

            for (int i = 0; i < 3; i++) {

                /*
                 * same as client2 but print the summaries separately (no
                 * grouping).
                 */
                ms = System.currentTimeMillis();

                query.setHblParameter(0, i);

                AggregateResultSet rs = query.execute();
                closeables.addFirst(rs);
                while (rs.hasNext()) {
                    rs.next();
                    PreparedAggregateResult ar = (PreparedAggregateResult) rs.current();
                    System.out.printf("dim1: %032X impCnt %.4f clickCnt %.4f ctr %.4f wctr 7d %.4f, wctr90d %.4f \n",
                                      i, /*
                                          * new BigInteger(1, (byte[])
                                          * ar.getObject("dim1")),
                                          */
                                      ar.getObject("imp"),
                                      ar.getObject("click"),
                                      (Double) ar.getObject("click") / (Double) ar.getObject("imp"),
                                      ((OnlineCannyAvgSummarizer) ar.getObject(/* 2 */"wctr7d")).getValue(),
                                      ((OnlineCannyAvgSummarizer) ar.getObject("wctr90d")).getValue());
                }
                closeables.remove(rs);
                rs.close();

                System.out.printf("query+printout complete in %d ms\n", System.currentTimeMillis() - ms);
            }

        } finally {
            IOUtil.closeAll(closeables);
        }
    }

    /**
     * Month - spanning test
     *
     * @param cubeName
     * @throws IOException
     * @throws HblException
     */
    private void testClient7(String cubeName) throws IOException, HblException {

        System.out.println("Test7:\n\n");

        Deque<Closeable> closeables = new ArrayDeque<Closeable>();
        try {

            byte ids[][] = new byte[2][];
            ids[0] = new byte[16];
            ids[1] = new byte[16];
            HblUtil.incrementKey(ids[1], 0, 16);

            /**
             * will try to also constrain for half-open [1:00am,3:00am)
             */

            GregorianCalendar startTime = IOUtil.tryClone(START_BASE);
            GregorianCalendar endTime = IOUtil.tryClone(START_BASE);

            // our actual example generated facts in utc zone of that day.
            startTime.setTimeZone(TimeZone.getTimeZone("UTC"));
            endTime.setTimeZone(TimeZone.getTimeZone("UTC"));

            // flush
            startTime.getTimeInMillis();
            endTime.getTimeInMillis();

            // modify time-of-day-wise
            startTime.add(Calendar.HOUR_OF_DAY, 1);
            endTime.add(Calendar.HOUR_OF_DAY, 3);
            endTime.add(Calendar.MONTH, 9);

            // recalculate the calendars
            startTime.getTimeInMillis();
            endTime.getTimeInMillis();

            PreparedAggregateQuery query = queryClient.createPreparedQuery();

            /*
             * test reuse of the prepared query. Should speedup stuff exactly as
             * prepared query is supposed to do. we also have an option of
             * re-preparing query at any time, but we still need to run reset()
             * to clean out stuff like parameters initialized and execution.
             * reset() does not necessarily cancel previously existing AST tree
             * of the query, only prepare() updates that. but prepare does
             * reset() implicitly, so if we re-prepared the query, the previous
             * parameter set cannot be used.
             */
            long ms = System.currentTimeMillis();
            query.prepare("select dim1, SUM(impCnt) as ?, COUNT(impCnt) as ?, SUM(click) as clickSum, "
                + "COUNT(click) as clickCnt, cannyAvg7d(clickTimeSeries) as ctr " +

                "from Example1 where impressionTime in [?,?), dim1 in [?] " + "group by dim1");
            System.out.printf("query prepared in %d ms\n", System.currentTimeMillis() - ms);

            // warm up helps?
            for (int k = 0; k < 200; k++) {

                for (int i = 0; i < 2; i++) {

                    /**
                     * same as client2 but print the summaries separately (no
                     * grouping).
                     *
                     */
                    ms = System.currentTimeMillis();

                    // demo: can parameterize aliases
                    // or measure names in the select expression.
                    query.setHblParameter(0, "impSum");
                    query.setHblParameter(1, "impCnt");

                    query.setHblParameter(2, startTime);
                    query.setHblParameter(3, endTime);

                    query.setHblParameter(4, i);

                    AggregateResultSet rs = query.execute();
                    closeables.addFirst(rs);
                    while (rs.hasNext()) {
                        rs.next();
                        PreparedAggregateResult ar = (PreparedAggregateResult) rs.current();

                        OnlineCannyAvgSummarizer ctrSum = (OnlineCannyAvgSummarizer) ar.getObject("ctr");
                        Double wctr = ctrSum == null ? null : ctrSum.getValue();

                        Double impSum = (Double) ar.getObject("impSum");
                        if (impSum == null)
                            impSum = new Double(0);
                        Double clickSum = (Double) ar.getObject("clickSum");
                        if (clickSum == null)
                            clickSum = new Double(0);

                        System.out
                            .printf("%032X sum/cnt: impCnt %.4f/%d, click %.4f/%d, ctr: %.4f, weighted ctr: %.4f \n",

                                    new BigInteger(1, (byte[]) ar.getObject(0)),
                                    ar.getObject("impSum"),
                                    ar.getObject("impCnt"),
                                    ar.getObject("clickSum"),
                                    ar.getObject("clickCnt"),
                                    clickSum / impSum,
                                    wctr);
                    }
                    closeables.remove(rs);
                    rs.close();

                    System.out.printf("query+printout complete in %d ms\n", System.currentTimeMillis() - ms);
                }
            }

        } finally {
            IOUtil.closeAll(closeables);
        }
    }

    private void testClient8(String cubeName) throws IOException, HblException {

        System.out.println("Test8:\n\n");

        Deque<Closeable> closeables = new ArrayDeque<Closeable>();
        try {

            byte ids[][] = new byte[2][];
            ids[0] = new byte[16];
            ids[1] = new byte[16];
            HblUtil.incrementKey(ids[1], 0, 16);

            /**
             * will try to also constrain for half-open [1:00am,3:00am)
             */

            GregorianCalendar startTime = IOUtil.tryClone(START_BASE);
            GregorianCalendar endTime = IOUtil.tryClone(START_BASE);

            // our actual example generated facts in utc zone of that day.
            startTime.setTimeZone(TimeZone.getTimeZone("UTC"));
            endTime.setTimeZone(TimeZone.getTimeZone("UTC"));

            // flush
            startTime.getTimeInMillis();
            endTime.getTimeInMillis();

            // modify time-of-day-wise
            startTime.add(Calendar.HOUR_OF_DAY, 1);
            endTime.add(Calendar.HOUR_OF_DAY, 3);
            endTime.add(Calendar.MONTH, 9);

            // recalculate the calendars
            startTime.getTimeInMillis();
            endTime.getTimeInMillis();

            PreparedAggregateQuery query = queryClient.createPreparedQuery();

            /*
             * test reuse of the prepared query. Should speedup stuff exactly as
             * prepared query is supposed to do. we also have an option of
             * re-preparing query at any time, but we still need to run reset()
             * to clean out stuff like parameters initialized and execution.
             * reset() does not necessarily cancel previously existing AST tree
             * of the query, only prepare() updates that. but prepare does
             * reset() implicitly, so if we re-prepared the query, the previous
             * parameter set cannot be used.
             */
            long ms = System.currentTimeMillis();
            query.prepare("select dim1, charDim1, SUM(impCnt) as ?, COUNT(impCnt) as ?, SUM(click) as clickSum, "
                + "COUNT(click) as clickCnt, cannyAvg7d(clickTimeSeries) as ctr " +

                "from Example1 where impressionTime in [?,?), dim1 in [?] " + "group by dim1, charDim1");
            System.out.printf("query prepared in %d ms\n", System.currentTimeMillis() - ms);

            for (int i = 0; i < 2; i++) {

                /**
                 * same as client2 but print the summaries separately (no
                 * grouping).
                 *
                 */
                ms = System.currentTimeMillis();

                // demo: can parameterize aliases
                // or measure names in the select expression.
                query.setHblParameter(0, "impSum");
                query.setHblParameter(1, "impCnt");

                query.setHblParameter(2, startTime);
                query.setHblParameter(3, endTime);

                query.setHblParameter(4, i);

                AggregateResultSet rs = query.execute();
                closeables.addFirst(rs);
                while (rs.hasNext()) {
                    rs.next();
                    PreparedAggregateResult ar = (PreparedAggregateResult) rs.current();

                    OnlineCannyAvgSummarizer ctrSum = (OnlineCannyAvgSummarizer) ar.getObject("ctr");
                    Double wctr = ctrSum == null ? null : ctrSum.getValue();

                    // sum has semantics of being NULL for sum of all NULLs
                    Double impSum = (Double) ar.getObject("impSum");
                    if (impSum == null)
                        impSum = new Double(0);
                    Double clickSum = (Double) ar.getObject("clickSum");
                    if (clickSum == null)
                        clickSum = new Double(0);

                    System.out
                        .printf("%032X (charDim=%s) sum/cnt: impCnt %.4f/%d, click %.4f/%d, ctr: %.4f, weighted ctr: %.4f \n",

                                new BigInteger(1, (byte[]) ar.getObject(0)),
                                ar.getObject("charDim1"),
                                ar.getObject("impSum"),
                                ar.getObject("impCnt"),
                                ar.getObject("clickSum"),
                                ar.getObject("clickCnt"),
                                clickSum / impSum,
                                wctr);
                }
                closeables.remove(rs);
                rs.close();

                System.out.printf("query+printout complete in %d ms\n", System.currentTimeMillis() - ms);
            }

        } finally {
            IOUtil.closeAll(closeables);
        }
    }

    private void testClient9(String cubeName) throws IOException, HblException {

        System.out.println("Test9:\n\n");

        Deque<Closeable> closeables = new ArrayDeque<Closeable>();
        try {

            PreparedAggregateQuery query = queryClient.createPreparedQuery();

            /*
             * test reuse of the prepared query. Should speedup stuff exactly as
             * prepared query is supposed to do. we also have an option of
             * re-preparing query at any time, but we still need to run reset()
             * to clean out stuff like parameters initialized and execution.
             * reset() does not necessarily cancel previously existing AST tree
             * of the query, only prepare() updates that. but prepare does
             * reset() implicitly, so if we re-prepared the query, the previous
             * parameter set cannot be used.
             */
            long ms = System.currentTimeMillis();
            query.prepare("select dim1, dim2, charDim1, SUM(impCnt) as impCnt "
                + "from Example1 group by dim1, charDim1,dim2");
            System.out.printf("query prepared in %d ms\n", System.currentTimeMillis() - ms);

            ms = System.currentTimeMillis();

            // demo: can parameterize aliases
            // or measure names in the select expression.

            AggregateResultSet rs = query.execute();
            closeables.addFirst(rs);
            while (rs.hasNext()) {
                rs.next();
                PreparedAggregateResult ar = (PreparedAggregateResult) rs.current();

                System.out.printf("dim1=%032X (cdim1=%s) dim2=%032X\n",

                new BigInteger(1, (byte[]) ar.getObject("dim1")), ar.getObject("charDim1"), new BigInteger(
                    1,
                    (byte[]) ar.getObject("dim2")));
            }
            closeables.remove(rs);
            rs.close();

            System.out.printf("query+printout complete in %d ms\n", System.currentTimeMillis() - ms);

        } finally {
            IOUtil.closeAll(closeables);
        }
    }

    private static final int               N          = 24 * 5;
    private static final double            clickRate  = 0.25;
    private static final GregorianCalendar START_BASE = new GregorianCalendar(2011, 8, 1);

    private void simulateInput(FileSystem fs, Path inputDir) throws IOException {
        Deque<Closeable> closeables = new ArrayDeque<Closeable>();

        byte[] idBytes = new byte[16];

        ByteString[] id = new ByteString[2];
        id[0] = ByteString.copyFrom(idBytes);
        HblUtil.incrementKey(idBytes, 0, idBytes.length);
        id[1] = ByteString.copyFrom(idBytes);

        Random rnd = new Random();

        try {

            // how many months to simulate
            int months = 10;

            Path inpFile = new Path(inputDir, "example1");
            fs.mkdirs(inputDir);
            SequenceFile.Writer w =
                SequenceFile.createWriter(fs, getConf(), inpFile, IntWritable.class, BytesWritable.class);
            closeables.addFirst(w);
            IntWritable iw = new IntWritable();
            BytesWritable bw = new BytesWritable();

            for (int mo = 0; mo < months; mo++) {
                GregorianCalendar start = IOUtil.tryClone(START_BASE);
                start.setTimeZone(TimeZone.getTimeZone("UTC"));
                // flush the cal
                start.getTimeInMillis();
                start.add(Calendar.MONTH, mo);

                for (int i = 0; i < N; i++) {
                    for (int k = 0; k < 2; k++) {
                        for (int j = 0; j < i + k + 1; j++) {
                            CompilerInput.Builder inp = CompilerInput.newBuilder();
                            inp.setDim1(id[k]);
                            inp.setCharDim1("dim1-as-"+(k+1));
                            inp.setDim2(id[k]);
                            //                            inp.setCharDim2("dim2-as-" + (k + 1));
                            inp.clearCharDim2(); // simulate NULL
                            inp.setDim3(id[k]);
                            inp.setCharDim3("dim3-as-" + (k + 1));
                            inp.setImpressionTime(start.getTimeInMillis());
                            inp.setImpCnt(1);
                            if (rnd.nextDouble() < clickRate)
                                inp.setClick(1);
                            byte[] b = inp.build().toByteArray();
                            bw.set(b, 0, b.length);
                            w.append(iw, bw);
                        }
                    }
                    start.add(Calendar.HOUR_OF_DAY, 1);
                }
            }

            // simulate deficient input
            // simulate empty time, should be substituted
            // by 01/01/1970 00:00:00
            CompilerInput.Builder inp = CompilerInput.newBuilder();
            inp.setDim1(id[0]);
            inp.setDim2(id[0]);
            inp.setDim3(id[0]);
            byte[] b = inp.build().toByteArray();
            bw.set(b, 0, b.length);
            w.append(iw, bw);

        } finally {
            IOUtil.closeAll(closeables);
        }

    }

    private void runScript(String script, Path inputPath) throws IOException {

        try {
            /*
             * this is a pig-version-specific hack to use grunt and its
             * preprocessors in sort of embedded mode. AFAIK it's not official
             * Pig's way to do this
             */
            PigContext pc = new PigContext();

            pc.setExecType(EXEC_TYPE);
            pc.getProperties().setProperty("pig.logfile", "pig.log");
            pc.getProperties().setProperty(PigContext.JOB_NAME, "sample1-compiler-run");

            /*
             * HACK! we probably should get the location of the job jar thru
             * some other way in this example. Since this is just an sample
             * (meaning one's real pipeline framework should figure its own way
             * to configure job jars and kick off pig scripts) and not really
             * part of the real deal, we probably can afford not to go out of
             * our way with this right now.
             */

            File[] jobJars = new File("target").listFiles(new FileFilter() {

                @Override
                public boolean accept(File pathname) {
                    return pathname.getName().matches("sample-\\d+\\.\\d+\\.\\d+(-SNAPSHOT)?-hadoop-job.jar");
                }
            });
            if (jobJars == null || jobJars.length == 0)
                throw new IOException(
                    "hadoop job jar was not found, please rebuild and run from $HBL_HOME/sample location..");

            for (File f : jobJars)
                pc.addJar(f.getAbsolutePath());

            /*
             * pig-preprocess. We specified hbl input as $input in the
             * preambula, so we now need to substitute it using Grunt's
             * preprocessor.
             */

            ParameterSubstitutionPreprocessor psp = new ParameterSubstitutionPreprocessor(512);
            StringWriter sw = new StringWriter();
            BufferedReader br = new BufferedReader(new StringReader(script));
            psp.genSubstitutedFile(br, sw, new String[] { "input=" + inputPath }, null);
            sw.close();

            script = sw.toString();
            sw = null;
            br = null;

            Grunt grunt = new Grunt(new BufferedReader(new StringReader(script)), pc);

            int[] codes = grunt.exec();

            int failed = codes[1];
            int succeeded = codes[0];

            System.out.printf("pig jobs failed:%d, pig jobs succeeded:%d.\n", failed, succeeded);

            if (failed != 0)
                throw new IOException("Pig script execution failed, some jobs failed. Check the pig log for errors.");

        } catch (Throwable thr) {
            // sorry, Grunt really declares Throwable to be thrown
            if (thr instanceof IOException)
                throw (IOException) thr;
            throw new IOException(thr);
        }

    }

}
TOP

Related Classes of Example1

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.