Package com.cloudera.science.pig

Source Code of com.cloudera.science.pig.Quantile

/**
* Copyright 2011 Cloudera Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*      http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.cloudera.science.pig;

import java.io.IOException;
import java.util.List;

import org.apache.pig.EvalFunc;
import org.apache.pig.data.BagFactory;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;

import com.cloudera.science.quantile.MunroPatersonQuantileEstimator;
import com.cloudera.science.quantile.QuantileEstimator;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;

/**
* A Pig UDF for computing quantiles from a {@link DataBag} of numeric values.
* It returns a {@code DataBag} of {@link Tuple} values with two fields:
* a zero-indexed integer that represents the quantile and a double value
* that is actual quantile estimate.
*
* <p>By default, this class uses the Munro-Paterson algorithm for estimating
* quantiles in a streaming fashion. Subclasses may override the
* {@code createEstimator} method in this class to return a different
* implementation.
*
*/
public class Quantile extends EvalFunc<DataBag> {

  private final TupleFactory tupleFactory = TupleFactory.getInstance();
  private final BagFactory bagFactory = BagFactory.getInstance();
  private final int numQuantiles;
 
  public Quantile(String numQuantiles) {
    this.numQuantiles = Integer.valueOf(numQuantiles);
  }
 
  protected QuantileEstimator createEstimator() {
    return new MunroPatersonQuantileEstimator(numQuantiles);
  }
 
  @Override
  public DataBag exec(Tuple input) throws IOException {
    DataBag output = bagFactory.newDefaultBag();
    Object o = input.get(0);
    if (!(o instanceof DataBag)) {
      throw new IOException("Expected input to be a bag, but got: " + o.getClass().getName());
    }
    DataBag inputBag = (DataBag) o;
    QuantileEstimator estimator = createEstimator();
   
    for (Tuple t : inputBag) {
      if (t != null && t.get(0) != null) {
        estimator.add(((Number) t.get(0)).doubleValue());
      }
    }
   
    List<Double> quantiles = estimator.getQuantiles();
    for (int i = 0; i < quantiles.size(); i++) {
      output.add(tupleFactory.newTuple(ImmutableList.of(i, quantiles.get(i))));
    }
    return output;
  }

  private boolean isNumeric(byte pigType) {
    return pigType == DataType.DOUBLE || pigType == DataType.FLOAT ||
        pigType == DataType.INTEGER || pigType == DataType.LONG;
  }
 
  @Override
  public Schema outputSchema(Schema input) {
    if (input.size() != 1) {
      throw new IllegalArgumentException("Expected a bag; input has > 1 field");
    }
    try {
      if (input.getField(0).type != DataType.BAG) {
        throw new IllegalArgumentException("Expected a bag; found: " +
            DataType.findTypeName(input.getField(0).type));
      }
      if (input.getField(0).schema.size() != 1) {
        throw new IllegalArgumentException("The bag must contain a single field");
      }
      byte bagType = input.getField(0).schema.getField(0).type;
      if (bagType == DataType.TUPLE) {
        bagType = input.getField(0).schema.getField(0).schema.getField(0).type;
      }
      if (!isNumeric(bagType)) {
        throw new IllegalArgumentException("The bag's field must be a numeric type");
      }
     
      List<FieldSchema> fields = Lists.newArrayList(new FieldSchema("quantile", DataType.INTEGER),
          new FieldSchema("value", DataType.DOUBLE));
      Schema tupleSchema = new Schema(fields);
     
      FieldSchema tupleFieldSchema = new FieldSchema("t", tupleSchema,
          DataType.TUPLE);
     
      Schema bagSchema = new Schema(tupleFieldSchema);
      bagSchema.setTwoLevelAccessRequired(true);
      FieldSchema bagFieldSchema = new FieldSchema("b", bagSchema, DataType.BAG);
      return new Schema(bagFieldSchema);
    } catch (Exception e) {
      throw new RuntimeException(e);
    }
  }
}
TOP

Related Classes of com.cloudera.science.pig.Quantile

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.