if(!parseParameters(args)) {
return;
}
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
// get customer data set: (custkey, name, address, nationkey, acctbal)
DataSet<Tuple5<Integer, String, String, Integer, Double>> customers = getCustomerDataSet(env);
// get orders data set: (orderkey, custkey, orderdate)
DataSet<Tuple3<Integer, Integer, String>> orders = getOrdersDataSet(env);
// get lineitem data set: (orderkey, extendedprice, discount, returnflag)
DataSet<Tuple4<Integer, Double, Double, String>> lineitems = getLineitemDataSet(env);
// get nation data set: (nationkey, name)
DataSet<Tuple2<Integer, String>> nations = getNationsDataSet(env);
// orders filtered by year: (orderkey, custkey)
DataSet<Tuple2<Integer, Integer>> ordersFilteredByYear =
// filter by year
orders.filter(
new FilterFunction<Tuple3<Integer,Integer, String>>() {
@Override
public boolean filter(Tuple3<Integer, Integer, String> t) {
int year = Integer.parseInt(t.f2.substring(0, 4));
return year > 1990;
}
})
// project fields out that are no longer required
.project(0,1).types(Integer.class, Integer.class);
// lineitems filtered by flag: (orderkey, extendedprice, discount)
DataSet<Tuple3<Integer, Double, Double>> lineitemsFilteredByFlag =
// filter by flag
lineitems.filter(new FilterFunction<Tuple4<Integer, Double, Double, String>>() {
@Override
public boolean filter(Tuple4<Integer, Double, Double, String> t)
throws Exception {
return t.f3.equals("R");
}
})
// project fields out that are no longer required
.project(0,1,2).types(Integer.class, Double.class, Double.class);
// join orders with lineitems: (custkey, extendedprice, discount)
DataSet<Tuple3<Integer, Double, Double>> lineitemsOfCustomerKey =
ordersFilteredByYear.joinWithHuge(lineitemsFilteredByFlag)
.where(0).equalTo(0)
.projectFirst(1).projectSecond(1,2)
.types(Integer.class, Double.class, Double.class);
// aggregate for revenue: (custkey, revenue)
DataSet<Tuple2<Integer, Double>> revenueOfCustomerKey = lineitemsOfCustomerKey
// calculate the revenue for each item
.map(new MapFunction<Tuple3<Integer, Double, Double>, Tuple2<Integer, Double>>() {
@Override
public Tuple2<Integer, Double> map(Tuple3<Integer, Double, Double> t) {
// revenue per item = l_extendedprice * (1 - l_discount)
return new Tuple2<Integer, Double>(t.f0, t.f1 * (1 - t.f2));
}
})
// aggregate the revenues per item to revenue per customer
.groupBy(0).aggregate(Aggregations.SUM, 1);
// join customer with nation (custkey, name, address, nationname, acctbal)
DataSet<Tuple5<Integer, String, String, String, Double>> customerWithNation = customers
.joinWithTiny(nations)
.where(3).equalTo(0)
.projectFirst(0,1,2).projectSecond(1).projectFirst(4)
.types(Integer.class, String.class, String.class, String.class, Double.class);
// join customer (with nation) with revenue (custkey, name, address, nationname, acctbal, revenue)
DataSet<Tuple6<Integer, String, String, String, Double, Double>> customerWithRevenue =
customerWithNation.join(revenueOfCustomerKey)
.where(0).equalTo(0)
.projectFirst(0,1,2,3,4).projectSecond(1)
.types(Integer.class, String.class, String.class, String.class, Double.class, Double.class);
// emit result
customerWithRevenue.writeAsCsv(outputPath);
// execute program
env.execute("TPCH Query 10 Example");
}