@Override
public int run(String[] args) throws Exception {
// Construct a local filesystem dataset repository rooted at /tmp/data
DatasetRepository fsRepo = DatasetRepositories.open("repo:hdfs:/tmp/data");
// Construct an HCatalog dataset repository using external Hive tables
DatasetRepository hcatRepo = DatasetRepositories.open("repo:hive:/tmp/data");
// Turn debug on while in development.
getPipeline().enableDebug();
getPipeline().getConfiguration().set("crunch.log.job.progress", "true");
// Load the events dataset and get the correct partition to sessionize
Dataset<StandardEvent> eventsDataset = fsRepo.load("events");
Dataset<StandardEvent> partition;
if (args.length == 0 || (args.length == 1 && args[0].equals("LATEST"))) {
partition = getLatestPartition(eventsDataset);
} else {
partition = getPartitionForURI(eventsDataset, args[0]);
}
// Create a parallel collection from the working partition
PCollection<StandardEvent> events = read(
CrunchDatasets.asSource(partition, StandardEvent.class));
// Process the events into sessions, using a combiner
PCollection<Session> sessions = events
.parallelDo(new DoFn<StandardEvent, Session>() {
@Override
public void process(StandardEvent event, Emitter<Session> emitter) {
emitter.emit(Session.newBuilder()
.setUserId(event.getUserId())
.setSessionId(event.getSessionId())
.setIp(event.getIp())
.setStartTimestamp(event.getTimestamp())
.setDuration(0)
.setSessionEventCount(1)
.build());
}
}, Avros.specifics(Session.class))
.by(new MapFn<Session, Pair<Long, String>>() {
@Override
public Pair<Long, String> map(Session session) {
return Pair.of(session.getUserId(), session.getSessionId());
}
}, Avros.pairs(Avros.longs(), Avros.strings()))
.groupByKey()
.combineValues(new CombineFn<Pair<Long, String>, Session>() {
@Override
public void process(Pair<Pair<Long, String>, Iterable<Session>> pairIterable,
Emitter<Pair<Pair<Long, String>, Session>> emitter) {
String ip = null;
long startTimestamp = Long.MAX_VALUE;
long endTimestamp = Long.MIN_VALUE;
int sessionEventCount = 0;
for (Session s : pairIterable.second()) {
ip = s.getIp();
startTimestamp = Math.min(startTimestamp, s.getStartTimestamp());
endTimestamp = Math.max(endTimestamp, s.getStartTimestamp() + s.getDuration());
sessionEventCount += s.getSessionEventCount();
}
emitter.emit(Pair.of(pairIterable.first(), Session.newBuilder()
.setUserId(pairIterable.first().first())
.setSessionId(pairIterable.first().second())
.setIp(ip)
.setStartTimestamp(startTimestamp)
.setDuration(endTimestamp - startTimestamp)
.setSessionEventCount(sessionEventCount)
.build()));
}
})
.parallelDo(new DoFn<Pair<Pair<Long, String>, Session>, Session>() {
@Override
public void process(Pair<Pair<Long, String>, Session> pairSession,
Emitter<Session> emitter) {
emitter.emit(pairSession.second());
}
}, Avros.specifics(Session.class));
// Write the sessions to the "sessions" Dataset
getPipeline().write(sessions, CrunchDatasets.asTarget(hcatRepo.load("sessions")),
Target.WriteMode.APPEND);
return run().succeeded() ? 0 : 1;
}