/**
* Copyright 2012 LiveRamp
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.liveramp.cascading_ext.bloom;
import cascading.flow.Flow;
import cascading.flow.FlowStep;
import cascading.flow.FlowStepStrategy;
import cascading.flow.planner.BaseFlowStep;
import cascading.stats.FlowStepStats;
import com.liveramp.cascading_ext.assembly.CreateBloomFilter;
import com.liveramp.cascading_ext.counters.Counters;
import org.apache.hadoop.mapred.JobConf;
import org.apache.log4j.Logger;
import java.util.List;
import java.util.Map;
/**
* Does any configuration necessary for a job that involves stuff from BloomAssembly
*/
public class BloomAssemblyStrategy implements FlowStepStrategy<JobConf> {
private static Logger LOG = Logger.getLogger(BloomAssemblyStrategy.class);
@Override
public void apply(Flow<JobConf> flow, List<FlowStep<JobConf>> predecessorSteps, FlowStep<JobConf> flowStep) {
JobConf conf = flowStep.getConfig();
String targetBloomID = conf.get(BloomProps.TARGET_BLOOM_FILTER_ID);
if (targetBloomID != null) {
prepareBloomFilterBuilder(flowStep);
}
// the job is the filter which needs to use the bloom filter
String sourceBloomID = conf.get(BloomProps.SOURCE_BLOOM_FILTER_ID);
if (sourceBloomID != null) {
buildBloomfilter(sourceBloomID, flowStep, predecessorSteps);
}
}
private void prepareBloomFilterBuilder(FlowStep<JobConf> currentStep) {
JobConf currentStepConf = currentStep.getConfig();
currentStepConf.set("mapred.reduce.tasks", Integer.toString(BloomProps.getNumSplits(currentStepConf)));
currentStepConf.set("io.sort.record.percent", Double.toString(BloomProps.getIOSortPercent(currentStepConf)));
}
/**
* Merges bloom filter parts created across multiple splits of the keys and put the result in the distributed cache.
*/
private void buildBloomfilter(String bloomID, FlowStep<JobConf> currentStep, List<FlowStep<JobConf>> predecessorSteps) {
try {
JobConf currentStepConf = currentStep.getConfig();
currentStepConf.set("io.sort.mb", Integer.toString(BloomProps.getBufferSize(currentStepConf)));
currentStepConf.set("mapred.job.reuse.jvm.num.tasks", "-1");
String requiredBloomPath = currentStepConf.get(BloomProps.REQUIRED_BLOOM_FILTER_PATH);
for (FlowStep<JobConf> step : predecessorSteps) {
JobConf prevStepConf = step.getConfig();
String targetBloomID = prevStepConf.get(BloomProps.TARGET_BLOOM_FILTER_ID);
if (bloomID.equals(targetBloomID)) {
LOG.info("Found step generating required bloom filter: " + targetBloomID);
// Extract the counters from the previous job to approximate the average key/tuple size
FlowStepStats stats = ((BaseFlowStep) step).getFlowStepStats();
// Collect some of the stats gathered. This will help configure the bloom filter
long numSampled = Counters.get(stats, CreateBloomFilter.StatsCounters.TOTAL_SAMPLED_TUPLES);
long keySizeSum = Counters.get(stats, CreateBloomFilter.StatsCounters.KEY_SIZE_SUM);
long matchSizeSum = Counters.get(stats, CreateBloomFilter.StatsCounters.TUPLE_SIZE_SUM);
int avgKeySize = 0;
int avgMatchSize = 0;
if (numSampled != 0) {
avgKeySize = (int) (keySizeSum / numSampled);
avgMatchSize = (int) (matchSizeSum / numSampled);
}
LOG.info("Avg key size ~= " + avgKeySize);
LOG.info("Avg match size ~= " + avgMatchSize);
for (Map.Entry<String, String> entry : BloomUtil.getPropertiesForBloomFilter(avgMatchSize, avgKeySize).entrySet()) {
currentStepConf.set(entry.getKey(), entry.getValue());
}
// Put merged result in distributed cache
LOG.info("Adding dist cache properties to config:");
for (Map.Entry<String, String> prop : BloomUtil.getPropertiesForDistCache(requiredBloomPath).entrySet()) {
LOG.info(prop.getKey() + " = " + prop.getValue());
String previousProperty = currentStepConf.get(prop.getKey());
if (previousProperty != null) {
LOG.info("found already existing value for key: " + prop.getKey() + ", found " + previousProperty + ". Appending.");
currentStepConf.set(prop.getKey(), previousProperty + "," + prop.getValue());
} else {
currentStepConf.set(prop.getKey(), prop.getValue());
}
}
BloomUtil.writeFilterToHdfs(prevStepConf, requiredBloomPath);
}
}
} catch (Exception e) {
throw new RuntimeException("Failed to create bloom filter!", e);
}
}
}