if(in==null || in.size()==0)
return null;
Integer numQuantiles = null;
DataBag samples = null;
ArrayList<Tuple> quantilesList = new ArrayList<Tuple>();
InternalMap weightedParts = new InternalMap();
// the sample file has a tuple as under:
// (numQuantiles, bag of samples)
// numQuantiles here is the reduce parallelism
try{
numQuantiles = (Integer)in.get(0);
samples = (DataBag)in.get(1);
long numSamples = samples.size();
long toSkip = numSamples / numQuantiles;
if(toSkip == 0) {
// numSamples is < numQuantiles;
// set numQuantiles to numSamples
numQuantiles = (int)numSamples;
toSkip = 1;
}
long ind=0, j=-1, nextQuantile = toSkip-1;
for (Tuple it : samples) {
if (ind==nextQuantile){
++j;
quantilesList.add(it);
nextQuantile+=toSkip;
if(j==numQuantiles-1)
break;
}
ind++;
if (ind % 1000 == 0) progress();
}
long i=-1;
Map<Tuple,CountingMap<Integer>> contribs = new HashMap<Tuple, CountingMap<Integer>>();
for (Tuple it : samples){
++i;
if (i % 1000 == 0) progress();
int partInd = new Long(i/toSkip).intValue(); // which partition
if(partInd==numQuantiles) break;
// the quantiles array has the element from the sample which is the
// last element for a given partition. For example: if numQuantiles
// is 5 and number of samples is 100, then toSkip = 20
// quantiles[0] = sample[19] // the 20th element
// quantiles[1] = sample[39] // the 40th element
// and so on. For any element in the sample between 0 and 19, partInd
// will be 0. We want to check if a sample element which is
// present between 0 and 19 is also the 19th (quantiles[0] element).
// This would mean that element might spread over the 0th and 1st
// partition. We are looking for contributions to a partition
// from such elements.
// First We only check for sample elements in partitions other than the last one
// < numQuantiles -1 (partInd is 0 indexed).
if(partInd<numQuantiles-1 && areEqual(it,quantilesList.get(partInd))){
if(!contribs.containsKey(it)){
CountingMap<Integer> cm = new CountingMap<Integer>();
cm.put(partInd, 1);
contribs.put(it, cm);
}
else
contribs.get(it).put(partInd, 1);
}
else{
// we are either in the last partition (last quantile)
// OR the sample element we are currently processing is not
// the same as the element in the quantile array for this partition
// if we haven't seen this sample item earlier, this is not an
// element which crosses partitions - so ignore
if(!contribs.containsKey(it))
continue;
else
// we have seen this sample before (in a previous partInd),
// add to the contribution associated with this sample - if we had
// not seen this sample in a previous partInd, then we would have not
// had this in the contribs map! (because of the if above).This
// "key" (represented by the sample item) can either go to the
// previous partInd or this partInd in the final sort reduce stage.
// That is where the amount of contribution to each partInd will
// matter and influence the choice.
contribs.get(it).put(partInd, 1);
}
}
int k = 0;
for(Entry<Tuple, CountingMap<Integer>> ent : contribs.entrySet()){
if (k % 1000 == 0) progress();
Tuple key = ent.getKey(); // sample item which repeats
// this map will have the contributions of the sample item to the different partitions
CountingMap<Integer> value = ent.getValue();
long total = value.getTotalCount();
Tuple probVec = mTupleFactory.newTuple(numQuantiles.intValue());
// initialize all contribution fractions for different
// partitions to 0.0
for (int l = 0; l < numQuantiles; l++) {
probVec.set(l, new Float(0.0));
}
// for each partition that this sample item is present in,
// compute the fraction of the total occurences for that
// partition - this will be the probability with which we
// will pick this partition in the final sort reduce job
// for this sample item
for (Entry<Integer,Integer> valEnt : value.entrySet()) {
probVec.set(valEnt.getKey(), (float)valEnt.getValue()/total);
}
weightedParts.put(key, probVec);
}
output.put(QUANTILES_LIST, mBagFactory.newDefaultBag(quantilesList));
output.put(WEIGHTED_PARTS, weightedParts);
return output;
}catch (Exception e){