convertToArray(quantilesList);
long i=-1;
Map<PigNullableWritable,CountingMap<Integer>> contribs = new HashMap<PigNullableWritable, CountingMap<Integer>>();
for (Tuple it : samples){
++i;
PigNullableWritable sample = getPigNullableWritable(it);
int partInd = new Long(i/toSkip).intValue(); // which partition
if(partInd==numQuantiles) break;
// the quantiles array has the element from the sample which is the
// last element for a given partition. For example: if numQunatiles
// is 5 and number of samples is 100, then toSkip = 20
// quantiles[0] = sample[19] // the 20th element
// quantiles[1] = sample[39] // the 40th element
// and so on. For any element in the sample between 0 and 19, partInd
// will be 0. We want to check if a sample element which is
// present between 0 and 19 and is also the 19th (quantiles[0] element).
// This would mean that element might spread over the 0th and 1st
// partition. We are looking for contributions to a partition
// from such elements.
// First We only check for sample elements in partitions other than the last one
// < numQunatiles -1 (partInd is 0 indexed).
if(partInd<numQuantiles-1 && areEqual(sample,quantiles[partInd])){
if(!contribs.containsKey(sample)){
CountingMap<Integer> cm = new CountingMap<Integer>();
cm.put(partInd, 1);
contribs.put(sample, cm);
}
else
contribs.get(sample).put(partInd, 1);
}
else{
// we are either in the last partition (last quantile)
// OR the sample element we are currently processing is not
// the same as the element in the quantile array for this partition
// if we haven't seen this sample item earlier, this is not an
// element which crosses partitions - so ignore
if(!contribs.containsKey(sample))
continue;
else
// we have seen this sample before (in a previous partInd),
// add to the contribution associated with this sample - if we had
// not seen this sample in a previous partInd, then we have not
// had this in the contribs map! (because of the if above).This
// sample can either go to the previous partInd or this partInd
// in the final sort reduce stage. That is where the amount of
// contribution to each partInd will matter and influence the choice.
contribs.get(sample).put(partInd, 1);
}
}
for(Entry<PigNullableWritable, CountingMap<Integer>> ent : contribs.entrySet()){
PigNullableWritable key = ent.getKey(); // sample item which repeats
// this map will have the contributions of the sample item to the different partitions
CountingMap<Integer> value = ent.getValue();
long total = value.getTotalCount();