// reading elements from one split
long readElement = 0;
while (reader.next(key, value))
{
collector.clear();
Tuple tuple = mapper.parse(key, value);
readElement++;
if (readElement> (((long)numSamples)*((long)proportion)) )
{
// a split might be very big (ex: a large gz file),
// so we just need to read the
break;
}
if (r.nextDouble() <= freq)
{
if (samples.size() < numSamples)
{
mapper.joinmap(key, value, collector, Reporter.NULL);
// joinmap function might generate more than one output key
// per <code>key</code> input.
for( Tuple t:collector.getOutKey() )
{
Tuple mt = Tuple.merge(tuple, t);
DataJoinKey nkey = this.getKey(mt, sorters, datasetID, mapper, job);
samples.add(nkey);
}
}
else
{
// When exceeding the maximum number of samples, replace
// a random element with this one, then adjust the
// frequency to reflect the possibility of existing
// elements being pushed out
mapper.joinmap(key, value, collector, Reporter.NULL);
for( Tuple t:collector.getOutKey() )
{
int ind = r.nextInt(numSamples);
if (ind != numSamples)
{
Tuple mt = Tuple.merge(tuple, t);
DataJoinKey nkey = this.getKey(mt, sorters, datasetID, mapper, job);
samples.set(ind, nkey);
}
}