// TODO Only power-of-two array lengths are supported so far
dir = (dir != 0) ? 1 : 0;
CLKernel sortlocal1 = kernels.get(BITONIC_SORT_LOCAL1);
CLKernel sortlocal = kernels.get(BITONIC_SORT_LOCAL);
CLKernel mergeGlobal = kernels.get(BITONIC_MERGE_GLOBAL);
CLKernel mergeLocal = kernels.get(BITONIC_MERGE_LOCAL);
if (arrayLength <= LOCAL_SIZE_LIMIT) {
// oclCheckError( (batch * arrayLength) % LOCAL_SIZE_LIMIT == 0, shrTRUE );
//Launch bitonicSortLocal
sortlocal.putArgs(dstKey, srcKey)
.putArg(arrayLength).putArg(dir).rewind();
int localWorkSize = LOCAL_SIZE_LIMIT / 2;
int globalWorkSize = batch * arrayLength / 2;
queue.put1DRangeKernel(sortlocal, 0, globalWorkSize, localWorkSize);
} else {
//Launch bitonicSortLocal1
sortlocal1.setArgs(dstKey, srcKey);
int localWorkSize = LOCAL_SIZE_LIMIT / 2;
int globalWorkSize = batch * arrayLength / 2;
queue.put1DRangeKernel(sortlocal1, 0, globalWorkSize, localWorkSize);
for (int size = 2 * LOCAL_SIZE_LIMIT; size <= arrayLength; size <<= 1) {
for (int stride = size / 2; stride > 0; stride >>= 1) {
if (stride >= LOCAL_SIZE_LIMIT) {
//Launch bitonicMergeGlobal
mergeGlobal.putArgs(dstKey, dstKey)
.putArg(arrayLength).putArg(size).putArg(stride).putArg(dir).rewind();
globalWorkSize = batch * arrayLength / 2;
queue.put1DRangeKernel(mergeGlobal, 0, globalWorkSize, 0);
} else {
//Launch bitonicMergeLocal
mergeLocal.putArgs(dstKey, dstKey)
.putArg(arrayLength).putArg(stride).putArg(size).putArg(dir).rewind();
localWorkSize = LOCAL_SIZE_LIMIT / 2;
globalWorkSize = batch * arrayLength / 2;