// select fastest device
CLDevice device = context.getMaxFlopsDevice();
out.println("using "+device);
// create command queue on device.
CLCommandQueue queue = device.createCommandQueue();
int elementCount = 1444477; // Length of arrays to process
int localWorkSize = min(device.getMaxWorkGroupSize(), 256); // Local work size dimensions
int globalWorkSize = roundUp(localWorkSize, elementCount); // rounded up to the nearest multiple of the localWorkSize
// load sources, create and build program
CLProgram program = context.createProgram(HelloJOCL.class.getResourceAsStream("VectorAdd.cl")).build();
// A, B are input buffers, C is for the result
CLBuffer<FloatBuffer> clBufferA = context.createFloatBuffer(globalWorkSize, READ_ONLY);
CLBuffer<FloatBuffer> clBufferB = context.createFloatBuffer(globalWorkSize, READ_ONLY);
CLBuffer<FloatBuffer> clBufferC = context.createFloatBuffer(globalWorkSize, WRITE_ONLY);
out.println("used device memory: "
+ (clBufferA.getCLSize()+clBufferB.getCLSize()+clBufferC.getCLSize())/1000000 +"MB");
// fill input buffers with random numbers
// (just to have test data; seed is fixed -> results will not change between runs).
fillBuffer(clBufferA.getBuffer(), 12345);
fillBuffer(clBufferB.getBuffer(), 67890);
// get a reference to the kernel function with the name 'VectorAdd'
// and map the buffers to its input parameters.
CLKernel kernel = program.createCLKernel("VectorAdd");
kernel.putArgs(clBufferA, clBufferB, clBufferC).putArg(elementCount);
// asynchronous write of data to GPU device,
// followed by blocking read to get the computed results back.
long time = nanoTime();
queue.putWriteBuffer(clBufferA, false)
.putWriteBuffer(clBufferB, false)
.put1DRangeKernel(kernel, 0, globalWorkSize, localWorkSize)
.putReadBuffer(clBufferC, true);
time = nanoTime() - time;