// Set the Argument values
ret = cl.clSetKernelArg(kernel, 0, is32Bit()?SIZEOF_INT:SIZEOF_LONG, wrap(devSrcA)); checkError("on clSetKernelArg0", ret);
ret = cl.clSetKernelArg(kernel, 1, is32Bit()?SIZEOF_INT:SIZEOF_LONG, wrap(devSrcB)); checkError("on clSetKernelArg1", ret);
ret = cl.clSetKernelArg(kernel, 2, is32Bit()?SIZEOF_INT:SIZEOF_LONG, wrap(devDst)); checkError("on clSetKernelArg2", ret);
ret = cl.clSetKernelArg(kernel, 3, SIZEOF_INT, wrap(ELEMENT_COUNT)); checkError("on clSetKernelArg3", ret);
out.println("used device memory: "+ (srcA.capacity()+srcB.capacity()+dest.capacity())/1000000 +"MB");
// Asynchronous write of data to GPU device
ret = cl.clEnqueueWriteBuffer(commandQueue, devSrcA, CL.CL_FALSE, 0, srcA.capacity(), srcA, 0, null, null);