// find all available OpenCL platforms
ret = cl.clGetPlatformIDs(0, null, intBuffer);
checkForError(ret);
assertTrue(intBuffer.get(0) > 0);
NativeSizeBuffer pb = NativeSizeBuffer.allocateDirect(intBuffer.get(0));
ret = cl.clGetPlatformIDs(pb.capacity(), pb, null);
checkForError(ret);
long platform = pb.get(0);
NativeSizeBuffer properties = NativeSizeBuffer.allocateDirect(3).put(CL.CL_CONTEXT_PLATFORM)
.put(platform).put(0) // 0 terminated array
.rewind();
long context = cl.clCreateContextFromType(properties, CL.CL_DEVICE_TYPE_ALL, null, null);
out.println("context handle: "+context);
checkError("on clCreateContextFromType", ret);
NativeSizeBuffer longBuffer = NativeSizeBuffer.allocateDirect(1);
ret = cl.clGetContextInfo(context, CL.CL_CONTEXT_DEVICES, 0, null, longBuffer);
checkError("on clGetContextInfo", ret);
int deviceCount = (int) (longBuffer.get(0) / (is32Bit() ? 4 : 8));
out.println("context created with " + deviceCount + " devices");
ByteBuffer bb = newDirectByteBuffer(4096);
ret = cl.clGetContextInfo(context, CL.CL_CONTEXT_DEVICES, bb.capacity(), bb, null);
checkError("on clGetContextInfo", ret);
for (int i = 0; i < deviceCount; i++) {
out.println("device id: " + (is32Bit()?bb.getInt():bb.getLong()));
}
// use a random device
int offset = new Random().nextInt(deviceCount);
out.println("using device# " + offset);
offset *= (is32Bit() ? 4 : 8);
long device = is32Bit()?bb.getInt(offset):bb.getLong(offset);
ret = cl.clGetDeviceInfo(device, CL.CL_DEVICE_MAX_WORK_GROUP_SIZE, bb.capacity(), bb, null);
checkError("on clGetDeviceInfo", ret);
int maxWGS = bb.getInt();
out.println("max WGS: " + maxWGS);
// Create a command-queue
long commandQueue = cl.clCreateCommandQueue(context, device, 0, intBuffer);
checkError("on clCreateCommandQueue", intBuffer.get(0));
int localWorkSize = Math.min(128, maxWGS); // set and log Global and Local work size dimensions
int globalWorkSize = roundUp(localWorkSize, ELEMENT_COUNT); // rounded up to the nearest multiple of the LocalWorkSize
out.println("allocateing buffers of size: "+globalWorkSize);
ByteBuffer srcA = newDirectByteBuffer(globalWorkSize*SIZEOF_INT);
ByteBuffer srcB = newDirectByteBuffer(globalWorkSize*SIZEOF_INT);
ByteBuffer dest = newDirectByteBuffer(globalWorkSize*SIZEOF_INT);
// Allocate the OpenCL buffer memory objects for source and result on the device GMEM
long devSrcA = cl.clCreateBuffer(context, CL.CL_MEM_READ_ONLY, srcA.capacity(), null, intBuffer);
checkError("on clCreateBuffer", intBuffer.get(0));
long devSrcB = cl.clCreateBuffer(context, CL.CL_MEM_READ_ONLY, srcB.capacity(), null, intBuffer);
checkError("on clCreateBuffer", intBuffer.get(0));
long devDst = cl.clCreateBuffer(context, CL.CL_MEM_WRITE_ONLY, dest.capacity(), null, intBuffer);
checkError("on clCreateBuffer", intBuffer.get(0));
// Create the program
NativeSizeBuffer lengths = NativeSizeBuffer.allocateDirect(1).put(programSource.length()).rewind();
final long program = cl.clCreateProgramWithSource(context, 1, new String[] {programSource}, lengths, intBuffer);
out.println("program id: "+program);
checkError("on clCreateProgramWithSource", intBuffer.get(0));
// tests if the callback is called
final CountDownLatch latch = new CountDownLatch(1);
BuildProgramCallback callback = new BuildProgramCallback() {
@Override
public void buildFinished(long cl_program) {
try{
assertEquals(program, cl_program);
}finally{
latch.countDown();
}
}
};
// spec: building programs is not threadsafe (see loadtest)
synchronized(CLProgram.class) {
// Build the program
ret = cl.clBuildProgram(program, 0, null, null, callback);
checkError("on clBuildProgram", ret);
out.println("waiting for program to build...");
latch.await();
}
out.println("done");
// Read program infos
bb.rewind();
ret = cl.clGetProgramInfo(program, CL.CL_PROGRAM_NUM_DEVICES, bb.capacity(), bb, null);
checkError("on clGetProgramInfo1", ret);
out.println("program associated with "+bb.getInt(0)+" device(s)");
ret = cl.clGetProgramInfo(program, CL.CL_PROGRAM_SOURCE, 0, null, longBuffer);
checkError("on clGetProgramInfo CL_PROGRAM_SOURCE", ret);
out.println("program source length (cl): "+longBuffer.get(0));
out.println("program source length (java): "+programSource.length());
ret = cl.clGetProgramInfo(program, CL.CL_PROGRAM_SOURCE, bb.capacity(), bb, null);
checkError("on clGetProgramInfo CL_PROGRAM_SOURCE", ret);
out.println("program source:\n" + clString2JavaString(bb, (int)longBuffer.get(0)));
// Check program status
ret = cl.clGetProgramBuildInfo(program, device, CL.CL_PROGRAM_BUILD_STATUS, bb.capacity(), bb, null);
checkError("on clGetProgramBuildInfo1", ret);
out.println("program build status: " + CLProgram.Status.valueOf(bb.getInt(0)));
assertEquals("build status", CL.CL_BUILD_SUCCESS, bb.getInt(0));
// Read build log
ret = cl.clGetProgramBuildInfo(program, device, CL.CL_PROGRAM_BUILD_LOG, 0, null, longBuffer);
checkError("on clGetProgramBuildInfo2", ret);
out.println("program log length: " + longBuffer.get(0));
bb.rewind();
ret = cl.clGetProgramBuildInfo(program, device, CL.CL_PROGRAM_BUILD_LOG, bb.capacity(), bb, null);
checkError("on clGetProgramBuildInfo3", ret);
out.println("log:\n" + clString2JavaString(bb, (int)longBuffer.get(0)));
// Create the kernel
long kernel = cl.clCreateKernel(program, "VectorAdd", intBuffer);
out.println("kernel id: "+kernel);
checkError("on clCreateKernel", intBuffer.get(0));
// srcA.limit(elementCount*SIZEOF_FLOAT);
// srcB.limit(elementCount*SIZEOF_FLOAT);
fillBuffer(srcA, 23456);
fillBuffer(srcB, 46987);
// Set the Argument values
ret = cl.clSetKernelArg(kernel, 0, is32Bit()?SIZEOF_INT:SIZEOF_LONG, wrap(devSrcA)); checkError("on clSetKernelArg0", ret);
ret = cl.clSetKernelArg(kernel, 1, is32Bit()?SIZEOF_INT:SIZEOF_LONG, wrap(devSrcB)); checkError("on clSetKernelArg1", ret);
ret = cl.clSetKernelArg(kernel, 2, is32Bit()?SIZEOF_INT:SIZEOF_LONG, wrap(devDst)); checkError("on clSetKernelArg2", ret);
ret = cl.clSetKernelArg(kernel, 3, SIZEOF_INT, wrap(ELEMENT_COUNT)); checkError("on clSetKernelArg3", ret);
out.println("used device memory: "+ (srcA.capacity()+srcB.capacity()+dest.capacity())/1000000 +"MB");
// Asynchronous write of data to GPU device
ret = cl.clEnqueueWriteBuffer(commandQueue, devSrcA, CL.CL_FALSE, 0, srcA.capacity(), srcA, 0, null, null);
checkError("on clEnqueueWriteBuffer", ret);
ret = cl.clEnqueueWriteBuffer(commandQueue, devSrcB, CL.CL_FALSE, 0, srcB.capacity(), srcB, 0, null, null);
checkError("on clEnqueueWriteBuffer", ret);
// Launch kernel
NativeSizeBuffer gWS = NativeSizeBuffer.allocateDirect(1).put(globalWorkSize).rewind();
NativeSizeBuffer lWS = NativeSizeBuffer.allocateDirect(1).put(localWorkSize).rewind();
ret = cl.clEnqueueNDRangeKernel(commandQueue, kernel, 1, null, gWS, lWS, 0, null, null);
checkError("on clEnqueueNDRangeKernel", ret);
// Synchronous/blocking read of results
ret = cl.clEnqueueReadBuffer(commandQueue, devDst, CL.CL_TRUE, 0, dest.capacity(), dest, 0, null, null);