// The same kernel can be safely used by different threads, as long as setArgs + enqueueNDRange are in a synchronized block
synchronized (kernel) {
//kernel.setArgs(b1,b2,b3);
kernelCompletion = kernel.enqueueNDRange(queue, new int[] { A.length }, new int[] { 1 } );
}
kernelCompletion.waitFor(); // better not to wait for it but to pass it as a dependent event to some other queuable operation (CLBuffer.read, for instance)
FloatBuffer f = b3.read(queue, kernelCompletion);
for(int i=0;i<A.length;i++) {
System.out.println( A[i] + " * " + B[i] + " = " + f.get(i));