*/
public static DenseDoubleMatrix multiply(Pointer a, Pointer b,
MatrixDimension dim) {
// Prepare the pointer for the result in DEVICE memory
Pointer deviceResultPointer = new Pointer();
int resMatrixSize = dim.getM() * dim.getN();
int transA = dim.isTransposeA() ? cublasOperation.CUBLAS_OP_T
: cublasOperation.CUBLAS_OP_N;
int transB = dim.isTransposeB() ? cublasOperation.CUBLAS_OP_T
: cublasOperation.CUBLAS_OP_N;
if (CUBLAS2_AVAILABLE) {
JCuda.cudaMalloc(deviceResultPointer, Sizeof.DOUBLE * resMatrixSize);
Pointer alpha = Pointer.to(new double[] { 1.0d });
Pointer beta = Pointer.to(new double[] { 0.0d });
JCublas2.cublasDgemm(handle, transA, transB, dim.getM(), dim.getN(),
dim.getK(), alpha, a, dim.getLdA(), b, dim.getLdB(), beta,
deviceResultPointer, dim.getLdC());
freePointer(alpha);
freePointer(beta);