if (p<0) {
A = inverse(A);
p = -p;
}
if (p==0) return DoubleFactory2D.dense.identity(A.rows());
DoubleMatrix2D T = A.like(); // temporary
if (p==1) return T.assign(A); // safes one auxiliary matrix allocation
if (p==2) {
blas.dgemm(false,false,1,A,A,0,T); // mult(A,A); // safes one auxiliary matrix allocation
return T;
}
int k = cern.colt.bitvector.QuickBitVector.mostSignificantBit(p); // index of highest bit in state "true"
/*
this is the naive version:
DoubleMatrix2D B = A.copy();
for (int i=0; i<p-1; i++) {
B = mult(B,A);
}
return B;
*/
// here comes the optimized version:
//cern.colt.Timer timer = new cern.colt.Timer().start();
int i=0;
while (i<=k && (p & (1<<i)) == 0) { // while (bit i of p == false)
// A = mult(A,A); would allocate a lot of temporary memory
blas.dgemm(false,false,1,A,A,0,T); // A.zMult(A,T);
DoubleMatrix2D swap = A; A = T; T = swap; // swap A with T
i++;
}
DoubleMatrix2D B = A.copy();
i++;
for (; i<=k; i++) {
// A = mult(A,A); would allocate a lot of temporary memory
blas.dgemm(false,false,1,A,A,0,T); // A.zMult(A,T);
DoubleMatrix2D swap = A; A = T; T = swap; // swap A with T
if ((p & (1<<i)) != 0) { // if (bit i of p == true)
// B = mult(B,A); would allocate a lot of temporary memory
blas.dgemm(false,false,1,B,A,0,T); // B.zMult(A,T);
swap = B; B = T; T = swap; // swap B with T