2 #ifdef NATIVE_LAPACK_LIB
22 #ifdef NATIVE_LAPACK_LIB
23 static cublasHandle_t handle;
25 static bool cublas_init =
false;
30 #ifdef NATIVE_LAPACK_LIB
31 cublasStatus_t error = cublasCreate(&handle);
32 if (error != CUBLAS_STATUS_SUCCESS)
errorQuda(
"cublasCreate failed with error %d", error);
41 #ifdef NATIVE_LAPACK_LIB
42 cublasStatus_t error = cublasDestroy(handle);
43 if (error != CUBLAS_STATUS_SUCCESS)
44 errorQuda(
"\nError indestroying cublas context, error code = %d\n", error);
51 template <
typename EigenMatrix,
typename Float>
52 __host__
void checkEigen(std::complex<Float> *A_h, std::complex<Float> *Ainv_h,
int n, uint64_t batch)
54 EigenMatrix A = EigenMatrix::Zero(n, n);
55 EigenMatrix Ainv = EigenMatrix::Zero(n, n);
56 for (
int j = 0; j < n; j++) {
57 for (
int k = 0; k < n; k++) {
58 A(k, j) = A_h[batch * n * n + j * n + k];
59 Ainv(k, j) = Ainv_h[batch * n * n + j * n + k];
64 EigenMatrix unit = EigenMatrix::Identity(n, n);
65 EigenMatrix prod = A * Ainv;
66 Float L2norm = ((prod - unit).
norm() / (n * n));
67 printfQuda(
"cuBLAS: Norm of (A * Ainv - I) batch %lu = %e\n", batch, L2norm);
75 #ifdef NATIVE_LAPACK_LIB
78 printfQuda(
"BatchInvertMatrix (native - cuBLAS): Nc = %d, batch = %lu\n", n, batch);
82 gettimeofday(&
start, NULL);
84 size_t size = 2 * n * n *
prec * batch;
91 std::complex<float> *A_h
93 static_cast<std::complex<float> *
>(A_d));
100 memset(info_array,
'0', batch *
sizeof(
int));
103 typedef cuFloatComplex C;
105 C **Ainv_array = A_array + batch;
107 C **Ainv_array_h = A_array_h + batch;
108 for (uint64_t i = 0; i < batch; i++) {
109 A_array_h[i] =
static_cast<C *
>(A_d) + i * n * n;
110 Ainv_array_h[i] =
static_cast<C *
>(Ainv_d) + i * n * n;
112 qudaMemcpy(A_array, A_array_h, 2 * batch *
sizeof(C *), cudaMemcpyHostToDevice);
114 cublasStatus_t error = cublasCgetrfBatched(handle, n, A_array, n, dipiv, dinfo_array, batch);
117 if (error != CUBLAS_STATUS_SUCCESS)
118 errorQuda(
"\nError in LU decomposition (cublasCgetrfBatched), error code = %d\n", error);
120 qudaMemcpy(info_array, dinfo_array, batch *
sizeof(
int), cudaMemcpyDeviceToHost);
121 for (uint64_t i = 0; i < batch; i++) {
122 if (info_array[i] < 0) {
123 errorQuda(
"%lu argument had an illegal value or another error occured, such as memory allocation failed",
125 }
else if (info_array[i] > 0) {
126 errorQuda(
"%lu factorization completed but the factor U is exactly singular", i);
130 error = cublasCgetriBatched(handle, n, (
const C **)A_array, n, dipiv, Ainv_array, n, dinfo_array, batch);
133 if (error != CUBLAS_STATUS_SUCCESS)
134 errorQuda(
"\nError in matrix inversion (cublasCgetriBatched), error code = %d\n", error);
136 qudaMemcpy(info_array, dinfo_array, batch *
sizeof(
int), cudaMemcpyDeviceToHost);
138 for (uint64_t i = 0; i < batch; i++) {
139 if (info_array[i] < 0) {
140 errorQuda(
"%lu argument had an illegal value or another error occured, such as memory allocation failed",
142 }
else if (info_array[i] > 0) {
143 errorQuda(
"%lu factorization completed but the factor U is exactly singular", i);
152 std::complex<float> *Ainv_h =
static_cast<std::complex<float> *
>(
pool_pinned_malloc(size));
153 qudaMemcpy((
void *)Ainv_h, Ainv_d, size, cudaMemcpyDeviceToHost);
155 for (uint64_t i = 0; i < batch; i++) { checkEigen<MatrixXcf, float>(A_h, Ainv_h, n, i); }
160 errorQuda(
"%s not implemented for precision=%d", __func__,
prec);
164 qudaMemcpy(Ainv, Ainv_d, size, cudaMemcpyDeviceToHost);
174 gettimeofday(&
stop, NULL);
177 double time = ds + 0.000001 * dus;
180 printfQuda(
"Batched matrix inversion completed in %f seconds with GFLOPS = %f\n", time, 1e-9 *
flops / time);
184 errorQuda(
"Native BLAS not built. Please build and use native BLAS or use generic BLAS");
#define FLOPS_CGETRF(m_, n_)
void * memset(void *s, int c, size_t n)
enum QudaPrecision_s QudaPrecision
@ QUDA_CUDA_FIELD_LOCATION
@ QUDA_CPU_FIELD_LOCATION
enum QudaFieldLocation_s QudaFieldLocation
#define pool_pinned_malloc(size)
#define pool_device_malloc(size)
#define pool_pinned_free(ptr)
#define pool_device_free(ptr)
long long BatchInvertMatrix(void *Ainv, void *A, const int n, const uint64_t batch, QudaPrecision precision, QudaFieldLocation location)
Batch inversion the matrix field using an LU decomposition method.
void init()
Create the BLAS context.
void destroy()
Destroy the BLAS context.
void stop()
Stop profiling.
void start()
Start profiling.
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
FloatingPoint< float > Float
#define qudaMemcpy(dst, src, count, kind)
#define qudaDeviceSynchronize()
QudaVerbosity getVerbosity()