quda-ref/v1.1.0/blas__lapack__cublas_8cpp_source.html

 #include <complex.h>

 #include <blas_lapack.h>

 #ifdef NATIVE_LAPACK_LIB

 #include <cublas_v2.h>

 #include <malloc_quda.h>

 #endif


 //#define _DEBUG


 #ifdef _DEBUG

 #include <eigen_helper.h>

 #endif


 namespace quda

 {


   namespace blas_lapack

   {


     namespace native

     {


 #ifdef NATIVE_LAPACK_LIB

       static cublasHandle_t handle;

 #endif

       static bool cublas_init = false;


       void init()

       {

         if (!cublas_init) {

 #ifdef NATIVE_LAPACK_LIB

           cublasStatus_t error = cublasCreate(&handle);

           if (error != CUBLAS_STATUS_SUCCESS)

             errorQuda("cublasCreate failed with error %d", error);

           else

             printfQuda("cublasCreated successfully\n");

           cublas_init = true;

 #endif

         }

       }


       void destroy()

       {

         if (cublas_init) {

 #ifdef NATIVE_LAPACK_LIB

           cublasStatus_t error = cublasDestroy(handle);

           if (error != CUBLAS_STATUS_SUCCESS)

             errorQuda("\nError indestroying cublas context, error code = %d\n", error);

           cublas_init = false;

 #endif

         }

       }


 #ifdef _DEBUG

       template <typename EigenMatrix, typename Float>

       __host__ void checkEigen(std::complex<Float> *A_h, std::complex<Float> *Ainv_h, int n, uint64_t batch)

       {

         EigenMatrix A = EigenMatrix::Zero(n, n);

         EigenMatrix Ainv = EigenMatrix::Zero(n, n);

         for (int j = 0; j < n; j++) {

           for (int k = 0; k < n; k++) {

             A(k, j) = A_h[batch * n * n + j * n + k];

             Ainv(k, j) = Ainv_h[batch * n * n + j * n + k];

           }

         }


         // Check result:

         EigenMatrix unit = EigenMatrix::Identity(n, n);

         EigenMatrix prod = A * Ainv;

         Float L2norm = ((prod - unit).norm() / (n * n));

         printfQuda("cuBLAS: Norm of (A * Ainv - I) batch %lu = %e\n", batch, L2norm);

       }

 #endif


       // FIXME do this in pipelined fashion to reduce memory overhead.

       long long BatchInvertMatrix(void *Ainv, void *A, const int n, const uint64_t batch, QudaPrecision prec,

                                   QudaFieldLocation location)

       {

 #ifdef NATIVE_LAPACK_LIB

         init();

         if (getVerbosity() >= QUDA_VERBOSE)

           printfQuda("BatchInvertMatrix (native - cuBLAS): Nc = %d, batch = %lu\n", n, batch);


         long long flops = 0;

         timeval start, stop;

         gettimeofday(&start, NULL);


         size_t size = 2 * n * n * prec * batch;

         void *A_d = location == QUDA_CUDA_FIELD_LOCATION ? A : pool_device_malloc(size);

         void *Ainv_d = location == QUDA_CUDA_FIELD_LOCATION ? Ainv : pool_device_malloc(size);

         if (location == QUDA_CPU_FIELD_LOCATION) qudaMemcpy(A_d, A, size, cudaMemcpyHostToDevice);


 #ifdef _DEBUG

         // Debug code: Copy original A matrix to host

         std::complex<float> *A_h

           = (location == QUDA_CUDA_FIELD_LOCATION ? static_cast<std::complex<float> *>(pool_pinned_malloc(size)) :

                                                     static_cast<std::complex<float> *>(A_d));

         if (location == QUDA_CUDA_FIELD_LOCATION) qudaMemcpy((void *)A_h, A_d, size, cudaMemcpyDeviceToHost);

 #endif


         int *dipiv = static_cast<int *>(pool_device_malloc(batch * n * sizeof(int)));

         int *dinfo_array = static_cast<int *>(pool_device_malloc(batch * sizeof(int)));

         int *info_array = static_cast<int *>(pool_pinned_malloc(batch * sizeof(int)));

         memset(info_array, '0', batch * sizeof(int)); // silence memcheck warnings


         if (prec == QUDA_SINGLE_PRECISION) {

           typedef cuFloatComplex C;

           C **A_array = static_cast<C **>(pool_device_malloc(2 * batch * sizeof(C *)));

           C **Ainv_array = A_array + batch;

           C **A_array_h = static_cast<C **>(pool_pinned_malloc(2 * batch * sizeof(C *)));

           C **Ainv_array_h = A_array_h + batch;

           for (uint64_t i = 0; i < batch; i++) {

             A_array_h[i] = static_cast<C *>(A_d) + i * n * n;

             Ainv_array_h[i] = static_cast<C *>(Ainv_d) + i * n * n;

           }

           qudaMemcpy(A_array, A_array_h, 2 * batch * sizeof(C *), cudaMemcpyHostToDevice);


           cublasStatus_t error = cublasCgetrfBatched(handle, n, A_array, n, dipiv, dinfo_array, batch);

           flops += batch * FLOPS_CGETRF(n, n);


           if (error != CUBLAS_STATUS_SUCCESS)

             errorQuda("\nError in LU decomposition (cublasCgetrfBatched), error code = %d\n", error);


           qudaMemcpy(info_array, dinfo_array, batch * sizeof(int), cudaMemcpyDeviceToHost);

           for (uint64_t i = 0; i < batch; i++) {

             if (info_array[i] < 0) {

               errorQuda("%lu argument had an illegal value or another error occured, such as memory allocation failed",

                         i);

             } else if (info_array[i] > 0) {

               errorQuda("%lu factorization completed but the factor U is exactly singular", i);

             }

           }


           error = cublasCgetriBatched(handle, n, (const C **)A_array, n, dipiv, Ainv_array, n, dinfo_array, batch);

           flops += batch * FLOPS_CGETRI(n);


           if (error != CUBLAS_STATUS_SUCCESS)

             errorQuda("\nError in matrix inversion (cublasCgetriBatched), error code = %d\n", error);


           qudaMemcpy(info_array, dinfo_array, batch * sizeof(int), cudaMemcpyDeviceToHost);


           for (uint64_t i = 0; i < batch; i++) {

             if (info_array[i] < 0) {

               errorQuda("%lu argument had an illegal value or another error occured, such as memory allocation failed",

                         i);

             } else if (info_array[i] > 0) {

               errorQuda("%lu factorization completed but the factor U is exactly singular", i);

             }

           }


           pool_device_free(A_array);

           pool_pinned_free(A_array_h);


 #ifdef _DEBUG

           // Debug code: Copy computed Ainv to host

           std::complex<float> *Ainv_h = static_cast<std::complex<float> *>(pool_pinned_malloc(size));

           qudaMemcpy((void *)Ainv_h, Ainv_d, size, cudaMemcpyDeviceToHost);


           for (uint64_t i = 0; i < batch; i++) { checkEigen<MatrixXcf, float>(A_h, Ainv_h, n, i); }

           pool_pinned_free(Ainv_h);

           pool_pinned_free(A_h);

 #endif

         } else {

           errorQuda("%s not implemented for precision=%d", __func__, prec);

         }


         if (location == QUDA_CPU_FIELD_LOCATION) {

           qudaMemcpy(Ainv, Ainv_d, size, cudaMemcpyDeviceToHost);

           pool_device_free(Ainv_d);

           pool_device_free(A_d);

         }


         pool_device_free(dipiv);

         pool_device_free(dinfo_array);

         pool_pinned_free(info_array);


         qudaDeviceSynchronize();

         gettimeofday(&stop, NULL);

         long ds = stop.tv_sec - start.tv_sec;

         long dus = stop.tv_usec - start.tv_usec;

         double time = ds + 0.000001 * dus;


         if (getVerbosity() >= QUDA_VERBOSE)

           printfQuda("Batched matrix inversion completed in %f seconds with GFLOPS = %f\n", time, 1e-9 * flops / time);


         return flops;

 #else

         errorQuda("Native BLAS not built. Please build and use native BLAS or use generic BLAS");

         return 0; // Stops a compiler warning

 #endif

       }


       long long stridedBatchGEMM(void *A_data, void *B_data, void *C_data, QudaBLASParam blas_param,

                                  QudaFieldLocation location)

       {

         long long flops = 0;

 #ifdef NATIVE_LAPACK_LIB

         timeval start, stop;

         gettimeofday(&start, NULL);


         // Sanity checks on parameters

         //-------------------------------------------------------------------------

         // If the user passes non positive M,N, or K, we error out

         int min_dim = std::min(blas_param.m, std::min(blas_param.n, blas_param.k));

         if (min_dim <= 0) {

           errorQuda("BLAS dims must be positive: m=%d, n=%d, k=%d", blas_param.m, blas_param.n, blas_param.k);

         }


         // If the user passes a negative stride, we error out as this has no meaning.

         int min_stride = std::min(std::min(blas_param.a_stride, blas_param.b_stride), blas_param.c_stride);

         if (min_stride < 0) {

           errorQuda("BLAS strides must be positive or zero: a_stride=%d, b_stride=%d, c_stride=%d", blas_param.a_stride,

                     blas_param.b_stride, blas_param.c_stride);

         }


         // If the user passes a negative offset, we error out as this has no meaning.

         int min_offset = std::min(std::min(blas_param.a_offset, blas_param.b_offset), blas_param.c_offset);

         if (min_offset < 0) {

           errorQuda("BLAS offsets must be positive or zero: a_offset=%d, b_offset=%d, c_offset=%d", blas_param.a_offset,

                     blas_param.b_offset, blas_param.c_offset);

         }


         // If the batch value is non-positve, we error out

         if (blas_param.batch_count <= 0) { errorQuda("Batches must be positive: batches=%d", blas_param.batch_count); }


         // Leading dims are dependendent on the matrix op type.

         if (blas_param.data_order == QUDA_BLAS_DATAORDER_COL) {

           if (blas_param.trans_a == QUDA_BLAS_OP_N) {

             if (blas_param.lda < std::max(1, blas_param.m))

               errorQuda("lda=%d must be >= max(1,m=%d)", blas_param.lda, blas_param.m);

           } else {

             if (blas_param.lda < std::max(1, blas_param.k))

               errorQuda("lda=%d must be >= max(1,k=%d)", blas_param.lda, blas_param.k);

           }


           if (blas_param.trans_b == QUDA_BLAS_OP_N) {

             if (blas_param.ldb < std::max(1, blas_param.k))

               errorQuda("ldb=%d must be >= max(1,k=%d)", blas_param.ldb, blas_param.k);

           } else {

             if (blas_param.ldb < std::max(1, blas_param.n))

               errorQuda("ldb=%d must be >= max(1,n=%d)", blas_param.ldb, blas_param.n);

           }

           if (blas_param.ldc < std::max(1, blas_param.m))

             errorQuda("ldc=%d must be >= max(1,m=%d)", blas_param.ldc, blas_param.m);

         } else {

           if (blas_param.trans_a == QUDA_BLAS_OP_N) {

             if (blas_param.lda < std::max(1, blas_param.k))

               errorQuda("lda=%d must be >= max(1,k=%d)", blas_param.lda, blas_param.k);

           } else {

             if (blas_param.lda < std::max(1, blas_param.m))

               errorQuda("lda=%d must be >= max(1,m=%d)", blas_param.lda, blas_param.m);

           }

           if (blas_param.trans_b == QUDA_BLAS_OP_N) {

             if (blas_param.ldb < std::max(1, blas_param.n))

               errorQuda("ldb=%d must be >= max(1,n=%d)", blas_param.ldb, blas_param.n);

           } else {

             if (blas_param.ldb < std::max(1, blas_param.k))

               errorQuda("ldb=%d must be >= max(1,k=%d)", blas_param.ldb, blas_param.k);

           }

           if (blas_param.ldc < std::max(1, blas_param.n))

             errorQuda("ldc=%d must be >= max(1,n=%d)", blas_param.ldc, blas_param.n);

         }

         //-------------------------------------------------------------------------


         // Parse parameters for CUBLAS

         //-------------------------------------------------------------------------

         // Swap A and B if in row order

         if (blas_param.data_order == QUDA_BLAS_DATAORDER_ROW) {

           std::swap(blas_param.m, blas_param.n);

           std::swap(blas_param.lda, blas_param.ldb);

           std::swap(blas_param.trans_a, blas_param.trans_b);

           std::swap(blas_param.a_offset, blas_param.b_offset);

           std::swap(blas_param.a_stride, blas_param.b_stride);

           std::swap(A_data, B_data);

         }


         // Get maximum stride length to deduce the number of batches in the

         // computation

         int max_stride = std::max(std::max(blas_param.a_stride, blas_param.b_stride), blas_param.c_stride);


         // If the user gives strides of 0 for all arrays, we are essentially performing

         // a GEMM on the first matrices in the array N_{batch} times.

         // Give them what they ask for, YMMV...

         // If the strides have not been set, we are just using strides of 1.

         if (max_stride == 0) max_stride = 1;


         // The number of GEMMs to compute

         const uint64_t batch = blas_param.batch_count / max_stride;


         uint64_t data_size

           = (blas_param.data_type == QUDA_BLAS_DATATYPE_S || blas_param.data_type == QUDA_BLAS_DATATYPE_C) ? 4 : 8;


         if (blas_param.data_type == QUDA_BLAS_DATATYPE_C || blas_param.data_type == QUDA_BLAS_DATATYPE_Z) {

           data_size *= 2;

         }


         // Number of data between batches

         unsigned int A_batch_size = blas_param.lda * blas_param.k;

         if (blas_param.trans_a != QUDA_BLAS_OP_N) A_batch_size = blas_param.lda * blas_param.m;

         unsigned int B_batch_size = blas_param.ldb * blas_param.n;

         if (blas_param.trans_b != QUDA_BLAS_OP_N) B_batch_size = blas_param.ldb * blas_param.k;

         unsigned int C_batch_size = blas_param.ldc * blas_param.n;


         // Strides in the cublas param are defaulted to -1. If that remains unchanged,

         // the stride will be the regular batch size, else the user specified value

         // is used.

         unsigned int a_stride = blas_param.a_stride == 0 ? A_batch_size : A_batch_size * blas_param.a_stride;

         unsigned int b_stride = blas_param.b_stride == 0 ? B_batch_size : B_batch_size * blas_param.b_stride;

         unsigned int c_stride = blas_param.c_stride == 0 ? C_batch_size : C_batch_size * blas_param.c_stride;


         // Data size of the entire array

         size_t sizeAarr = A_batch_size * data_size * batch;

         size_t sizeBarr = B_batch_size * data_size * batch;

         size_t sizeCarr = C_batch_size * data_size * batch;


         // If already on the device, just use the given pointer. If the data is on

         // the host, allocate device memory and transfer

         void *A_d = location == QUDA_CUDA_FIELD_LOCATION ? A_data : pool_device_malloc(sizeAarr);

         void *B_d = location == QUDA_CUDA_FIELD_LOCATION ? B_data : pool_device_malloc(sizeBarr);

         void *C_d = location == QUDA_CUDA_FIELD_LOCATION ? C_data : pool_device_malloc(sizeCarr);

         if (location == QUDA_CPU_FIELD_LOCATION) {

           qudaMemcpy(A_d, A_data, sizeAarr, cudaMemcpyHostToDevice);

           qudaMemcpy(B_d, B_data, sizeBarr, cudaMemcpyHostToDevice);

           qudaMemcpy(C_d, C_data, sizeCarr, cudaMemcpyHostToDevice);

         }


         cublasOperation_t trans_a = CUBLAS_OP_N;

         switch (blas_param.trans_a) {

         case QUDA_BLAS_OP_N: trans_a = CUBLAS_OP_N; break;

         case QUDA_BLAS_OP_T: trans_a = CUBLAS_OP_T; break;

         case QUDA_BLAS_OP_C: trans_a = CUBLAS_OP_C; break;

         default: errorQuda("Unknown QUDA_BLAS_OP type %d\n", blas_param.trans_a);

         }


         cublasOperation_t trans_b = CUBLAS_OP_N;

         switch (blas_param.trans_b) {

         case QUDA_BLAS_OP_N: trans_b = CUBLAS_OP_N; break;

         case QUDA_BLAS_OP_T: trans_b = CUBLAS_OP_T; break;

         case QUDA_BLAS_OP_C: trans_b = CUBLAS_OP_C; break;

         default: errorQuda("Unknown QUDA_BLAS_OP type %d\n", blas_param.trans_b);

         }

         //-------------------------------------------------------------------------


         // Call CUBLAS

         //-------------------------------------------------------------------------

         if (blas_param.data_type == QUDA_BLAS_DATATYPE_Z) {


           typedef cuDoubleComplex Z;


           const Z alpha = make_double2((double)(static_cast<std::complex<double>>(blas_param.alpha).real()),

                                        (double)(static_cast<std::complex<double>>(blas_param.alpha).imag()));


           const Z beta = make_double2((double)(static_cast<std::complex<double>>(blas_param.beta).real()),

                                       (double)(static_cast<std::complex<double>>(blas_param.beta).imag()));


           cublasStatus_t error;

           if (batch > 1) {

             error = cublasZgemmStridedBatched(handle, trans_a, trans_b, blas_param.m, blas_param.n, blas_param.k,

                                               &alpha, (Z *)A_d + blas_param.a_offset, blas_param.lda, a_stride,

                                               (Z *)B_d + blas_param.b_offset, blas_param.ldb, b_stride, &beta,

                                               (Z *)C_d + blas_param.c_offset, blas_param.ldc, c_stride, batch);


             if (error != CUBLAS_STATUS_SUCCESS)

               errorQuda("\nError in cuBLASZGEMMStridedBatched, error code = %d\n", error);

           } else {

             error = cublasZgemm(handle, trans_a, trans_b, blas_param.m, blas_param.n, blas_param.k, &alpha,

                                 (Z *)A_d + blas_param.a_offset, blas_param.lda, (Z *)B_d + blas_param.b_offset,

                                 blas_param.ldb, &beta, (Z *)C_d + blas_param.c_offset, blas_param.ldc);


             if (error != CUBLAS_STATUS_SUCCESS) errorQuda("\nError in cuBLASZGEMM, error code = %d\n", error);

           }

         } else if (blas_param.data_type == QUDA_BLAS_DATATYPE_C) {


           typedef cuFloatComplex C;


           const C alpha = make_float2((float)(static_cast<std::complex<double>>(blas_param.alpha).real()),

                                       (float)(static_cast<std::complex<double>>(blas_param.alpha).imag()));


           const C beta = make_float2((float)(static_cast<std::complex<double>>(blas_param.beta).real()),

                                      (float)(static_cast<std::complex<double>>(blas_param.beta).imag()));


           cublasStatus_t error;

           if (batch > 1) {

             error = cublasCgemmStridedBatched(handle, trans_a, trans_b, blas_param.m, blas_param.n, blas_param.k,

                                               &alpha, (C *)A_d + blas_param.a_offset, blas_param.lda, a_stride,

                                               (C *)B_d + blas_param.b_offset, blas_param.ldb, b_stride, &beta,

                                               (C *)C_d + blas_param.c_offset, blas_param.ldc, c_stride, batch);


             if (error != CUBLAS_STATUS_SUCCESS)

               errorQuda("\nError in cuBLASCGEMMStridedBatched, error code = %d\n", error);

           } else {

             error = cublasCgemm(handle, trans_a, trans_b, blas_param.m, blas_param.n, blas_param.k, &alpha,

                                 (C *)A_d + blas_param.a_offset, blas_param.lda, (C *)B_d + blas_param.b_offset,

                                 blas_param.ldb, &beta, (C *)C_d + blas_param.c_offset, blas_param.ldc);


             if (error != CUBLAS_STATUS_SUCCESS) errorQuda("\nError in cuBLASCGEMMBatched, error code = %d\n", error);

           }

         } else if (blas_param.data_type == QUDA_BLAS_DATATYPE_D) {


           typedef double D;


           const D alpha = (D)(static_cast<std::complex<double>>(blas_param.alpha).real());

           const D beta = (D)(static_cast<std::complex<double>>(blas_param.beta).real());


           cublasStatus_t error;

           if (batch > 1) {

             error = cublasDgemmStridedBatched(handle, trans_a, trans_b, blas_param.m, blas_param.n, blas_param.k,

                                               &alpha, (D *)A_d + blas_param.a_offset, blas_param.lda, a_stride,

                                               (D *)B_d + blas_param.b_offset, blas_param.ldb, b_stride, &beta,

                                               (D *)C_d + blas_param.c_offset, blas_param.ldc, c_stride, batch);


             if (error != CUBLAS_STATUS_SUCCESS)

               errorQuda("\nError in cuBLASDGEMMStridedBatched, error code = %d\n", error);

           } else {

             error = cublasDgemm(handle, trans_a, trans_b, blas_param.m, blas_param.n, blas_param.k, &alpha,

                                 (D *)A_d + blas_param.a_offset, blas_param.lda, (D *)B_d + blas_param.b_offset,

                                 blas_param.ldb, &beta, (D *)C_d + blas_param.c_offset, blas_param.ldc);


             if (error != CUBLAS_STATUS_SUCCESS) errorQuda("\nError in cuBLASDGEMMBatched, error code = %d\n", error);

           }

         } else if (blas_param.data_type == QUDA_BLAS_DATATYPE_S) {


           typedef float S;


           const S alpha = (S)(static_cast<std::complex<float>>(blas_param.alpha).real());

           const S beta = (S)(static_cast<std::complex<float>>(blas_param.beta).real());


           cublasStatus_t error;

           if (batch > 1) {

             error = cublasSgemmStridedBatched(handle, trans_a, trans_b, blas_param.m, blas_param.n, blas_param.k,

                                               &alpha, (S *)A_d + blas_param.a_offset, blas_param.lda, a_stride,

                                               (S *)B_d + blas_param.b_offset, blas_param.ldb, b_stride, &beta,

                                               (S *)C_d + blas_param.c_offset, blas_param.ldc, c_stride, batch);


             if (error != CUBLAS_STATUS_SUCCESS)

               errorQuda("\nError in cuBLASSGEMMStridedBatched, error code = %d\n", error);

           } else {

             error = cublasSgemm(handle, trans_a, trans_b, blas_param.m, blas_param.n, blas_param.k, &alpha,

                                 (S *)A_d + blas_param.a_offset, blas_param.lda, (S *)B_d + blas_param.b_offset,

                                 blas_param.ldb, &beta, (S *)C_d + blas_param.c_offset, blas_param.ldc);


             if (error != CUBLAS_STATUS_SUCCESS) errorQuda("\nError in cuBLASSGEMMBatched, error code = %d\n", error);

           }

         } else {

           errorQuda("cublasGEMM type %d not implemented\n", blas_param.data_type);

         }

         //-------------------------------------------------------------------------


         // Clean up

         //-------------------------------------------------------------------------

         if (blas_param.data_order == QUDA_BLAS_DATAORDER_ROW) {

           std::swap(blas_param.m, blas_param.n);

           std::swap(blas_param.lda, blas_param.ldb);

           std::swap(blas_param.trans_a, blas_param.trans_b);

           std::swap(blas_param.a_offset, blas_param.b_offset);

           std::swap(blas_param.a_stride, blas_param.b_stride);

           std::swap(A_data, B_data);

         }


         if (location == QUDA_CPU_FIELD_LOCATION) {

           qudaMemcpy(C_data, C_d, sizeCarr, cudaMemcpyDeviceToHost);

           pool_device_free(A_d);

           pool_device_free(B_d);

           pool_device_free(C_d);

         }


         qudaDeviceSynchronize();

         gettimeofday(&stop, NULL);

         long ds = stop.tv_sec - start.tv_sec;

         long dus = stop.tv_usec - start.tv_usec;

         double time = ds + 0.000001 * dus;

         if (getVerbosity() >= QUDA_DEBUG_VERBOSE)

           printfQuda("Batched matrix GEMM completed in %f seconds with GFLOPS = %f\n", time, 1e-9 * flops / time);

         //-------------------------------------------------------------------------


         return flops;

 #else

         errorQuda("Native BLAS not built. Please build and use native BLAS or use generic BLAS");

         return 0; // Stops a compiler warning

 #endif

       }

     } // namespace native

   }   // namespace blas_lapack

 } // namespace quda

blas_lapack.h

FLOPS_CGETRF
#define FLOPS_CGETRF(m_, n_)
Definition: blas_lapack.h:14

FLOPS_CGETRI
#define FLOPS_CGETRI(n_)
Definition: blas_lapack.h:21

prec
QudaPrecision prec
Definition: command_line_params.cpp:26

Z
int Z[4]
Definition: host_utils.cpp:36

memset
void * memset(void *s, int c, size_t n)

eigen_helper.h

QudaPrecision
enum QudaPrecision_s QudaPrecision

QUDA_CUDA_FIELD_LOCATION
@ QUDA_CUDA_FIELD_LOCATION
Definition: enum_quda.h:326

QUDA_CPU_FIELD_LOCATION
@ QUDA_CPU_FIELD_LOCATION
Definition: enum_quda.h:325

QUDA_DEBUG_VERBOSE
@ QUDA_DEBUG_VERBOSE
Definition: enum_quda.h:268

QUDA_VERBOSE
@ QUDA_VERBOSE
Definition: enum_quda.h:267

QUDA_BLAS_DATATYPE_Z
@ QUDA_BLAS_DATATYPE_Z
Definition: enum_quda.h:480

QUDA_BLAS_DATATYPE_D
@ QUDA_BLAS_DATATYPE_D
Definition: enum_quda.h:478

QUDA_BLAS_DATATYPE_C
@ QUDA_BLAS_DATATYPE_C
Definition: enum_quda.h:479

QUDA_BLAS_DATATYPE_S
@ QUDA_BLAS_DATATYPE_S
Definition: enum_quda.h:477

QudaFieldLocation
enum QudaFieldLocation_s QudaFieldLocation

QUDA_BLAS_DATAORDER_COL
@ QUDA_BLAS_DATAORDER_COL
Definition: enum_quda.h:486

QUDA_BLAS_DATAORDER_ROW
@ QUDA_BLAS_DATAORDER_ROW
Definition: enum_quda.h:485

QUDA_BLAS_OP_C
@ QUDA_BLAS_OP_C
Definition: enum_quda.h:472

QUDA_BLAS_OP_N
@ QUDA_BLAS_OP_N
Definition: enum_quda.h:470

QUDA_BLAS_OP_T
@ QUDA_BLAS_OP_T
Definition: enum_quda.h:471

QUDA_SINGLE_PRECISION
@ QUDA_SINGLE_PRECISION
Definition: enum_quda.h:64

malloc_quda.h

pool_pinned_malloc
#define pool_pinned_malloc(size)
Definition: malloc_quda.h:172

pool_device_malloc
#define pool_device_malloc(size)
Definition: malloc_quda.h:170

pool_pinned_free
#define pool_pinned_free(ptr)
Definition: malloc_quda.h:173

pool_device_free
#define pool_device_free(ptr)
Definition: malloc_quda.h:171

quda::blas_lapack::native::BatchInvertMatrix
long long BatchInvertMatrix(void *Ainv, void *A, const int n, const uint64_t batch, QudaPrecision precision, QudaFieldLocation location)
Batch inversion the matrix field using an LU decomposition method.
Definition: blas_lapack_cublas.cpp:76

quda::blas_lapack::native::init
void init()
Create the BLAS context.
Definition: blas_lapack_cublas.cpp:28

quda::blas_lapack::native::stridedBatchGEMM
long long stridedBatchGEMM(void *A, void *B, void *C, QudaBLASParam blas_param, QudaFieldLocation location)
Strided Batch GEMM. This function performs N GEMM type operations in a strided batched fashion....
Definition: blas_lapack_cublas.cpp:193

quda::blas_lapack::native::destroy
void destroy()
Destroy the BLAS context.
Definition: blas_lapack_cublas.cpp:42

quda::blas::flops
unsigned long long flops

quda::device::profile::stop
void stop()
Stop profiling.
Definition: device.cpp:228

quda::device::profile::start
void start()
Start profiling.
Definition: device.cpp:226

quda
Definition: blas_lapack.h:24

quda::norm
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
Definition: complex_quda.h:1088

testing::internal::Float
FloatingPoint< float > Float
Definition: gtest-internal.h:396

qudaMemcpy
#define qudaMemcpy(dst, src, count, kind)
Definition: quda_api.h:204

qudaDeviceSynchronize
#define qudaDeviceSynchronize()
Definition: quda_api.h:250

QudaBLASParam_s
Definition: quda.h:748

QudaBLASParam_s::c_offset
int c_offset
Definition: quda.h:761

QudaBLASParam_s::alpha
double_complex alpha
Definition: quda.h:766

QudaBLASParam_s::a_stride
int a_stride
Definition: quda.h:762

QudaBLASParam_s::b_stride
int b_stride
Definition: quda.h:763

QudaBLASParam_s::ldc
int ldc
Definition: quda.h:758

QudaBLASParam_s::data_order
QudaBLASDataOrder data_order
Definition: quda.h:772

QudaBLASParam_s::c_stride
int c_stride
Definition: quda.h:764

QudaBLASParam_s::b_offset
int b_offset
Definition: quda.h:760

QudaBLASParam_s::trans_a
QudaBLASOperation trans_a
Definition: quda.h:751

QudaBLASParam_s::beta
double_complex beta
Definition: quda.h:767

QudaBLASParam_s::ldb
int ldb
Definition: quda.h:757

QudaBLASParam_s::data_type
QudaBLASDataType data_type
Definition: quda.h:771

QudaBLASParam_s::a_offset
int a_offset
Definition: quda.h:759

QudaBLASParam_s::lda
int lda
Definition: quda.h:756

QudaBLASParam_s::batch_count
int batch_count
Definition: quda.h:769

QudaBLASParam_s::n
int n
Definition: quda.h:754

QudaBLASParam_s::m
int m
Definition: quda.h:753

QudaBLASParam_s::trans_b
QudaBLASOperation trans_b
Definition: quda.h:752

QudaBLASParam_s::k
int k
Definition: quda.h:755

swap
DEVICEHOST void swap(Real &a, Real &b)
Definition: svd_quda.h:134

printfQuda
#define printfQuda(...)
Definition: util_quda.h:114

getVerbosity
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21

errorQuda
#define errorQuda(...)
Definition: util_quda.h:120