v0.9.0/doc/blas__cublas_8cu_source.html

 #include <blas_cublas.h>
 #include <malloc_quda.h>

 #define FMULS_GETRF(m_, n_) ( ((m_) < (n_)) \
     ? (0.5 * (m_) * ((m_) * ((n_) - (1./3.) * (m_) - 1. ) + (n_)) + (2. / 3.) * (m_)) \
     : (0.5 * (n_) * ((n_) * ((m_) - (1./3.) * (n_) - 1. ) + (m_)) + (2. / 3.) * (n_)) )
 #define FADDS_GETRF(m_, n_) ( ((m_) < (n_)) \
     ? (0.5 * (m_) * ((m_) * ((n_) - (1./3.) * (m_)      ) - (n_)) + (1. / 6.) * (m_)) \
     : (0.5 * (n_) * ((n_) * ((m_) - (1./3.) * (n_)      ) - (m_)) + (1. / 6.) * (n_)) )

 #define FLOPS_ZGETRF(m_, n_) (6. * FMULS_GETRF((double)(m_), (double)(n_)) + 2.0 * FADDS_GETRF((double)(m_), (double)(n_)) )
 #define FLOPS_CGETRF(m_, n_) (6. * FMULS_GETRF((double)(m_), (double)(n_)) + 2.0 * FADDS_GETRF((double)(m_), (double)(n_)) )

 #define FMULS_GETRI(n_) ( (n_) * ((5. / 6.) + (n_) * ((2. / 3.) * (n_) + 0.5)) )
 #define FADDS_GETRI(n_) ( (n_) * ((5. / 6.) + (n_) * ((2. / 3.) * (n_) - 1.5)) )

 #define FLOPS_ZGETRI(n_) (6. * FMULS_GETRI((double)(n_)) + 2.0 * FADDS_GETRI((double)(n_)) )
 #define FLOPS_CGETRI(n_) (6. * FMULS_GETRI((double)(n_)) + 2.0 * FADDS_GETRI((double)(n_)) )

 namespace quda {

   namespace cublas {

     // mini kernel to set the array of pointers needed for batched cublas
     template<typename T>
     __global__ void set_pointer(T **output_array_a, T *input_a, T **output_array_b, T *input_b, int batch_offset)
     {
       output_array_a[blockIdx.x] = input_a + blockIdx.x * batch_offset;
       output_array_b[blockIdx.x] = input_b + blockIdx.x * batch_offset;
     }

     // FIXME do this in pipelined fashion to reduce memory overhead.
     long long BatchInvertMatrix(void *Ainv, void* A, const int n, const int batch, QudaPrecision prec, QudaFieldLocation location)
     {
       long long flops = 0;
 #ifdef CUBLAS_LIB
       timeval start, stop;
       gettimeofday(&start, NULL);

       size_t size = 2*n*n*prec*batch;

       void *A_d = location == QUDA_CUDA_FIELD_LOCATION ? A : pool_device_malloc(size);
       void *Ainv_d = location == QUDA_CUDA_FIELD_LOCATION ? Ainv : pool_device_malloc(size);
       if (location == QUDA_CPU_FIELD_LOCATION) qudaMemcpy(A_d, A, size, cudaMemcpyHostToDevice);

       int *dipiv = static_cast<int*>(pool_device_malloc(batch*n*sizeof(int)));
       int *dinfo_array = static_cast<int*>(pool_device_malloc(batch*sizeof(int)));
       int *info_array = static_cast<int*>(pool_pinned_malloc(batch*sizeof(int)));

       cublasHandle_t handle;
       cublasStatus_t error = cublasCreate(&handle);
       if (error != CUBLAS_STATUS_SUCCESS) errorQuda("cublasCreate failed");

       if (prec == QUDA_SINGLE_PRECISION) {
   typedef cuFloatComplex C;
   C **A_array = static_cast<C**>(pool_device_malloc(batch*sizeof(C*)));
   C **Ainv_array = static_cast<C**>(pool_device_malloc(batch*sizeof(C*)));

   set_pointer<C><<<batch,1>>>(A_array, (C*)A_d, Ainv_array, (C*)Ainv_d, n*n);

   error = cublasCgetrfBatched(handle, n, A_array, n, dipiv, dinfo_array, batch);
   flops += batch*FLOPS_CGETRF(n,n);

   if (error != CUBLAS_STATUS_SUCCESS)
     errorQuda("\nError in LU decomposition (cublasCgetrfBatched), error code = %d\n", error);

   qudaMemcpy(info_array, dinfo_array, batch*sizeof(int), cudaMemcpyDeviceToHost);
   for (int i=0; i<batch; i++) {
     if (info_array[i] < 0) {
       errorQuda("%d argument had an illegal value or another error occured, such as memory allocation failed", i);
     } else if (info_array[i] > 0) {
       warningQuda("%d factorization completed but the factor U is exactly singular", i);
     }
   }

   error = cublasCgetriBatched(handle, n, (const C**)A_array, n, dipiv,
             Ainv_array, n, dinfo_array, batch);
   flops += batch*FLOPS_CGETRI(n);

   if (error != CUBLAS_STATUS_SUCCESS)
     errorQuda("\nError in matrix inversion (cublasCgetriBatched), error code = %d\n", error);

   qudaMemcpy(info_array, dinfo_array, batch*sizeof(int), cudaMemcpyDeviceToHost);

   for (int i=0; i<batch; i++) {
     if (info_array[i] < 0) {
       errorQuda("%d argument had an illegal value or another error occured, such as memory allocation failed", i);
     } else if (info_array[i] > 0) {
       errorQuda("%d factorization completed but the factor U is exactly singular", i);
     }
   }

   pool_device_free(Ainv_array);
   pool_device_free(A_array);

       } else if (prec == QUDA_DOUBLE_PRECISION) {

       } else {
   errorQuda("%s not implemented for precision=%d", __func__, prec);
       }

       error = cublasDestroy(handle);
       if (error != CUBLAS_STATUS_SUCCESS)
   errorQuda("\nError indestroying cublas context, error code = %d\n", error);

       if (location == QUDA_CPU_FIELD_LOCATION) {
   qudaMemcpy(Ainv, Ainv_d, size, cudaMemcpyDeviceToHost);
   pool_device_free(Ainv_d);
   pool_device_free(A_d);
       }

       pool_device_free(dipiv);
       pool_device_free(dinfo_array);
       pool_pinned_free(info_array);

       qudaDeviceSynchronize();
       gettimeofday(&stop, NULL);
       long ds = stop.tv_sec - start.tv_sec;
       long dus = stop.tv_usec - start.tv_usec;
       double time = ds + 0.000001*dus;

       printfQuda("Batched matrix inversion completed in %f seconds with GFLOPS = %f\n",
      time, 1e-9 * flops / time);
 #endif // CUBLAS_LIB

       return flops;
     }

   } // namespace cublas

 } // namespace quda
FLOPS_CGETRI
#define FLOPS_CGETRI(n_)
Definition: blas_cublas.cu:18

qudaMemcpy
#define qudaMemcpy(dst, src, count, kind)
Definition: quda_cuda_api.h:32

pool_pinned_free
#define pool_pinned_free(ptr)
Definition: malloc_quda.h:116

QudaPrecision
enum QudaPrecision_s QudaPrecision

quda::cublas::set_pointer
__global__ void set_pointer(T **output_array_a, T *input_a, T **output_array_b, T *input_b, int batch_offset)
Definition: blas_cublas.cu:26

timeval::tv_sec
__darwin_time_t tv_sec
Definition: CMakeCUDACompilerId.cpp1.ii:4833

errorQuda
#define errorQuda(...)
Definition: util_quda.h:90

QUDA_CUDA_FIELD_LOCATION
Definition: enum_quda.h:297

start
cudaEvent_t start
Definition: CMakeCUDACompilerId.cpp1.ii:2453

handle
unsigned int unsigned long long handle
Definition: CMakeCUDACompilerId.cpp1.ii:2320

malloc_quda.h

quda
Definition: blas_cublas.h:6

timeval::tv_usec
__darwin_suseconds_t tv_usec
Definition: CMakeCUDACompilerId.cpp1.ii:4834

time
time_t time(time_t *)

quda::cublas::BatchInvertMatrix
long long BatchInvertMatrix(void *Ainv, void *A, const int n, const int batch, QudaPrecision precision, QudaFieldLocation location)
Definition: blas_cublas.cu:33

pool_device_malloc
#define pool_device_malloc(size)
Definition: malloc_quda.h:113

fused_exterior_ndeg_tm_dslash_cuda_gen.i
int i
start here
Definition: fused_exterior_ndeg_tm_dslash_cuda_gen.py:816

warningQuda
#define warningQuda(...)
Definition: util_quda.h:101

timeval
Definition: CMakeCUDACompilerId.cpp1.ii:4831

quda::qudaDeviceSynchronize
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
Definition: quda_cuda_api.cpp:277

FLOPS_CGETRF
#define FLOPS_CGETRF(m_, n_)
Definition: blas_cublas.cu:12

pool_pinned_malloc
#define pool_pinned_malloc(size)
Definition: malloc_quda.h:115

QUDA_DOUBLE_PRECISION
Definition: enum_quda.h:61

QudaFieldLocation
enum QudaFieldLocation_s QudaFieldLocation

blas_cublas.h

QUDA_SINGLE_PRECISION
Definition: enum_quda.h:60

n
int n
Definition: CMakeCUDACompilerId.cpp1.ii:8086

printfQuda
#define printfQuda(...)
Definition: util_quda.h:84

quda::blas::flops
unsigned long long flops
Definition: blas_quda.cu:42

size
size_t size
Definition: CMakeCUDACompilerId.cpp1.ii:2289

e
return e
Definition: CMakeCUDACompilerId.cpp1.ii:3026

pool_device_free
#define pool_device_free(ptr)
Definition: malloc_quda.h:114

QUDA_CPU_FIELD_LOCATION
Definition: enum_quda.h:296

prec
QudaPrecision prec
Definition: test_util.cpp:1615