QUDA  0.9.0
blas_cublas.cu
Go to the documentation of this file.
1 #include <blas_cublas.h>
2 #include <malloc_quda.h>
3 
4 #define FMULS_GETRF(m_, n_) ( ((m_) < (n_)) \
5  ? (0.5 * (m_) * ((m_) * ((n_) - (1./3.) * (m_) - 1. ) + (n_)) + (2. / 3.) * (m_)) \
6  : (0.5 * (n_) * ((n_) * ((m_) - (1./3.) * (n_) - 1. ) + (m_)) + (2. / 3.) * (n_)) )
7 #define FADDS_GETRF(m_, n_) ( ((m_) < (n_)) \
8  ? (0.5 * (m_) * ((m_) * ((n_) - (1./3.) * (m_) ) - (n_)) + (1. / 6.) * (m_)) \
9  : (0.5 * (n_) * ((n_) * ((m_) - (1./3.) * (n_) ) - (m_)) + (1. / 6.) * (n_)) )
10 
11 #define FLOPS_ZGETRF(m_, n_) (6. * FMULS_GETRF((double)(m_), (double)(n_)) + 2.0 * FADDS_GETRF((double)(m_), (double)(n_)) )
12 #define FLOPS_CGETRF(m_, n_) (6. * FMULS_GETRF((double)(m_), (double)(n_)) + 2.0 * FADDS_GETRF((double)(m_), (double)(n_)) )
13 
14 #define FMULS_GETRI(n_) ( (n_) * ((5. / 6.) + (n_) * ((2. / 3.) * (n_) + 0.5)) )
15 #define FADDS_GETRI(n_) ( (n_) * ((5. / 6.) + (n_) * ((2. / 3.) * (n_) - 1.5)) )
16 
17 #define FLOPS_ZGETRI(n_) (6. * FMULS_GETRI((double)(n_)) + 2.0 * FADDS_GETRI((double)(n_)) )
18 #define FLOPS_CGETRI(n_) (6. * FMULS_GETRI((double)(n_)) + 2.0 * FADDS_GETRI((double)(n_)) )
19 
20 namespace quda {
21 
22  namespace cublas {
23 
24  // mini kernel to set the array of pointers needed for batched cublas
25  template<typename T>
26  __global__ void set_pointer(T **output_array_a, T *input_a, T **output_array_b, T *input_b, int batch_offset)
27  {
28  output_array_a[blockIdx.x] = input_a + blockIdx.x * batch_offset;
29  output_array_b[blockIdx.x] = input_b + blockIdx.x * batch_offset;
30  }
31 
32  // FIXME do this in pipelined fashion to reduce memory overhead.
33  long long BatchInvertMatrix(void *Ainv, void* A, const int n, const int batch, QudaPrecision prec, QudaFieldLocation location)
34  {
35  long long flops = 0;
36 #ifdef CUBLAS_LIB
37  timeval start, stop;
38  gettimeofday(&start, NULL);
39 
40  size_t size = 2*n*n*prec*batch;
41 
42  void *A_d = location == QUDA_CUDA_FIELD_LOCATION ? A : pool_device_malloc(size);
43  void *Ainv_d = location == QUDA_CUDA_FIELD_LOCATION ? Ainv : pool_device_malloc(size);
44  if (location == QUDA_CPU_FIELD_LOCATION) qudaMemcpy(A_d, A, size, cudaMemcpyHostToDevice);
45 
46  int *dipiv = static_cast<int*>(pool_device_malloc(batch*n*sizeof(int)));
47  int *dinfo_array = static_cast<int*>(pool_device_malloc(batch*sizeof(int)));
48  int *info_array = static_cast<int*>(pool_pinned_malloc(batch*sizeof(int)));
49 
50  cublasHandle_t handle;
51  cublasStatus_t error = cublasCreate(&handle);
52  if (error != CUBLAS_STATUS_SUCCESS) errorQuda("cublasCreate failed");
53 
54  if (prec == QUDA_SINGLE_PRECISION) {
55  typedef cuFloatComplex C;
56  C **A_array = static_cast<C**>(pool_device_malloc(batch*sizeof(C*)));
57  C **Ainv_array = static_cast<C**>(pool_device_malloc(batch*sizeof(C*)));
58 
59  set_pointer<C><<<batch,1>>>(A_array, (C*)A_d, Ainv_array, (C*)Ainv_d, n*n);
60 
61  error = cublasCgetrfBatched(handle, n, A_array, n, dipiv, dinfo_array, batch);
62  flops += batch*FLOPS_CGETRF(n,n);
63 
64  if (error != CUBLAS_STATUS_SUCCESS)
65  errorQuda("\nError in LU decomposition (cublasCgetrfBatched), error code = %d\n", error);
66 
67  qudaMemcpy(info_array, dinfo_array, batch*sizeof(int), cudaMemcpyDeviceToHost);
68  for (int i=0; i<batch; i++) {
69  if (info_array[i] < 0) {
70  errorQuda("%d argument had an illegal value or another error occured, such as memory allocation failed", i);
71  } else if (info_array[i] > 0) {
72  warningQuda("%d factorization completed but the factor U is exactly singular", i);
73  }
74  }
75 
76  error = cublasCgetriBatched(handle, n, (const C**)A_array, n, dipiv,
77  Ainv_array, n, dinfo_array, batch);
78  flops += batch*FLOPS_CGETRI(n);
79 
80  if (error != CUBLAS_STATUS_SUCCESS)
81  errorQuda("\nError in matrix inversion (cublasCgetriBatched), error code = %d\n", error);
82 
83  qudaMemcpy(info_array, dinfo_array, batch*sizeof(int), cudaMemcpyDeviceToHost);
84 
85  for (int i=0; i<batch; i++) {
86  if (info_array[i] < 0) {
87  errorQuda("%d argument had an illegal value or another error occured, such as memory allocation failed", i);
88  } else if (info_array[i] > 0) {
89  errorQuda("%d factorization completed but the factor U is exactly singular", i);
90  }
91  }
92 
93  pool_device_free(Ainv_array);
94  pool_device_free(A_array);
95 
96  } else if (prec == QUDA_DOUBLE_PRECISION) {
97 
98  } else {
99  errorQuda("%s not implemented for precision=%d", __func__, prec);
100  }
101 
102  error = cublasDestroy(handle);
103  if (error != CUBLAS_STATUS_SUCCESS)
104  errorQuda("\nError indestroying cublas context, error code = %d\n", error);
105 
106  if (location == QUDA_CPU_FIELD_LOCATION) {
107  qudaMemcpy(Ainv, Ainv_d, size, cudaMemcpyDeviceToHost);
108  pool_device_free(Ainv_d);
109  pool_device_free(A_d);
110  }
111 
112  pool_device_free(dipiv);
113  pool_device_free(dinfo_array);
114  pool_pinned_free(info_array);
115 
117  gettimeofday(&stop, NULL);
118  long ds = stop.tv_sec - start.tv_sec;
119  long dus = stop.tv_usec - start.tv_usec;
120  double time = ds + 0.000001*dus;
121 
122  printfQuda("Batched matrix inversion completed in %f seconds with GFLOPS = %f\n",
123  time, 1e-9 * flops / time);
124 #endif // CUBLAS_LIB
125 
126  return flops;
127  }
128 
129  } // namespace cublas
130 
131 } // namespace quda
#define FLOPS_CGETRI(n_)
Definition: blas_cublas.cu:18
#define qudaMemcpy(dst, src, count, kind)
Definition: quda_cuda_api.h:32
#define pool_pinned_free(ptr)
Definition: malloc_quda.h:116
enum QudaPrecision_s QudaPrecision
__global__ void set_pointer(T **output_array_a, T *input_a, T **output_array_b, T *input_b, int batch_offset)
Definition: blas_cublas.cu:26
__darwin_time_t tv_sec
#define errorQuda(...)
Definition: util_quda.h:90
cudaEvent_t start
unsigned int unsigned long long handle
__darwin_suseconds_t tv_usec
time_t time(time_t *)
long long BatchInvertMatrix(void *Ainv, void *A, const int n, const int batch, QudaPrecision precision, QudaFieldLocation location)
Definition: blas_cublas.cu:33
#define pool_device_malloc(size)
Definition: malloc_quda.h:113
#define warningQuda(...)
Definition: util_quda.h:101
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
#define FLOPS_CGETRF(m_, n_)
Definition: blas_cublas.cu:12
#define pool_pinned_malloc(size)
Definition: malloc_quda.h:115
enum QudaFieldLocation_s QudaFieldLocation
#define printfQuda(...)
Definition: util_quda.h:84
unsigned long long flops
Definition: blas_quda.cu:42
#define pool_device_free(ptr)
Definition: malloc_quda.h:114
QudaPrecision prec
Definition: test_util.cpp:1615