QUDA  0.9.0
blas_magma_nopiv.cu
Go to the documentation of this file.
1 #include <blas_magma.h>
2 #include <string.h>
3 
4 #include <vector>
5 #include <algorithm>
6 
7 #include <util_quda.h>
8 #include <quda_internal.h>
9 
10 #ifndef MAX
11 #define MAX(a, b) (a > b) ? a : b;
12 #endif
13 
14 #define MAGMA_17 //default version version of the MAGMA library
15 
16 #ifdef MAGMA_LIB
17 #include <magma.h>
18 
19 #ifdef MAGMA_14
20 
21 #define _cV 'V'
22 #define _cU 'U'
23 
24 #define _cR 'R'
25 #define _cL 'L'
26 
27 #define _cC 'C'
28 #define _cN 'N'
29 
30 #define _cNV 'N'
31 
32 #else
33 
34 #define _cV MagmaVec
35 #define _cU MagmaUpper
36 
37 #define _cR MagmaRight
38 #define _cL MagmaLeft
39 
40 #define _cC MagmaConjTrans
41 #define _cN MagmaNoTrans
42 
43 #define _cNV MagmaNoVec
44 
45 #endif
46 
47 #endif
48 
49 //Column major format: Big matrix times Little matrix.
50 
51 #ifdef MAGMA_LIB
52 
53 
54 void OpenMagma(){
55 
56 #ifdef MAGMA_LIB
57  magma_int_t err = magma_init();
58 
59  if(err != MAGMA_SUCCESS) errorQuda("\nError: cannot initialize MAGMA library\n");
60 
61  int major, minor, micro;
62 
63  magma_version( &major, &minor, &micro);
64  printfQuda("\nMAGMA library version: %d.%d\n\n", major, minor);
65 #else
66  errorQuda("\nError: MAGMA library was not compiled, check your compilation options...\n");
67 #endif
68 
69  return;
70 }
71 
72 void CloseMagma(){
73 
74 #ifdef MAGMA_LIB
75  if(magma_finalize() != MAGMA_SUCCESS) errorQuda("\nError: cannot close MAGMA library\n");
76 #else
77  errorQuda("\nError: MAGMA library was not compiled, check your compilation options...\n");
78 #endif
79 
80  return;
81 }
82 
83 #define FMULS_GETRF(m_, n_) ( ((m_) < (n_)) \
84  ? (0.5 * (m_) * ((m_) * ((n_) - (1./3.) * (m_) - 1. ) + (n_)) + (2. / 3.) * (m_)) \
85  : (0.5 * (n_) * ((n_) * ((m_) - (1./3.) * (n_) - 1. ) + (m_)) + (2. / 3.) * (n_)) )
86 #define FADDS_GETRF(m_, n_) ( ((m_) < (n_)) \
87  ? (0.5 * (m_) * ((m_) * ((n_) - (1./3.) * (m_) ) - (n_)) + (1. / 6.) * (m_)) \
88  : (0.5 * (n_) * ((n_) * ((m_) - (1./3.) * (n_) ) - (m_)) + (1. / 6.) * (n_)) )
89 
90 #define FLOPS_ZGETRF(m_, n_) (6. * FMULS_GETRF((double)(m_), (double)(n_)) + 2.0 * FADDS_GETRF((double)(m_), (double)(n_)) )
91 #define FLOPS_CGETRF(m_, n_) (6. * FMULS_GETRF((double)(m_), (double)(n_)) + 2.0 * FADDS_GETRF((double)(m_), (double)(n_)) )
92 
93 #define FMULS_GETRI(n_) ( (n_) * ((5. / 6.) + (n_) * ((2. / 3.) * (n_) + 0.5)) )
94 #define FADDS_GETRI(n_) ( (n_) * ((5. / 6.) + (n_) * ((2. / 3.) * (n_) - 1.5)) )
95 
96 #define FLOPS_ZGETRI(n_) (6. * FMULS_GETRI((double)(n_)) + 2.0 * FADDS_GETRI((double)(n_)) )
97 #define FLOPS_CGETRI(n_) (6. * FMULS_GETRI((double)(n_)) + 2.0 * FADDS_GETRI((double)(n_)) )
98 
99 void BlasMagmaArgs::BatchInvertMatrix(void *Ainv_h, void* A_h, const int n, const int batch, const int prec)
100 {
101 #ifdef MAGMA_LIB
102  printfQuda("%s with n=%d and batch=%d\n", __func__, n, batch);
103 
104  magma_queue_t queue = 0;
105 
106  size_t size = 2*n*n*prec*batch;
107  void *A_d = device_malloc(size);
108  void *Ainv_d = device_malloc(size);
109  qudaMemcpy(A_d, A_h, size, cudaMemcpyHostToDevice);
110 
111  magma_int_t **dipiv_array = static_cast<magma_int_t**>(device_malloc(batch*sizeof(magma_int_t*)));
112  magma_int_t *dipiv_tmp = static_cast<magma_int_t*>(device_malloc(batch*n*sizeof(magma_int_t)));
113  set_ipointer(dipiv_array, dipiv_tmp, 1, 0, 0, n, batch, queue);
114 
115  magma_int_t *no_piv_array = static_cast<magma_int_t*>(safe_malloc(batch*n*sizeof(magma_int_t)));
116 
117  for (int i=0; i<batch; i++) {
118  for (int j=0; j<n; j++) {
119  no_piv_array[i*n + j] = j+1;
120  }
121  }
122  qudaMemcpy(dipiv_tmp, no_piv_array, batch*n*sizeof(magma_int_t), cudaMemcpyHostToDevice);
123 
124  host_free(no_piv_array);
125 
126  magma_int_t *dinfo_array = static_cast<magma_int_t*>(device_malloc(batch*sizeof(magma_int_t)));
127  magma_int_t *info_array = static_cast<magma_int_t*>(safe_malloc(batch*sizeof(magma_int_t)));
128  magma_int_t err;
129 
130  // FIXME do this in pipelined fashion to reduce memory overhead.
131  if (prec == 4) {
132  magmaFloatComplex **A_array = static_cast<magmaFloatComplex**>(device_malloc(batch*sizeof(magmaFloatComplex*)));
133  magmaFloatComplex **Ainv_array = static_cast<magmaFloatComplex**>(device_malloc(batch*sizeof(magmaFloatComplex*)));
134 
135  cset_pointer(A_array, static_cast<magmaFloatComplex*>(A_d), n, 0, 0, n*n, batch, queue);
136  cset_pointer(Ainv_array, static_cast<magmaFloatComplex*>(Ainv_d), n, 0, 0, n*n, batch, queue);
137 
138  double magma_time = magma_sync_wtime(queue);
139  //err = magma_cgetrf_batched(n, n, A_array, n, dipiv_array, dinfo_array, batch, queue);
140  err = magma_cgetrf_nopiv_batched(n, n, A_array, n, dinfo_array, batch, queue);
141  magma_time = magma_sync_wtime(queue) - magma_time;
142  printfQuda("LU factorization completed in %f seconds with GFLOPS = %f\n",
143  magma_time, 1e-9 * batch * FLOPS_CGETRF(n,n) / magma_time);
144 
145  if(err != 0) errorQuda("\nError in LU decomposition (magma_cgetrf), error code = %d\n", err);
146 
147  qudaMemcpy(info_array, dinfo_array, batch*sizeof(magma_int_t), cudaMemcpyDeviceToHost);
148  for (int i=0; i<batch; i++) {
149  if (info_array[i] < 0) {
150  errorQuda("%d argument had an illegal value or another error occured, such as memory allocation failed", i);
151  } else if (info_array[i] > 0) {
152  errorQuda("%d factorization completed but the factor U is exactly singular", i);
153  }
154  }
155 
156  magma_time = magma_sync_wtime(queue);
157  err = magma_cgetri_outofplace_batched(n, A_array, n, dipiv_array, Ainv_array, n, dinfo_array, batch, queue);
158  magma_time = magma_sync_wtime(queue) - magma_time;
159  printfQuda("Matrix inversion completed in %f seconds with GFLOPS = %f\n",
160  magma_time, 1e-9 * batch * FLOPS_CGETRI(n) / magma_time);
161 
162  if(err != 0) errorQuda("\nError in matrix inversion (magma_cgetri), error code = %d\n", err);
163 
164  qudaMemcpy(info_array, dinfo_array, batch*sizeof(magma_int_t), cudaMemcpyDeviceToHost);
165 
166  for (int i=0; i<batch; i++) {
167  if (info_array[i] < 0) {
168  errorQuda("%d argument had an illegal value or another error occured, such as memory allocation failed", i);
169  } else if (info_array[i] > 0) {
170  errorQuda("%d factorization completed but the factor U is exactly singular", i);
171  }
172  }
173 
174  device_free(Ainv_array);
175  device_free(A_array);
176  } else if (prec == 8) {
177  magmaDoubleComplex **A_array = static_cast<magmaDoubleComplex**>(device_malloc(batch*sizeof(magmaDoubleComplex*)));
178  zset_pointer(A_array, static_cast<magmaDoubleComplex*>(A_d), n, 0, 0, n*n, batch, queue);
179 
180  magmaDoubleComplex **Ainv_array = static_cast<magmaDoubleComplex**>(device_malloc(batch*sizeof(magmaDoubleComplex*)));
181  zset_pointer(Ainv_array, static_cast<magmaDoubleComplex*>(Ainv_d), n, 0, 0, n*n, batch, queue);
182 
183  double magma_time = magma_sync_wtime(queue);
184  err = magma_zgetrf_batched(n, n, A_array, n, dipiv_array, dinfo_array, batch, queue);
185  magma_time = magma_sync_wtime(queue) - magma_time;
186  printfQuda("LU factorization completed in %f seconds with GFLOPS = %f\n",
187  magma_time, 1e-9 * batch * FLOPS_ZGETRF(n,n) / magma_time);
188 
189  if(err != 0) errorQuda("\nError in LU decomposition (magma_zgetrf), error code = %d\n", err);
190 
191  qudaMemcpy(info_array, dinfo_array, batch*sizeof(magma_int_t), cudaMemcpyDeviceToHost);
192  for (int i=0; i<batch; i++) {
193  if (info_array[i] < 0) {
194  errorQuda("%d argument had an illegal value or another error occured, such as memory allocation failed", i);
195  } else if (info_array[i] > 0) {
196  errorQuda("%d factorization completed but the factor U is exactly singular", i);
197  }
198  }
199 
200  magma_time = magma_sync_wtime(queue);
201  err = magma_zgetri_outofplace_batched(n, A_array, n, dipiv_array, Ainv_array, n, dinfo_array, batch, queue);
202  magma_time = magma_sync_wtime(queue) - magma_time;
203  printfQuda("Matrix inversion completed in %f seconds with GFLOPS = %f\n",
204  magma_time, 1e-9 * batch * FLOPS_ZGETRI(n) / magma_time);
205 
206  if(err != 0) errorQuda("\nError in matrix inversion (magma_cgetri), error code = %d\n", err);
207 
208  qudaMemcpy(info_array, dinfo_array, batch*sizeof(magma_int_t), cudaMemcpyDeviceToHost);
209 
210  for (int i=0; i<batch; i++) {
211  if (info_array[i] < 0) {
212  errorQuda("%d argument had an illegal value or another error occured, such as memory allocation failed", i);
213  } else if (info_array[i] > 0) {
214  errorQuda("%d factorization completed but the factor U is exactly singular", i);
215  }
216  }
217 
218  device_free(Ainv_array);
219  device_free(A_array);
220  } else {
221  errorQuda("%s not implemented for precision=%d", __func__, prec);
222  }
223 
224  qudaMemcpy(Ainv_h, Ainv_d, size, cudaMemcpyDeviceToHost);
225 
226  device_free(dipiv_tmp);
227  device_free(dipiv_array);
228  device_free(dinfo_array);
229  host_free(info_array);
230  device_free(Ainv_d);
231  device_free(A_d);
232 
233 #endif
234  return;
235 }
236 
237 
238 
239 #ifdef MAGMA_LIB
240 
241 #undef _cV
242 #undef _cU
243 #undef _cR
244 #undef _cL
245 #undef _cC
246 #undef _cN
247 #undef _cNV
248 
249 #endif
#define FLOPS_CGETRI(n_)
Definition: blas_cublas.cu:18
#define qudaMemcpy(dst, src, count, kind)
Definition: quda_cuda_api.h:32
#define errorQuda(...)
Definition: util_quda.h:90
#define host_free(ptr)
Definition: malloc_quda.h:59
void CloseMagma()
Definition: blas_magma.cu:326
long long BatchInvertMatrix(void *Ainv, void *A, const int n, const int batch, QudaPrecision precision, QudaFieldLocation location)
Definition: blas_cublas.cu:33
void OpenMagma()
Definition: blas_magma.cu:310
cudaError_t err
#define safe_malloc(size)
Definition: malloc_quda.h:54
#define FLOPS_CGETRF(m_, n_)
Definition: blas_cublas.cu:12
#define FLOPS_ZGETRF(m_, n_)
Definition: blas_cublas.cu:11
#define FLOPS_ZGETRI(n_)
Definition: blas_cublas.cu:17
#define printfQuda(...)
Definition: util_quda.h:84
#define device_malloc(size)
Definition: malloc_quda.h:52
QudaPrecision prec
Definition: test_util.cpp:1615
#define device_free(ptr)
Definition: malloc_quda.h:57