QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
blas_quda.cu
Go to the documentation of this file.
1 #include <stdlib.h>
2 #include <stdio.h>
3 #include <cstring> // needed for memset
4 
5 #include <float_vector.h>
6 
7 #include <tune_quda.h>
8 #include <typeinfo>
9 
10 #include <quda_internal.h>
11 #include <blas_quda.h>
12 #include <color_spinor_field.h>
13 #include <face_quda.h> // this is where the MPI / QMP depdendent code is
14 
15 #define checkSpinor(a, b) \
16  { \
17  if (a.Precision() != b.Precision()) \
18  errorQuda("precisions do not match: %d %d", a.Precision(), b.Precision()); \
19  if (a.Length() != b.Length()) \
20  errorQuda("lengths do not match: %d %d", a.Length(), b.Length()); \
21  if (a.Stride() != b.Stride()) \
22  errorQuda("strides do not match: %d %d", a.Stride(), b.Stride()); \
23  }
24 
25 #define checkLength(a, b) \
26  { \
27  if (a.Length() != b.Length()) \
28  errorQuda("lengths do not match: %d %d", a.Length(), b.Length()); \
29  if (a.Stride() != b.Stride()) \
30  errorQuda("strides do not match: %d %d", a.Stride(), b.Stride()); \
31  }
32 
33 namespace quda {
34 
35 #include <texture.h>
36 
37  unsigned long long blas_flops;
38  unsigned long long blas_bytes;
39 
41 
42  static cudaStream_t *blasStream;
43 
44  static struct {
45  const char *vol_str;
46  const char *aux_str;
48  } blasStrings;
49 
50  void initReduce();
51  void endReduce();
52 
53  void initBlas()
54  {
55  blasStream = &streams[Nstream-1];
56  initReduce();
57  }
58 
59  void endBlas(void)
60  {
61  endReduce();
62  }
63 
64  cudaStream_t* getBlasStream() { return blasStream; }
65 
66 #include <blas_core.h>
67 #include <blas_mixed_core.h>
68 
72  template <typename Float2, typename FloatN>
73  struct axpby {
74  const Float2 a;
75  const Float2 b;
76  axpby(const Float2 &a, const Float2 &b, const Float2 &c) : a(a), b(b) { ; }
77  __device__ void operator()(const FloatN &x, FloatN &y, const FloatN &z, const FloatN &w) { y = a.x*x + b.x*y; }
78  static int streams() { return 3; }
79  static int flops() { return 3; }
80  };
81 
82  void axpbyCuda(const double &a, cudaColorSpinorField &x, const double &b, cudaColorSpinorField &y) {
83  blasCuda<axpby,0,1,0,0>(make_double2(a, 0.0), make_double2(b, 0.0), make_double2(0.0, 0.0),
84  x, y, x, x);
85  }
86 
90  template <typename Float2, typename FloatN>
91  struct xpy {
92  xpy(const Float2 &a, const Float2 &b, const Float2 &c) { ; }
93  __device__ void operator()(const FloatN &x, FloatN &y, const FloatN &z, const FloatN &w) { y += x ; }
94  static int streams() { return 3; }
95  static int flops() { return 1; }
96  };
97 
99  blasCuda<xpy,0,1,0,0>(make_double2(1.0, 0.0), make_double2(1.0, 0.0), make_double2(0.0, 0.0),
100  x, y, x, x);
101  }
102 
106  template <typename Float2, typename FloatN>
107  struct axpy {
108  const Float2 a;
109  axpy(const Float2 &a, const Float2 &b, const Float2 &c) : a(a) { ; }
110  __device__ void operator()(const FloatN &x, FloatN &y, const FloatN &z, const FloatN &w) { y = a.x*x + y; }
111  static int streams() { return 3; }
112  static int flops() { return 2; }
113  };
114 
116  if (x.Precision() != y.Precision()) {
117  // call hacked mixed precision kernel
118  mixed::blasCuda<axpy,0,1,0,0>(make_double2(a,0.0), make_double2(1.0,0.0), make_double2(0.0,0.0),
119  x, y, x, x);
120  } else {
121  blasCuda<axpy,0,1,0,0>(make_double2(a, 0.0), make_double2(1.0, 0.0), make_double2(0.0, 0.0),
122  x, y, x, x);
123  }
124  }
125 
129  template <typename Float2, typename FloatN>
130  struct xpay {
131  const Float2 a;
132  xpay(const Float2 &a, const Float2 &b, const Float2 &c) : a(a) { ; }
133  __device__ void operator()(const FloatN &x, FloatN &y, const FloatN &z, const FloatN &w) { y = x + a.x*y; }
134  static int streams() { return 3; }
135  static int flops() { return 2; }
136  };
137 
139  blasCuda<xpay,0,1,0,0>(make_double2(a,0.0), make_double2(0.0, 0.0), make_double2(0.0, 0.0),
140  x, y, x, x);
141  }
142 
146  template <typename Float2, typename FloatN>
147  struct mxpy {
148  mxpy(const Float2 &a, const Float2 &b, const Float2 &c) { ; }
149  __device__ void operator()(const FloatN &x, FloatN &y, const FloatN &z, const FloatN &w) { y -= x; }
150  static int streams() { return 3; }
151  static int flops() { return 1; }
152  };
153 
155  blasCuda<mxpy,0,1,0,0>(make_double2(1.0, 0.0), make_double2(1.0, 0.0),
156  make_double2(0.0, 0.0), x, y, x, x);
157  }
158 
162  template <typename Float2, typename FloatN>
163  struct ax {
164  const Float2 a;
165  ax(const Float2 &a, const Float2 &b, const Float2 &c) : a(a) { ; }
166  __device__ void operator()(FloatN &x, const FloatN &y, const FloatN &z, const FloatN &w) { x *= a.x; }
167  static int streams() { return 2; }
168  static int flops() { return 1; }
169  };
170 
171  void axCuda(const double &a, cudaColorSpinorField &x) {
172  blasCuda<ax,1,0,0,0>(make_double2(a, 0.0), make_double2(0.0, 0.0),
173  make_double2(0.0, 0.0), x, x, x, x);
174  }
175 
180  __device__ void caxpy_(const float2 &a, const float4 &x, float4 &y) {
181  y.x += a.x*x.x; y.x -= a.y*x.y;
182  y.y += a.y*x.x; y.y += a.x*x.y;
183  y.z += a.x*x.z; y.z -= a.y*x.w;
184  y.w += a.y*x.z; y.w += a.x*x.w;
185  }
186 
187  __device__ void caxpy_(const float2 &a, const float2 &x, float2 &y) {
188  y.x += a.x*x.x; y.x -= a.y*x.y;
189  y.y += a.y*x.x; y.y += a.x*x.y;
190  }
191 
192  __device__ void caxpy_(const double2 &a, const double2 &x, double2 &y) {
193  y.x += a.x*x.x; y.x -= a.y*x.y;
194  y.y += a.y*x.x; y.y += a.x*x.y;
195  }
196 
197  template <typename Float2, typename FloatN>
198  struct caxpy {
199  const Float2 a;
200  caxpy(const Float2 &a, const Float2 &b, const Float2 &c) : a(a) { ; }
201  __device__ void operator()(const FloatN &x, FloatN &y, const FloatN &z, const FloatN &w)
202  { caxpy_(a, x, y); }
203  static int streams() { return 3; }
204  static int flops() { return 4; }
205  };
206 
208  blasCuda<caxpy,0,1,0,0>(make_double2(REAL(a), IMAG(a)),
209  make_double2(0.0, 0.0),
210  make_double2(0.0, 0.0), x, y, x, x);
211  }
212 
217  __device__ void caxpby_(const float2 &a, const float4 &x, const float2 &b, float4 &y)
218  { float4 yy;
219  yy.x = a.x*x.x; yy.x -= a.y*x.y; yy.x += b.x*y.x; yy.x -= b.y*y.y;
220  yy.y = a.y*x.x; yy.y += a.x*x.y; yy.y += b.y*y.x; yy.y += b.x*y.y;
221  yy.z = a.x*x.z; yy.z -= a.y*x.w; yy.z += b.x*y.z; yy.z -= b.y*y.w;
222  yy.w = a.y*x.z; yy.w += a.x*x.w; yy.w += b.y*y.z; yy.w += b.x*y.w;
223  y = yy; }
224 
225  __device__ void caxpby_(const float2 &a, const float2 &x, const float2 &b, float2 &y)
226  { float2 yy;
227  yy.x = a.x*x.x; yy.x -= a.y*x.y; yy.x += b.x*y.x; yy.x -= b.y*y.y;
228  yy.y = a.y*x.x; yy.y += a.x*x.y; yy.y += b.y*y.x; yy.y += b.x*y.y;
229  y = yy; }
230 
231  __device__ void caxpby_(const double2 &a, const double2 &x, const double2 &b, double2 &y)
232  { double2 yy;
233  yy.x = a.x*x.x; yy.x -= a.y*x.y; yy.x += b.x*y.x; yy.x -= b.y*y.y;
234  yy.y = a.y*x.x; yy.y += a.x*x.y; yy.y += b.y*y.x; yy.y += b.x*y.y;
235  y = yy; }
236 
237  template <typename Float2, typename FloatN>
238  struct caxpby {
239  const Float2 a;
240  const Float2 b;
241  caxpby(const Float2 &a, const Float2 &b, const Float2 &c) : a(a), b(b) { ; }
242  __device__ void operator()(const FloatN &x, FloatN &y, const FloatN &z, const FloatN &w) { caxpby_(a, x, b, y); }
243  static int streams() { return 3; }
244  static int flops() { return 7; }
245  };
246 
248  blasCuda<caxpby,0,1,0,0>(make_double2(REAL(a),IMAG(a)), make_double2(REAL(b), IMAG(b)),
249  make_double2(0.0, 0.0), x, y, x, x);
250  }
251 
256  __device__ void cxpaypbz_(const float4 &x, const float2 &a, const float4 &y, const float2 &b, float4 &z) {
257  float4 zz;
258  zz.x = x.x + a.x*y.x; zz.x -= a.y*y.y; zz.x += b.x*z.x; zz.x -= b.y*z.y;
259  zz.y = x.y + a.y*y.x; zz.y += a.x*y.y; zz.y += b.y*z.x; zz.y += b.x*z.y;
260  zz.z = x.z + a.x*y.z; zz.z -= a.y*y.w; zz.z += b.x*z.z; zz.z -= b.y*z.w;
261  zz.w = x.w + a.y*y.z; zz.w += a.x*y.w; zz.w += b.y*z.z; zz.w += b.x*z.w;
262  z = zz;
263  }
264 
265  __device__ void cxpaypbz_(const float2 &x, const float2 &a, const float2 &y, const float2 &b, float2 &z) {
266  float2 zz;
267  zz.x = x.x + a.x*y.x; zz.x -= a.y*y.y; zz.x += b.x*z.x; zz.x -= b.y*z.y;
268  zz.y = x.y + a.y*y.x; zz.y += a.x*y.y; zz.y += b.y*z.x; zz.y += b.x*z.y;
269  z = zz;
270  }
271 
272  __device__ void cxpaypbz_(const double2 &x, const double2 &a, const double2 &y, const double2 &b, double2 &z) {
273  double2 zz;
274  zz.x = x.x + a.x*y.x; zz.x -= a.y*y.y; zz.x += b.x*z.x; zz.x -= b.y*z.y;
275  zz.y = x.y + a.y*y.x; zz.y += a.x*y.y; zz.y += b.y*z.x; zz.y += b.x*z.y;
276  z = zz;
277  }
278 
279  template <typename Float2, typename FloatN>
280  struct cxpaypbz {
281  const Float2 a;
282  const Float2 b;
283  cxpaypbz(const Float2 &a, const Float2 &b, const Float2 &c) : a(a), b(b) { ; }
284  __device__ void operator()(const FloatN &x, const FloatN &y, FloatN &z, FloatN &w)
285  { cxpaypbz_(x, a, y, b, z); }
286  static int streams() { return 4; }
287  static int flops() { return 8; }
288  };
289 
291  const Complex &b, cudaColorSpinorField &z) {
292  blasCuda<cxpaypbz,0,0,1,0>(make_double2(REAL(a),IMAG(a)), make_double2(REAL(b), IMAG(b)),
293  make_double2(0.0, 0.0), x, y, z, z);
294  }
295 
299  template <typename Float2, typename FloatN>
300  struct axpyBzpcx {
301  const Float2 a;
302  const Float2 b;
303  const Float2 c;
304  axpyBzpcx(const Float2 &a, const Float2 &b, const Float2 &c) : a(a), b(b), c(c) { ; }
305  __device__ void operator()(FloatN &x, FloatN &y, const FloatN &z, const FloatN &w)
306  { y += a.x*x; x = b.x*z + c.x*x; }
307  static int streams() { return 5; }
308  static int flops() { return 10; }
309  };
310 
311  void axpyBzpcxCuda(const double &a, cudaColorSpinorField& x, cudaColorSpinorField& y, const double &b,
312  cudaColorSpinorField& z, const double &c) {
313  if (x.Precision() != y.Precision()) {
314  // call hacked mixed precision kernel
315  mixed::blasCuda<axpyBzpcx,1,1,0,0>(make_double2(a,0.0), make_double2(b,0.0),
316  make_double2(c,0.0), x, y, z, x);
317  } else {
318  // swap arguments around
319  blasCuda<axpyBzpcx,1,1,0,0>(make_double2(a,0.0), make_double2(b,0.0),
320  make_double2(c,0.0), x, y, z, x);
321  }
322  }
323 
327  template <typename Float2, typename FloatN>
328  struct axpyZpbx {
329  const Float2 a;
330  const Float2 b;
331  axpyZpbx(const Float2 &a, const Float2 &b, const Float2 &c) : a(a), b(b) { ; }
332  __device__ void operator()(FloatN &x, FloatN &y, const FloatN &z, const FloatN &w)
333  { y += a.x*x; x = z + b.x*x; }
334  static int streams() { return 5; }
335  static int flops() { return 8; }
336  };
337 
339  cudaColorSpinorField& z, const double &b) {
340  if (x.Precision() != y.Precision()) {
341  // call hacked mixed precision kernel
342  mixed::blasCuda<axpyZpbx,1,1,0,0>(make_double2(a,0.0), make_double2(b,0.0), make_double2(0.0,0.0),
343  x, y, z, x);
344  } else {
345  // swap arguments around
346  blasCuda<axpyZpbx,1,1,0,0>(make_double2(a,0.0), make_double2(b,0.0), make_double2(0.0,0.0),
347  x, y, z, x);
348  }
349  }
350 
354  template <typename Float2, typename FloatN>
355  struct caxpbypzYmbw {
356  const Float2 a;
357  const Float2 b;
358  caxpbypzYmbw(const Float2 &a, const Float2 &b, const Float2 &c) : a(a), b(b) { ; }
359  __device__ void operator()(const FloatN &x, FloatN &y, FloatN &z, const FloatN &w)
360  { caxpy_(a, x, z); caxpy_(b, y, z); caxpy_(-b, w, y); }
361 
362  static int streams() { return 6; }
363  static int flops() { return 12; }
364  };
365 
368  blasCuda<caxpbypzYmbw,0,1,1,0>(make_double2(REAL(a),IMAG(a)), make_double2(REAL(b),IMAG(b)),
369  make_double2(0.0,0.0), x, y, z, w);
370  }
371 
375  template <typename Float2, typename FloatN>
376  struct cabxpyAx {
377  const Float2 a;
378  const Float2 b;
379  cabxpyAx(const Float2 &a, const Float2 &b, const Float2 &c) : a(a), b(b) { ; }
380  __device__ void operator()(FloatN &x, FloatN &y, const FloatN &z, const FloatN &w)
381  { x *= a.x; caxpy_(b, x, y); }
382  static int streams() { return 4; }
383  static int flops() { return 5; }
384  };
385 
386  void cabxpyAxCuda(const double &a, const Complex &b,
388  // swap arguments around
389  blasCuda<cabxpyAx,1,1,0,0>(make_double2(a,0.0), make_double2(REAL(b),IMAG(b)),
390  make_double2(0.0,0.0), x, y, x, x);
391  }
392 
396  template <typename Float2, typename FloatN>
397  struct caxpbypz {
398  const Float2 a;
399  const Float2 b;
400  caxpbypz(const Float2 &a, const Float2 &b, const Float2 &c) : a(a), b(b) { ; }
401  __device__ void operator()(const FloatN &x, const FloatN &y, FloatN &z, const FloatN &w)
402  { caxpy_(a, x, z); caxpy_(b, y, z); }
403  static int streams() { return 4; }
404  static int flops() { return 5; }
405  };
406 
407  void caxpbypzCuda(const Complex &a, cudaColorSpinorField &x, const Complex &b,
409  blasCuda<caxpbypz,0,0,1,0>(make_double2(REAL(a),IMAG(a)), make_double2(REAL(b),IMAG(b)),
410  make_double2(0.0,0.0), x, y, z, z);
411  }
412 
416  template <typename Float2, typename FloatN>
417  struct caxpbypczpw {
418  const Float2 a;
419  const Float2 b;
420  const Float2 c;
421  caxpbypczpw(const Float2 &a, const Float2 &b, const Float2 &c) : a(a), b(b), c(c) { ; }
422  __device__ void operator()(const FloatN &x, const FloatN &y, const FloatN &z, FloatN &w)
423  { caxpy_(a, x, w); caxpy_(b, y, w); caxpy_(c, z, w); }
424 
425  static int streams() { return 4; }
426  static int flops() { return 5; }
427  };
428 
432  blasCuda<caxpbypczpw,0,0,0,1>(make_double2(REAL(a),IMAG(a)), make_double2(REAL(b),IMAG(b)),
433  make_double2(REAL(c),IMAG(c)), x, y, z, w);
434  }
435 
442  template <typename Float2, typename FloatN>
443  struct caxpyxmaz {
444  Float2 a;
445  caxpyxmaz(const Float2 &a, const Float2 &b, const Float2 &c) : a(a) { ; }
446  __device__ void operator()(FloatN &x, FloatN &y, const FloatN &z, const FloatN &w)
447  { caxpy_(a, x, y); x-= a.x*z; }
448  static int streams() { return 5; }
449  static int flops() { return 8; }
450  };
451 
454  blasCuda<caxpyxmaz,1,1,0,0>(make_double2(REAL(a), IMAG(a)), make_double2(0.0, 0.0),
455  make_double2(0.0, 0.0), x, y, z, x);
456  }
457 
469  template <typename Float2, typename FloatN>
470  struct tripleCGUpdate {
471  Float2 a, b;
472  tripleCGUpdate(const Float2 &a, const Float2 &b, const Float2 &c) : a(a), b(b) { ; }
473  __device__ void operator()(const FloatN &x, FloatN &y, FloatN &z, FloatN &w)
474  //{ y -= a.x*x; z += a.x*w; w = y + b.x*w; }
475  { y += a.x*w; z -= a.x*x; w = z + b.x*w; }
476  static int streams() { return 7; }
477  static int flops() { return 6; }
478  };
479 
480  void tripleCGUpdateCuda(const double &a, const double &b, cudaColorSpinorField &x,
482  if (x.Precision() != y.Precision()) {
483  // call hacked mixed precision kernel
484  mixed::blasCuda<tripleCGUpdate,0,1,1,1>(make_double2(a,0.0), make_double2(b,0.0),
485  make_double2(0.0,0.0), x, y, z, w);
486  } else {
487  blasCuda<tripleCGUpdate,0,1,1,1>(make_double2(a, 0.0), make_double2(b, 0.0),
488  make_double2(0.0, 0.0), x, y, z, w);
489  }
490  }
491 
492 } // namespace quda
__device__ void operator()(const FloatN &x, FloatN &y, const FloatN &z, const FloatN &w)
Definition: blas_quda.cu:93
static int flops()
total number of input and output streams
Definition: blas_quda.cu:204
const Float2 c
Definition: blas_quda.cu:420
void caxpyCuda(const Complex &a, cudaColorSpinorField &x, cudaColorSpinorField &y)
Definition: blas_quda.cu:207
axpby(const Float2 &a, const Float2 &b, const Float2 &c)
Definition: blas_quda.cu:76
__device__ void operator()(const FloatN &x, FloatN &y, FloatN &z, const FloatN &w)
Definition: blas_quda.cu:359
xpy(const Float2 &a, const Float2 &b, const Float2 &c)
Definition: blas_quda.cu:92
static int streams()
Definition: blas_quda.cu:94
int y[4]
static int flops()
total number of input and output streams
Definition: blas_quda.cu:335
__device__ void operator()(const FloatN &x, FloatN &y, const FloatN &z, const FloatN &w)
Definition: blas_quda.cu:149
axpyZpbx(const Float2 &a, const Float2 &b, const Float2 &c)
Definition: blas_quda.cu:331
caxpbypczpw(const Float2 &a, const Float2 &b, const Float2 &c)
Definition: blas_quda.cu:421
const Float2 a
Definition: blas_quda.cu:131
__device__ void operator()(const FloatN &x, const FloatN &y, FloatN &z, FloatN &w)
Definition: blas_quda.cu:284
void initReduce()
Definition: reduce_quda.cu:52
const char * aux_str
Definition: blas_quda.cu:46
char aux_tmp[TuneKey::aux_n]
Definition: blas_quda.cu:47
void endReduce()
Definition: reduce_quda.cu:85
static int flops()
total number of input and output streams
Definition: blas_quda.cu:363
__device__ void operator()(const FloatN &x, const FloatN &y, const FloatN &z, FloatN &w)
Definition: blas_quda.cu:422
static int streams()
Definition: blas_quda.cu:167
unsigned long long blas_bytes
Definition: blas_quda.cu:38
const Float2 a
Definition: blas_quda.cu:377
std::complex< double > Complex
Definition: eig_variables.h:13
cudaStream_t * streams
void axpbyCuda(const double &a, cudaColorSpinorField &x, const double &b, cudaColorSpinorField &y)
Definition: blas_quda.cu:82
static int streams()
Definition: blas_quda.cu:403
__device__ void caxpy_(const float2 &a, const float4 &x, float4 &y)
Definition: blas_quda.cu:180
void axpyZpbxCuda(const double &a, cudaColorSpinorField &x, cudaColorSpinorField &y, cudaColorSpinorField &z, const double &b)
Definition: blas_quda.cu:338
const Float2 b
Definition: blas_quda.cu:330
const int Nstream
mxpy(const Float2 &a, const Float2 &b, const Float2 &c)
Definition: blas_quda.cu:148
const Float2 c
Definition: blas_quda.cu:303
xpay(const Float2 &a, const Float2 &b, const Float2 &c)
Definition: blas_quda.cu:132
static int streams()
Definition: blas_quda.cu:150
const Float2 a
Definition: blas_quda.cu:108
__device__ void cxpaypbz_(const float4 &x, const float2 &a, const float4 &y, const float2 &b, float4 &z)
Definition: blas_quda.cu:256
const Float2 a
Definition: blas_quda.cu:164
static int flops()
total number of input and output streams
Definition: blas_quda.cu:168
static int streams()
Definition: blas_quda.cu:203
static int flops()
total number of input and output streams
Definition: blas_quda.cu:287
static int flops()
total number of input and output streams
Definition: blas_quda.cu:426
static int streams()
Definition: blas_quda.cu:78
__device__ void operator()(const FloatN &x, const FloatN &y, FloatN &z, const FloatN &w)
Definition: blas_quda.cu:401
static int streams()
Definition: blas_quda.cu:425
static int flops()
total number of input and output streams
Definition: blas_quda.cu:151
axpyBzpcx(const Float2 &a, const Float2 &b, const Float2 &c)
Definition: blas_quda.cu:304
void cabxpyAxCuda(const double &a, const Complex &b, cudaColorSpinorField &x, cudaColorSpinorField &y)
Definition: blas_quda.cu:386
const Float2 a
Definition: blas_quda.cu:329
__device__ void operator()(FloatN &x, const FloatN &y, const FloatN &z, const FloatN &w)
Definition: blas_quda.cu:166
static int flops()
total number of input and output streams
Definition: blas_quda.cu:404
void axpyBzpcxCuda(const double &a, cudaColorSpinorField &x, cudaColorSpinorField &y, const double &b, cudaColorSpinorField &z, const double &c)
Definition: blas_quda.cu:311
void caxpyXmazCuda(const Complex &a, cudaColorSpinorField &x, cudaColorSpinorField &y, cudaColorSpinorField &z)
Definition: blas_quda.cu:452
const Float2 b
Definition: blas_quda.cu:240
const Float2 b
Definition: blas_quda.cu:378
caxpyxmaz(const Float2 &a, const Float2 &b, const Float2 &c)
Definition: blas_quda.cu:445
cxpaypbz(const Float2 &a, const Float2 &b, const Float2 &c)
Definition: blas_quda.cu:283
static int streams()
Definition: blas_quda.cu:334
#define REAL(a)
Definition: quda_internal.h:86
static int flops()
total number of input and output streams
Definition: blas_quda.cu:308
__device__ void operator()(const FloatN &x, FloatN &y, const FloatN &z, const FloatN &w)
Definition: blas_quda.cu:242
const Float2 b
Definition: blas_quda.cu:75
void mxpyCuda(cudaColorSpinorField &x, cudaColorSpinorField &y)
Definition: blas_quda.cu:154
__device__ void operator()(const FloatN &x, FloatN &y, FloatN &z, FloatN &w)
Definition: blas_quda.cu:473
__device__ void operator()(FloatN &x, FloatN &y, const FloatN &z, const FloatN &w)
Definition: blas_quda.cu:305
const Float2 a
Definition: blas_quda.cu:356
__device__ void caxpby_(const float2 &a, const float4 &x, const float2 &b, float4 &y)
Definition: blas_quda.cu:217
const char * vol_str
Definition: blas_quda.cu:45
#define IMAG(a)
Definition: quda_internal.h:87
static int flops()
total number of input and output streams
Definition: blas_quda.cu:112
__device__ void operator()(const FloatN &x, FloatN &y, const FloatN &z, const FloatN &w)
Definition: blas_quda.cu:133
caxpby(const Float2 &a, const Float2 &b, const Float2 &c)
Definition: blas_quda.cu:241
const Float2 b
Definition: blas_quda.cu:357
caxpbypzYmbw(const Float2 &a, const Float2 &b, const Float2 &c)
Definition: blas_quda.cu:358
const Float2 a
Definition: blas_quda.cu:239
__device__ void operator()(FloatN &x, FloatN &y, const FloatN &z, const FloatN &w)
Definition: blas_quda.cu:332
const Float2 b
Definition: blas_quda.cu:399
void axpyCuda(const double &a, cudaColorSpinorField &x, cudaColorSpinorField &y)
Definition: blas_quda.cu:115
const Float2 a
Definition: blas_quda.cu:74
void caxpbypczpwCuda(const Complex &, cudaColorSpinorField &, const Complex &, cudaColorSpinorField &, const Complex &, cudaColorSpinorField &, cudaColorSpinorField &)
Definition: blas_quda.cu:429
static int flops()
total number of input and output streams
Definition: blas_quda.cu:244
static int flops()
total number of input and output streams
Definition: blas_quda.cu:135
static int streams()
Definition: blas_quda.cu:476
int x[4]
cabxpyAx(const Float2 &a, const Float2 &b, const Float2 &c)
Definition: blas_quda.cu:379
cudaStream_t * getBlasStream()
Definition: blas_quda.cu:64
unsigned long long blas_flops
Definition: blas_quda.cu:37
static int streams()
Definition: blas_quda.cu:382
void xpyCuda(cudaColorSpinorField &x, cudaColorSpinorField &y)
Definition: blas_quda.cu:98
axpy(const Float2 &a, const Float2 &b, const Float2 &c)
Definition: blas_quda.cu:109
static int streams()
Definition: blas_quda.cu:307
void endBlas(void)
Definition: blas_quda.cu:59
tripleCGUpdate(const Float2 &a, const Float2 &b, const Float2 &c)
Definition: blas_quda.cu:472
static int streams()
Definition: blas_quda.cu:286
QudaPrecision Precision() const
void cxpaypbzCuda(cudaColorSpinorField &, const Complex &b, cudaColorSpinorField &y, const Complex &c, cudaColorSpinorField &z)
Definition: blas_quda.cu:290
ax(const Float2 &a, const Float2 &b, const Float2 &c)
Definition: blas_quda.cu:165
static const int aux_n
Definition: tune_key.h:12
caxpbypz(const Float2 &a, const Float2 &b, const Float2 &c)
Definition: blas_quda.cu:400
const Float2 a
Definition: blas_quda.cu:199
caxpy(const Float2 &a, const Float2 &b, const Float2 &c)
Definition: blas_quda.cu:200
static int flops()
total number of input and output streams
Definition: blas_quda.cu:449
void caxpbypzCuda(const Complex &, cudaColorSpinorField &, const Complex &, cudaColorSpinorField &, cudaColorSpinorField &)
Definition: blas_quda.cu:407
const Float2 a
Definition: blas_quda.cu:418
const Float2 b
Definition: blas_quda.cu:302
const Float2 b
Definition: blas_quda.cu:419
static int streams()
Definition: blas_quda.cu:362
__device__ void operator()(const FloatN &x, FloatN &y, const FloatN &z, const FloatN &w)
Definition: blas_quda.cu:77
static int flops()
total number of input and output streams
Definition: blas_quda.cu:95
void zeroCuda(cudaColorSpinorField &a)
Definition: blas_quda.cu:40
static int streams()
Definition: blas_quda.cu:134
__device__ void operator()(FloatN &x, FloatN &y, const FloatN &z, const FloatN &w)
Definition: blas_quda.cu:380
static int flops()
total number of input and output streams
Definition: blas_quda.cu:477
void tripleCGUpdateCuda(const double &alpha, const double &beta, cudaColorSpinorField &q, cudaColorSpinorField &r, cudaColorSpinorField &x, cudaColorSpinorField &p)
Definition: blas_quda.cu:480
void caxpbyCuda(const Complex &a, cudaColorSpinorField &x, const Complex &b, cudaColorSpinorField &y)
Definition: blas_quda.cu:247
const Float2 b
Definition: blas_quda.cu:282
static int streams()
Definition: blas_quda.cu:111
static int streams()
Definition: blas_quda.cu:243
void xpayCuda(cudaColorSpinorField &x, const double &a, cudaColorSpinorField &y)
Definition: blas_quda.cu:138
const Float2 a
Definition: blas_quda.cu:281
const Float2 a
Definition: blas_quda.cu:398
__device__ void operator()(const FloatN &x, FloatN &y, const FloatN &z, const FloatN &w)
Definition: blas_quda.cu:201
static int flops()
total number of input and output streams
Definition: blas_quda.cu:79
void axCuda(const double &a, cudaColorSpinorField &x)
Definition: blas_quda.cu:171
const Float2 a
Definition: blas_quda.cu:301
void initBlas()
Definition: blas_quda.cu:53
void caxpbypzYmbwCuda(const Complex &, cudaColorSpinorField &, const Complex &, cudaColorSpinorField &, cudaColorSpinorField &, cudaColorSpinorField &)
Definition: blas_quda.cu:366
__device__ void operator()(const FloatN &x, FloatN &y, const FloatN &z, const FloatN &w)
Definition: blas_quda.cu:110
static int streams()
Definition: blas_quda.cu:448
__device__ void operator()(FloatN &x, FloatN &y, const FloatN &z, const FloatN &w)
Definition: blas_quda.cu:446
static int flops()
total number of input and output streams
Definition: blas_quda.cu:383