QUDA  0.9.0
multi_blas_quda.cu
Go to the documentation of this file.
1 #include <stdlib.h>
2 #include <stdio.h>
3 #include <cstring> // needed for memset
4 #include <typeinfo>
5 
6 #include <tune_quda.h>
7 #include <quda_internal.h>
8 #include <float_vector.h>
9 #include <blas_quda.h>
10 #include <color_spinor_field.h>
12 
13 #define checkSpinor(a, b) \
14  { \
15  if (a.Precision() != b.Precision()) \
16  errorQuda("precisions do not match: %d %d", a.Precision(), b.Precision()); \
17  if (a.Length() != b.Length()) \
18  errorQuda("lengths do not match: %lu %lu", a.Length(), b.Length()); \
19  if (a.Stride() != b.Stride()) \
20  errorQuda("strides do not match: %d %d", a.Stride(), b.Stride()); \
21  }
22 
23 #define checkLength(a, b) \
24  { \
25  if (a.Length() != b.Length()) \
26  errorQuda("lengths do not match: %lu %lu", a.Length(), b.Length()); \
27  if (a.Stride() != b.Stride()) \
28  errorQuda("strides do not match: %d %d", a.Stride(), b.Stride()); \
29  }
30 
31 namespace quda {
32 
33  namespace blas {
34 
35  namespace multi {
36 #define BLAS_SPINOR // do not include ghost functions in Spinor class to reduce parameter space overhead
37 #include <texture.h>
38  }
39 
40  cudaStream_t* getStream();
41 
42  static struct {
43  const char *vol_str;
44  const char *aux_str;
45  char aux_tmp[TuneKey::aux_n];
46  } blasStrings;
47 
48  template <int writeX, int writeY, int writeZ, int writeW>
49  struct write {
50  static constexpr int X = writeX;
51  static constexpr int Y = writeY;
52  static constexpr int Z = writeZ;
53  static constexpr int W = writeW;
54  };
55 
56 #include <multi_blas_core.cuh>
57 #include <multi_blas_core.h>
58 #include <multi_blas_mixed_core.h>
59 
60 
61  template <int NXZ, typename Float2, typename FloatN>
63 
65  virtual __device__ __host__ void init() { ; }
66 
68  virtual __device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j) = 0;
69  };
70 
71 
76  __device__ __host__ inline void _caxpy(const float2 &a, const float4 &x, float4 &y) {
77  y.x += a.x*x.x; y.x -= a.y*x.y;
78  y.y += a.y*x.x; y.y += a.x*x.y;
79  y.z += a.x*x.z; y.z -= a.y*x.w;
80  y.w += a.y*x.z; y.w += a.x*x.w;
81  }
82 
83  __device__ __host__ inline void _caxpy(const float2 &a, const float2 &x, float2 &y) {
84  y.x += a.x*x.x; y.x -= a.y*x.y;
85  y.y += a.y*x.x; y.y += a.x*x.y;
86  }
87 
88  __device__ __host__ inline void _caxpy(const double2 &a, const double2 &x, double2 &y) {
89  y.x += a.x*x.x; y.x -= a.y*x.y;
90  y.y += a.y*x.x; y.y += a.x*x.y;
91  }
92 
93  template<int NXZ, typename Float2, typename FloatN>
94  struct multicaxpy_ : public MultiBlasFunctor<NXZ, Float2, FloatN> {
95  const int NYW;
96  // ignore parameter arrays since we place them in constant memory
98  const coeff_array<Complex> &c, int NYW) : NYW(NYW)
99  { }
100 
101  __device__ __host__ inline void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)
102  {
103 #ifdef __CUDA_ARCH__
104  Float2 *a = reinterpret_cast<Float2*>(Amatrix_d); // fetch coefficient matrix from constant memory
105  _caxpy(a[MAX_MULTI_BLAS_N*j+i], x, y);
106 #else
107  Float2 *a = reinterpret_cast<Float2*>(Amatrix_h);
108  _caxpy(a[NYW*j+i], x, y);
109 #endif
110  }
111 
112  int streams() { return 2*NYW + NXZ*NYW; }
113  int flops() { return 4*NXZ*NYW; }
114  };
115 
116 
117  void caxpy_recurse(const Complex *a_, std::vector<ColorSpinorField*> &x, std::vector<ColorSpinorField*> &y,
118  int i_idx ,int j_idx, int upper) {
119 
120  if (y.size() > MAX_MULTI_BLAS_N) // if greater than max single-kernel size, recurse.
121  {
122  // We need to split up 'a' carefully since it's row-major.
123  Complex* tmpmajor = new Complex[x.size()*y.size()];
124  Complex* tmpmajor0 = &tmpmajor[0];
125  Complex* tmpmajor1 = &tmpmajor[x.size()*(y.size()/2)];
126  std::vector<ColorSpinorField*> y0(y.begin(), y.begin() + y.size()/2);
127  std::vector<ColorSpinorField*> y1(y.begin() + y.size()/2, y.end());
128 
129  const unsigned int xlen = x.size();
130  const unsigned int ylen0 = y.size()/2;
131  const unsigned int ylen1 = y.size() - y.size()/2;
132 
133  int count = 0, count0 = 0, count1 = 0;
134  for (unsigned int i = 0; i < xlen; i++)
135  {
136  for (unsigned int j = 0; j < ylen0; j++)
137  tmpmajor0[count0++] = a_[count++];
138  for (unsigned int j = 0; j < ylen1; j++)
139  tmpmajor1[count1++] = a_[count++];
140  }
141 
142  caxpy_recurse(tmpmajor0, x, y0, i_idx, 2*j_idx+0, upper);
143  caxpy_recurse(tmpmajor1, x, y1, i_idx, 2*j_idx+1, upper);
144 
145  delete[] tmpmajor;
146  }
147  else
148  {
149  // if at the bottom of recursion,
150  // return if on lower left for upper triangular,
151  // return if on upper right for lower triangular.
152  if (x.size() <= MAX_MULTI_BLAS_N) {
153  if (upper == 1 && j_idx < i_idx) { return; }
154  if (upper == -1 && j_idx > i_idx) { return; }
155  }
156 
157  // mark true since we will copy the "a" matrix into constant memory
158  coeff_array<Complex> a(a_, true), b, c;
159 
160  if (x[0]->Precision() == y[0]->Precision())
161  {
162  switch (x.size()) {
163  case 1:
164  multiblasCuda<1,multicaxpy_,write<0,1,0,0> >(a, b, c, x, y, x, y);
165  break;
166  #if MAX_MULTI_BLAS_N >= 2
167  case 2:
168  multiblasCuda<2,multicaxpy_,write<0,1,0,0> >(a, b, c, x, y, x, y);
169  break;
170  #if MAX_MULTI_BLAS_N >= 3
171  case 3:
172  multiblasCuda<3,multicaxpy_,write<0,1,0,0> >(a, b, c, x, y, x, y);
173  break;
174  #if MAX_MULTI_BLAS_N >= 4
175  case 4:
176  multiblasCuda<4,multicaxpy_,write<0,1,0,0> >(a, b, c, x, y, x, y);
177  break;
178  #if MAX_MULTI_BLAS_N >= 5
179  case 5:
180  multiblasCuda<5,multicaxpy_,write<0,1,0,0> >(a, b, c, x, y, x, y);
181  break;
182  #if MAX_MULTI_BLAS_N >= 6
183  case 6:
184  multiblasCuda<6,multicaxpy_,write<0,1,0,0> >(a, b, c, x, y, x, y);
185  break;
186  #if MAX_MULTI_BLAS_N >= 7
187  case 7:
188  multiblasCuda<7,multicaxpy_,write<0,1,0,0> >(a, b, c, x, y, x, y);
189  break;
190  #if MAX_MULTI_BLAS_N >= 8
191  case 8:
192  multiblasCuda<8,multicaxpy_,write<0,1,0,0> >(a, b, c, x, y, x, y);
193  break;
194  #if MAX_MULTI_BLAS_N >= 9
195  case 9:
196  multiblasCuda<9,multicaxpy_,write<0,1,0,0> >(a, b, c, x, y, x, y);
197  break;
198  #if MAX_MULTI_BLAS_N >= 10
199  case 10:
200  multiblasCuda<10,multicaxpy_,write<0,1,0,0> >(a, b, c, x, y, x, y);
201  break;
202  #if MAX_MULTI_BLAS_N >= 11
203  case 11:
204  multiblasCuda<11,multicaxpy_,write<0,1,0,0> >(a, b, c, x, y, x, y);
205  break;
206  #if MAX_MULTI_BLAS_N >= 12
207  case 12:
208  multiblasCuda<12,multicaxpy_,write<0,1,0,0> >(a, b, c, x, y, x, y);
209  break;
210  #if MAX_MULTI_BLAS_N >= 13
211  case 13:
212  multiblasCuda<13,multicaxpy_,write<0,1,0,0> >(a, b, c, x, y, x, y);
213  break;
214  #if MAX_MULTI_BLAS_N >= 14
215  case 14:
216  multiblasCuda<14,multicaxpy_,write<0,1,0,0> >(a, b, c, x, y, x, y);
217  break;
218  #if MAX_MULTI_BLAS_N >= 15
219  case 15:
220  multiblasCuda<15,multicaxpy_,write<0,1,0,0> >(a, b, c, x, y, x, y);
221  break;
222  #if MAX_MULTI_BLAS_N >= 16
223  case 16:
224  multiblasCuda<16,multicaxpy_,write<0,1,0,0> >(a, b, c, x, y, x, y);
225  break;
226  #endif // 16
227  #endif // 15
228  #endif // 14
229  #endif // 13
230  #endif // 12
231  #endif // 11
232  #endif // 10
233  #endif // 9
234  #endif // 8
235  #endif // 7
236  #endif // 6
237  #endif // 5
238  #endif // 4
239  #endif // 3
240  #endif // 2
241  default:
242  // split the problem in half and recurse
243  const Complex *a0 = &a_[0];
244  const Complex *a1 = &a_[(x.size()/2)*y.size()];
245 
246  std::vector<ColorSpinorField*> x0(x.begin(), x.begin() + x.size()/2);
247  std::vector<ColorSpinorField*> x1(x.begin() + x.size()/2, x.end());
248 
249  caxpy_recurse(a0, x0, y, 2*i_idx+0, j_idx, upper);
250  caxpy_recurse(a1, x1, y, 2*i_idx+1, j_idx, upper);
251  break;
252  }
253  }
254  else // precisions don't agree.
255  {
256  switch (x.size()) {
257  case 1:
258  mixed::multiblasCuda<1,multicaxpy_,write<0,1,0,0> >(a, b, c, x, y, x, y);
259  break;
260  #if MAX_MULTI_BLAS_N >= 2
261  case 2:
262  mixed::multiblasCuda<2,multicaxpy_,write<0,1,0,0> >(a, b, c, x, y, x, y);
263  break;
264  #if MAX_MULTI_BLAS_N >= 3
265  case 3:
266  mixed::multiblasCuda<3,multicaxpy_,write<0,1,0,0> >(a, b, c, x, y, x, y);
267  break;
268  #if MAX_MULTI_BLAS_N >= 4
269  case 4:
270  mixed::multiblasCuda<4,multicaxpy_,write<0,1,0,0> >(a, b, c, x, y, x, y);
271  break;
272  #if MAX_MULTI_BLAS_N >= 5
273  case 5:
274  mixed::multiblasCuda<5,multicaxpy_,write<0,1,0,0> >(a, b, c, x, y, x, y);
275  break;
276  #if MAX_MULTI_BLAS_N >= 6
277  case 6:
278  mixed::multiblasCuda<6,multicaxpy_,write<0,1,0,0> >(a, b, c, x, y, x, y);
279  break;
280  #if MAX_MULTI_BLAS_N >= 7
281  case 7:
282  mixed::multiblasCuda<7,multicaxpy_,write<0,1,0,0> >(a, b, c, x, y, x, y);
283  break;
284  #if MAX_MULTI_BLAS_N >= 8
285  case 8:
286  mixed::multiblasCuda<8,multicaxpy_,write<0,1,0,0> >(a, b, c, x, y, x, y);
287  break;
288  #if MAX_MULTI_BLAS_N >= 9
289  case 9:
290  mixed::multiblasCuda<9,multicaxpy_,write<0,1,0,0> >(a, b, c, x, y, x, y);
291  break;
292  #if MAX_MULTI_BLAS_N >= 10
293  case 10:
294  mixed::multiblasCuda<10,multicaxpy_,write<0,1,0,0> >(a, b, c, x, y, x, y);
295  break;
296  #if MAX_MULTI_BLAS_N >= 11
297  case 11:
298  mixed::multiblasCuda<11,multicaxpy_,write<0,1,0,0> >(a, b, c, x, y, x, y);
299  break;
300  #if MAX_MULTI_BLAS_N >= 12
301  case 12:
302  mixed::multiblasCuda<12,multicaxpy_,write<0,1,0,0> >(a, b, c, x, y, x, y);
303  break;
304  #if MAX_MULTI_BLAS_N >= 13
305  case 13:
306  mixed::multiblasCuda<13,multicaxpy_,write<0,1,0,0> >(a, b, c, x, y, x, y);
307  break;
308  #if MAX_MULTI_BLAS_N >= 14
309  case 14:
310  mixed::multiblasCuda<14,multicaxpy_,write<0,1,0,0> >(a, b, c, x, y, x, y);
311  break;
312  #if MAX_MULTI_BLAS_N >= 15
313  case 15:
314  mixed::multiblasCuda<15,multicaxpy_,write<0,1,0,0> >(a, b, c, x, y, x, y);
315  break;
316  #if MAX_MULTI_BLAS_N >= 16
317  case 16:
318  mixed::multiblasCuda<16,multicaxpy_,write<0,1,0,0> >(a, b, c, x, y, x, y);
319  break;
320  #endif // 16
321  #endif // 15
322  #endif // 14
323  #endif // 13
324  #endif // 12
325  #endif // 11
326  #endif // 10
327  #endif // 9
328  #endif // 8
329  #endif // 7
330  #endif // 6
331  #endif // 5
332  #endif // 4
333  #endif // 3
334  #endif // 2
335  default:
336  // split the problem in half and recurse
337  const Complex *a0 = &a_[0];
338  const Complex *a1 = &a_[(x.size()/2)*y.size()];
339 
340  std::vector<ColorSpinorField*> x0(x.begin(), x.begin() + x.size()/2);
341  std::vector<ColorSpinorField*> x1(x.begin() + x.size()/2, x.end());
342 
343  caxpy_recurse(a0, x0, y, 2*i_idx+0, j_idx, upper);
344  caxpy_recurse(a1, x1, y, 2*i_idx+1, j_idx, upper);
345  break;
346  }
347  }
348  } // end if (y.size() > MAX_MULTI_BLAS_N)
349  }
350 
351  void caxpy(const Complex *a_, std::vector<ColorSpinorField*> &x, std::vector<ColorSpinorField*> &y) {
352  // Enter a recursion.
353  // Pass a, x, y. (0,0) indexes the tiles. false specifies the matrix is unstructured.
354  caxpy_recurse(a_, x, y, 0, 0, 0);
355  }
356 
357  void caxpy_U(const Complex *a_, std::vector<ColorSpinorField*> &x, std::vector<ColorSpinorField*> &y) {
358  // Enter a recursion.
359  // Pass a, x, y. (0,0) indexes the tiles. 1 indicates the matrix is upper-triangular,
360  // which lets us skip some tiles.
361  if (x.size() != y.size())
362  {
363  errorQuda("An optimal block caxpy_U with non-square 'a' has not yet been implemented. Use block caxpy instead.\n");
364  return;
365  }
366  caxpy_recurse(a_, x, y, 0, 0, 1);
367  }
368 
369  void caxpy_L(const Complex *a_, std::vector<ColorSpinorField*> &x, std::vector<ColorSpinorField*> &y) {
370  // Enter a recursion.
371  // Pass a, x, y. (0,0) indexes the tiles. -1 indicates the matrix is lower-triangular
372  // which lets us skip some tiles.
373  if (x.size() != y.size())
374  {
375  errorQuda("An optimal block caxpy_L with non-square 'a' has not yet been implemented. Use block caxpy instead.\n");
376  return;
377  }
378  caxpy_recurse(a_, x, y, 0, 0, -1);
379  }
380 
381 
382  void caxpy(const Complex *a, ColorSpinorField &x, ColorSpinorField &y) { caxpy(a, x.Components(), y.Components()); }
383 
384  void caxpy_U(const Complex *a, ColorSpinorField &x, ColorSpinorField &y) { caxpy_U(a, x.Components(), y.Components()); }
385 
386  void caxpy_L(const Complex *a, ColorSpinorField &x, ColorSpinorField &y) { caxpy_L(a, x.Components(), y.Components()); }
387 
391  template<int NXZ, typename Float2, typename FloatN>
392  struct multicaxpyz_ : public MultiBlasFunctor<NXZ, Float2, FloatN> {
393  const int NYW;
394  // ignore parameter arrays since we place them in constant memory
396  const coeff_array<Complex> &c, int NYW) : NYW(NYW)
397  { }
398 
399  __device__ __host__ inline void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)
400  {
401 #ifdef __CUDA_ARCH__
402  Float2 *a = reinterpret_cast<Float2*>(Amatrix_d); // fetch coefficient matrix from constant memory
403  if (j==0) w = y;
404  _caxpy(a[MAX_MULTI_BLAS_N*j+i], x, w);
405 #else
406  Float2 *a = reinterpret_cast<Float2*>(Amatrix_h);
407  if (j==0) w = y;
408  _caxpy(a[NYW*j+i], x, w);
409 #endif
410  }
411 
412  int streams() { return 2*NYW + NXZ*NYW; }
413  int flops() { return 4*NXZ*NYW; }
414  };
415 
416  void caxpyz_recurse(const Complex *a_, std::vector<ColorSpinorField*> &x, std::vector<ColorSpinorField*> &y, std::vector<ColorSpinorField*> &z, int i, int j, int pass, int upper) {
417 
418  if (y.size() > MAX_MULTI_BLAS_N) // if greater than max single-kernel size, recurse.
419  {
420  // We need to split up 'a' carefully since it's row-major.
421  Complex* tmpmajor = new Complex[x.size()*y.size()];
422  Complex* tmpmajor0 = &tmpmajor[0];
423  Complex* tmpmajor1 = &tmpmajor[x.size()*(y.size()/2)];
424  std::vector<ColorSpinorField*> y0(y.begin(), y.begin() + y.size()/2);
425  std::vector<ColorSpinorField*> y1(y.begin() + y.size()/2, y.end());
426 
427  std::vector<ColorSpinorField*> z0(z.begin(), z.begin() + z.size()/2);
428  std::vector<ColorSpinorField*> z1(z.begin() + z.size()/2, z.end());
429 
430  const unsigned int xlen = x.size();
431  const unsigned int ylen0 = y.size()/2;
432  const unsigned int ylen1 = y.size() - y.size()/2;
433 
434  int count = 0, count0 = 0, count1 = 0;
435  for (unsigned int i_ = 0; i_ < xlen; i_++)
436  {
437  for (unsigned int j = 0; j < ylen0; j++)
438  tmpmajor0[count0++] = a_[count++];
439  for (unsigned int j = 0; j < ylen1; j++)
440  tmpmajor1[count1++] = a_[count++];
441  }
442 
443  caxpyz_recurse(tmpmajor0, x, y0, z0, i, 2*j+0, pass, upper);
444  caxpyz_recurse(tmpmajor1, x, y1, z1, i, 2*j+1, pass, upper);
445 
446  delete[] tmpmajor;
447  }
448  else
449  {
450  // if at bottom of recursion check where we are
451  if (x.size() <= MAX_MULTI_BLAS_N) {
452  if (pass==1) {
453  if (i!=j)
454  {
455  if (upper == 1 && j < i) { return; } // upper right, don't need to update lower left.
456  if (upper == -1 && i < j) { return; } // lower left, don't need to update upper right.
457  caxpy(a_, x, z); return; // off diagonal
458  }
459  return;
460  } else {
461  if (i!=j) return; // We're on the first pass, so we only want to update the diagonal.
462  }
463  }
464 
465  // mark true since we will copy the "a" matrix into constant memory
466  coeff_array<Complex> a(a_, true), b, c;
467 
468  if (x[0]->Precision() == y[0]->Precision())
469  {
470  switch (x.size()) {
471  case 1:
472  multiblasCuda<1,multicaxpyz_,write<0,0,0,1> >(a, b, c, x, y, x, z);
473  break;
474  #if MAX_MULTI_BLAS_N >= 2
475  case 2:
476  multiblasCuda<2,multicaxpyz_,write<0,0,0,1> >(a, b, c, x, y, x, z);
477  break;
478  #if MAX_MULTI_BLAS_N >= 3
479  case 3:
480  multiblasCuda<3,multicaxpyz_,write<0,0,0,1> >(a, b, c, x, y, x, z);
481  break;
482  #if MAX_MULTI_BLAS_N >= 4
483  case 4:
484  multiblasCuda<4,multicaxpyz_,write<0,0,0,1> >(a, b, c, x, y, x, z);
485  break;
486  #if MAX_MULTI_BLAS_N >= 5
487  case 5:
488  multiblasCuda<5,multicaxpyz_,write<0,0,0,1> >(a, b, c, x, y, x, z);
489  break;
490  #if MAX_MULTI_BLAS_N >= 6
491  case 6:
492  multiblasCuda<6,multicaxpyz_,write<0,0,0,1> >(a, b, c, x, y, x, z);
493  break;
494  #if MAX_MULTI_BLAS_N >= 7
495  case 7:
496  multiblasCuda<7,multicaxpyz_,write<0,0,0,1> >(a, b, c, x, y, x, z);
497  break;
498  #if MAX_MULTI_BLAS_N >= 8
499  case 8:
500  multiblasCuda<8,multicaxpyz_,write<0,0,0,1> >(a, b, c, x, y, x, z);
501  break;
502  #if MAX_MULTI_BLAS_N >= 9
503  case 9:
504  multiblasCuda<9,multicaxpyz_,write<0,0,0,1> >(a, b, c, x, y, x, z);
505  break;
506  #if MAX_MULTI_BLAS_N >= 10
507  case 10:
508  multiblasCuda<10,multicaxpyz_,write<0,0,0,1> >(a, b, c, x, y, x, z);
509  break;
510  #if MAX_MULTI_BLAS_N >= 11
511  case 11:
512  multiblasCuda<11,multicaxpyz_,write<0,0,0,1> >(a, b, c, x, y, x, z);
513  break;
514  #if MAX_MULTI_BLAS_N >= 12
515  case 12:
516  multiblasCuda<12,multicaxpyz_,write<0,0,0,1> >(a, b, c, x, y, x, z);
517  break;
518  #if MAX_MULTI_BLAS_N >= 13
519  case 13:
520  multiblasCuda<13,multicaxpyz_,write<0,0,0,1> >(a, b, c, x, y, x, z);
521  break;
522  #if MAX_MULTI_BLAS_N >= 14
523  case 14:
524  multiblasCuda<14,multicaxpyz_,write<0,0,0,1> >(a, b, c, x, y, x, z);
525  break;
526  #if MAX_MULTI_BLAS_N >= 15
527  case 15:
528  multiblasCuda<15,multicaxpyz_,write<0,0,0,1> >(a, b, c, x, y, x, z);
529  break;
530  #if MAX_MULTI_BLAS_N >= 16
531  case 16:
532  multiblasCuda<16,multicaxpyz_,write<0,0,0,1> >(a, b, c, x, y, x, z);
533  break;
534  #endif // 16
535  #endif // 15
536  #endif // 14
537  #endif // 13
538  #endif // 12
539  #endif // 11
540  #endif // 10
541  #endif // 9
542  #endif // 8
543  #endif // 7
544  #endif // 6
545  #endif // 5
546  #endif // 4
547  #endif // 3
548  #endif // 2
549  default:
550  // split the problem in half and recurse
551  const Complex *a0 = &a_[0];
552  const Complex *a1 = &a_[(x.size()/2)*y.size()];
553 
554  std::vector<ColorSpinorField*> x0(x.begin(), x.begin() + x.size()/2);
555  std::vector<ColorSpinorField*> x1(x.begin() + x.size()/2, x.end());
556 
557  caxpyz_recurse(a0, x0, y, z, 2*i+0, j, pass, upper);
558  caxpyz_recurse(a1, x1, y, z, 2*i+1, j, pass, upper); // b/c we don't want to re-zero z.
559  break;
560  }
561  }
562  else // precisions don't agree.
563  {
564  switch (x.size()) {
565  case 1:
566  mixed::multiblasCuda<1,multicaxpyz_,write<0,0,0,1> >(a, b, c, x, y, x, z);
567  break;
568  #if MAX_MULTI_BLAS_N >= 2
569  case 2:
570  mixed::multiblasCuda<2,multicaxpyz_,write<0,0,0,1> >(a, b, c, x, y, x, z);
571  break;
572  #if MAX_MULTI_BLAS_N >= 3
573  case 3:
574  mixed::multiblasCuda<3,multicaxpyz_,write<0,0,0,1> >(a, b, c, x, y, x, z);
575  break;
576  #if MAX_MULTI_BLAS_N >= 4
577  case 4:
578  mixed::multiblasCuda<4,multicaxpyz_,write<0,0,0,1> >(a, b, c, x, y, x, z);
579  break;
580  #if MAX_MULTI_BLAS_N >= 5
581  case 5:
582  mixed::multiblasCuda<5,multicaxpyz_,write<0,0,0,1> >(a, b, c, x, y, x, z);
583  break;
584  #if MAX_MULTI_BLAS_N >= 6
585  case 6:
586  mixed::multiblasCuda<6,multicaxpyz_,write<0,0,0,1> >(a, b, c, x, y, x, z);
587  break;
588  #if MAX_MULTI_BLAS_N >= 7
589  case 7:
590  mixed::multiblasCuda<7,multicaxpyz_,write<0,0,0,1> >(a, b, c, x, y, x, z);
591  break;
592  #if MAX_MULTI_BLAS_N >= 8
593  case 8:
594  mixed::multiblasCuda<8,multicaxpyz_,write<0,0,0,1> >(a, b, c, x, y, x, z);
595  break;
596  #if MAX_MULTI_BLAS_N >= 9
597  case 9:
598  mixed::multiblasCuda<9,multicaxpyz_,write<0,0,0,1> >(a, b, c, x, y, x, z);
599  break;
600  #if MAX_MULTI_BLAS_N >= 10
601  case 10:
602  mixed::multiblasCuda<10,multicaxpyz_,write<0,0,0,1> >(a, b, c, x, y, x, z);
603  break;
604  #if MAX_MULTI_BLAS_N >= 11
605  case 11:
606  mixed::multiblasCuda<11,multicaxpyz_,write<0,0,0,1> >(a, b, c, x, y, x, z);
607  break;
608  #if MAX_MULTI_BLAS_N >= 12
609  case 12:
610  mixed::multiblasCuda<12,multicaxpyz_,write<0,0,0,1> >(a, b, c, x, y, x, z);
611  break;
612  #if MAX_MULTI_BLAS_N >= 13
613  case 13:
614  mixed::multiblasCuda<13,multicaxpyz_,write<0,0,0,1> >(a, b, c, x, y, x, z);
615  break;
616  #if MAX_MULTI_BLAS_N >= 14
617  case 14:
618  mixed::multiblasCuda<14,multicaxpyz_,write<0,0,0,1> >(a, b, c, x, y, x, z);
619  break;
620  #if MAX_MULTI_BLAS_N >= 15
621  case 15:
622  mixed::multiblasCuda<15,multicaxpyz_,write<0,0,0,1> >(a, b, c, x, y, x, z);
623  break;
624  #if MAX_MULTI_BLAS_N >= 16
625  case 16:
626  mixed::multiblasCuda<16,multicaxpyz_,write<0,0,0,1> >(a, b, c, x, y, x, z);
627  break;
628  #endif // 16
629  #endif // 15
630  #endif // 14
631  #endif // 13
632  #endif // 12
633  #endif // 11
634  #endif // 10
635  #endif // 9
636  #endif // 8
637  #endif // 7
638  #endif // 6
639  #endif // 5
640  #endif // 4
641  #endif // 3
642  #endif // 2
643  default:
644  // split the problem in half and recurse
645  const Complex *a0 = &a_[0];
646  const Complex *a1 = &a_[(x.size()/2)*y.size()];
647 
648  std::vector<ColorSpinorField*> x0(x.begin(), x.begin() + x.size()/2);
649  std::vector<ColorSpinorField*> x1(x.begin() + x.size()/2, x.end());
650 
651  caxpyz_recurse(a0, x0, y, z, 2*i+0, j, pass, upper);
652  caxpyz_recurse(a1, x1, y, z, 2*i+1, j, pass, upper);
653  break;
654  }
655  }
656  } // end if (y.size() > MAX_MULTI_BLAS_N)
657  }
658 
659  void caxpyz(const Complex *a, std::vector<ColorSpinorField*> &x, std::vector<ColorSpinorField*> &y, std::vector<ColorSpinorField*> &z) {
660  // first pass does the caxpyz on the diagonal
661  caxpyz_recurse(a, x, y, z, 0, 0, 0, 0);
662  // second pass does caxpy on the off diagonals
663  caxpyz_recurse(a, x, y, z, 0, 0, 1, 0);
664  }
665 
666  void caxpyz_U(const Complex *a, std::vector<ColorSpinorField*> &x, std::vector<ColorSpinorField*> &y, std::vector<ColorSpinorField*> &z) {
667  // a is upper triangular.
668  // first pass does the caxpyz on the diagonal
669  caxpyz_recurse(a, x, y, z, 0, 0, 0, 1);
670  // second pass does caxpy on the off diagonals
671  caxpyz_recurse(a, x, y, z, 0, 0, 1, 1);
672  }
673 
674  void caxpyz_L(const Complex *a, std::vector<ColorSpinorField*> &x, std::vector<ColorSpinorField*> &y, std::vector<ColorSpinorField*> &z) {
675  // a is upper triangular.
676  // first pass does the caxpyz on the diagonal
677  caxpyz_recurse(a, x, y, z, 0, 0, 0, -1);
678  // second pass does caxpy on the off diagonals
679  caxpyz_recurse(a, x, y, z, 0, 0, 1, -1);
680  }
681 
682 
684  caxpyz(a, x.Components(), y.Components(), z.Components());
685  }
686 
688  caxpyz_U(a, x.Components(), y.Components(), z.Components());
689  }
690 
692  caxpyz_L(a, x.Components(), y.Components(), z.Components());
693  }
694 
698  template<int NXZ, typename Float2, typename FloatN>
699  struct multi_axpyBzpcx_ : public MultiBlasFunctor<NXZ, Float2, FloatN> {
700  typedef typename scalar<Float2>::type real;
701  const int NYW;
703 
705  : NYW(NYW) , a{ }, b{ }, c{ } {
706  // copy arguments into the functor
707  for (int i=0; i<NYW; i++) { this->a[i] = a.data[i]; this->b[i] = b.data[i]; this->c[i] = c.data[i]; }
708  }
709  __device__ __host__ inline void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)
710  {
711  y += a[i] * w;
712  w = b[i] * x + c[i] * w;
713  }
714  int streams() { return 4*NYW + NXZ; }
715  int flops() { return 5*NXZ*NYW; }
716  };
717 
718  void axpyBzpcx(const double *a_, std::vector<ColorSpinorField*> &x_, std::vector<ColorSpinorField*> &y_,
719  const double *b_, ColorSpinorField &z_, const double *c_) {
720 
721  if (y_.size() <= MAX_MULTI_BLAS_N) {
722  // swizzle order since we are writing to x_ and y_, but the
723  // multi-blas only allow writing to y and w, and moreover the
724  // block width of y and w must match, and x and z must match.
725  std::vector<ColorSpinorField*> &y = y_;
726  std::vector<ColorSpinorField*> &w = x_;
727 
728  // wrap a container around the third solo vector
729  std::vector<ColorSpinorField*> x;
730  x.push_back(&z_);
731 
732  // we will curry the parameter arrays into the functor
733  coeff_array<double> a(a_,false), b(b_,false), c(c_,false);
734 
735  if (x[0]->Precision() != y[0]->Precision() ) {
736  mixed::multiblasCuda<1,multi_axpyBzpcx_,write<0,1,0,1> >(a, b, c, x, y, x, w);
737  } else {
738  multiblasCuda<1,multi_axpyBzpcx_,write<0,1,0,1> >(a, b, c, x, y, x, w);
739  }
740  } else {
741  // split the problem in half and recurse
742  const double *a0 = &a_[0];
743  const double *b0 = &b_[0];
744  const double *c0 = &c_[0];
745 
746  std::vector<ColorSpinorField*> x0(x_.begin(), x_.begin() + x_.size()/2);
747  std::vector<ColorSpinorField*> y0(y_.begin(), y_.begin() + y_.size()/2);
748 
749  axpyBzpcx(a0, x0, y0, b0, z_, c0);
750 
751  const double *a1 = &a_[y_.size()/2];
752  const double *b1 = &b_[y_.size()/2];
753  const double *c1 = &c_[y_.size()/2];
754 
755  std::vector<ColorSpinorField*> x1(x_.begin() + x_.size()/2, x_.end());
756  std::vector<ColorSpinorField*> y1(y_.begin() + y_.size()/2, y_.end());
757 
758  axpyBzpcx(a1, x1, y1, b1, z_, c1);
759  }
760  }
761 
765  template<int NXZ, typename Float2, typename FloatN>
766  struct multi_caxpyBxpz_ : public MultiBlasFunctor<NXZ, Float2, FloatN>
767  {
768  typedef typename scalar<Float2>::type real;
769  const int NYW;
770 
772  { }
773 
774  // i loops over NYW, j loops over NXZ
775  __device__ __host__ inline void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)
776  {
777 #ifdef __CUDA_ARCH__
778  Float2 *a = reinterpret_cast<Float2*>(Amatrix_d); // fetch coefficient matrix from constant memory
779  Float2 *b = reinterpret_cast<Float2*>(Bmatrix_d); // fetch coefficient matrix from constant memory
780  _caxpy(a[MAX_MULTI_BLAS_N*j], x, y); _caxpy(b[MAX_MULTI_BLAS_N*j], x, w); // b/c we swizzled z into w.
781 #else
782  Float2 *a = reinterpret_cast<Float2*>(Amatrix_h);
783  Float2 *b = reinterpret_cast<Float2*>(Bmatrix_h);
784  _caxpy(a[j], x, y); _caxpy(b[j], x, w); // b/c we swizzled z into w.
785 #endif
786  }
787  int streams() { return 4*NYW + NXZ; }
788  int flops() { return 8*NXZ*NYW; }
789  };
790 
791  void caxpyBxpz(const Complex *a_, std::vector<ColorSpinorField*> &x_, ColorSpinorField &y_,
792  const Complex *b_, ColorSpinorField &z_)
793  {
794 
795  const int xsize = x_.size();
796  if (xsize <= MAX_MULTI_BLAS_N) // only swizzle if we have to.
797  {
798  // swizzle order since we are writing to y_ and z_, but the
799  // multi-blas only allow writing to y and w, and moreover the
800  // block width of y and w must match, and x and z must match.
801  // Also, wrap a container around them.
802  std::vector<ColorSpinorField*> y;
803  y.push_back(&y_);
804  std::vector<ColorSpinorField*> w;
805  w.push_back(&z_);
806 
807  // we're reading from x
808  std::vector<ColorSpinorField*> &x = x_;
809 
810  // put a and b into constant space
811  coeff_array<Complex> a(a_,true), b(b_,true), c;
812 
813  if (x[0]->Precision() != y[0]->Precision() )
814  {
815  switch(xsize)
816  {
817  case 1:
818  mixed::multiblasCuda<1,multi_caxpyBxpz_,write<0,1,0,1> >(a, b, c, x, y, x, w);
819  break;
820 #if MAX_MULTI_BLAS_N >= 2
821  case 2:
822  mixed::multiblasCuda<2,multi_caxpyBxpz_,write<0,1,0,1> >(a, b, c, x, y, x, w);
823  break;
824 #if MAX_MULTI_BLAS_N >= 3
825  case 3:
826  mixed::multiblasCuda<3,multi_caxpyBxpz_,write<0,1,0,1> >(a, b, c, x, y, x, w);
827  break;
828 #if MAX_MULTI_BLAS_N >= 4
829  case 4:
830  mixed::multiblasCuda<4,multi_caxpyBxpz_,write<0,1,0,1> >(a, b, c, x, y, x, w);
831  break;
832 #if MAX_MULTI_BLAS_N >= 5
833  case 5:
834  mixed::multiblasCuda<5,multi_caxpyBxpz_,write<0,1,0,1> >(a, b, c, x, y, x, w);
835  break;
836 #if MAX_MULTI_BLAS_N >= 6
837  case 6:
838  mixed::multiblasCuda<6,multi_caxpyBxpz_,write<0,1,0,1> >(a, b, c, x, y, x, w);
839  break;
840 #if MAX_MULTI_BLAS_N >= 7
841  case 7:
842  mixed::multiblasCuda<7,multi_caxpyBxpz_,write<0,1,0,1> >(a, b, c, x, y, x, w);
843  break;
844 #if MAX_MULTI_BLAS_N >= 8
845  case 8:
846  mixed::multiblasCuda<8,multi_caxpyBxpz_,write<0,1,0,1> >(a, b, c, x, y, x, w);
847  break;
848 #if MAX_MULTI_BLAS_N >= 9
849  case 9:
850  mixed::multiblasCuda<9,multi_caxpyBxpz_,write<0,1,0,1> >(a, b, c, x, y, x, w);
851  break;
852 #if MAX_MULTI_BLAS_N >= 10
853  case 10:
854  mixed::multiblasCuda<10,multi_caxpyBxpz_,write<0,1,0,1> >(a, b, c, x, y, x, w);
855  break;
856 #if MAX_MULTI_BLAS_N >= 11
857  case 11:
858  mixed::multiblasCuda<11,multi_caxpyBxpz_,write<0,1,0,1> >(a, b, c, x, y, x, w);
859  break;
860 #if MAX_MULTI_BLAS_N >= 12
861  case 12:
862  mixed::multiblasCuda<12,multi_caxpyBxpz_,write<0,1,0,1> >(a, b, c, x, y, x, w);
863  break;
864 #if MAX_MULTI_BLAS_N >= 13
865  case 13:
866  mixed::multiblasCuda<13,multi_caxpyBxpz_,write<0,1,0,1> >(a, b, c, x, y, x, w);
867  break;
868 #if MAX_MULTI_BLAS_N >= 14
869  case 14:
870  mixed::multiblasCuda<14,multi_caxpyBxpz_,write<0,1,0,1> >(a, b, c, x, y, x, w);
871  break;
872 #if MAX_MULTI_BLAS_N >= 15
873  case 15:
874  mixed::multiblasCuda<15,multi_caxpyBxpz_,write<0,1,0,1> >(a, b, c, x, y, x, w);
875  break;
876 #if MAX_MULTI_BLAS_N >= 16
877  case 16:
878  mixed::multiblasCuda<16,multi_caxpyBxpz_,write<0,1,0,1> >(a, b, c, x, y, x, w);
879  break;
880 #endif // 16
881 #endif // 15
882 #endif // 14
883 #endif // 13
884 #endif // 12
885 #endif // 11
886 #endif // 10
887 #endif // 9
888 #endif // 8
889 #endif // 7
890 #endif // 6
891 #endif // 5
892 #endif // 4
893 #endif // 3
894 #endif // 2
895  default:
896  // we can't hit the default, it ends up in the else below.
897  break;
898  }
899  }
900  else
901  {
902  switch(xsize)
903  {
904  case 1:
905  multiblasCuda<1,multi_caxpyBxpz_,write<0,1,0,1> >(a, b, c, x, y, x, w);
906  break;
907 #if MAX_MULTI_BLAS_N >= 2
908  case 2:
909  multiblasCuda<2,multi_caxpyBxpz_,write<0,1,0,1> >(a, b, c, x, y, x, w);
910  break;
911 #if MAX_MULTI_BLAS_N >= 3
912  case 3:
913  multiblasCuda<3,multi_caxpyBxpz_,write<0,1,0,1> >(a, b, c, x, y, x, w);
914  break;
915 #if MAX_MULTI_BLAS_N >= 4
916  case 4:
917  multiblasCuda<4,multi_caxpyBxpz_,write<0,1,0,1> >(a, b, c, x, y, x, w);
918  break;
919 #if MAX_MULTI_BLAS_N >= 5
920  case 5:
921  multiblasCuda<5,multi_caxpyBxpz_,write<0,1,0,1> >(a, b, c, x, y, x, w);
922  break;
923 #if MAX_MULTI_BLAS_N >= 6
924  case 6:
925  multiblasCuda<6,multi_caxpyBxpz_,write<0,1,0,1> >(a, b, c, x, y, x, w);
926  break;
927 #if MAX_MULTI_BLAS_N >= 7
928  case 7:
929  multiblasCuda<7,multi_caxpyBxpz_,write<0,1,0,1> >(a, b, c, x, y, x, w);
930  break;
931 #if MAX_MULTI_BLAS_N >= 8
932  case 8:
933  multiblasCuda<8,multi_caxpyBxpz_,write<0,1,0,1> >(a, b, c, x, y, x, w);
934  break;
935 #if MAX_MULTI_BLAS_N >= 9
936  case 9:
937  multiblasCuda<9,multi_caxpyBxpz_,write<0,1,0,1> >(a, b, c, x, y, x, w);
938  break;
939 #if MAX_MULTI_BLAS_N >= 10
940  case 10:
941  multiblasCuda<10,multi_caxpyBxpz_,write<0,1,0,1> >(a, b, c, x, y, x, w);
942  break;
943 #if MAX_MULTI_BLAS_N >= 11
944  case 11:
945  multiblasCuda<11,multi_caxpyBxpz_,write<0,1,0,1> >(a, b, c, x, y, x, w);
946  break;
947 #if MAX_MULTI_BLAS_N >= 12
948  case 12:
949  multiblasCuda<12,multi_caxpyBxpz_,write<0,1,0,1> >(a, b, c, x, y, x, w);
950  break;
951 #if MAX_MULTI_BLAS_N >= 13
952  case 13:
953  multiblasCuda<13,multi_caxpyBxpz_,write<0,1,0,1> >(a, b, c, x, y, x, w);
954  break;
955 #if MAX_MULTI_BLAS_N >= 14
956  case 14:
957  multiblasCuda<14,multi_caxpyBxpz_,write<0,1,0,1> >(a, b, c, x, y, x, w);
958  break;
959 #if MAX_MULTI_BLAS_N >= 15
960  case 15:
961  multiblasCuda<15,multi_caxpyBxpz_,write<0,1,0,1> >(a, b, c, x, y, x, w);
962  break;
963 #if MAX_MULTI_BLAS_N >= 16
964  case 16:
965  multiblasCuda<16,multi_caxpyBxpz_,write<0,1,0,1> >(a, b, c, x, y, x, w);
966  break;
967 #endif // 16
968 #endif // 15
969 #endif // 14
970 #endif // 13
971 #endif // 12
972 #endif // 11
973 #endif // 10
974 #endif // 9
975 #endif // 8
976 #endif // 7
977 #endif // 6
978 #endif // 5
979 #endif // 4
980 #endif // 3
981 #endif // 2
982  default:
983  // we can't hit the default, it ends up in the else below.
984  break;
985  }
986  }
987  }
988  else
989  {
990  // split the problem in half and recurse
991  const Complex *a0 = &a_[0];
992  const Complex *b0 = &b_[0];
993 
994  std::vector<ColorSpinorField*> x0(x_.begin(), x_.begin() + x_.size()/2);
995 
996  caxpyBxpz(a0, x0, y_, b0, z_);
997 
998  const Complex *a1 = &a_[x_.size()/2];
999  const Complex *b1 = &b_[x_.size()/2];
1000 
1001  std::vector<ColorSpinorField*> x1(x_.begin() + x_.size()/2, x_.end());
1002 
1003  caxpyBxpz(a1, x1, y_, b1, z_);
1004  }
1005  }
1006 
1007  } // namespace blas
1008 
1009 } // namespace quda
void caxpyz(const Complex *a, std::vector< ColorSpinorField *> &x, std::vector< ColorSpinorField *> &y, std::vector< ColorSpinorField *> &z)
Compute the block "caxpyz" with over the set of ColorSpinorFields. E.g., it computes.
void caxpyz_U(const Complex *a, std::vector< ColorSpinorField *> &x, std::vector< ColorSpinorField *> &y, std::vector< ColorSpinorField *> &z)
Compute the block "caxpyz" with over the set of ColorSpinorFields. E.g., it computes.
const char * aux_str
Definition: blas_quda.cu:57
virtual __device__ __host__ void init()
pre-computation routine before the main loop
multi_axpyBzpcx_(const coeff_array< double > &a, const coeff_array< double > &b, const coeff_array< double > &c, int NYW)
static __constant__ signed char Bmatrix_d[MAX_MATRIX_SIZE]
char aux_tmp[TuneKey::aux_n]
Definition: blas_quda.cu:58
#define errorQuda(...)
Definition: util_quda.h:90
static __constant__ signed char Amatrix_d[MAX_MATRIX_SIZE]
std::complex< double > Complex
Definition: eig_variables.h:13
real a[MAX_MULTI_BLAS_N]
multicaxpyz_(const coeff_array< Complex > &a, const coeff_array< Complex > &b, const coeff_array< Complex > &c, int NYW)
multi_caxpyBxpz_(const coeff_array< Complex > &a, const coeff_array< Complex > &b, const coeff_array< Complex > &c, int NYW)
double y0(double) __attribute__((availability(macosx
void caxpy_U(const Complex *a, std::vector< ColorSpinorField *> &x, std::vector< ColorSpinorField *> &y)
Compute the block "caxpy_U" with over the set of ColorSpinorFields. E.g., it computes.
int flops()
total number of input and output streams
static constexpr int Y
void caxpyBxpz(const Complex &, ColorSpinorField &, ColorSpinorField &, const Complex &, ColorSpinorField &)
Definition: blas_quda.cu:438
__device__ __host__ void _caxpy(const float2 &a, const float4 &x, float4 &y)
Definition: blas_quda.cu:219
__device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)
where the reduction is usually computed and any auxiliary operations
#define b
real b[MAX_MULTI_BLAS_N]
cudaStream_t * getStream()
Definition: blas_quda.cu:75
real c[MAX_MULTI_BLAS_N]
double y1(double) __attribute__((availability(macosx
static struct quda::blas::@4 blasStrings
static constexpr int Z
scalar< Float2 >::type real
int int int w
__device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)
where the reduction is usually computed and any auxiliary operations
void caxpy_recurse(const Complex *a_, std::vector< ColorSpinorField *> &x, std::vector< ColorSpinorField *> &y, int i_idx, int j_idx, int upper)
static constexpr int W
static constexpr int X
void caxpy(const Complex &a, ColorSpinorField &x, ColorSpinorField &y)
Definition: blas_quda.cu:246
__device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)
where the reduction is usually computed and any auxiliary operations
const char * vol_str
Definition: blas_quda.cu:56
scalar< Float2 >::type real
void caxpy_L(const Complex *a, std::vector< ColorSpinorField *> &x, std::vector< ColorSpinorField *> &y)
Compute the block "caxpy_L" with over the set of ColorSpinorFields. E.g., it computes.
static signed char * Amatrix_h
void caxpyz_L(const Complex *a, std::vector< ColorSpinorField *> &x, std::vector< ColorSpinorField *> &y, std::vector< ColorSpinorField *> &z)
Compute the block "caxpyz" with over the set of ColorSpinorFields. E.g., it computes.
int flops()
total number of input and output streams
#define MAX_MULTI_BLAS_N
Definition: quda_internal.h:49
void axpyBzpcx(const double &a, ColorSpinorField &x, ColorSpinorField &y, const double &b, ColorSpinorField &z, const double &c)
Definition: blas_quda.cu:356
__device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)
where the reduction is usually computed and any auxiliary operations
static const int aux_n
Definition: tune_key.h:12
virtual __device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)=0
where the reduction is usually computed and any auxiliary operations
const void * c
void caxpyz_recurse(const Complex *a_, std::vector< ColorSpinorField *> &x, std::vector< ColorSpinorField *> &y, std::vector< ColorSpinorField *> &z, int i, int j, int pass, int upper)
multicaxpy_(const coeff_array< Complex > &a, const coeff_array< Complex > &b, const coeff_array< Complex > &c, int NYW)
int flops()
total number of input and output streams
int flops()
total number of input and output streams
__device__ unsigned int count[QUDA_MAX_MULTI_REDUCE]
Definition: cub_helper.cuh:118
#define a
static signed char * Bmatrix_h