QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
multi_blas_core.cuh
Go to the documentation of this file.
1 #pragma once
2 
4 #include <blas_helper.cuh>
5 
6 namespace quda
7 {
8 
9  namespace blas
10  {
11 
12 #define BLAS_SPINOR // do not include ghost functions in Spinor class to reduce parameter space overhead
13 #include <texture.h>
14 
15  // storage for matrix coefficients
16 #define MAX_MATRIX_SIZE 4096
17  static __constant__ signed char Amatrix_d[MAX_MATRIX_SIZE];
18  static __constant__ signed char Bmatrix_d[MAX_MATRIX_SIZE];
19  static __constant__ signed char Cmatrix_d[MAX_MATRIX_SIZE];
20 
21  static signed char *Amatrix_h;
22  static signed char *Bmatrix_h;
23  static signed char *Cmatrix_h;
24 
25 #if CUDA_VERSION < 9000
26  // as a performance work around we put the argument struct into
27  // __constant__ memory to prevent the compiler from spilling
28  // registers on older CUDA
29  static __constant__ signed char arg_buffer[MAX_MATRIX_SIZE];
30 #endif
31 
42  template <int NXZ, typename SpinorX, typename SpinorY, typename SpinorZ, typename SpinorW, typename Functor>
43  struct MultiBlasArg {
44  const int NYW;
45  SpinorX X[NXZ];
46  SpinorY Y[MAX_MULTI_BLAS_N];
47  SpinorZ Z[NXZ];
48  SpinorW W[MAX_MULTI_BLAS_N];
49  Functor f;
50  const int length;
51 
52  MultiBlasArg(SpinorX X[NXZ], SpinorY Y[], SpinorZ Z[NXZ], SpinorW W[], Functor f, int NYW, int length) :
53  NYW(NYW),
54  f(f),
55  length(length)
56  {
57  for (int i = 0; i < NXZ; ++i) {
58  this->X[i] = X[i];
59  this->Z[i] = Z[i];
60  }
61  for (int i = 0; i < NYW; ++i) {
62  this->Y[i] = Y[i];
63  this->W[i] = W[i];
64  }
65  }
66  };
67 
73  template <typename FloatN, int M, int NXZ, typename Arg> __global__ void multiBlasKernel(Arg arg_)
74  {
75 #if CUDA_VERSION >= 9000
76  Arg &arg = arg_;
77 #else
78  Arg &arg = *((Arg *)arg_buffer);
79 #endif
80 
81  // use i to loop over elements in kernel
82  unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
83  unsigned int k = blockIdx.y * blockDim.y + threadIdx.y;
84  unsigned int parity = blockIdx.z;
85 
86  arg.f.init();
87  if (k >= arg.NYW) return;
88 
89  while (idx < arg.length) {
90 
91  FloatN x[M], y[M], z[M], w[M];
92  arg.Y[k].load(y, idx, parity);
93  arg.W[k].load(w, idx, parity);
94 
95 #pragma unroll
96  for (int l = 0; l < NXZ; l++) {
97  arg.X[l].load(x, idx, parity);
98  arg.Z[l].load(z, idx, parity);
99 
100 #pragma unroll
101  for (int j = 0; j < M; j++) arg.f(x[j], y[j], z[j], w[j], k, l);
102  }
103  arg.Y[k].save(y, idx, parity);
104  arg.W[k].save(w, idx, parity);
105 
106  idx += gridDim.x * blockDim.x;
107  }
108  }
109 
110  template <typename T> struct coeff_array {
111  const T *data;
112  const bool use_const;
113  coeff_array() : data(nullptr), use_const(false) {}
114  coeff_array(const T *data, bool use_const) : data(data), use_const(use_const) {}
115  };
116 
117  template <int NXZ, typename Float2, typename FloatN> struct MultiBlasFunctor {
118 
120  virtual __device__ __host__ void init() { ; }
121 
123  virtual __device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)
124  = 0;
125  };
126 
131  __device__ __host__ inline void _caxpy(const float2 &a, const float4 &x, float4 &y)
132  {
133  y.x += a.x * x.x;
134  y.x -= a.y * x.y;
135  y.y += a.y * x.x;
136  y.y += a.x * x.y;
137  y.z += a.x * x.z;
138  y.z -= a.y * x.w;
139  y.w += a.y * x.z;
140  y.w += a.x * x.w;
141  }
142 
143  __device__ __host__ inline void _caxpy(const float2 &a, const float2 &x, float2 &y)
144  {
145  y.x += a.x * x.x;
146  y.x -= a.y * x.y;
147  y.y += a.y * x.x;
148  y.y += a.x * x.y;
149  }
150 
151  __device__ __host__ inline void _caxpy(const double2 &a, const double2 &x, double2 &y)
152  {
153  y.x += a.x * x.x;
154  y.x -= a.y * x.y;
155  y.y += a.y * x.x;
156  y.y += a.x * x.y;
157  }
158 
159  template <int NXZ, typename Float2, typename FloatN>
160  struct multicaxpy_ : public MultiBlasFunctor<NXZ, Float2, FloatN> {
161  const int NYW;
162  // ignore parameter arrays since we place them in constant memory
164  NYW(NYW)
165  {
166  }
167 
168  __device__ __host__ inline void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)
169  {
170 #ifdef __CUDA_ARCH__
171  Float2 *a = reinterpret_cast<Float2 *>(Amatrix_d); // fetch coefficient matrix from constant memory
172  _caxpy(a[MAX_MULTI_BLAS_N * j + i], x, y);
173 #else
174  Float2 *a = reinterpret_cast<Float2 *>(Amatrix_h);
175  _caxpy(a[NYW * j + i], x, y);
176 #endif
177  }
178 
179  int streams() { return 2 * NYW + NXZ * NYW; }
180  int flops() { return 4 * NXZ * NYW; }
181  };
182 
186  template <int NXZ, typename Float2, typename FloatN>
187  struct multicaxpyz_ : public MultiBlasFunctor<NXZ, Float2, FloatN> {
188  const int NYW;
189  // ignore parameter arrays since we place them in constant memory
191  NYW(NYW)
192  {
193  }
194 
195  __device__ __host__ inline void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)
196  {
197 #ifdef __CUDA_ARCH__
198  Float2 *a = reinterpret_cast<Float2 *>(Amatrix_d); // fetch coefficient matrix from constant memory
199  if (j == 0) w = y;
200  _caxpy(a[MAX_MULTI_BLAS_N * j + i], x, w);
201 #else
202  Float2 *a = reinterpret_cast<Float2 *>(Amatrix_h);
203  if (j == 0) w = y;
204  _caxpy(a[NYW * j + i], x, w);
205 #endif
206  }
207 
208  int streams() { return 2 * NYW + NXZ * NYW; }
209  int flops() { return 4 * NXZ * NYW; }
210  };
211 
215  template <int NXZ, typename Float2, typename FloatN>
216  struct multi_axpyBzpcx_ : public MultiBlasFunctor<NXZ, Float2, FloatN> {
217  typedef typename scalar<Float2>::type real;
218  const int NYW;
220 
222  NYW(NYW),
223  a {},
224  b {},
225  c {}
226  {
227  // copy arguments into the functor
228  for (int i = 0; i < NYW; i++) {
229  this->a[i] = a.data[i];
230  this->b[i] = b.data[i];
231  this->c[i] = c.data[i];
232  }
233  }
234  __device__ __host__ inline void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)
235  {
236  y += a[i] * w;
237  w = b[i] * x + c[i] * w;
238  }
239  int streams() { return 4 * NYW + NXZ; }
240  int flops() { return 5 * NXZ * NYW; }
241  };
242 
246  template <int NXZ, typename Float2, typename FloatN>
247  struct multi_caxpyBxpz_ : public MultiBlasFunctor<NXZ, Float2, FloatN> {
248  typedef typename scalar<Float2>::type real;
249  const int NYW;
250 
252  const coeff_array<Complex> &a, const coeff_array<Complex> &b, const coeff_array<Complex> &c, int NYW) :
253  NYW(NYW)
254  {
255  }
256 
257  // i loops over NYW, j loops over NXZ
258  __device__ __host__ inline void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)
259  {
260 #ifdef __CUDA_ARCH__
261  Float2 *a = reinterpret_cast<Float2 *>(Amatrix_d); // fetch coefficient matrix from constant memory
262  Float2 *b = reinterpret_cast<Float2 *>(Bmatrix_d); // fetch coefficient matrix from constant memory
263  _caxpy(a[MAX_MULTI_BLAS_N * j], x, y);
264  _caxpy(b[MAX_MULTI_BLAS_N * j], x, w); // b/c we swizzled z into w.
265 #else
266  Float2 *a = reinterpret_cast<Float2 *>(Amatrix_h);
267  Float2 *b = reinterpret_cast<Float2 *>(Bmatrix_h);
268  _caxpy(a[j], x, y);
269  _caxpy(b[j], x, w); // b/c we swizzled z into w.
270 #endif
271  }
272  int streams() { return 4 * NYW + NXZ; }
273  int flops() { return 8 * NXZ * NYW; }
274  };
275 
276  } // namespace blas
277 
278 } // namespace quda
virtual __device__ __host__ void init()
pre-computation routine before the main loop
multi_axpyBzpcx_(const coeff_array< double > &a, const coeff_array< double > &b, const coeff_array< double > &c, int NYW)
static __constant__ signed char Cmatrix_d[MAX_MATRIX_SIZE]
SpinorY Y[MAX_MULTI_BLAS_N]
Parameter struct for generic multi-blas kernel.
static __constant__ signed char Amatrix_d[MAX_MATRIX_SIZE]
multicaxpyz_(const coeff_array< Complex > &a, const coeff_array< Complex > &b, const coeff_array< Complex > &c, int NYW)
multi_caxpyBxpz_(const coeff_array< Complex > &a, const coeff_array< Complex > &b, const coeff_array< Complex > &c, int NYW)
int flops()
total number of input and output streams
__device__ __host__ void _caxpy(const float2 &a, const float4 &x, float4 &y)
Definition: blas_core.cuh:110
__device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)
where the reduction is usually computed and any auxiliary operations
coeff_array(const T *data, bool use_const)
scalar< Float2 >::type real
__device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)
where the reduction is usually computed and any auxiliary operations
static signed char * Bmatrix_h
static __constant__ signed char Bmatrix_d[MAX_MATRIX_SIZE]
__global__ void multiBlasKernel(Arg arg_)
Generic multi-blas kernel with four loads and up to four stores.
__device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)
where the reduction is usually computed and any auxiliary operations
scalar< Float2 >::type real
int flops()
total number of input and output streams
__device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)
where the reduction is usually computed and any auxiliary operations
static __constant__ signed char arg_buffer[MAX_MATRIX_SIZE]
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
SpinorW W[MAX_MULTI_BLAS_N]
MultiBlasArg(SpinorX X[NXZ], SpinorY Y[], SpinorZ Z[NXZ], SpinorW W[], Functor f, int NYW, int length)
#define MAX_MATRIX_SIZE
multicaxpy_(const coeff_array< Complex > &a, const coeff_array< Complex > &b, const coeff_array< Complex > &c, int NYW)
int flops()
total number of input and output streams
int flops()
total number of input and output streams
static signed char * Amatrix_h
QudaParity parity
Definition: covdev_test.cpp:54
static signed char * Cmatrix_h
#define MAX_MULTI_BLAS_N