QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
multi_reduce_core.cuh
Go to the documentation of this file.
1 #pragma once
2 
4 #include <blas_helper.cuh>
5 #include <cub_helper.cuh>
6 
7 //#define WARP_MULTI_REDUCE
8 
9 namespace quda
10 {
11 
12  namespace blas
13  {
14 
15 #define BLAS_SPINOR // do not include ghost functions in Spinor class to reduce parameter space overhead
16 #include <texture.h>
17 
18  // storage for matrix coefficients
19 #define MAX_MATRIX_SIZE 4096
20  static __constant__ signed char Amatrix_d[MAX_MATRIX_SIZE];
21  static __constant__ signed char Bmatrix_d[MAX_MATRIX_SIZE];
22  static __constant__ signed char Cmatrix_d[MAX_MATRIX_SIZE];
23 
24  static signed char *Amatrix_h;
25  static signed char *Bmatrix_h;
26  static signed char *Cmatrix_h;
27 
28 #if CUDA_VERSION < 9000
29  // as a performance work around we put the argument struct into
30  // __constant__ memory to prevent the compiler from spilling
31  // registers on older CUDA
32  static __constant__ signed char arg_buffer[MAX_MATRIX_SIZE];
33 #endif
34 
46  template <int NXZ, typename ReduceType, typename SpinorX, typename SpinorY, typename SpinorZ, typename SpinorW,
47  typename Reducer>
48  struct MultiReduceArg : public ReduceArg<vector_type<ReduceType, NXZ>> {
49 
50  const int NYW;
51  SpinorX X[NXZ];
52  SpinorY Y[MAX_MULTI_BLAS_N];
53  SpinorZ Z[NXZ];
54  SpinorW W[MAX_MULTI_BLAS_N];
55  Reducer r;
56  const int length;
57  MultiReduceArg(SpinorX X[NXZ], SpinorY Y[], SpinorZ Z[NXZ], SpinorW W[], Reducer r, int NYW, int length) :
58  NYW(NYW),
59  r(r),
60  length(length)
61  {
62  for (int i = 0; i < NXZ; ++i) {
63  this->X[i] = X[i];
64  this->Z[i] = Z[i];
65  }
66 
67  for (int i = 0; i < NYW; ++i) {
68  this->Y[i] = Y[i];
69  this->W[i] = W[i];
70  }
71  }
72  };
73 
74 #ifdef WARP_MULTI_REDUCE
75  template <typename ReduceType, typename FloatN, int M, int NXZ, typename Arg>
76 #else
77  template <int block_size, typename ReduceType, typename FloatN, int M, int NXZ, typename Arg>
78 #endif
79  __global__ void multiReduceKernel(Arg arg_)
80  {
81 #if CUDA_VERSION >= 9000
82  Arg &arg = arg_;
83 #else
84  Arg &arg = *((Arg *)arg_buffer);
85 #endif
86  unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
87  unsigned int k = blockIdx.y * blockDim.y + threadIdx.y;
88  unsigned int parity = blockIdx.z;
89 
90  if (k >= arg.NYW) return; // safe since k are different thread blocks
91 
93 
94  while (idx < arg.length) {
95 
96  FloatN x[M], y[M], z[M], w[M];
97 
98  arg.Y[k].load(y, idx, parity);
99  arg.W[k].load(w, idx, parity);
100 
101  // Each NYW owns its own thread.
102  // The NXZ's are all in the same thread block,
103  // so they can share the same memory.
104 #pragma unroll
105  for (int l = 0; l < NXZ; l++) {
106  arg.X[l].load(x, idx, parity);
107  arg.Z[l].load(z, idx, parity);
108 
109  arg.r.pre();
110 
111 #pragma unroll
112  for (int j = 0; j < M; j++) arg.r(sum[l], x[j], y[j], z[j], w[j], k, l);
113 
114  arg.r.post(sum[l]);
115  }
116 
117  arg.Y[k].save(y, idx, parity);
118  arg.W[k].save(w, idx, parity);
119 
120  idx += gridDim.x * blockDim.x;
121  }
122 
123 #ifdef WARP_MULTI_REDUCE
124  ::quda::warp_reduce<vector_type<ReduceType, NXZ>>(arg, sum, arg.NYW * parity + k);
125 #else
126  ::quda::reduce<block_size, vector_type<ReduceType, NXZ>>(arg, sum, arg.NYW * parity + k);
127 #endif
128  } // multiReduceKernel
129 
130  template <typename T> struct coeff_array {
131  const T *data;
132  const bool use_const;
133  coeff_array() : data(nullptr), use_const(false) {}
134  coeff_array(const T *data, bool use_const) : data(data), use_const(use_const) {}
135  };
136 
140  template <int NXZ, typename ReduceType, typename Float2, typename FloatN> struct MultiReduceFunctor {
141 
143  virtual __device__ __host__ void pre() { ; }
144 
146  virtual __device__ __host__ __host__ void operator()(
147  ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)
148  = 0;
149 
151  virtual __device__ __host__ void post(ReduceType &sum) { ; }
152  };
153 
158  template <typename ReduceType> __device__ __host__ void dot_(ReduceType &sum, const double2 &a, const double2 &b)
159  {
160  sum += (ReduceType)a.x * (ReduceType)b.x;
161  sum += (ReduceType)a.y * (ReduceType)b.y;
162  }
163 
164  template <typename ReduceType> __device__ __host__ void dot_(ReduceType &sum, const float2 &a, const float2 &b)
165  {
166  sum += (ReduceType)a.x * (ReduceType)b.x;
167  sum += (ReduceType)a.y * (ReduceType)b.y;
168  }
169 
170  template <typename ReduceType> __device__ __host__ void dot_(ReduceType &sum, const float4 &a, const float4 &b)
171  {
172  sum += (ReduceType)a.x * (ReduceType)b.x;
173  sum += (ReduceType)a.y * (ReduceType)b.y;
174  sum += (ReduceType)a.z * (ReduceType)b.z;
175  sum += (ReduceType)a.w * (ReduceType)b.w;
176  }
177 
178  template <int NXZ, typename ReduceType, typename Float2, typename FloatN>
179  struct Dot : public MultiReduceFunctor<NXZ, ReduceType, Float2, FloatN> {
180  typedef typename scalar<Float2>::type real;
181  const int NYW;
182  Dot(const coeff_array<Complex> &a, const coeff_array<Complex> &b, const coeff_array<Complex> &c, int NYW) :
183  NYW(NYW)
184  {
185  ;
186  }
187  __device__ __host__ void operator()(
188  ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)
189  {
190  dot_<ReduceType>(sum, x, y);
191  }
192  static int streams() { return 2; }
193  static int flops() { return 2; }
194  };
195 
199  template <typename ReduceType> __device__ __host__ void cdot_(ReduceType &sum, const double2 &a, const double2 &b)
200  {
201  typedef typename scalar<ReduceType>::type scalar;
202  sum.x += (scalar)a.x * (scalar)b.x;
203  sum.x += (scalar)a.y * (scalar)b.y;
204  sum.y += (scalar)a.x * (scalar)b.y;
205  sum.y -= (scalar)a.y * (scalar)b.x;
206  }
207 
208  template <typename ReduceType> __device__ __host__ void cdot_(ReduceType &sum, const float2 &a, const float2 &b)
209  {
210  typedef typename scalar<ReduceType>::type scalar;
211  sum.x += (scalar)a.x * (scalar)b.x;
212  sum.x += (scalar)a.y * (scalar)b.y;
213  sum.y += (scalar)a.x * (scalar)b.y;
214  sum.y -= (scalar)a.y * (scalar)b.x;
215  }
216 
217  template <typename ReduceType> __device__ __host__ void cdot_(ReduceType &sum, const float4 &a, const float4 &b)
218  {
219  typedef typename scalar<ReduceType>::type scalar;
220  sum.x += (scalar)a.x * (scalar)b.x;
221  sum.x += (scalar)a.y * (scalar)b.y;
222  sum.x += (scalar)a.z * (scalar)b.z;
223  sum.x += (scalar)a.w * (scalar)b.w;
224  sum.y += (scalar)a.x * (scalar)b.y;
225  sum.y -= (scalar)a.y * (scalar)b.x;
226  sum.y += (scalar)a.z * (scalar)b.w;
227  sum.y -= (scalar)a.w * (scalar)b.z;
228  }
229 
230  template <int NXZ, typename ReduceType, typename Float2, typename FloatN>
231  struct Cdot : public MultiReduceFunctor<NXZ, ReduceType, Float2, FloatN> {
232  typedef typename scalar<Float2>::type real;
233  const int NYW;
234  Cdot(const coeff_array<Complex> &a, const coeff_array<Complex> &b, const coeff_array<Complex> &c, int NYW) :
235  NYW(NYW)
236  {
237  ;
238  }
239  __device__ __host__ inline void operator()(
240  ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)
241  {
242  cdot_<ReduceType>(sum, x, y);
243  }
244  static int streams() { return 2; }
245  static int flops() { return 4; }
246  };
247 
248  template <int NXZ, typename ReduceType, typename Float2, typename FloatN>
249  struct CdotCopy : public MultiReduceFunctor<NXZ, ReduceType, Float2, FloatN> {
250  typedef typename scalar<Float2>::type real;
251  const int NYW;
253  NYW(NYW)
254  {
255  ;
256  }
257  __device__ __host__ inline void operator()(
258  ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)
259  {
260  cdot_<ReduceType>(sum, x, y);
261  if (i == j) w = y;
262  }
263  static int streams() { return 2; }
264  static int flops() { return 4; }
265  };
266 
267  } // namespace blas
268 
269 } // namespace quda
static int flops()
total number of input and output streams
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)
where the reduction is usually computed and any auxiliary operations
static __constant__ signed char Cmatrix_d[MAX_MATRIX_SIZE]
__device__ __host__ void cdot_(ReduceType &sum, const double2 &a, const double2 &b)
Cdot(const coeff_array< Complex > &a, const coeff_array< Complex > &b, const coeff_array< Complex > &c, int NYW)
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)
where the reduction is usually computed and any auxiliary operations
static __constant__ signed char Amatrix_d[MAX_MATRIX_SIZE]
virtual __device__ __host__ void pre()
pre-computation routine called before the "M-loop"
__host__ __device__ void sum(double &a, double &b)
Definition: blas_helper.cuh:62
SpinorW W[MAX_MULTI_BLAS_N]
#define MAX_MATRIX_SIZE
coeff_array(const T *data, bool use_const)
SpinorY Y[MAX_MULTI_BLAS_N]
static signed char * Bmatrix_h
static __constant__ signed char Bmatrix_d[MAX_MATRIX_SIZE]
Parameter struct for generic multi-blas kernel.
scalar< Float2 >::type real
static int flops()
total number of input and output streams
static __constant__ signed char arg_buffer[MAX_MATRIX_SIZE]
static int flops()
total number of input and output streams
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Dot(const coeff_array< Complex > &a, const coeff_array< Complex > &b, const coeff_array< Complex > &c, int NYW)
__global__ void multiReduceKernel(Arg arg_)
CdotCopy(const coeff_array< Complex > &a, const coeff_array< Complex > &b, const coeff_array< Complex > &c, int NYW)
static signed char * Amatrix_h
QudaParity parity
Definition: covdev_test.cpp:54
__device__ __host__ void dot_(ReduceType &sum, const double2 &a, const double2 &b)
scalar< Float2 >::type real
static signed char * Cmatrix_h
virtual __device__ __host__ void post(ReduceType &sum)
post-computation routine called after the "M-loop"
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)
where the reduction is usually computed and any auxiliary operations
MultiReduceArg(SpinorX X[NXZ], SpinorY Y[], SpinorZ Z[NXZ], SpinorW W[], Reducer r, int NYW, int length)
#define MAX_MULTI_BLAS_N
scalar< Float2 >::type real