QUDA  0.9.0
multi_reduce_core.cuh
Go to the documentation of this file.
1 __host__ __device__ inline double set(double &x) { return x;}
2 __host__ __device__ inline double2 set(double2 &x) { return x;}
3 __host__ __device__ inline double3 set(double3 &x) { return x;}
4 __host__ __device__ inline void sum(double &a, double &b) { a += b; }
5 __host__ __device__ inline void sum(double2 &a, double2 &b) { a.x += b.x; a.y += b.y; }
6 __host__ __device__ inline void sum(double3 &a, double3 &b) { a.x += b.x; a.y += b.y; a.z += b.z; }
7 
8 #ifdef QUAD_SUM
9 __host__ __device__ inline double set(doubledouble &a) { return a.head(); }
10 __host__ __device__ inline double2 set(doubledouble2 &a) { return make_double2(a.x.head(),a.y.head()); }
11 __host__ __device__ inline double3 set(doubledouble3 &a) { return make_double3(a.x.head(),a.y.head(),a.z.head()); }
12 __host__ __device__ inline void sum(double &a, doubledouble &b) { a += b.head(); }
13 __host__ __device__ inline void sum(double2 &a, doubledouble2 &b) { a.x += b.x.head(); a.y += b.y.head(); }
14 __host__ __device__ inline void sum(double3 &a, doubledouble3 &b) { a.x += b.x.head(); a.y += b.y.head(); a.z += b.z.head(); }
15 #endif
16 
17 //#define WARP_MULTI_REDUCE
18 
19 __device__ static unsigned int count = 0;
20 __shared__ static bool isLastBlockDone;
21 
22 #include <launch_kernel.cuh>
23 
24 
36 template <int NXZ, typename ReduceType, typename SpinorX, typename SpinorY,
37  typename SpinorZ, typename SpinorW, typename Reducer>
38 struct MultiReduceArg : public ReduceArg<vector_type<ReduceType,NXZ> > {
39 
40  const int NYW;
41  SpinorX X[NXZ];
42  SpinorY Y[MAX_MULTI_BLAS_N];
43  SpinorZ Z[NXZ];
44  SpinorW W[MAX_MULTI_BLAS_N];
45  Reducer r;
46  const int length;
47  MultiReduceArg(SpinorX X[NXZ], SpinorY Y[], SpinorZ Z[NXZ], SpinorW W[], Reducer r, int NYW, int length)
48  : NYW(NYW), r(r), length(length) {
49 
50  for (int i=0; i<NXZ; ++i)
51  {
52  this->X[i] = X[i];
53  this->Z[i] = Z[i];
54  }
55 
56  for (int i=0; i<NYW; ++i)
57  {
58  this->Y[i] = Y[i];
59  this->W[i] = W[i];
60  }
61  }
62 };
63 
64 // storage for matrix coefficients
65 #define MAX_MATRIX_SIZE 4096
66 static __constant__ signed char Amatrix_d[MAX_MATRIX_SIZE];
67 static __constant__ signed char Bmatrix_d[MAX_MATRIX_SIZE];
68 static __constant__ signed char Cmatrix_d[MAX_MATRIX_SIZE];
69 
70 static signed char *Amatrix_h;
71 static signed char *Bmatrix_h;
72 static signed char *Cmatrix_h;
73 
74 // 'sum' should be an array of length NXZ...?
75 template<int k, int NXZ, typename FloatN, int M, typename ReduceType, typename Arg>
76 __device__ inline void compute(vector_type<ReduceType,NXZ> &sum, Arg &arg, int idx, int parity) {
77 
78  constexpr int kmod = k; // there's an old warning about silencing an out-of-bounds compiler warning,
79  // but I never seem to get it, and I'd need to compare against NYW anyway,
80  // which we can't really get at here b/c it's not a const, and I don't want
81  // to fix that. It works fine based on the switch in the function below.
82 
83  while (idx < arg.length) {
84 
85  FloatN x[M], y[M], z[M], w[M];
86 
87  arg.Y[kmod].load(y, idx, parity);
88  arg.W[kmod].load(w, idx, parity);
89 
90  // Each NYW owns its own thread.
91  // The NXZ's are all in the same thread block,
92  // so they can share the same memory.
93 #pragma unroll
94  for (int l=0; l < NXZ; l++) {
95  arg.X[l].load(x, idx, parity);
96  arg.Z[l].load(z, idx, parity);
97 
98  arg.r.pre();
99 
100 #pragma unroll
101  for (int j=0; j<M; j++) arg.r(sum[l], x[j], y[j], z[j], w[j], k, l);
102 
103  arg.r.post(sum[l]);
104  }
105 
106  arg.Y[kmod].save(y, idx, parity);
107  arg.W[kmod].save(w, idx, parity);
108 
109  idx += gridDim.x*blockDim.x;
110  }
111 
112 }
113 
114 #ifdef WARP_MULTI_REDUCE
115 template<typename ReduceType, typename FloatN, int M, int NXZ,
116  typename SpinorX, typename SpinorY, typename SpinorZ, typename SpinorW, typename Reducer>
117 #else
118  template<int block_size, typename ReduceType, typename FloatN, int M, int NXZ,
119  typename SpinorX, typename SpinorY, typename SpinorZ, typename SpinorW, typename Reducer>
120 #endif
122 
123  unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
124  unsigned int k = blockIdx.y*blockDim.y + threadIdx.y;
125  unsigned int parity = blockIdx.z;
126 
127  if (k >= arg.NYW) return; // safe since k are different thread blocks
128 
129  vector_type<ReduceType,NXZ> sum;
130 
131  switch(k) {
132  case 0: compute< 0,NXZ,FloatN,M,ReduceType>(sum,arg,i,parity); break;
133 #if MAX_MULTI_BLAS_N >= 2
134  case 1: compute< 1,NXZ,FloatN,M,ReduceType>(sum,arg,i,parity); break;
135 #if MAX_MULTI_BLAS_N >= 3
136  case 2: compute< 2,NXZ,FloatN,M,ReduceType>(sum,arg,i,parity); break;
137 #if MAX_MULTI_BLAS_N >= 4
138  case 3: compute< 3,NXZ,FloatN,M,ReduceType>(sum,arg,i,parity); break;
139 #if MAX_MULTI_BLAS_N >= 5
140  case 4: compute< 4,NXZ,FloatN,M,ReduceType>(sum,arg,i,parity); break;
141 #if MAX_MULTI_BLAS_N >= 6
142  case 5: compute< 5,NXZ,FloatN,M,ReduceType>(sum,arg,i,parity); break;
143 #if MAX_MULTI_BLAS_N >= 7
144  case 6: compute< 6,NXZ,FloatN,M,ReduceType>(sum,arg,i,parity); break;
145 #if MAX_MULTI_BLAS_N >= 8
146  case 7: compute< 7,NXZ,FloatN,M,ReduceType>(sum,arg,i,parity); break;
147 #if MAX_MULTI_BLAS_N >= 9
148  case 8: compute< 8,NXZ,FloatN,M,ReduceType>(sum,arg,i,parity); break;
149 #if MAX_MULTI_BLAS_N >= 10
150  case 9: compute< 9,NXZ,FloatN,M,ReduceType>(sum,arg,i,parity); break;
151 #if MAX_MULTI_BLAS_N >= 11
152  case 10: compute<10,NXZ,FloatN,M,ReduceType>(sum,arg,i,parity); break;
153 #if MAX_MULTI_BLAS_N >= 12
154  case 11: compute<11,NXZ,FloatN,M,ReduceType>(sum,arg,i,parity); break;
155 #if MAX_MULTI_BLAS_N >= 13
156  case 12: compute<12,NXZ,FloatN,M,ReduceType>(sum,arg,i,parity); break;
157 #if MAX_MULTI_BLAS_N >= 14
158  case 13: compute<13,NXZ,FloatN,M,ReduceType>(sum,arg,i,parity); break;
159 #if MAX_MULTI_BLAS_N >= 15
160  case 14: compute<14,NXZ,FloatN,M,ReduceType>(sum,arg,i,parity); break;
161 #if MAX_MULTI_BLAS_N >= 16
162  case 15: compute<15,NXZ,FloatN,M,ReduceType>(sum,arg,i,parity); break;
163 #endif //16
164 #endif //15
165 #endif //14
166 #endif //13
167 #endif //12
168 #endif //11
169 #endif //10
170 #endif //9
171 #endif //8
172 #endif //7
173 #endif //6
174 #endif //5
175 #endif //4
176 #endif //3
177 #endif //2
178 }
179 
180 #ifdef WARP_MULTI_REDUCE
181  ::quda::warp_reduce<vector_type<ReduceType,NXZ> >(arg, sum, arg.NYW*parity + k);
182 #else
183  ::quda::reduce<block_size, vector_type<ReduceType,NXZ> >(arg, sum, arg.NYW*parity + k);
184 #endif
185 
186 } // multiReduceKernel
187 
188 template<typename doubleN, typename ReduceType, typename FloatN, int M, int NXZ,
189  typename SpinorX, typename SpinorY, typename SpinorZ, typename SpinorW, typename Reducer>
190  void multiReduceLaunch(doubleN result[],
192  const TuneParam &tp, const cudaStream_t &stream) {
193 
194  if(tp.grid.x > (unsigned int)deviceProp.maxGridSize[0])
195  errorQuda("Grid size %d greater than maximum %d\n", tp.grid.x, deviceProp.maxGridSize[0]);
196 
197  // ESW: this is where the multireduce kernel is called...?
198 #ifdef WARP_MULTI_REDUCE
199  multiReduceKernel<ReduceType,FloatN,M,NXZ><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
200 #else
201  LAUNCH_KERNEL_LOCAL_PARITY(multiReduceKernel, tp, stream, arg, ReduceType, FloatN, M, NXZ);
202 #endif
203 
204 #if (defined(_MSC_VER) && defined(_WIN64) || defined(__LP64__))
205  if(deviceProp.canMapHostMemory){
207  while(cudaSuccess != qudaEventQuery(*getReduceEvent())) {}
208  } else
209 #endif
210  { qudaMemcpy(getHostReduceBuffer(), getMappedHostReduceBuffer(), tp.grid.z*sizeof(ReduceType)*NXZ*arg.NYW, cudaMemcpyDeviceToHost); }
211 
212  // need to transpose for same order with vector thread reduction
213  for (int i=0; i<NXZ; i++) {
214  for (int j=0; j<arg.NYW; j++) {
215  result[i*arg.NYW+j] = set(((ReduceType*)getHostReduceBuffer())[j*NXZ+i]);
216  if (tp.grid.z==2) sum(result[i*arg.NYW+j], ((ReduceType*)getHostReduceBuffer())[NXZ*arg.NYW+j*NXZ+i]);
217  }
218  }
219 }
220 
221 namespace detail
222 {
223  template<unsigned... digits>
224  struct to_chars { static const char value[]; };
225 
226  template<unsigned... digits>
227  const char to_chars<digits...>::value[] = {('0' + digits)..., 0};
228 
229  template<unsigned rem, unsigned... digits>
230  struct explode : explode<rem / 10, rem % 10, digits...> {};
231 
232  template<unsigned... digits>
233  struct explode<0, digits...> : to_chars<digits...> {};
234 }
235 
236 template<unsigned num>
237 struct num_to_string : detail::explode<num / 10, num % 10> {};
238 
239 template<int NXZ, typename doubleN, typename ReduceType, typename FloatN, int M, typename SpinorX,
240  typename SpinorY, typename SpinorZ, typename SpinorW, typename Reducer>
241  class MultiReduceCuda : public Tunable {
242 
243  private:
244  const int NYW;
246  doubleN *result;
247  int nParity;
248 
249  // host pointer used for backing up fields when tuning
250  // don't curry into the Spinors to minimize parameter size
252  std::vector<ColorSpinorField*> &y, &w;
253 
254  unsigned int sharedBytesPerThread() const { return 0; }
255  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
256 
257  virtual bool advanceSharedBytes(TuneParam &param) const
258  {
259  TuneParam next(param);
260  advanceBlockDim(next); // to get next blockDim
261  int nthreads = next.block.x * next.block.y * next.block.z;
263  return false;
264  }
265 
266  // we only launch thread blocks up to size 512 since the autoner
267  // tuner favours smaller blocks and this helps with compile time
268  unsigned int maxBlockSize() const { return deviceProp.maxThreadsPerBlock / 2; }
269 
270 public:
271  MultiReduceCuda(doubleN result[], SpinorX X[], SpinorY Y[], SpinorZ Z[], SpinorW W[],
272  Reducer &r, int NYW, int length, int nParity, std::vector<ColorSpinorField*> &y, std::vector<ColorSpinorField*> &w)
273  : NYW(NYW), arg(X, Y, Z, W, r, NYW, length/nParity), nParity(nParity), result(result),
274  Y_h(), W_h(), Ynorm_h(), Wnorm_h(), y(y), w(w) { }
275 
276  inline TuneKey tuneKey() const {
277  char name[TuneKey::name_n];
279  strcat(name, std::to_string(NYW).c_str());
280  strcat(name, typeid(arg.r).name());
281  char aux[TuneKey::aux_n];
282  strcpy(aux, "policy_kernel,");
283  strcat(aux,blasStrings.aux_tmp);
284  return TuneKey(blasStrings.vol_str, name, aux);
285  }
286 
287  void apply(const cudaStream_t &stream){
288  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
289  multiReduceLaunch<doubleN,ReduceType,FloatN,M,NXZ>(result,arg,tp,stream);
290  }
291 
292  // Should these be NYW?
293 #ifdef WARP_MULTI_REDUCE
294 
301  bool advanceBlockDim(TuneParam &param) const {
302  if (param.block.y < NYW) {
303  param.block.y++;
304  param.grid.y = (NYW + param.block.y - 1) / param.block.y;
305  return true;
306  } else {
307  param.block.y = 1;
308  param.grid.y = NYW;
309  return false;
310  }
311  }
312 #endif
313 
314  bool advanceGridDim(TuneParam &param) const {
315  bool rtn = Tunable::advanceGridDim(param);
316  if (NYW > deviceProp.maxGridSize[1]) errorQuda("N=%d is greater than the maximum support grid size", NYW);
317  return rtn;
318  }
319 
320  void initTuneParam(TuneParam &param) const {
321  Tunable::initTuneParam(param);
322  param.block.y = 1;
323  param.grid.y = NYW;
324  param.grid.z = nParity;
325  }
326 
327  void defaultTuneParam(TuneParam &param) const {
328  Tunable::defaultTuneParam(param);
329  param.block.y = 1;
330  param.grid.y = NYW;
331  param.grid.z = nParity;
332  }
333 
334  void preTune() {
335  for(int i=0; i<NYW; ++i){
336  arg.Y[i].backup(&Y_h[i], &Ynorm_h[i], y[i]->Bytes(), y[i]->NormBytes());
337  arg.W[i].backup(&W_h[i], &Wnorm_h[i], w[i]->Bytes(), w[i]->NormBytes());
338  }
339  }
340 
341  void postTune() {
342  for(int i=0; i<NYW; ++i){
343  arg.Y[i].restore(&Y_h[i], &Ynorm_h[i], y[i]->Bytes(), y[i]->NormBytes());
344  arg.W[i].restore(&W_h[i], &Wnorm_h[i], w[i]->Bytes(), w[i]->NormBytes());
345  }
346  }
347 
348  // Need to check this!
349  // The NYW seems right?
350  long long flops() const { return NYW*NXZ*arg.r.flops()*vec_length<FloatN>::value*(long long)arg.length*nParity*M; }
351  long long bytes() const {
352  size_t bytes = NYW*NXZ*arg.X[0].Precision()*vec_length<FloatN>::value*M;
353  if (arg.X[0].Precision() == QUDA_HALF_PRECISION) bytes += NYW*NXZ*sizeof(float);
354  return arg.r.streams()*bytes*arg.length*nParity; }
355  int tuningIter() const { return 3; }
356 };
357 
358 template <typename T>
359 struct coeff_array {
360  const T *data;
361  const bool use_const;
362  coeff_array() : data(nullptr), use_const(false) { }
364 };
365 
366 
367 
368 template <typename doubleN, typename ReduceType, typename RegType, typename StoreType, typename yType,
369  int M, int NXZ, template <int MXZ, typename ReducerType, typename Float, typename FloatN> class Reducer, typename write, typename T>
370  void multiReduceCuda(doubleN result[], const reduce::coeff_array<T> &a, const reduce::coeff_array<T> &b, const reduce::coeff_array<T> &c,
371  std::vector<ColorSpinorField*>& x, std::vector<ColorSpinorField*>& y,
372  std::vector<ColorSpinorField*>& z, std::vector<ColorSpinorField*>& w,
373  int length) {
374 
375  const int NYW = y.size();
376 
377  int nParity = x[0]->SiteSubset();
378  memset(result, 0, NXZ*NYW*sizeof(doubleN));
379 
380  const int N_MAX = NXZ > NYW ? NXZ : NYW;
381  const int N_MIN = NXZ < NYW ? NXZ : NYW;
382 
383  static_assert(MAX_MULTI_BLAS_N*MAX_MULTI_BLAS_N <= QUDA_MAX_MULTI_REDUCE, "MAX_MULTI_BLAS_N^2 exceeds maximum number of reductions");
384  static_assert(MAX_MULTI_BLAS_N <= 16, "MAX_MULTI_BLAS_N exceeds maximum size 16");
385  if (N_MAX > MAX_MULTI_BLAS_N) errorQuda("Spinor vector length exceeds max size (%d > %d)", N_MAX, MAX_MULTI_BLAS_N);
386 
387  if (NXZ*NYW*sizeof(Complex) > MAX_MATRIX_SIZE)
388  errorQuda("A matrix exceeds max size (%lu > %d)", NXZ*NYW*sizeof(Complex), MAX_MATRIX_SIZE);
389 
390  for (int i=0; i<N_MIN; i++) {
391  checkSpinor(*x[i],*y[i]); checkSpinor(*x[i],*z[i]); checkSpinor(*x[i],*w[i]);
392  if (!x[i]->isNative()) {
393  warningQuda("Reductions on non-native fields are not supported\n");
394  return;
395  }
396  }
397 
398  typedef typename scalar<RegType>::type Float;
399  typedef typename vector<Float,2>::type Float2;
400  typedef vector<Float,2> vec2;
401 
402  // FIXME - if NXZ=1 no need to copy entire array
403  // FIXME - do we really need strided access here?
404  if (a.data && a.use_const) {
405  Float2 A[MAX_MATRIX_SIZE/sizeof(Float2)];
406  // since the kernel doesn't know the width of them matrix at compile
407  // time we stride it and copy the padded matrix to GPU
408  for (int i=0; i<NXZ; i++) for (int j=0; j<NYW; j++)
409  A[MAX_MULTI_BLAS_N * i + j] = make_Float2<Float2>(Complex(a.data[NYW * i + j]));
410 
411  cudaMemcpyToSymbolAsync(Amatrix_d, A, MAX_MATRIX_SIZE, 0, cudaMemcpyHostToDevice, *getStream());
412  Amatrix_h = reinterpret_cast<signed char*>(const_cast<T*>(a.data));
413  }
414 
415  if (b.data && b.use_const) {
416  Float2 B[MAX_MATRIX_SIZE/sizeof(Float2)];
417  // since the kernel doesn't know the width of them matrix at compile
418  // time we stride it and copy the padded matrix to GPU
419  for (int i=0; i<NXZ; i++) for (int j=0; j<NYW; j++)
420  B[MAX_MULTI_BLAS_N * i + j] = make_Float2<Float2>(Complex(b.data[NYW * i + j]));
421 
422  cudaMemcpyToSymbolAsync(Bmatrix_d, B, MAX_MATRIX_SIZE, 0, cudaMemcpyHostToDevice, *getStream());
423  Bmatrix_h = reinterpret_cast<signed char*>(const_cast<T*>(b.data));
424  }
425 
426  if (c.data && c.use_const) {
427  Float2 C[MAX_MATRIX_SIZE/sizeof(Float2)];
428  // since the kernel doesn't know the width of them matrix at compile
429  // time we stride it and copy the padded matrix to GPU
430  for (int i=0; i<NXZ; i++) for (int j=0; j<NYW; j++)
431  C[MAX_MULTI_BLAS_N * i + j] = make_Float2<Float2>(Complex(c.data[NYW * i + j]));
432 
433  cudaMemcpyToSymbolAsync(Cmatrix_d, C, MAX_MATRIX_SIZE, 0, cudaMemcpyHostToDevice, *getStream());
434  Cmatrix_h = reinterpret_cast<signed char*>(const_cast<T*>(c.data));
435  }
436 
437  blasStrings.vol_str = x[0]->VolString();
438  strcpy(blasStrings.aux_tmp, x[0]->AuxString());
439  if (typeid(StoreType) != typeid(yType)) {
440  strcat(blasStrings.aux_tmp, ",");
441  strcat(blasStrings.aux_tmp, y[0]->AuxString());
442  }
443 
444  multi::SpinorTexture<RegType,StoreType,M,0> X[NXZ];
445  multi::Spinor<RegType,yType,M,write::Y,1> Y[MAX_MULTI_BLAS_N];
446  multi::SpinorTexture<RegType,StoreType,M,2> Z[NXZ];
447  multi::Spinor<RegType,StoreType,M,write::W,3> W[MAX_MULTI_BLAS_N];
448 
449  for (int i=0; i<NXZ; i++) {
450  X[i].set(*dynamic_cast<cudaColorSpinorField *>(x[i]));
451  Z[i].set(*dynamic_cast<cudaColorSpinorField *>(z[i]));
452  }
453  for (int i=0; i<NYW; i++) {
454  Y[i].set(*dynamic_cast<cudaColorSpinorField *>(y[i]));
455  W[i].set(*dynamic_cast<cudaColorSpinorField *>(w[i]));
456  }
457 
458  // since the block dot product and the block norm use the same functors, we need to distinguish them
459  bool is_norm = false;
460  if (NXZ==NYW) {
461  is_norm = true;
462  for (int i=0; i<NXZ; i++) {
463  if (x[i]->V() != y[i]->V() || x[i]->V() != z[i]->V() || x[i]->V() != w[i]->V()) {
464  is_norm = false;
465  break;
466  }
467  }
468  }
469  if (is_norm) strcat(blasStrings.aux_tmp, ",norm");
470 
471  Reducer<NXZ, ReduceType, Float2, RegType> r(a, b, c, NYW);
472 
473  MultiReduceCuda<NXZ,doubleN,ReduceType,RegType,M,
474  multi::SpinorTexture<RegType,StoreType,M,0>,
475  multi::Spinor<RegType,yType,M,write::Y,1>,
476  multi::SpinorTexture<RegType,StoreType,M,2>,
477  multi::Spinor<RegType,StoreType,M,write::W,3>,
478  decltype(r) >
479  reduce(result, X, Y, Z, W, r, NYW, length, x[0]->SiteSubset(), y, w);
480  reduce.apply(*blas::getStream());
481 
482  blas::bytes += reduce.bytes();
483  blas::flops += reduce.flops();
484 
485  checkCudaError();
486 
487  return;
488 }
#define qudaMemcpy(dst, src, count, kind)
Definition: quda_cuda_api.h:32
dim3 dim3 blockDim
long long bytes() const
static __constant__ signed char Bmatrix_d[MAX_MATRIX_SIZE]
cudaStream_t stream
void * getHostReduceBuffer()
Definition: reduce_quda.cu:75
cudaError_t qudaEventQuery(cudaEvent_t &event)
Wrapper around cudaEventQuery or cuEventQuery.
cudaDeviceProp deviceProp
#define LAUNCH_KERNEL_LOCAL_PARITY(kernel, tp, stream, arg,...)
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
#define errorQuda(...)
Definition: util_quda.h:90
#define QUDA_MAX_MULTI_REDUCE
Maximum number of simultaneous reductions that can take place. This number may be increased if needed...
std::complex< double > Complex
Definition: eig_variables.h:13
char * Ynorm_h[MAX_MULTI_BLAS_N]
char * Wnorm_h[MAX_MULTI_BLAS_N]
char * Y_h[MAX_MULTI_BLAS_N]
bool advanceGridDim(TuneParam &param) const
Parameter struct for generic multi-blas kernel.
char * strcpy(char *__dst, const char *__src)
void multiReduceLaunch(doubleN result[], MultiReduceArg< NXZ, ReduceType, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer > &arg, const TuneParam &tp, const cudaStream_t &stream)
char * strcat(char *__s1, const char *__s2)
static __shared__ bool isLastBlockDone
void * getMappedHostReduceBuffer()
Definition: reduce_quda.cu:74
static signed char * Bmatrix_h
std::vector< ColorSpinorField * > & w
coeff_array(const T *data, bool use_const)
void defaultTuneParam(TuneParam &param) const
__global__ void multiReduceKernel(MultiReduceArg< NXZ, ReduceType, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer > arg)
QudaGaugeParam param
Definition: pack_test.cpp:17
#define b
cudaStream_t * getStream()
Definition: blas_quda.cu:75
#define MAX_MATRIX_SIZE
char * W_h[MAX_MULTI_BLAS_N]
__host__ __device__ void sum(double &a, double &b)
static struct quda::blas::@4 blasStrings
int int int w
int V
Definition: test_util.cpp:28
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603
#define warningQuda(...)
Definition: util_quda.h:101
static __constant__ signed char Amatrix_d[MAX_MATRIX_SIZE]
virtual bool advanceSharedBytes(TuneParam &param) const
static __device__ unsigned int count
std::vector< ColorSpinorField * > & y
int Z[4]
Definition: test_util.cpp:27
cudaEvent_t * getReduceEvent()
Definition: reduce_quda.cu:76
unsigned int sharedBytesPerBlock(const TuneParam &param) const
TuneKey tuneKey() const
static signed char * Cmatrix_h
void checkSpinor(const ColorSpinorField &a, const ColorSpinorField &b)
Definition: reduce_quda.cu:40
void * memset(void *__b, int __c, size_t __len)
static signed char * Amatrix_h
void initTuneParam(TuneParam &param) const
SpinorY Y[MAX_MULTI_BLAS_N]
unsigned int sharedBytesPerThread() const
__device__ void reduce(ReduceArg< T > arg, const T &in, const int idx=0)
Definition: cub_helper.cuh:163
#define MAX_MULTI_BLAS_N
Definition: quda_internal.h:49
MultiReduceArg< NXZ, ReduceType, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer > arg
long long flops() const
static __constant__ signed char Cmatrix_d[MAX_MATRIX_SIZE]
void apply(const cudaStream_t &stream)
unsigned long long flops
Definition: blas_quda.cu:42
MultiReduceArg(SpinorX X[NXZ], SpinorY Y[], SpinorZ Z[NXZ], SpinorW W[], Reducer r, int NYW, int length)
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:880
unsigned int maxBlockSize() const
void size_t length
MultiReduceCuda(doubleN result[], SpinorX X[], SpinorY Y[], SpinorZ Z[], SpinorW W[], Reducer &r, int NYW, int length, int nParity, std::vector< ColorSpinorField *> &y, std::vector< ColorSpinorField *> &w)
cudaError_t qudaEventRecord(cudaEvent_t &event, cudaStream_t stream=0)
Wrapper around cudaEventRecord or cuEventRecord.
static const char value[]
const void * c
SpinorW W[MAX_MULTI_BLAS_N]
#define checkCudaError()
Definition: util_quda.h:129
__device__ void compute(vector_type< ReduceType, NXZ > &sum, Arg &arg, int idx, int parity)
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51
void multiReduceCuda(doubleN result[], const reduce::coeff_array< T > &a, const reduce::coeff_array< T > &b, const reduce::coeff_array< T > &c, std::vector< ColorSpinorField *> &x, std::vector< ColorSpinorField *> &y, std::vector< ColorSpinorField *> &z, std::vector< ColorSpinorField *> &w, int length)
QudaParity parity
Definition: covdev_test.cpp:53
const bool use_const
#define a
unsigned long long bytes
Definition: blas_quda.cu:43