QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
reduce_core_cub.h
Go to the documentation of this file.
1 __host__ __device__ void zero(double &x) { x = 0.0; }
2 __host__ __device__ void zero(double2 &x) { x.x = 0.0; x.y = 0.0; }
3 __host__ __device__ void zero(double3 &x) { x.x = 0.0; x.y = 0.0; x.z = 0.0; }
4 __device__ void copytoshared(double *s, const int i, const double x, const int block) { s[i] = x; }
5 __device__ void copytoshared(double *s, const int i, const double2 x, const int block)
6 { s[i] = x.x; s[i+block] = x.y; }
7 __device__ void copytoshared(double *s, const int i, const double3 x, const int block)
8 { s[i] = x.x; s[i+block] = x.y; s[i+2*block] = x.z; }
9 __device__ void copytoshared(volatile double *s, const int i, const double x, const int block) { s[i] = x; }
10 __device__ void copytoshared(volatile double *s, const int i, const double2 x, const int block)
11 { s[i] = x.x; s[i+block] = x.y; }
12 __device__ void copytoshared(volatile double *s, const int i, const double3 x, const int block)
13 { s[i] = x.x; s[i+block] = x.y; s[i+2*block] = x.z; }
14 __device__ void copyfromshared(double &x, const double *s, const int i, const int block) { x = s[i]; }
15 __device__ void copyfromshared(double2 &x, const double *s, const int i, const int block)
16 { x.x = s[i]; x.y = s[i+block]; }
17 __device__ void copyfromshared(double3 &x, const double *s, const int i, const int block)
18 { x.x = s[i]; x.y = s[i+block]; x.z = s[i+2*block]; }
19 
20 template<typename ReduceType, typename ReduceSimpleType>
21 __device__ void add(ReduceType &sum, ReduceSimpleType *s, const int i, const int block) { }
22 template<> __device__ void add<double,double>(double &sum, double *s, const int i, const int block)
23 { sum += s[i]; }
24 template<> __device__ void add<double2,double>(double2 &sum, double *s, const int i, const int block)
25 { sum.x += s[i]; sum.y += s[i+block]; }
26 template<> __device__ void add<double3,double>(double3 &sum, double *s, const int i, const int block)
27 { sum.x += s[i]; sum.y += s[i+block]; sum.z += s[i+2*block]; }
28 
29 template<typename ReduceType, typename ReduceSimpleType>
30 __device__ void add(ReduceSimpleType *s, const int i, const int j, const int block) { }
31 template<typename ReduceType, typename ReduceSimpleType>
32 __device__ void add(volatile ReduceSimpleType *s, const int i, const int j, const int block) { }
33 
34 template<> __device__ void add<double,double>(double *s, const int i, const int j, const int block)
35 { s[i] += s[j]; }
36 template<> __device__ void add<double,double>(volatile double *s, const int i, const int j, const int block)
37 { s[i] += s[j]; }
38 
39 template<> __device__ void add<double2,double>(double *s, const int i, const int j, const int block)
40 { s[i] += s[j]; s[i+block] += s[j+block];}
41 template<> __device__ void add<double2,double>(volatile double *s, const int i, const int j, const int block)
42 { s[i] += s[j]; s[i+block] += s[j+block];}
43 
44 template<> __device__ void add<double3,double>(double *s, const int i, const int j, const int block)
45 { s[i] += s[j]; s[i+block] += s[j+block]; s[i+2*block] += s[j+2*block];}
46 template<> __device__ void add<double3,double>(volatile double *s, const int i, const int j, const int block)
47 { s[i] += s[j]; s[i+block] += s[j+block]; s[i+2*block] += s[j+2*block];}
48 
49 #if (__COMPUTE_CAPABILITY__ < 130)
50 __host__ __device__ void zero(doublesingle &x) { x = 0.0; }
51 __host__ __device__ void zero(doublesingle2 &x) { x.x = 0.0; x.y = 0.0; }
52 __host__ __device__ void zero(doublesingle3 &x) { x.x = 0.0; x.y = 0.0; x.z = 0.0; }
53 __device__ void copytoshared(doublesingle *s, const int i, const doublesingle x, const int block) { s[i] = x; }
54 __device__ void copytoshared(doublesingle *s, const int i, const doublesingle2 x, const int block)
55 { s[i] = x.x; s[i+block] = x.y; }
56 __device__ void copytoshared(doublesingle *s, const int i, const doublesingle3 x, const int block)
57 { s[i] = x.x; s[i+block] = x.y; s[i+2*block] = x.z; }
58 __device__ void copytoshared(volatile doublesingle *s, const int i, const doublesingle x, const int block) { s[i].a.x = x.a.x; s[i].a.y = x.a.y; }
59 __device__ void copytoshared(volatile doublesingle *s, const int i, const doublesingle2 x, const int block)
60 { s[i].a.x = x.x.a.x; s[i].a.y = x.x.a.y; s[i+block].a.x = x.y.a.x; s[i+block].a.y = x.y.a.y; }
61 __device__ void copytoshared(volatile doublesingle *s, const int i, const doublesingle3 x, const int block)
62 { s[i].a.x = x.x.a.x; s[i].a.y = x.x.a.y; s[i+block].a.x = x.y.a.x; s[i+block].a.y = x.y.a.y;
63  s[i+2*block].a.x = x.z.a.x; s[i+2*block].a.y = x.z.a.y; }
64 __device__ void copyfromshared(doublesingle &x, const doublesingle *s, const int i, const int block) { x = s[i]; }
65 __device__ void copyfromshared(doublesingle2 &x, const doublesingle *s, const int i, const int block)
66 { x.x = s[i]; x.y = s[i+block]; }
67 __device__ void copyfromshared(doublesingle3 &x, const doublesingle *s, const int i, const int block)
68 { x.x = s[i]; x.y = s[i+block]; x.z = s[i+2*block]; }
69 
70 template<> __device__ void add<doublesingle,doublesingle>(doublesingle &sum, doublesingle *s, const int i, const int block)
71 { sum += s[i]; }
72 template<> __device__ void add<doublesingle2,doublesingle>(doublesingle2 &sum, doublesingle *s, const int i, const int block)
73 { sum.x += s[i]; sum.y += s[i+block]; }
74 template<> __device__ void add<doublesingle3,doublesingle>(doublesingle3 &sum, doublesingle *s, const int i, const int block)
75 { sum.x += s[i]; sum.y += s[i+block]; sum.z += s[i+2*block]; }
76 
77 template<> __device__ void add<doublesingle,doublesingle>(doublesingle *s, const int i, const int j, const int block)
78 { s[i] += s[j]; }
79 template<> __device__ void add<doublesingle,doublesingle>(volatile doublesingle *s, const int i, const int j, const int block)
80 { s[i] += s[j]; }
81 
82 template<> __device__ void add<doublesingle2,doublesingle>(doublesingle *s, const int i, const int j, const int block)
83 { s[i] += s[j]; s[i+block] += s[j+block];}
84 template<> __device__ void add<doublesingle2,doublesingle>(volatile doublesingle *s, const int i, const int j, const int block)
85 { s[i] += s[j]; s[i+block] += s[j+block];}
86 
87 template<> __device__ void add<doublesingle3,doublesingle>(doublesingle *s, const int i, const int j, const int block)
88 { s[i] += s[j]; s[i+block] += s[j+block]; s[i+2*block] += s[j+2*block];}
89 template<> __device__ void add<doublesingle3,doublesingle>(volatile doublesingle *s, const int i, const int j, const int block)
90 { s[i] += s[j]; s[i+block] += s[j+block]; s[i+2*block] += s[j+2*block];}
91 #endif
92 
93 __device__ unsigned int count = 0;
94 __shared__ bool isLastBlockDone;
95 
96 template <typename ReduceType, typename SpinorX, typename SpinorY,
97  typename SpinorZ, typename SpinorW, typename SpinorV, typename Reducer>
98 struct ReduceArg {
99  SpinorX X;
100  SpinorY Y;
101  SpinorZ Z;
102  SpinorW W;
103  SpinorV V;
104  Reducer r;
105  ReduceType *partial;
106  ReduceType *complete;
107  const int length;
108  ReduceArg(SpinorX X, SpinorY Y, SpinorZ Z, SpinorW W, SpinorV V, Reducer r,
109  ReduceType *partial, ReduceType *complete, int length)
110  : X(X), Y(Y), Z(Z), W(W), V(V), r(r), partial(partial), complete(complete), length(length) { ; }
111 };
112 
116 template <int block_size, typename ReduceType, typename ReduceSimpleType,
117  typename FloatN, int M, typename SpinorX, typename SpinorY,
118  typename SpinorZ, typename SpinorW, typename SpinorV, typename Reducer>
120  unsigned int tid = threadIdx.x;
121  unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
122  unsigned int gridSize = gridDim.x*blockDim.x;
123 
124  ReduceType sum;
125  zero(sum);
126  while (i < arg.length) {
127  FloatN x[M], y[M], z[M], w[M], v[M];
128  arg.X.load(x, i);
129  arg.Y.load(y, i);
130  arg.Z.load(z, i);
131  arg.W.load(w, i);
132  arg.V.load(v, i);
133 
134 #if (__COMPUTE_CAPABILITY__ >= 200)
135  arg.r.pre();
136 #endif
137 
138 #pragma unroll
139  for (int j=0; j<M; j++) arg.r(sum, x[j], y[j], z[j], w[j], v[j]);
140 
141 #if (__COMPUTE_CAPABILITY__ >= 200)
142  arg.r.post(sum);
143 #endif
144 
145  arg.X.save(x, i);
146  arg.Y.save(y, i);
147  arg.Z.save(z, i);
148  arg.W.save(w, i);
149  arg.V.save(v, i);
150 
151  i += gridSize;
152  }
153 
154  //#define CUB_REDUCTION
155 #ifndef CUB_REDUCTION
156 
157  extern __shared__ ReduceSimpleType sdata[];
158  ReduceSimpleType *s = sdata + tid;
159  if (tid >= warpSize) copytoshared(s, 0, sum, block_size);
160  __syncthreads();
161 
162  // now reduce using the first warp only
163  if (tid<warpSize) {
164  // Warp raking
165 #pragma unroll
166  for (int i=warpSize; i<block_size; i+=warpSize) { add<ReduceType>(sum, s, i, block_size); }
167 
168  // Intra-warp reduction
169  volatile ReduceSimpleType *sv = s;
170  copytoshared(sv, 0, sum, block_size);
171 
172  if (block_size >= 32) { add<ReduceType>(sv, 0, 16, block_size); }
173  if (block_size >= 16) { add<ReduceType>(sv, 0, 8, block_size); }
174  if (block_size >= 8) { add<ReduceType>(sv, 0, 4, block_size); }
175  if (block_size >= 4) { add<ReduceType>(sv, 0, 2, block_size); }
176  if (block_size >= 2) { add<ReduceType>(sv, 0, 1, block_size); }
177 
178  // warpSize generic warp reduction - open64 can't handle it, only nvvm
179  //#pragma unroll
180  //for (int i=warpSize/2; i>0; i/=2) { add<ReduceType>(sv, 0, i, block_size); }
181  }
182 
183  // write result for this block to global mem
184  if (tid == 0) {
185  ReduceType tmp;
186  copyfromshared(tmp, s, 0, block_size);
187  arg.partial[blockIdx.x] = tmp;
188 
189  __threadfence(); // flush result
190 
191  // increment global block counter
192  unsigned int value = atomicInc(&count, gridDim.x);
193 
194  // Determine if this block is the last block to be done
195  isLastBlockDone = (value == (gridDim.x-1));
196  }
197 
198  __syncthreads();
199 
200  // Finish the reduction if last block
201  if (isLastBlockDone) {
202  unsigned int i = threadIdx.x;
203 
204  ReduceType sum;
205  zero(sum);
206  while (i < gridDim.x) {
207  sum += arg.partial[i];
208  i += block_size;
209  }
210 
211  extern __shared__ ReduceSimpleType sdata[];
212  ReduceSimpleType *s = sdata + tid;
213  if (tid >= warpSize) copytoshared(s, 0, sum, block_size);
214  __syncthreads();
215 
216  // now reduce using the first warp only
217  if (tid<warpSize) {
218  // Warp raking
219 #pragma unroll
220  for (int i=warpSize; i<block_size; i+=warpSize) { add<ReduceType>(sum, s, i, block_size); }
221 
222  // Intra-warp reduction
223  volatile ReduceSimpleType *sv = s;
224  copytoshared(sv, 0, sum, block_size);
225 
226  if (block_size >= 32) { add<ReduceType>(sv, 0, 16, block_size); }
227  if (block_size >= 16) { add<ReduceType>(sv, 0, 8, block_size); }
228  if (block_size >= 8) { add<ReduceType>(sv, 0, 4, block_size); }
229  if (block_size >= 4) { add<ReduceType>(sv, 0, 2, block_size); }
230  if (block_size >= 2) { add<ReduceType>(sv, 0, 1, block_size); }
231 
232  //#pragma unroll
233  //for (int i=warpSize/2; i>0; i/=2) { add<ReduceType>(sv, 0, i, block_size); }
234  }
235 
236  // write out the final reduced value
237  if (threadIdx.x == 0) {
238  ReduceType tmp;
239  copyfromshared(tmp, s, 0, block_size);
240  arg.complete[0] = tmp;
241  count = 0;
242  }
243  }
244 
245 #else
246 
247  typedef cub::BlockReduce<ReduceType, block_size> BlockReduce;
248  __shared__ typename BlockReduce::TempStorage temp_storage;
249 
250  {
251  sum = BlockReduce(temp_storage).Sum(sum);
252 
253  if (tid == 0) {
254  arg.partial[blockIdx.x] = sum;
255 
256  __threadfence(); // flush result
257 
258  // increment global block counter
259  unsigned int value = atomicInc(&count, gridDim.x);
260 
261  // Determine if this block is the last block to be done
262  isLastBlockDone = (value == (gridDim.x-1));
263  }
264  }
265 
266  __syncthreads();
267 
268  // Finish the reduction if last block
269  if (isLastBlockDone) {
270  unsigned int i = threadIdx.x;
271 
272  ReduceType sum;
273  zero(sum);
274  while (i < gridDim.x) {
275  sum += arg.partial[i];
276  i += block_size;
277  }
278 
279  sum = BlockReduce(temp_storage).Sum(sum);
280 
281  // write out the final reduced value
282  if (threadIdx.x == 0) {
283  arg.complete[0] = sum;
284  count = 0;
285  }
286  }
287 #endif
288 
289 }
293 template <typename doubleN, typename ReduceType, typename ReduceSimpleType, typename FloatN,
294  int M, typename SpinorX, typename SpinorY, typename SpinorZ,
295  typename SpinorW, typename SpinorV, typename Reducer>
297  const TuneParam &tp, const cudaStream_t &stream) {
298  if (tp.grid.x > REDUCE_MAX_BLOCKS)
299  errorQuda("Grid size %d greater than maximum %d\n", tp.grid.x, REDUCE_MAX_BLOCKS);
300 
301  switch (tp.block.x) {
302  case 32:
303  reduceKernel<32,ReduceType,ReduceSimpleType,FloatN,M>
304  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg);
305  break;
306  case 64:
307  reduceKernel<64,ReduceType,ReduceSimpleType,FloatN,M>
308  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg);
309  break;
310  case 96:
311  reduceKernel<96,ReduceType,ReduceSimpleType,FloatN,M>
312  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg);
313  break;
314  case 128:
315  reduceKernel<128,ReduceType,ReduceSimpleType,FloatN,M>
316  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg);
317  break;
318  case 160:
319  reduceKernel<160,ReduceType,ReduceSimpleType,FloatN,M>
320  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg);
321  break;
322  case 192:
323  reduceKernel<192,ReduceType,ReduceSimpleType,FloatN,M>
324  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg);
325  break;
326  case 224:
327  reduceKernel<224,ReduceType,ReduceSimpleType,FloatN,M>
328  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg);
329  break;
330  case 256:
331  reduceKernel<256,ReduceType,ReduceSimpleType,FloatN,M>
332  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg);
333  break;
334  case 288:
335  reduceKernel<288,ReduceType,ReduceSimpleType,FloatN,M>
336  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg);
337  break;
338  case 320:
339  reduceKernel<320,ReduceType,ReduceSimpleType,FloatN,M>
340  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg);
341  break;
342  case 352:
343  reduceKernel<352,ReduceType,ReduceSimpleType,FloatN,M>
344  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg);
345  break;
346  case 384:
347  reduceKernel<384,ReduceType,ReduceSimpleType,FloatN,M>
348  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg);
349  break;
350  case 416:
351  reduceKernel<416,ReduceType,ReduceSimpleType,FloatN,M>
352  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg);
353  break;
354  case 448:
355  reduceKernel<448,ReduceType,ReduceSimpleType,FloatN,M>
356  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg);
357  break;
358  case 480:
359  reduceKernel<480,ReduceType,ReduceSimpleType,FloatN,M>
360  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg);
361  break;
362  case 512:
363  reduceKernel<512,ReduceType,ReduceSimpleType,FloatN,M>
364  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg);
365  break;
366  case 544:
367  reduceKernel<544,ReduceType,ReduceSimpleType,FloatN,M>
368  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg);
369  break;
370  case 576:
371  reduceKernel<576,ReduceType,ReduceSimpleType,FloatN,M>
372  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg);
373  break;
374  case 608:
375  reduceKernel<608,ReduceType,ReduceSimpleType,FloatN,M>
376  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg);
377  break;
378  case 640:
379  reduceKernel<640,ReduceType,ReduceSimpleType,FloatN,M>
380  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg);
381  break;
382  case 672:
383  reduceKernel<672,ReduceType,ReduceSimpleType,FloatN,M>
384  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg);
385  break;
386  case 704:
387  reduceKernel<704,ReduceType,ReduceSimpleType,FloatN,M>
388  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg);
389  break;
390  case 736:
391  reduceKernel<736,ReduceType,ReduceSimpleType,FloatN,M>
392  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg);
393  break;
394  case 768:
395  reduceKernel<768,ReduceType,ReduceSimpleType,FloatN,M>
396  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg);
397  break;
398  case 800:
399  reduceKernel<800,ReduceType,ReduceSimpleType,FloatN,M>
400  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg);
401  break;
402  case 832:
403  reduceKernel<832,ReduceType,ReduceSimpleType,FloatN,M>
404  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg);
405  break;
406  case 864:
407  reduceKernel<864,ReduceType,ReduceSimpleType,FloatN,M>
408  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg);
409  break;
410  case 896:
411  reduceKernel<896,ReduceType,ReduceSimpleType,FloatN,M>
412  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg);
413  break;
414  case 928:
415  reduceKernel<928,ReduceType,ReduceSimpleType,FloatN,M>
416  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg);
417  break;
418  case 960:
419  reduceKernel<960,ReduceType,ReduceSimpleType,FloatN,M>
420  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg);
421  break;
422  case 992:
423  reduceKernel<992,ReduceType,ReduceSimpleType,FloatN,M>
424  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg);
425  break;
426  case 1024:
427  reduceKernel<1024,ReduceType,ReduceSimpleType,FloatN,M>
428  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(arg);
429  break;
430  default:
431  errorQuda("Reduction not implemented for %d threads", tp.block.x);
432  }
433 
434 #if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__)
435  if(deviceProp.canMapHostMemory) {
436  cudaEventRecord(reduceEnd, stream);
437  while (cudaSuccess != cudaEventQuery(reduceEnd)) { ; }
438  } else
439 #endif
440  { cudaMemcpy(h_reduce, hd_reduce, sizeof(ReduceType), cudaMemcpyDeviceToHost); }
441 
442  doubleN cpu_sum;
443  zero(cpu_sum);
444  cpu_sum += ((ReduceType*)h_reduce)[0];
445 
446  const int Nreduce = sizeof(doubleN) / sizeof(double);
447  reduceDoubleArray((double*)&cpu_sum, Nreduce);
448 
449  return cpu_sum;
450 }
451 
452 
453 template <typename doubleN, typename ReduceType, typename ReduceSimpleType, typename FloatN,
454  int M, typename SpinorX, typename SpinorY, typename SpinorZ,
455  typename SpinorW, typename SpinorV, typename Reducer>
456 class ReduceCuda : public Tunable {
457 
458 private:
460  doubleN &result;
461 
462  // host pointers used for backing up fields when tuning
463  // these can't be curried into the Spinors because of Tesla argument length restriction
464  char *X_h, *Y_h, *Z_h, *W_h, *V_h;
465  char *Xnorm_h, *Ynorm_h, *Znorm_h, *Wnorm_h, *Vnorm_h;
466 
467  unsigned int sharedBytesPerThread() const { return sizeof(ReduceType); }
468 
469  // when there is only one warp per block, we need to allocate two warps
470  // worth of shared memory so that we don't index shared memory out of bounds
471  unsigned int sharedBytesPerBlock(const TuneParam &param) const {
472  int warpSize = 32; // FIXME - use device property query
473  return 2*warpSize*sizeof(ReduceType);
474  }
475 
476  virtual bool advanceSharedBytes(TuneParam &param) const
477  {
478  TuneParam next(param);
479  advanceBlockDim(next); // to get next blockDim
480  int nthreads = next.block.x * next.block.y * next.block.z;
481  param.shared_bytes = sharedBytesPerThread()*nthreads > sharedBytesPerBlock(param) ?
482  sharedBytesPerThread()*nthreads : sharedBytesPerBlock(param);
483  return false;
484  }
485 
486 public:
487  ReduceCuda(doubleN &result, SpinorX &X, SpinorY &Y, SpinorZ &Z,
488  SpinorW &W, SpinorV &V, Reducer &r, int length) :
489  arg(X, Y, Z, W, V, r, (ReduceType*)d_reduce, (ReduceType*)hd_reduce, length),
490  result(result), X_h(0), Y_h(0), Z_h(0), W_h(0), V_h(0),
491  Xnorm_h(0), Ynorm_h(0), Znorm_h(0), Wnorm_h(0), Vnorm_h(0)
492  { ; }
493  virtual ~ReduceCuda() { }
494 
495  TuneKey tuneKey() const {
496  std::stringstream vol, aux;
497  vol << blasConstants.x[0] << "x";
498  vol << blasConstants.x[1] << "x";
499  vol << blasConstants.x[2] << "x";
500  vol << blasConstants.x[3];
501  aux << "stride=" << blasConstants.stride << ",prec=" << arg.X.Precision();
502  return TuneKey(vol.str(), typeid(arg.r).name(), aux.str());
503  }
504 
505  void apply(const cudaStream_t &stream) {
506  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
507  result = reduceLaunch<doubleN,ReduceType,ReduceSimpleType,FloatN,M>(arg, tp, stream);
508  }
509 
510  void preTune() {
511  size_t bytes = arg.X.Precision()*(sizeof(FloatN)/sizeof(((FloatN*)0)->x))*M*arg.X.Stride();
512  size_t norm_bytes = (arg.X.Precision() == QUDA_HALF_PRECISION) ? sizeof(float)*arg.length : 0;
513  arg.X.save(&X_h, &Xnorm_h, bytes, norm_bytes);
514  arg.Y.save(&Y_h, &Ynorm_h, bytes, norm_bytes);
515  arg.Z.save(&Z_h, &Znorm_h, bytes, norm_bytes);
516  arg.W.save(&W_h, &Wnorm_h, bytes, norm_bytes);
517  arg.V.save(&V_h, &Vnorm_h, bytes, norm_bytes);
518  }
519 
520  void postTune() {
521  size_t bytes = arg.X.Precision()*(sizeof(FloatN)/sizeof(((FloatN*)0)->x))*M*arg.X.Stride();
522  size_t norm_bytes = (arg.X.Precision() == QUDA_HALF_PRECISION) ? sizeof(float)*arg.length : 0;
523  arg.X.load(&X_h, &Xnorm_h, bytes, norm_bytes);
524  arg.Y.load(&Y_h, &Ynorm_h, bytes, norm_bytes);
525  arg.Z.load(&Z_h, &Znorm_h, bytes, norm_bytes);
526  arg.W.load(&W_h, &Wnorm_h, bytes, norm_bytes);
527  arg.V.load(&V_h, &Vnorm_h, bytes, norm_bytes);
528  }
529 
530  long long flops() const { return arg.r.flops()*(sizeof(FloatN)/sizeof(((FloatN*)0)->x))*arg.length*M; }
531  long long bytes() const {
532  size_t bytes = arg.X.Precision()*(sizeof(FloatN)/sizeof(((FloatN*)0)->x))*M;
533  if (arg.X.Precision() == QUDA_HALF_PRECISION) bytes += sizeof(float);
534  return arg.r.streams()*bytes*arg.length; }
535  int tuningIter() const { return 3; }
536 };
537 
538 
539 /*
540  Wilson
541  double double2 M = 1/12
542  single float4 M = 1/6
543  half short4 M = 6/6
544 
545  Staggered
546  double double2 M = 1/3
547  single float2 M = 1/3
548  half short2 M = 3/3
549  */
550 
556 template <typename doubleN, typename ReduceType, typename ReduceSimpleType,
557  template <typename ReducerType, typename Float, typename FloatN> class Reducer,
558  int writeX, int writeY, int writeZ, int writeW, int writeV, bool siteUnroll>
559 doubleN reduceCuda(const double2 &a, const double2 &b, cudaColorSpinorField &x,
560  cudaColorSpinorField &y, cudaColorSpinorField &z, cudaColorSpinorField &w,
561  cudaColorSpinorField &v) {
562  if (x.SiteSubset() == QUDA_FULL_SITE_SUBSET) {
563  doubleN even =
564  reduceCuda<doubleN,ReduceType,ReduceSimpleType,Reducer,writeX,
565  writeY,writeZ,writeW,writeV,siteUnroll>
566  (a, b, x.Even(), y.Even(), z.Even(), w.Even(), v.Even());
567  doubleN odd =
568  reduceCuda<doubleN,ReduceType,ReduceSimpleType,Reducer,writeX,
569  writeY,writeZ,writeW,writeV,siteUnroll>
570  (a, b, x.Odd(), y.Odd(), z.Odd(), w.Odd(), v.Odd());
571  return even + odd;
572  }
573 
574  checkSpinor(x, y);
575  checkSpinor(x, z);
576  checkSpinor(x, w);
577  checkSpinor(x, v);
578 
579  if (!x.isNative()) {
580  warningQuda("Reductions on non-native fields is not supported\n");
581  doubleN value;
582  zero(value);
583  return value;
584  }
585 
586  for (int d=0; d<QUDA_MAX_DIM; d++) blasConstants.x[d] = x.X()[d];
587  blasConstants.stride = x.Stride();
588 
589  int reduce_length = siteUnroll ? x.RealLength() : x.Length();
590  doubleN value;
591 
592  // FIXME: use traits to encapsulate register type for shorts -
593  // will reduce template type parameters from 3 to 2
594 
595  if (x.Precision() == QUDA_DOUBLE_PRECISION) {
596  if (x.Nspin() == 4){ //wilson
597  const int M = siteUnroll ? 12 : 1; // determines how much work per thread to do
603  Reducer<ReduceType, double2, double2> r(a,b);
604  ReduceCuda<doubleN,ReduceType,ReduceSimpleType,double2,M,
607  Spinor<double2,double2,double2,M,writeV>, Reducer<ReduceType, double2, double2> >
608  reduce(value, X, Y, Z, W, V, r, reduce_length/(2*M));
609  reduce.apply(*getBlasStream());
610  } else if (x.Nspin() == 1){ //staggered
611  const int M = siteUnroll ? 3 : 1; // determines how much work per thread to do
617  Reducer<ReduceType, double2, double2> r(a,b);
618  ReduceCuda<doubleN,ReduceType,ReduceSimpleType,double2,M,
621  Spinor<double2,double2,double2,M,writeV>, Reducer<ReduceType, double2, double2> >
622  reduce(value, X, Y, Z, W, V, r, reduce_length/(2*M));
623  reduce.apply(*getBlasStream());
624  } else { errorQuda("ERROR: nSpin=%d is not supported\n", x.Nspin()); }
625  } else if (x.Precision() == QUDA_SINGLE_PRECISION) {
626  if (x.Nspin() == 4){ //wilson
627 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC)
628  const int M = siteUnroll ? 6 : 1; // determines how much work per thread to do
634  Reducer<ReduceType, float2, float4> r(make_float2(a.x, a.y), make_float2(b.x, b.y));
635  ReduceCuda<doubleN,ReduceType,ReduceSimpleType,float4,M,
638  Spinor<float4,float4,float4,M,writeV,4>, Reducer<ReduceType, float2, float4> >
639  reduce(value, X, Y, Z, W, V, r, reduce_length/(4*M));
640  reduce.apply(*getBlasStream());
641 #else
642  errorQuda("blas has not been built for Nspin=%d fields", x.Nspin());
643 #endif
644  } else if (x.Nspin() == 1) {
645 #ifdef GPU_STAGGERED_DIRAC
646  const int M = siteUnroll ? 3 : 1; // determines how much work per thread to do
652  Reducer<ReduceType, float2, float2> r(make_float2(a.x, a.y), make_float2(b.x, b.y));
653  ReduceCuda<doubleN,ReduceType,ReduceSimpleType,float2,M,
656  Spinor<float2,float2,float2,M,writeV,4>, Reducer<ReduceType, float2, float2> >
657  reduce(value, X, Y, Z, W, V, r, reduce_length/(2*M));
658  reduce.apply(*getBlasStream());
659 #else
660  errorQuda("blas has not been built for Nspin=%d fields", x.Nspin());
661 #endif
662  } else { errorQuda("ERROR: nSpin=%d is not supported\n", x.Nspin()); }
663  } else {
664  if (x.Nspin() == 4){ //wilson
665 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC)
671  Reducer<ReduceType, float2, float4> r(make_float2(a.x, a.y), make_float2(b.x, b.y));
672  ReduceCuda<doubleN,ReduceType,ReduceSimpleType,float4,6,
675  Spinor<float4,float4,short4,6,writeV,4>, Reducer<ReduceType, float2, float4> >
676  reduce(value, X, Y, Z, W, V, r, y.Volume());
677  reduce.apply(*getBlasStream());
678 #else
679  errorQuda("blas has not been built for Nspin=%d fields", x.Nspin());
680 #endif
681  } else if (x.Nspin() == 1) {//staggered
682 #ifdef GPU_STAGGERED_DIRAC
688  Reducer<ReduceType, float2, float2> r(make_float2(a.x, a.y), make_float2(b.x, b.y));
689  ReduceCuda<doubleN,ReduceType,ReduceSimpleType,float2,3,
692  Spinor<float2,float2,short2,3,writeV,4>, Reducer<ReduceType, float2, float2> >
693  reduce(value, X, Y, Z, W, V, r, y.Volume());
694  reduce.apply(*getBlasStream());
695 #else
696  errorQuda("blas has not been built for Nspin=%d fields", x.Nspin());
697 #endif
698  } else { errorQuda("ERROR: nSpin=%d is not supported\n", x.Nspin()); }
699  blas_bytes += Reducer<ReduceType,double2,double2>::streams()*(unsigned long long)x.Volume()*sizeof(float);
700  }
701  blas_bytes += Reducer<ReduceType,double2,double2>::streams()*(unsigned long long)x.RealLength()*x.Precision();
702  blas_flops += Reducer<ReduceType,double2,double2>::flops()*(unsigned long long)x.RealLength();
703 
704  checkCudaError();
705 
706  return value;
707 }
708 
__device__ void add< double2, double >(double2 &sum, double *s, const int i, const int block)
__device__ void add< double3, double >(double3 &sum, double *s, const int i, const int block)
ReduceCuda(doubleN &result, SpinorX &X, SpinorY &Y, SpinorZ &Z, SpinorW &W, SpinorV &V, Reducer &r, int length)
int V
Definition: test_util.cpp:29
Reducer r
Definition: reduce_core.h:123
int tuningIter() const
int y[4]
__device__ void add< doublesingle2, doublesingle >(doublesingle2 &sum, doublesingle *s, const int i, const int block)
cudaDeviceProp deviceProp
doublesingle x
Definition: double_single.h:37
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
#define errorQuda(...)
Definition: util_quda.h:73
ReduceArg(SpinorX X, SpinorY Y, SpinorZ Z, SpinorW W, SpinorV V, Reducer r, ReduceType *partial, ReduceType *complete, int length)
unsigned long long blas_bytes
Definition: blas_quda.cu:38
cudaStream_t * streams
cudaStream_t * stream
__device__ unsigned int count
long long flops() const
doublesingle y
Definition: double_single.h:50
TuneKey tuneKey() const
SpinorW W
Definition: reduce_core.h:121
int length[]
SpinorY Y
Definition: reduce_core.h:119
const int length
Definition: reduce_core.h:126
SpinorX X
Definition: reduce_core.h:118
QudaGaugeParam param
Definition: pack_test.cpp:17
doublesingle z
Definition: double_single.h:51
cudaColorSpinorField * tmp
__device__ void add< doublesingle3, doublesingle >(doublesingle3 &sum, doublesingle *s, const int i, const int block)
SpinorZ Z
Definition: reduce_core.h:120
void reduceDoubleArray(double *, const int len)
doublesingle y
Definition: double_single.h:38
__device__ void add< doublesingle, doublesingle >(doublesingle &sum, doublesingle *s, const int i, const int block)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:271
__global__ void reduceKernel(ReduceArg< ReduceType, SpinorX, SpinorY, SpinorZ, SpinorW, SpinorV, Reducer > arg)
#define warningQuda(...)
Definition: util_quda.h:84
doubleN reduceCuda(const double2 &a, const double2 &b, cudaColorSpinorField &x, cudaColorSpinorField &y, cudaColorSpinorField &z, cudaColorSpinorField &w, cudaColorSpinorField &v)
SpinorV V
Definition: reduce_core.h:122
__host__ __device__ void zero(double &x)
int x[4]
__device__ void add(ReduceType &sum, ReduceSimpleType *s, const int i, const int block)
cudaStream_t * getBlasStream()
Definition: blas_quda.cu:64
unsigned long long blas_flops
Definition: blas_quda.cu:37
virtual ~ReduceCuda()
int Z[4]
Definition: test_util.cpp:28
long long bytes() const
Definition: reduce_core.h:342
ReduceType * complete
Definition: reduce_core.h:125
void apply(const cudaStream_t &stream)
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:843
#define checkSpinor(a, b)
Definition: blas_quda.cu:15
#define REDUCE_MAX_BLOCKS
Definition: reduce_quda.cu:16
__shared__ bool isLastBlockDone
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
#define checkCudaError()
Definition: util_quda.h:110
QudaTune getTuning()
Definition: util_quda.cpp:32
VOLATILE spinorFloat * s
__device__ void add< double, double >(double &sum, double *s, const int i, const int block)
__device__ void copyfromshared(double &x, const double *s, const int i, const int block)
doubleN reduceLaunch(ReduceArg< ReduceType, SpinorX, SpinorY, SpinorZ, SpinorW, SpinorV, Reducer > &arg, const TuneParam &tp, const cudaStream_t &stream)
__device__ void copytoshared(double *s, const int i, const double x, const int block)
__syncthreads()
doublesingle x
Definition: double_single.h:49
ReduceType * partial
Definition: reduce_core.h:124