QUDA  v0.5.0
A library for QCD on GPUs
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
reduce_core.h
Go to the documentation of this file.
1 __host__ __device__ void zero(double &x) { x = 0.0; }
2 __host__ __device__ void zero(double2 &x) { x.x = 0.0; x.y = 0.0; }
3 __host__ __device__ void zero(double3 &x) { x.x = 0.0; x.y = 0.0; x.z = 0.0; }
4 __device__ void copytoshared(double *s, const int i, const double x, const int block) { s[i] = x; }
5 __device__ void copytoshared(double *s, const int i, const double2 x, const int block)
6 { s[i] = x.x; s[i+block] = x.y; }
7 __device__ void copytoshared(double *s, const int i, const double3 x, const int block)
8 { s[i] = x.x; s[i+block] = x.y; s[i+2*block] = x.z; }
9 __device__ void copytoshared(volatile double *s, const int i, const double x, const int block) { s[i] = x; }
10 __device__ void copytoshared(volatile double *s, const int i, const double2 x, const int block)
11 { s[i] = x.x; s[i+block] = x.y; }
12 __device__ void copytoshared(volatile double *s, const int i, const double3 x, const int block)
13 { s[i] = x.x; s[i+block] = x.y; s[i+2*block] = x.z; }
14 __device__ void copyfromshared(double &x, const double *s, const int i, const int block) { x = s[i]; }
15 __device__ void copyfromshared(double2 &x, const double *s, const int i, const int block)
16 { x.x = s[i]; x.y = s[i+block]; }
17 __device__ void copyfromshared(double3 &x, const double *s, const int i, const int block)
18 { x.x = s[i]; x.y = s[i+block]; x.z = s[i+2*block]; }
19 
20 template<typename ReduceType, typename ReduceSimpleType>
21 __device__ void add(ReduceType &sum, ReduceSimpleType *s, const int i, const int block) { }
22 template<> __device__ void add<double,double>(double &sum, double *s, const int i, const int block)
23 { sum += s[i]; }
24 template<> __device__ void add<double2,double>(double2 &sum, double *s, const int i, const int block)
25 { sum.x += s[i]; sum.y += s[i+block]; }
26 template<> __device__ void add<double3,double>(double3 &sum, double *s, const int i, const int block)
27 { sum.x += s[i]; sum.y += s[i+block]; sum.z += s[i+2*block]; }
28 
29 template<typename ReduceType, typename ReduceSimpleType>
30 __device__ void add(ReduceSimpleType *s, const int i, const int j, const int block) { }
31 template<typename ReduceType, typename ReduceSimpleType>
32 __device__ void add(volatile ReduceSimpleType *s, const int i, const int j, const int block) { }
33 
34 template<> __device__ void add<double,double>(double *s, const int i, const int j, const int block)
35 { s[i] += s[j]; }
36 template<> __device__ void add<double,double>(volatile double *s, const int i, const int j, const int block)
37 { s[i] += s[j]; }
38 
39 template<> __device__ void add<double2,double>(double *s, const int i, const int j, const int block)
40 { s[i] += s[j]; s[i+block] += s[j+block];}
41 template<> __device__ void add<double2,double>(volatile double *s, const int i, const int j, const int block)
42 { s[i] += s[j]; s[i+block] += s[j+block];}
43 
44 template<> __device__ void add<double3,double>(double *s, const int i, const int j, const int block)
45 { s[i] += s[j]; s[i+block] += s[j+block]; s[i+2*block] += s[j+2*block];}
46 template<> __device__ void add<double3,double>(volatile double *s, const int i, const int j, const int block)
47 { s[i] += s[j]; s[i+block] += s[j+block]; s[i+2*block] += s[j+2*block];}
48 
49 #if (__COMPUTE_CAPABILITY__ < 130)
50 __host__ __device__ void zero(doublesingle &x) { x = 0.0; }
51 __host__ __device__ void zero(doublesingle2 &x) { x.x = 0.0; x.y = 0.0; }
52 __host__ __device__ void zero(doublesingle3 &x) { x.x = 0.0; x.y = 0.0; x.z = 0.0; }
53 __device__ void copytoshared(doublesingle *s, const int i, const doublesingle x, const int block) { s[i] = x; }
54 __device__ void copytoshared(doublesingle *s, const int i, const doublesingle2 x, const int block)
55 { s[i] = x.x; s[i+block] = x.y; }
56 __device__ void copytoshared(doublesingle *s, const int i, const doublesingle3 x, const int block)
57 { s[i] = x.x; s[i+block] = x.y; s[i+2*block] = x.z; }
58 __device__ void copytoshared(volatile doublesingle *s, const int i, const doublesingle x, const int block) { s[i].a.x = x.a.x; s[i].a.y = x.a.y; }
59 __device__ void copytoshared(volatile doublesingle *s, const int i, const doublesingle2 x, const int block)
60 { s[i].a.x = x.x.a.x; s[i].a.y = x.x.a.y; s[i+block].a.x = x.y.a.x; s[i+block].a.y = x.y.a.y; }
61 __device__ void copytoshared(volatile doublesingle *s, const int i, const doublesingle3 x, const int block)
62 { s[i].a.x = x.x.a.x; s[i].a.y = x.x.a.y; s[i+block].a.x = x.y.a.x; s[i+block].a.y = x.y.a.y;
63  s[i+2*block].a.x = x.z.a.x; s[i+2*block].a.y = x.z.a.y; }
64 __device__ void copyfromshared(doublesingle &x, const doublesingle *s, const int i, const int block) { x = s[i]; }
65 __device__ void copyfromshared(doublesingle2 &x, const doublesingle *s, const int i, const int block)
66 { x.x = s[i]; x.y = s[i+block]; }
67 __device__ void copyfromshared(doublesingle3 &x, const doublesingle *s, const int i, const int block)
68 { x.x = s[i]; x.y = s[i+block]; x.z = s[i+2*block]; }
69 
70 template<> __device__ void add<doublesingle,doublesingle>(doublesingle &sum, doublesingle *s, const int i, const int block)
71 { sum += s[i]; }
72 template<> __device__ void add<doublesingle2,doublesingle>(doublesingle2 &sum, doublesingle *s, const int i, const int block)
73 { sum.x += s[i]; sum.y += s[i+block]; }
74 template<> __device__ void add<doublesingle3,doublesingle>(doublesingle3 &sum, doublesingle *s, const int i, const int block)
75 { sum.x += s[i]; sum.y += s[i+block]; sum.z += s[i+2*block]; }
76 
77 template<> __device__ void add<doublesingle,doublesingle>(doublesingle *s, const int i, const int j, const int block)
78 { s[i] += s[j]; }
79 template<> __device__ void add<doublesingle,doublesingle>(volatile doublesingle *s, const int i, const int j, const int block)
80 { s[i] += s[j]; }
81 
82 template<> __device__ void add<doublesingle2,doublesingle>(doublesingle *s, const int i, const int j, const int block)
83 { s[i] += s[j]; s[i+block] += s[j+block];}
84 template<> __device__ void add<doublesingle2,doublesingle>(volatile doublesingle *s, const int i, const int j, const int block)
85 { s[i] += s[j]; s[i+block] += s[j+block];}
86 
87 template<> __device__ void add<doublesingle3,doublesingle>(doublesingle *s, const int i, const int j, const int block)
88 { s[i] += s[j]; s[i+block] += s[j+block]; s[i+2*block] += s[j+2*block];}
89 template<> __device__ void add<doublesingle3,doublesingle>(volatile doublesingle *s, const int i, const int j, const int block)
90 { s[i] += s[j]; s[i+block] += s[j+block]; s[i+2*block] += s[j+2*block];}
91 #endif
92 
93 __device__ unsigned int count = 0;
94 __shared__ bool isLastBlockDone;
95 
99 template <int block_size, typename ReduceType, typename ReduceSimpleType,
100  typename FloatN, int M, typename SpinorX, typename SpinorY,
101  typename SpinorZ, typename SpinorW, typename SpinorV, typename Reducer>
102 __global__ void reduceKernel(SpinorX X, SpinorY Y, SpinorZ Z, SpinorW W, SpinorV V, Reducer r,
103  ReduceType *partial, ReduceType *complete, int length) {
104  unsigned int tid = threadIdx.x;
105  unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
106  unsigned int gridSize = gridDim.x*blockDim.x;
107 
108  ReduceType sum;
109  zero(sum);
110  while (i < length) {
111  FloatN x[M], y[M], z[M], w[M], v[M];
112  X.load(x, i);
113  Y.load(y, i);
114  Z.load(z, i);
115  W.load(w, i);
116  V.load(v, i);
117 
118 #if (__COMPUTE_CAPABILITY__ >= 200)
119  r.pre();
120 #endif
121 
122 #pragma unroll
123  for (int j=0; j<M; j++) r(sum, x[j], y[j], z[j], w[j], v[j]);
124 
125 #if (__COMPUTE_CAPABILITY__ >= 200)
126  r.post(sum);
127 #endif
128 
129  X.save(x, i);
130  Y.save(y, i);
131  Z.save(z, i);
132  W.save(w, i);
133  V.save(v, i);
134 
135  i += gridSize;
136  }
137 
138  extern __shared__ ReduceSimpleType sdata[];
139  ReduceSimpleType *s = sdata + tid;
140  if (tid >= warpSize) copytoshared(s, 0, sum, block_size);
141  __syncthreads();
142 
143  // now reduce using the first warp only
144  if (tid<warpSize) {
145  // Warp raking
146 #pragma unroll
147  for (int i=warpSize; i<block_size; i+=warpSize) { add<ReduceType>(sum, s, i, block_size); }
148 
149  // Intra-warp reduction
150  volatile ReduceSimpleType *sv = s;
151  copytoshared(sv, 0, sum, block_size);
152 
153  if (block_size >= 32) { add<ReduceType>(sv, 0, 16, block_size); }
154  if (block_size >= 16) { add<ReduceType>(sv, 0, 8, block_size); }
155  if (block_size >= 8) { add<ReduceType>(sv, 0, 4, block_size); }
156  if (block_size >= 4) { add<ReduceType>(sv, 0, 2, block_size); }
157  if (block_size >= 2) { add<ReduceType>(sv, 0, 1, block_size); }
158 
159  // warpSize generic warp reduction - open64 can't handle it, only nvvm
160  //#pragma unroll
161  //for (int i=warpSize/2; i>0; i/=2) { add<ReduceType>(sv, 0, i, block_size); }
162  }
163 
164  // write result for this block to global mem
165  if (tid == 0) {
166  ReduceType tmp;
167  copyfromshared(tmp, s, 0, block_size);
168  partial[blockIdx.x] = tmp;
169 
170  __threadfence(); // flush result
171 
172  // increment global block counter
173  unsigned int value = atomicInc(&count, gridDim.x);
174 
175  // Determine if this block is the last block to be done
176  isLastBlockDone = (value == (gridDim.x-1));
177  }
178 
179  __syncthreads();
180 
181  // Finish the reduction if last block
182  if (isLastBlockDone) {
183  unsigned int i = threadIdx.x;
184 
185  ReduceType sum;
186  zero(sum);
187  while (i < gridDim.x) {
188  sum += partial[i];
189  i += block_size;
190  }
191 
192  extern __shared__ ReduceSimpleType sdata[];
193  ReduceSimpleType *s = sdata + tid;
194  if (tid >= warpSize) copytoshared(s, 0, sum, block_size);
195  __syncthreads();
196 
197  // now reduce using the first warp only
198  if (tid<warpSize) {
199  // Warp raking
200 #pragma unroll
201  for (int i=warpSize; i<block_size; i+=warpSize) { add<ReduceType>(sum, s, i, block_size); }
202 
203  // Intra-warp reduction
204  volatile ReduceSimpleType *sv = s;
205  copytoshared(sv, 0, sum, block_size);
206 
207  if (block_size >= 32) { add<ReduceType>(sv, 0, 16, block_size); }
208  if (block_size >= 16) { add<ReduceType>(sv, 0, 8, block_size); }
209  if (block_size >= 8) { add<ReduceType>(sv, 0, 4, block_size); }
210  if (block_size >= 4) { add<ReduceType>(sv, 0, 2, block_size); }
211  if (block_size >= 2) { add<ReduceType>(sv, 0, 1, block_size); }
212 
213  //#pragma unroll
214  //for (int i=warpSize/2; i>0; i/=2) { add<ReduceType>(sv, 0, i, block_size); }
215  }
216 
217  // write out the final reduced value
218  if (threadIdx.x == 0) {
219  ReduceType tmp;
220  copyfromshared(tmp, s, 0, block_size);
221  complete[0] = tmp;
222  count = 0;
223  }
224  }
225 
226 }
230 template <typename doubleN, typename ReduceType, typename ReduceSimpleType, typename FloatN,
231  int M, typename SpinorX, typename SpinorY, typename SpinorZ,
232  typename SpinorW, typename SpinorV, typename Reducer>
233 doubleN reduceLaunch(SpinorX X, SpinorY Y, SpinorZ Z, SpinorW W, SpinorV V, Reducer r,
234  int len, const TuneParam &tp, const cudaStream_t &stream) {
235  ReduceType *part = (ReduceType*)d_reduce;
236  ReduceType *full = (ReduceType*)hd_reduce;
237 
238  if (tp.grid.x > REDUCE_MAX_BLOCKS)
239  errorQuda("Grid size %d greater than maximum %d\n", tp.grid.x, REDUCE_MAX_BLOCKS);
240 
241  if (tp.block.x == 32) {
242  reduceKernel<32,ReduceType,ReduceSimpleType,FloatN,M>
243  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, part, full, len);
244  } else if (tp.block.x == 64) {
245  reduceKernel<64,ReduceType,ReduceSimpleType,FloatN,M>
246  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, part, full, len);
247  } else if (tp.block.x == 96) {
248  reduceKernel<96,ReduceType,ReduceSimpleType,FloatN,M>
249  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, part, full, len);
250  } else if (tp.block.x == 128) {
251  reduceKernel<128,ReduceType,ReduceSimpleType,FloatN,M>
252  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, part, full, len);
253  } else if (tp.block.x == 160) {
254  reduceKernel<160,ReduceType,ReduceSimpleType,FloatN,M>
255  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, part, full, len);
256  } else if (tp.block.x == 192) {
257  reduceKernel<192,ReduceType,ReduceSimpleType,FloatN,M>
258  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, part, full, len);
259  } else if (tp.block.x == 224) {
260  reduceKernel<224,ReduceType,ReduceSimpleType,FloatN,M>
261  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, part, full, len);
262  } else if (tp.block.x == 256) {
263  reduceKernel<256,ReduceType,ReduceSimpleType,FloatN,M>
264  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, part, full, len);
265  } else if (tp.block.x == 288) {
266  reduceKernel<288,ReduceType,ReduceSimpleType,FloatN,M>
267  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, part, full, len);
268  } else if (tp.block.x == 320) {
269  reduceKernel<320,ReduceType,ReduceSimpleType,FloatN,M>
270  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, part, full, len);
271  } else if (tp.block.x == 352) {
272  reduceKernel<352,ReduceType,ReduceSimpleType,FloatN,M>
273  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, part, full, len);
274  } else if (tp.block.x == 384) {
275  reduceKernel<384,ReduceType,ReduceSimpleType,FloatN,M>
276  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, part, full, len);
277  } else if (tp.block.x == 416) {
278  reduceKernel<416,ReduceType,ReduceSimpleType,FloatN,M>
279  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, part, full, len);
280  } else if (tp.block.x == 448) {
281  reduceKernel<448,ReduceType,ReduceSimpleType,FloatN,M>
282  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, part, full, len);
283  } else if (tp.block.x == 480) {
284  reduceKernel<480,ReduceType,ReduceSimpleType,FloatN,M>
285  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, part, full, len);
286  } else if (tp.block.x == 512) {
287  reduceKernel<512,ReduceType,ReduceSimpleType,FloatN,M>
288  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, part, full, len);
289  } else if (tp.block.x == 544) {
290  reduceKernel<544,ReduceType,ReduceSimpleType,FloatN,M>
291  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, part, full, len);
292  } else if (tp.block.x == 576) {
293  reduceKernel<576,ReduceType,ReduceSimpleType,FloatN,M>
294  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, part, full, len);
295  } else if (tp.block.x == 608) {
296  reduceKernel<608,ReduceType,ReduceSimpleType,FloatN,M>
297  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, part, full, len);
298  } else if (tp.block.x == 640) {
299  reduceKernel<640,ReduceType,ReduceSimpleType,FloatN,M>
300  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, part, full, len);
301  } else if (tp.block.x == 672) {
302  reduceKernel<672,ReduceType,ReduceSimpleType,FloatN,M>
303  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, part, full, len);
304  } else if (tp.block.x == 704) {
305  reduceKernel<704,ReduceType,ReduceSimpleType,FloatN,M>
306  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, part, full, len);
307  } else if (tp.block.x == 736) {
308  reduceKernel<736,ReduceType,ReduceSimpleType,FloatN,M>
309  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, part, full, len);
310  } else if (tp.block.x == 768) {
311  reduceKernel<768,ReduceType,ReduceSimpleType,FloatN,M>
312  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, part, full, len);
313  } else if (tp.block.x == 800) {
314  reduceKernel<800,ReduceType,ReduceSimpleType,FloatN,M>
315  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, part, full, len);
316  } else if (tp.block.x == 832) {
317  reduceKernel<832,ReduceType,ReduceSimpleType,FloatN,M>
318  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, part, full, len);
319  } else if (tp.block.x == 864) {
320  reduceKernel<864,ReduceType,ReduceSimpleType,FloatN,M>
321  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, part, full, len);
322  } else if (tp.block.x == 896) {
323  reduceKernel<896,ReduceType,ReduceSimpleType,FloatN,M>
324  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, part, full, len);
325  } else if (tp.block.x == 928) {
326  reduceKernel<928,ReduceType,ReduceSimpleType,FloatN,M>
327  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, part, full, len);
328  } else if (tp.block.x == 960) {
329  reduceKernel<960,ReduceType,ReduceSimpleType,FloatN,M>
330  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, part, full, len);
331  } else if (tp.block.x == 992) {
332  reduceKernel<992,ReduceType,ReduceSimpleType,FloatN,M>
333  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, part, full, len);
334  } else if (tp.block.x == 1024) {
335  reduceKernel<1024,ReduceType,ReduceSimpleType,FloatN,M>
336  <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(X, Y, Z, W, V, r, part, full, len);
337  } else {
338  errorQuda("Reduction not implemented for %d threads", tp.block.x);
339  }
340 
341 #if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__)
342  if(deviceProp.canMapHostMemory) {
343  cudaEventRecord(reduceEnd, stream);
344  while (cudaSuccess != cudaEventQuery(reduceEnd)) { ; }
345  } else
346 #endif
347  { cudaMemcpy(h_reduce, hd_reduce, sizeof(ReduceType), cudaMemcpyDeviceToHost); }
348 
349  doubleN cpu_sum;
350  zero(cpu_sum);
351  cpu_sum += ((ReduceType*)h_reduce)[0];
352 
353  const int Nreduce = sizeof(doubleN) / sizeof(double);
354  reduceDoubleArray((double*)&cpu_sum, Nreduce);
355 
356  return cpu_sum;
357 }
358 
359 
360 template <typename doubleN, typename ReduceType, typename ReduceSimpleType, typename FloatN,
361  int M, typename SpinorX, typename SpinorY, typename SpinorZ,
362  typename SpinorW, typename SpinorV, typename Reducer>
363 class ReduceCuda : public Tunable {
364 
365 private:
366  SpinorX &X;
367  SpinorY &Y;
368  SpinorZ &Z;
369  SpinorW &W;
370  SpinorV &V;
371  Reducer &r;
372  const int length;
373  doubleN &result;
374 
375  // host pointers used for backing up fields when tuning
376  // these can't be curried into the Spinors because of Tesla argument length restriction
377  char *X_h, *Y_h, *Z_h, *W_h, *V_h;
378  char *Xnorm_h, *Ynorm_h, *Znorm_h, *Wnorm_h, *Vnorm_h;
379 
380  int sharedBytesPerThread() const { return sizeof(ReduceType); }
381 
382  // when there is only one warp per block, we need to allocate two warps
383  // worth of shared memory so that we don't index shared memory out of bounds
384  int sharedBytesPerBlock(const TuneParam &param) const {
385  int warpSize = 32; // FIXME - use device property query
386  return 2*warpSize*sizeof(ReduceType);
387  }
388 
389  virtual bool advanceSharedBytes(TuneParam &param) const
390  {
391  TuneParam next(param);
392  advanceBlockDim(next); // to get next blockDim
393  int nthreads = next.block.x * next.block.y * next.block.z;
394  param.shared_bytes = sharedBytesPerThread()*nthreads > sharedBytesPerBlock(param) ?
395  sharedBytesPerThread()*nthreads : sharedBytesPerBlock(param);
396  return false;
397  }
398 
399 public:
400  ReduceCuda(doubleN &result, SpinorX &X, SpinorY &Y, SpinorZ &Z,
401  SpinorW &W, SpinorV &V, Reducer &r, int length) :
402  result(result), X(X), Y(Y), Z(Z), W(W), V(V), r(r),
403  X_h(0), Y_h(0), Z_h(0), W_h(0), V_h(0),
404  Xnorm_h(0), Ynorm_h(0), Znorm_h(0), Wnorm_h(0), Vnorm_h(0), length(length)
405  { ; }
406  virtual ~ReduceCuda() { }
407 
408  TuneKey tuneKey() const {
409  std::stringstream vol, aux;
410  vol << blasConstants.x[0] << "x";
411  vol << blasConstants.x[1] << "x";
412  vol << blasConstants.x[2] << "x";
413  vol << blasConstants.x[3];
414  aux << "stride=" << blasConstants.stride << ",prec=" << X.Precision();
415  return TuneKey(vol.str(), typeid(r).name(), aux.str());
416  }
417 
418  void apply(const cudaStream_t &stream) {
419  TuneParam tp = tuneLaunch(*this, getBlasTuning(), getBlasVerbosity());
420  result = reduceLaunch<doubleN,ReduceType,ReduceSimpleType,FloatN,M>
421  (X, Y, Z, W, V, r, length, tp, stream);
422  }
423 
424  void preTune() {
425  size_t bytes = X.Precision()*(sizeof(FloatN)/sizeof(((FloatN*)0)->x))*M*X.Stride();
426  size_t norm_bytes = (X.Precision() == QUDA_HALF_PRECISION) ? sizeof(float)*length : 0;
427  X.save(&X_h, &Xnorm_h, bytes, norm_bytes);
428  Y.save(&Y_h, &Ynorm_h, bytes, norm_bytes);
429  Z.save(&Z_h, &Znorm_h, bytes, norm_bytes);
430  W.save(&W_h, &Wnorm_h, bytes, norm_bytes);
431  V.save(&V_h, &Vnorm_h, bytes, norm_bytes);
432  }
433 
434  void postTune() {
435  size_t bytes = X.Precision()*(sizeof(FloatN)/sizeof(((FloatN*)0)->x))*M*X.Stride();
436  size_t norm_bytes = (X.Precision() == QUDA_HALF_PRECISION) ? sizeof(float)*length : 0;
437  X.load(&X_h, &Xnorm_h, bytes, norm_bytes);
438  Y.load(&Y_h, &Ynorm_h, bytes, norm_bytes);
439  Z.load(&Z_h, &Znorm_h, bytes, norm_bytes);
440  W.load(&W_h, &Wnorm_h, bytes, norm_bytes);
441  V.load(&V_h, &Vnorm_h, bytes, norm_bytes);
442  }
443 
444  long long flops() const { return r.flops()*(sizeof(FloatN)/sizeof(((FloatN*)0)->x))*length*M; }
445  long long bytes() const {
446  size_t bytes = X.Precision()*(sizeof(FloatN)/sizeof(((FloatN*)0)->x))*M;
447  if (X.Precision() == QUDA_HALF_PRECISION) bytes += sizeof(float);
448  return r.streams()*bytes*length; }
449 };
450 
451 
452 /*
453  Wilson
454  double double2 M = 1/12
455  single float4 M = 1/6
456  half short4 M = 6/6
457 
458  Staggered
459  double double2 M = 1/3
460  single float2 M = 1/3
461  half short2 M = 3/3
462  */
463 
469 template <typename doubleN, typename ReduceType, typename ReduceSimpleType,
470  template <typename ReducerType, typename Float, typename FloatN> class Reducer,
471  int writeX, int writeY, int writeZ, int writeW, int writeV, bool siteUnroll>
472 doubleN reduceCuda(const double2 &a, const double2 &b, cudaColorSpinorField &x,
473  cudaColorSpinorField &y, cudaColorSpinorField &z, cudaColorSpinorField &w,
474  cudaColorSpinorField &v) {
475  if (x.SiteSubset() == QUDA_FULL_SITE_SUBSET) {
476  doubleN even =
477  reduceCuda<doubleN,ReduceType,ReduceSimpleType,Reducer,writeX,
478  writeY,writeZ,writeW,writeV,siteUnroll>
479  (a, b, x.Even(), y.Even(), z.Even(), w.Even(), v.Even());
480  doubleN odd =
481  reduceCuda<doubleN,ReduceType,ReduceSimpleType,Reducer,writeX,
482  writeY,writeZ,writeW,writeV,siteUnroll>
483  (a, b, x.Odd(), y.Odd(), z.Odd(), w.Odd(), v.Odd());
484  return even + odd;
485  }
486 
487  checkSpinor(x, y);
488  checkSpinor(x, z);
489  checkSpinor(x, w);
490  checkSpinor(x, v);
491 
492  for (int d=0; d<QUDA_MAX_DIM; d++) blasConstants.x[d] = x.X()[d];
493  blasConstants.stride = x.Stride();
494 
495  int reduce_length = siteUnroll ? x.RealLength() : x.Length();
496  doubleN value;
497 
498  if (x.Precision() == QUDA_DOUBLE_PRECISION) {
499  if (x.Nspin() == 4){ //wilson
500  const int M = siteUnroll ? 12 : 1; // determines how much work per thread to do
506  Reducer<ReduceType, double2, double2> r(a,b);
507  ReduceCuda<doubleN,ReduceType,ReduceSimpleType,double2,M,
510  Spinor<double2,double2,double2,M,writeV>, Reducer<ReduceType, double2, double2> >
511  reduce(value, X, Y, Z, W, V, r, reduce_length/(2*M));
512  reduce.apply(*getBlasStream());
513  } else if (x.Nspin() == 1){ //staggered
514  const int M = siteUnroll ? 3 : 1; // determines how much work per thread to do
520  Reducer<ReduceType, double2, double2> r(a,b);
521  ReduceCuda<doubleN,ReduceType,ReduceSimpleType,double2,M,
524  Spinor<double2,double2,double2,M,writeV>, Reducer<ReduceType, double2, double2> >
525  reduce(value, X, Y, Z, W, V, r, reduce_length/(2*M));
526  reduce.apply(*getBlasStream());
527  } else { errorQuda("ERROR: nSpin=%d is not supported\n", x.Nspin()); }
528  } else if (x.Precision() == QUDA_SINGLE_PRECISION) {
529  if (x.Nspin() == 4){ //wilson
530  const int M = siteUnroll ? 6 : 1; // determines how much work per thread to do
536  Reducer<ReduceType, float2, float4> r(make_float2(a.x, a.y), make_float2(b.x, b.y));
537  ReduceCuda<doubleN,ReduceType,ReduceSimpleType,float4,M,
540  Spinor<float4,float4,float4,M,writeV,4>, Reducer<ReduceType, float2, float4> >
541  reduce(value, X, Y, Z, W, V, r, reduce_length/(4*M));
542  reduce.apply(*getBlasStream());
543  } else if (x.Nspin() == 1) {
544  const int M = siteUnroll ? 3 : 1; // determines how much work per thread to do
550  Reducer<ReduceType, float2, float2> r(make_float2(a.x, a.y), make_float2(b.x, b.y));
551  ReduceCuda<doubleN,ReduceType,ReduceSimpleType,float2,M,
554  Spinor<float2,float2,float2,M,writeV,4>, Reducer<ReduceType, float2, float2> >
555  reduce(value, X, Y, Z, W, V, r, reduce_length/(2*M));
556  reduce.apply(*getBlasStream());
557  } else { errorQuda("ERROR: nSpin=%d is not supported\n", x.Nspin()); }
558  } else {
559  if (x.Nspin() == 4){ //wilson
565  Reducer<ReduceType, float2, float4> r(make_float2(a.x, a.y), make_float2(b.x, b.y));
566  ReduceCuda<doubleN,ReduceType,ReduceSimpleType,float4,6,
569  Spinor<float4,float4,short4,6,writeV,4>, Reducer<ReduceType, float2, float4> >
570  reduce(value, X, Y, Z, W, V, r, y.Volume());
571  reduce.apply(*getBlasStream());
572  } else if (x.Nspin() == 1) {//staggered
578  Reducer<ReduceType, float2, float2> r(make_float2(a.x, a.y), make_float2(b.x, b.y));
579  ReduceCuda<doubleN,ReduceType,ReduceSimpleType,float2,3,
582  Spinor<float2,float2,short2,3,writeV,4>, Reducer<ReduceType, float2, float2> >
583  reduce(value, X, Y, Z, W, V, r, y.Volume());
584  reduce.apply(*getBlasStream());
585  } else { errorQuda("ERROR: nSpin=%d is not supported\n", x.Nspin()); }
586  blas_bytes += Reducer<ReduceType,double2,double2>::streams()*(unsigned long long)x.Volume()*sizeof(float);
587  }
588  blas_bytes += Reducer<ReduceType,double2,double2>::streams()*(unsigned long long)x.RealLength()*x.Precision();
589  blas_flops += Reducer<ReduceType,double2,double2>::flops()*(unsigned long long)x.RealLength();
590 
591  checkCudaError();
592 
593  return value;
594 }
595