QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
reduce_mixed_core.h
Go to the documentation of this file.
1 namespace mixed {
2 
3 __host__ __device__ void zero(double &x) { x = 0.0; }
4 __host__ __device__ void zero(double2 &x) { x.x = 0.0; x.y = 0.0; }
5 __host__ __device__ void zero(double3 &x) { x.x = 0.0; x.y = 0.0; x.z = 0.0; }
6 __device__ void copytoshared(double *s, const int i, const double x, const int block) { s[i] = x; }
7 __device__ void copytoshared(double *s, const int i, const double2 x, const int block)
8 { s[i] = x.x; s[i+block] = x.y; }
9 __device__ void copytoshared(double *s, const int i, const double3 x, const int block)
10 { s[i] = x.x; s[i+block] = x.y; s[i+2*block] = x.z; }
11 __device__ void copytoshared(volatile double *s, const int i, const double x, const int block) { s[i] = x; }
12 __device__ void copytoshared(volatile double *s, const int i, const double2 x, const int block)
13 { s[i] = x.x; s[i+block] = x.y; }
14 __device__ void copytoshared(volatile double *s, const int i, const double3 x, const int block)
15 { s[i] = x.x; s[i+block] = x.y; s[i+2*block] = x.z; }
16 __device__ void copyfromshared(double &x, const double *s, const int i, const int block) { x = s[i]; }
17 __device__ void copyfromshared(double2 &x, const double *s, const int i, const int block)
18 { x.x = s[i]; x.y = s[i+block]; }
19 __device__ void copyfromshared(double3 &x, const double *s, const int i, const int block)
20 { x.x = s[i]; x.y = s[i+block]; x.z = s[i+2*block]; }
21 
22 template<typename ReduceType, typename ReduceSimpleType>
23 __device__ void add(ReduceType &sum, ReduceSimpleType *s, const int i, const int block) { }
24 template<> __device__ void add<double,double>(double &sum, double *s, const int i, const int block)
25 { sum += s[i]; }
26 template<> __device__ void add<double2,double>(double2 &sum, double *s, const int i, const int block)
27 { sum.x += s[i]; sum.y += s[i+block]; }
28 template<> __device__ void add<double3,double>(double3 &sum, double *s, const int i, const int block)
29 { sum.x += s[i]; sum.y += s[i+block]; sum.z += s[i+2*block]; }
30 
31 template<typename ReduceType, typename ReduceSimpleType>
32 __device__ void add(ReduceSimpleType *s, const int i, const int j, const int block) { }
33 template<typename ReduceType, typename ReduceSimpleType>
34 __device__ void add(volatile ReduceSimpleType *s, const int i, const int j, const int block) { }
35 
36 template<> __device__ void add<double,double>(double *s, const int i, const int j, const int block)
37 { s[i] += s[j]; }
38 template<> __device__ void add<double,double>(volatile double *s, const int i, const int j, const int block)
39 { s[i] += s[j]; }
40 
41 template<> __device__ void add<double2,double>(double *s, const int i, const int j, const int block)
42 { s[i] += s[j]; s[i+block] += s[j+block];}
43 template<> __device__ void add<double2,double>(volatile double *s, const int i, const int j, const int block)
44 { s[i] += s[j]; s[i+block] += s[j+block];}
45 
46 template<> __device__ void add<double3,double>(double *s, const int i, const int j, const int block)
47 { s[i] += s[j]; s[i+block] += s[j+block]; s[i+2*block] += s[j+2*block];}
48 template<> __device__ void add<double3,double>(volatile double *s, const int i, const int j, const int block)
49 { s[i] += s[j]; s[i+block] += s[j+block]; s[i+2*block] += s[j+2*block];}
50 
51 #if (__COMPUTE_CAPABILITY__ < 130)
52 __host__ __device__ void zero(doublesingle &x) { x = 0.0; }
53 __host__ __device__ void zero(doublesingle2 &x) { x.x = 0.0; x.y = 0.0; }
54 __host__ __device__ void zero(doublesingle3 &x) { x.x = 0.0; x.y = 0.0; x.z = 0.0; }
55 __device__ void copytoshared(doublesingle *s, const int i, const doublesingle x, const int block) { s[i] = x; }
56 __device__ void copytoshared(doublesingle *s, const int i, const doublesingle2 x, const int block)
57 { s[i] = x.x; s[i+block] = x.y; }
58 __device__ void copytoshared(doublesingle *s, const int i, const doublesingle3 x, const int block)
59 { s[i] = x.x; s[i+block] = x.y; s[i+2*block] = x.z; }
60 __device__ void copytoshared(volatile doublesingle *s, const int i, const doublesingle x, const int block) { s[i].a.x = x.a.x; s[i].a.y = x.a.y; }
61 __device__ void copytoshared(volatile doublesingle *s, const int i, const doublesingle2 x, const int block)
62 { s[i].a.x = x.x.a.x; s[i].a.y = x.x.a.y; s[i+block].a.x = x.y.a.x; s[i+block].a.y = x.y.a.y; }
63 __device__ void copytoshared(volatile doublesingle *s, const int i, const doublesingle3 x, const int block)
64 { s[i].a.x = x.x.a.x; s[i].a.y = x.x.a.y; s[i+block].a.x = x.y.a.x; s[i+block].a.y = x.y.a.y;
65  s[i+2*block].a.x = x.z.a.x; s[i+2*block].a.y = x.z.a.y; }
66 __device__ void copyfromshared(doublesingle &x, const doublesingle *s, const int i, const int block) { x = s[i]; }
67 __device__ void copyfromshared(doublesingle2 &x, const doublesingle *s, const int i, const int block)
68 { x.x = s[i]; x.y = s[i+block]; }
69 __device__ void copyfromshared(doublesingle3 &x, const doublesingle *s, const int i, const int block)
70 { x.x = s[i]; x.y = s[i+block]; x.z = s[i+2*block]; }
71 
72 template<> __device__ void add<doublesingle,doublesingle>(doublesingle &sum, doublesingle *s, const int i, const int block)
73 { sum += s[i]; }
74 template<> __device__ void add<doublesingle2,doublesingle>(doublesingle2 &sum, doublesingle *s, const int i, const int block)
75 { sum.x += s[i]; sum.y += s[i+block]; }
76 template<> __device__ void add<doublesingle3,doublesingle>(doublesingle3 &sum, doublesingle *s, const int i, const int block)
77 { sum.x += s[i]; sum.y += s[i+block]; sum.z += s[i+2*block]; }
78 
79 template<> __device__ void add<doublesingle,doublesingle>(doublesingle *s, const int i, const int j, const int block)
80 { s[i] += s[j]; }
81 template<> __device__ void add<doublesingle,doublesingle>(volatile doublesingle *s, const int i, const int j, const int block)
82 { s[i] += s[j]; }
83 
84 template<> __device__ void add<doublesingle2,doublesingle>(doublesingle *s, const int i, const int j, const int block)
85 { s[i] += s[j]; s[i+block] += s[j+block];}
86 template<> __device__ void add<doublesingle2,doublesingle>(volatile doublesingle *s, const int i, const int j, const int block)
87 { s[i] += s[j]; s[i+block] += s[j+block];}
88 
89 template<> __device__ void add<doublesingle3,doublesingle>(doublesingle *s, const int i, const int j, const int block)
90 { s[i] += s[j]; s[i+block] += s[j+block]; s[i+2*block] += s[j+2*block];}
91 template<> __device__ void add<doublesingle3,doublesingle>(volatile doublesingle *s, const int i, const int j, const int block)
92 { s[i] += s[j]; s[i+block] += s[j+block]; s[i+2*block] += s[j+2*block];}
93 #endif
94 
95 #include <launch_kernel.cuh>
96 
97 __device__ unsigned int count = 0;
98 __shared__ bool isLastBlockDone;
99 
100 template <typename ReduceType, typename SpinorX, typename SpinorY,
101  typename SpinorZ, typename SpinorW, typename SpinorV, typename Reducer>
102 struct ReduceArg {
103  SpinorX X;
104  SpinorY Y;
105  SpinorZ Z;
106  SpinorW W;
107  SpinorV V;
108  Reducer r;
109  ReduceType *partial;
110  ReduceType *complete;
111  const int length;
112  ReduceArg(SpinorX X, SpinorY Y, SpinorZ Z, SpinorW W, SpinorV V, Reducer r,
113  ReduceType *partial, ReduceType *complete, int length)
114  : X(X), Y(Y), Z(Z), W(W), V(V), r(r), partial(partial), complete(complete), length(length) { ; }
115 };
116 
120 template <int block_size, typename ReduceType, typename ReduceSimpleType,
121  typename FloatN, int M, typename SpinorX, typename SpinorY,
122  typename SpinorZ, typename SpinorW, typename SpinorV, typename Reducer>
124  unsigned int tid = threadIdx.x;
125  unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
126  unsigned int gridSize = gridDim.x*blockDim.x;
127 
128  ReduceType sum;
129  zero(sum);
130  while (i < arg.length) {
131  FloatN x[M], y[M], z[M], w[M], v[M];
132  arg.X.load(x, i);
133  arg.Y.load(y, i);
134  arg.Z.load(z, i);
135  arg.W.load(w, i);
136  arg.V.load(v, i);
137 
138 #if (__COMPUTE_CAPABILITY__ >= 200)
139  arg.r.pre();
140 #endif
141 
142 #pragma unroll
143  for (int j=0; j<M; j++) arg.r(sum, x[j], y[j], z[j], w[j], v[j]);
144 
145 #if (__COMPUTE_CAPABILITY__ >= 200)
146  arg.r.post(sum);
147 #endif
148 
149  arg.X.save(x, i);
150  arg.Y.save(y, i);
151  arg.Z.save(z, i);
152  arg.W.save(w, i);
153  arg.V.save(v, i);
154 
155  i += gridSize;
156  }
157 
158  extern __shared__ ReduceSimpleType sdata[];
159  ReduceSimpleType *s = sdata + tid;
160  if (tid >= warpSize) copytoshared(s, 0, sum, block_size);
161  __syncthreads();
162 
163  // now reduce using the first warp only
164  if (tid<warpSize) {
165  // Warp raking
166 #pragma unroll
167  for (int i=warpSize; i<block_size; i+=warpSize) { add<ReduceType>(sum, s, i, block_size); }
168 
169  // Intra-warp reduction
170  volatile ReduceSimpleType *sv = s;
171  copytoshared(sv, 0, sum, block_size);
172 
173  if (block_size >= 32) { add<ReduceType>(sv, 0, 16, block_size); }
174  if (block_size >= 16) { add<ReduceType>(sv, 0, 8, block_size); }
175  if (block_size >= 8) { add<ReduceType>(sv, 0, 4, block_size); }
176  if (block_size >= 4) { add<ReduceType>(sv, 0, 2, block_size); }
177  if (block_size >= 2) { add<ReduceType>(sv, 0, 1, block_size); }
178 
179  // warpSize generic warp reduction - open64 can't handle it, only nvvm
180  //#pragma unroll
181  //for (int i=warpSize/2; i>0; i/=2) { add<ReduceType>(sv, 0, i, block_size); }
182  }
183 
184  // write result for this block to global mem
185  if (tid == 0) {
186  ReduceType tmp;
187  copyfromshared(tmp, s, 0, block_size);
188  arg.partial[blockIdx.x] = tmp;
189 
190  __threadfence(); // flush result
191 
192  // increment global block counter
193  unsigned int value = atomicInc(&count, gridDim.x);
194 
195  // Determine if this block is the last block to be done
196  isLastBlockDone = (value == (gridDim.x-1));
197  }
198 
199  __syncthreads();
200 
201  // Finish the reduction if last block
202  if (isLastBlockDone) {
203  unsigned int i = threadIdx.x;
204 
205  ReduceType sum;
206  zero(sum);
207  while (i < gridDim.x) {
208  sum += arg.partial[i];
209  i += block_size;
210  }
211 
212  extern __shared__ ReduceSimpleType sdata[];
213  ReduceSimpleType *s = sdata + tid;
214  if (tid >= warpSize) copytoshared(s, 0, sum, block_size);
215  __syncthreads();
216 
217  // now reduce using the first warp only
218  if (tid<warpSize) {
219  // Warp raking
220 #pragma unroll
221  for (int i=warpSize; i<block_size; i+=warpSize) { add<ReduceType>(sum, s, i, block_size); }
222 
223  // Intra-warp reduction
224  volatile ReduceSimpleType *sv = s;
225  copytoshared(sv, 0, sum, block_size);
226 
227  if (block_size >= 32) { add<ReduceType>(sv, 0, 16, block_size); }
228  if (block_size >= 16) { add<ReduceType>(sv, 0, 8, block_size); }
229  if (block_size >= 8) { add<ReduceType>(sv, 0, 4, block_size); }
230  if (block_size >= 4) { add<ReduceType>(sv, 0, 2, block_size); }
231  if (block_size >= 2) { add<ReduceType>(sv, 0, 1, block_size); }
232 
233  //#pragma unroll
234  //for (int i=warpSize/2; i>0; i/=2) { add<ReduceType>(sv, 0, i, block_size); }
235  }
236 
237  // write out the final reduced value
238  if (threadIdx.x == 0) {
239  ReduceType tmp;
240  copyfromshared(tmp, s, 0, block_size);
241  arg.complete[0] = tmp;
242  count = 0;
243  }
244  }
245 
246 }
250 template <typename doubleN, typename ReduceType, typename ReduceSimpleType, typename FloatN,
251  int M, typename SpinorX, typename SpinorY, typename SpinorZ,
252  typename SpinorW, typename SpinorV, typename Reducer>
254  const TuneParam &tp, const cudaStream_t &stream) {
255  if (tp.grid.x > REDUCE_MAX_BLOCKS)
256  errorQuda("Grid size %d greater than maximum %d\n", tp.grid.x, REDUCE_MAX_BLOCKS);
257 
258  LAUNCH_KERNEL(reduceKernel,tp,stream,arg,ReduceType,ReduceSimpleType,FloatN,M);
259 
260 #if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__)
261  if(deviceProp.canMapHostMemory) {
262  cudaEventRecord(reduceEnd, stream);
263  while (cudaSuccess != cudaEventQuery(reduceEnd)) { ; }
264  } else
265 #endif
266  { cudaMemcpy(h_reduce, hd_reduce, sizeof(ReduceType), cudaMemcpyDeviceToHost); }
267 
268  doubleN cpu_sum;
269  zero(cpu_sum);
270  cpu_sum += ((ReduceType*)h_reduce)[0];
271 
272  const int Nreduce = sizeof(doubleN) / sizeof(double);
273  reduceDoubleArray((double*)&cpu_sum, Nreduce);
274 
275  return cpu_sum;
276 }
277 
278 
279 template <typename doubleN, typename ReduceType, typename ReduceSimpleType, typename FloatN,
280  int M, typename SpinorX, typename SpinorY, typename SpinorZ,
281  typename SpinorW, typename SpinorV, typename Reducer>
282 class ReduceCuda : public Tunable {
283 
284 private:
286  doubleN &result;
287 
288  // host pointers used for backing up fields when tuning
289  // these can't be curried into the Spinors because of Tesla argument length restriction
290  char *X_h, *Y_h, *Z_h, *W_h, *V_h;
291  char *Xnorm_h, *Ynorm_h, *Znorm_h, *Wnorm_h, *Vnorm_h;
292  const size_t *bytes_;
293  const size_t *norm_bytes_;
294 
295  unsigned int sharedBytesPerThread() const { return sizeof(ReduceType); }
296 
297  // when there is only one warp per block, we need to allocate two warps
298  // worth of shared memory so that we don't index shared memory out of bounds
299  unsigned int sharedBytesPerBlock(const TuneParam &param) const {
300  int warpSize = 32; // FIXME - use device property query
301  return 2*warpSize*sizeof(ReduceType);
302  }
303 
304  virtual bool advanceSharedBytes(TuneParam &param) const
305  {
306  TuneParam next(param);
307  advanceBlockDim(next); // to get next blockDim
308  int nthreads = next.block.x * next.block.y * next.block.z;
309  param.shared_bytes = sharedBytesPerThread()*nthreads > sharedBytesPerBlock(param) ?
310  sharedBytesPerThread()*nthreads : sharedBytesPerBlock(param);
311  return false;
312  }
313 
314 public:
315  ReduceCuda(doubleN &result, SpinorX &X, SpinorY &Y, SpinorZ &Z,
316  SpinorW &W, SpinorV &V, Reducer &r, int length,
317  const size_t *bytes, const size_t *norm_bytes) :
318  arg(X, Y, Z, W, V, r, (ReduceType*)d_reduce, (ReduceType*)hd_reduce, length),
319  result(result), X_h(0), Y_h(0), Z_h(0), W_h(0), V_h(0),
320  Xnorm_h(0), Ynorm_h(0), Znorm_h(0), Wnorm_h(0), Vnorm_h(0),
321  bytes_(bytes), norm_bytes_(norm_bytes) { }
322  virtual ~ReduceCuda() { }
323 
324  inline TuneKey tuneKey() const {
325  return TuneKey(blasStrings.vol_str, typeid(arg.r).name(), blasStrings.aux_tmp);
326  }
327 
328  void apply(const cudaStream_t &stream) {
329  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
330  result = reduceLaunch<doubleN,ReduceType,ReduceSimpleType,FloatN,M>(arg, tp, stream);
331  }
332 
333  void preTune() {
334  arg.X.save(&X_h, &Xnorm_h, bytes_[0], norm_bytes_[0]);
335  arg.Y.save(&Y_h, &Ynorm_h, bytes_[1], norm_bytes_[1]);
336  arg.Z.save(&Z_h, &Znorm_h, bytes_[2], norm_bytes_[2]);
337  arg.W.save(&W_h, &Wnorm_h, bytes_[3], norm_bytes_[3]);
338  arg.V.save(&V_h, &Vnorm_h, bytes_[4], norm_bytes_[4]);
339  }
340 
341  void postTune() {
342  arg.X.load(&X_h, &Xnorm_h, bytes_[0], norm_bytes_[0]);
343  arg.Y.load(&Y_h, &Ynorm_h, bytes_[1], norm_bytes_[1]);
344  arg.Z.load(&Z_h, &Znorm_h, bytes_[2], norm_bytes_[2]);
345  arg.W.load(&W_h, &Wnorm_h, bytes_[3], norm_bytes_[3]);
346  arg.V.load(&V_h, &Vnorm_h, bytes_[4], norm_bytes_[4]);
347  }
348 
349  long long flops() const { return arg.r.flops()*(sizeof(FloatN)/sizeof(((FloatN*)0)->x))*arg.length*M; }
350  long long bytes() const {
351  size_t bytes = arg.X.Precision()*(sizeof(FloatN)/sizeof(((FloatN*)0)->x))*M;
352  if (arg.X.Precision() == QUDA_HALF_PRECISION) bytes += sizeof(float);
353  return arg.r.streams()*bytes*arg.length; }
354  int tuningIter() const { return 3; }
355 };
356 
357 
358 /*
359  Wilson
360  double double2 M = 1/12
361  single float4 M = 1/6
362  half short4 M = 6/6
363 
364  Staggered
365  double double2 M = 1/3
366  single float2 M = 1/3
367  half short2 M = 3/3
368  */
369 
375 template <typename doubleN, typename ReduceType, typename ReduceSimpleType,
376  template <typename ReducerType, typename Float, typename FloatN> class Reducer,
377  int writeX, int writeY, int writeZ, int writeW, int writeV, bool siteUnroll>
378 doubleN reduceCuda(const double2 &a, const double2 &b, cudaColorSpinorField &x,
379  cudaColorSpinorField &y, cudaColorSpinorField &z, cudaColorSpinorField &w,
380  cudaColorSpinorField &v) {
381  if (x.SiteSubset() == QUDA_FULL_SITE_SUBSET) {
382  doubleN even =
383  mixed::reduceCuda<doubleN,ReduceType,ReduceSimpleType,Reducer,writeX,
384  writeY,writeZ,writeW,writeV,siteUnroll>
385  (a, b, x.Even(), y.Even(), z.Even(), w.Even(), v.Even());
386  doubleN odd =
387  mixed::reduceCuda<doubleN,ReduceType,ReduceSimpleType,Reducer,writeX,
388  writeY,writeZ,writeW,writeV,siteUnroll>
389  (a, b, x.Odd(), y.Odd(), z.Odd(), w.Odd(), v.Odd());
390  return even + odd;
391  }
392 
393  checkSpinor(x, y);
394  checkLength(x, z); // z vector is the high precision one
395  checkSpinor(x, w);
396  checkSpinor(x, v);
397 
398  if (!x.isNative()) {
399  warningQuda("Reductions on non-native fields is not supported\n");
400  doubleN value;
401  zero(value);
402  return value;
403  }
404 
405  blasStrings.vol_str = x.VolString();
406  strcpy(blasStrings.aux_tmp, x.AuxString());
407  strcat(blasStrings.aux_tmp, ",");
408  strcat(blasStrings.aux_tmp, z.AuxString());
409 
410  doubleN value;
411 
412  // FIXME: use traits to encapsulate register type for shorts -
413  // will reduce template type parameters from 3 to 2
414 
415  size_t bytes[] = {x.Bytes(), y.Bytes(), z.Bytes(), w.Bytes(), v.Bytes()};
416  size_t norm_bytes[] = {x.NormBytes(), y.NormBytes(), z.NormBytes(), w.NormBytes(), v.NormBytes()};
417 
418  if (x.Precision() == QUDA_SINGLE_PRECISION && z.Precision() == QUDA_DOUBLE_PRECISION) {
419  if (x.Nspin() == 4){ //wilson
420 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC)
421  const int M = 12; // determines how much work per thread to do
427  Reducer<ReduceType, double2, double2> r(a,b);
428  ReduceCuda<doubleN,ReduceType,ReduceSimpleType,double2,M,
431  Spinor<double2,double4,float4,M,writeV>, Reducer<ReduceType, double2, double2> >
432  reduce(value, X, Y, Z, W, V, r, y.Volume(), bytes, norm_bytes);
433  reduce.apply(*getBlasStream());
434 #else
435  errorQuda("blas has not been built for Nspin=%d fields", x.Nspin());
436 #endif
437  } else if (x.Nspin() == 1) { //staggered
438 #ifdef GPU_STAGGERED_DIRAC
439  const int M = siteUnroll ? 3 : 1; // determines how much work per thread to do
440  const int reduce_length = siteUnroll ? x.RealLength() : x.Length();
446  Reducer<ReduceType, double2, double2> r(a,b);
447  ReduceCuda<doubleN,ReduceType,ReduceSimpleType,double2,M,
450  Spinor<double2,double2,float2,M,writeV>, Reducer<ReduceType, double2, double2> >
451  reduce(value, X, Y, Z, W, V, r, reduce_length/(2*M), bytes, norm_bytes);
452  reduce.apply(*getBlasStream());
453 #else
454  errorQuda("blas has not been built for Nspin=%d fields", x.Nspin());
455 #endif
456  } else { errorQuda("ERROR: nSpin=%d is not supported\n", x.Nspin()); }
457  } else if (x.Precision() == QUDA_HALF_PRECISION && z.Precision() == QUDA_DOUBLE_PRECISION) {
458  if (x.Nspin() == 4){ //wilson
459 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC)
460  const int M = 12; // determines how much work per thread to do
466  Reducer<ReduceType, double2, double2> r(a,b);
467  ReduceCuda<doubleN,ReduceType,ReduceSimpleType,double2,M,
470  Spinor<double2,double4,short4,M,writeV>, Reducer<ReduceType, double2, double2> >
471  reduce(value, X, Y, Z, W, V, r, y.Volume(), bytes, norm_bytes);
472  reduce.apply(*getBlasStream());
473 #else
474  errorQuda("blas has not been built for Nspin=%d fields", x.Nspin());
475 #endif
476  } else if (x.Nspin() == 1){ //staggered
477 #ifdef GPU_STAGGERED_DIRAC
478  const int M = siteUnroll ? 3 : 1; // determines how much work per thread to do
479  const int reduce_length = siteUnroll ? x.RealLength() : x.Length();
485  Reducer<ReduceType, double2, double2> r(a,b);
486  ReduceCuda<doubleN,ReduceType,ReduceSimpleType,double2,M,
489  Spinor<double2,double2,float2,M,writeV>, Reducer<ReduceType, double2, double2> >
490  reduce(value, X, Y, Z, W, V, r, reduce_length/(2*M), bytes, norm_bytes);
491  reduce.apply(*getBlasStream());
492 #else
493  errorQuda("blas has not been built for Nspin=%d fields", x.Nspin());
494 #endif
495  } else { errorQuda("ERROR: nSpin=%d is not supported\n", x.Nspin()); }
496  } else if (z.Precision() == QUDA_SINGLE_PRECISION) {
497  if (x.Nspin() == 4){ //wilson
498 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC)
504  Reducer<ReduceType, float2, float4> r(make_float2(a.x, a.y), make_float2(b.x, b.y));
505  ReduceCuda<doubleN,ReduceType,ReduceSimpleType,float4,6,
508  Spinor<float4,float4,short4,6,writeV,4>, Reducer<ReduceType, float2, float4> >
509  reduce(value, X, Y, Z, W, V, r, y.Volume(), bytes, norm_bytes);
510  reduce.apply(*getBlasStream());
511 #else
512  errorQuda("blas has not been built for Nspin=%d fields", x.Nspin());
513 #endif
514  } else if (x.Nspin() == 1) {//staggered
515 #ifdef GPU_STAGGERED_DIRAC
521  Reducer<ReduceType, float2, float2> r(make_float2(a.x, a.y), make_float2(b.x, b.y));
522  ReduceCuda<doubleN,ReduceType,ReduceSimpleType,float2,3,
525  Spinor<float2,float2,short2,3,writeV,4>, Reducer<ReduceType, float2, float2> >
526  reduce(value, X, Y, Z, W, V, r, y.Volume(), bytes, norm_bytes);
527  reduce.apply(*getBlasStream());
528 #else
529  errorQuda("blas has not been built for Nspin=%d fields", x.Nspin());
530 #endif
531  } else { errorQuda("ERROR: nSpin=%d is not supported\n", x.Nspin()); }
532  blas_bytes += Reducer<ReduceType,double2,double2>::streams()*(unsigned long long)x.Volume()*sizeof(float);
533  }
534  blas_bytes += Reducer<ReduceType,double2,double2>::streams()*(unsigned long long)x.RealLength()*x.Precision();
535  blas_flops += Reducer<ReduceType,double2,double2>::flops()*(unsigned long long)x.RealLength();
536 
537  checkCudaError();
538 
539  return value;
540 }
541 
542 }
int V
Definition: test_util.cpp:29
int y[4]
cudaDeviceProp deviceProp
doublesingle x
Definition: double_single.h:37
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
#define errorQuda(...)
Definition: util_quda.h:73
unsigned long long blas_bytes
Definition: blas_quda.cu:38
int tuningIter() const
cudaStream_t * streams
ReduceArg(SpinorX X, SpinorY Y, SpinorZ Z, SpinorW W, SpinorV V, Reducer r, ReduceType *partial, ReduceType *complete, int length)
cudaStream_t * stream
__device__ void add< double, double >(double &sum, double *s, const int i, const int block)
doublesingle y
Definition: double_single.h:50
ReduceType * complete
long long bytes() const
int length[]
doubleN reduceLaunch(ReduceArg< ReduceType, SpinorX, SpinorY, SpinorZ, SpinorW, SpinorV, Reducer > &arg, const TuneParam &tp, const cudaStream_t &stream)
QudaGaugeParam param
Definition: pack_test.cpp:17
__shared__ bool isLastBlockDone
doublesingle z
Definition: double_single.h:51
__device__ void add< doublesingle3, doublesingle >(doublesingle3 &sum, doublesingle *s, const int i, const int block)
cudaColorSpinorField * tmp
void reduceDoubleArray(double *, const int len)
doublesingle y
Definition: double_single.h:38
ReduceCuda(doubleN &result, SpinorX &X, SpinorY &Y, SpinorZ &Z, SpinorW &W, SpinorV &V, Reducer &r, int length, const size_t *bytes, const size_t *norm_bytes)
__device__ void add< double3, double >(double3 &sum, double *s, const int i, const int block)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:271
#define warningQuda(...)
Definition: util_quda.h:84
#define checkLength(a, b)
Definition: blas_quda.cu:25
__device__ void copyfromshared(double &x, const double *s, const int i, const int block)
__host__ __device__ void zero(double &x)
int x[4]
__device__ void copytoshared(double *s, const int i, const double x, const int block)
cudaStream_t * getBlasStream()
Definition: blas_quda.cu:64
unsigned long long blas_flops
Definition: blas_quda.cu:37
__device__ void add< double2, double >(double2 &sum, double *s, const int i, const int block)
int Z[4]
Definition: test_util.cpp:28
__device__ unsigned int count
__device__ void add< doublesingle2, doublesingle >(doublesingle2 &sum, doublesingle *s, const int i, const int block)
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:843
void apply(const cudaStream_t &stream)
#define checkSpinor(a, b)
Definition: blas_quda.cu:15
__device__ void add< doublesingle, doublesingle >(doublesingle &sum, doublesingle *s, const int i, const int block)
#define REDUCE_MAX_BLOCKS
Definition: reduce_quda.cu:16
TuneKey tuneKey() const
#define checkCudaError()
Definition: util_quda.h:110
QudaTune getTuning()
Definition: util_quda.cpp:32
VOLATILE spinorFloat * s
long long flops() const
doubleN reduceCuda(const double2 &a, const double2 &b, cudaColorSpinorField &x, cudaColorSpinorField &y, cudaColorSpinorField &z, cudaColorSpinorField &w, cudaColorSpinorField &v)
__syncthreads()
doublesingle x
Definition: double_single.h:49
__device__ void add(ReduceType &sum, ReduceSimpleType *s, const int i, const int block)
__global__ void reduceKernel(ReduceArg< ReduceType, SpinorX, SpinorY, SpinorZ, SpinorW, SpinorV, Reducer > arg)
ReduceType * partial