QUDA  0.9.0
reduce_core.cuh
Go to the documentation of this file.
1 __host__ __device__ inline double set(double &x) { return x;}
2 __host__ __device__ inline double2 set(double2 &x) { return x;}
3 __host__ __device__ inline double3 set(double3 &x) { return x;}
4 __host__ __device__ inline double4 set(double4 &x) { return x;}
5 __host__ __device__ inline void sum(double &a, double &b) { a += b; }
6 __host__ __device__ inline void sum(double2 &a, double2 &b) { a.x += b.x; a.y += b.y; }
7 __host__ __device__ inline void sum(double3 &a, double3 &b) { a.x += b.x; a.y += b.y; a.z += b.z; }
8 __host__ __device__ inline void sum(double4 &a, double4 &b) { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; }
9 
10 #ifdef QUAD_SUM
11 __host__ __device__ inline double set(doubledouble &a) { return a.head(); }
12 __host__ __device__ inline double2 set(doubledouble2 &a) { return make_double2(a.x.head(),a.y.head()); }
13 __host__ __device__ inline double3 set(doubledouble3 &a) { return make_double3(a.x.head(),a.y.head(),a.z.head()); }
14 __host__ __device__ inline void sum(double &a, doubledouble &b) { a += b.head(); }
15 __host__ __device__ inline void sum(double2 &a, doubledouble2 &b) { a.x += b.x.head(); a.y += b.y.head(); }
16 __host__ __device__ inline void sum(double3 &a, doubledouble3 &b) { a.x += b.x.head(); a.y += b.y.head(); a.z += b.z.head(); }
17 #endif
18 
19 __device__ static unsigned int count = 0;
20 __shared__ static bool isLastBlockDone;
21 
22 #include <launch_kernel.cuh>
23 
24 template <typename ReduceType, typename SpinorX, typename SpinorY,
25 typename SpinorZ, typename SpinorW, typename SpinorV, typename Reducer>
26 struct ReductionArg : public ReduceArg<ReduceType> {
27  SpinorX X;
28  SpinorY Y;
29  SpinorZ Z;
30  SpinorW W;
31  SpinorV V;
32  Reducer r;
33  const int length;
34  ReductionArg(SpinorX X, SpinorY Y, SpinorZ Z, SpinorW W, SpinorV V, Reducer r, int length)
35  : X(X), Y(Y), Z(Z), W(W), V(V), r(r), length(length) { ; }
36 };
37 
41 template <int block_size, typename ReduceType, typename FloatN, int M, typename SpinorX,
42  typename SpinorY, typename SpinorZ, typename SpinorW, typename SpinorV, typename Reducer>
44 
45  unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
46  unsigned int parity = blockIdx.y;
47  unsigned int gridSize = gridDim.x*blockDim.x;
48 
49  ReduceType sum;
51 
52  while (i < arg.length) {
53  FloatN x[M], y[M], z[M], w[M], v[M];
54  arg.X.load(x, i, parity);
55  arg.Y.load(y, i, parity);
56  arg.Z.load(z, i, parity);
57  arg.W.load(w, i, parity);
58  arg.V.load(v, i, parity);
59 
60  arg.r.pre();
61 
62 #pragma unroll
63  for (int j=0; j<M; j++) arg.r(sum, x[j], y[j], z[j], w[j], v[j]);
64 
65  arg.r.post(sum);
66 
67  arg.X.save(x, i, parity);
68  arg.Y.save(y, i, parity);
69  arg.Z.save(z, i, parity);
70  arg.W.save(w, i, parity);
71  arg.V.save(v, i, parity);
72 
73  i += gridSize;
74  }
75 
76  ::quda::reduce<block_size, ReduceType>(arg, sum, parity);
77 }
78 
79 
83 template <typename doubleN, typename ReduceType, typename FloatN, int M, typename SpinorX,
84  typename SpinorY, typename SpinorZ, typename SpinorW, typename SpinorV, typename Reducer>
86  const TuneParam &tp, const cudaStream_t &stream) {
87  if (tp.grid.x > (unsigned int)deviceProp.maxGridSize[0])
88  errorQuda("Grid size %d greater than maximum %d\n", tp.grid.x, deviceProp.maxGridSize[0]);
89 
90  LAUNCH_KERNEL(reduceKernel,tp,stream,arg,ReduceType,FloatN,M);
91 
92  if (!commAsyncReduction()) {
93 #if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__)
94  if(deviceProp.canMapHostMemory) {
96  while (cudaSuccess != qudaEventQuery(reduceEnd)) { ; }
97  } else
98 #endif
99  { qudaMemcpy(h_reduce, hd_reduce, sizeof(ReduceType), cudaMemcpyDeviceToHost); }
100  }
101  doubleN cpu_sum = set(((ReduceType*)h_reduce)[0]);
102  if (tp.grid.y==2) sum(cpu_sum, ((ReduceType*)h_reduce)[1]); // add other parity if needed
103  return cpu_sum;
104 }
105 
106 
107 template <typename doubleN, typename ReduceType, typename FloatN, int M, typename SpinorX,
108  typename SpinorY, typename SpinorZ, typename SpinorW, typename SpinorV, typename Reducer>
109 class ReduceCuda : public Tunable {
110 
111 private:
113  doubleN &result;
114  const int nParity;
115 
116  // host pointers used for backing up fields when tuning
117  // these can't be curried into the Spinors because of Tesla argument length restriction
118  char *X_h, *Y_h, *Z_h, *W_h, *V_h;
120  const size_t *bytes_;
121  const size_t *norm_bytes_;
122 
123  unsigned int sharedBytesPerThread() const { return 0; }
124  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
125 
126  virtual bool advanceSharedBytes(TuneParam &param) const
127  {
128  TuneParam next(param);
129  advanceBlockDim(next); // to get next blockDim
130  int nthreads = next.block.x * next.block.y * next.block.z;
131  param.shared_bytes = sharedBytesPerThread()*nthreads > sharedBytesPerBlock(param) ?
133  return false;
134  }
135 
136 public:
137  ReduceCuda(doubleN &result, SpinorX &X, SpinorY &Y, SpinorZ &Z,
138  SpinorW &W, SpinorV &V, Reducer &r, int length, int nParity,
139  const size_t *bytes, const size_t *norm_bytes) :
140  arg(X, Y, Z, W, V, r, length/nParity),
141  result(result), nParity(nParity), X_h(0), Y_h(0), Z_h(0), W_h(0), V_h(0),
142  Xnorm_h(0), Ynorm_h(0), Znorm_h(0), Wnorm_h(0), Vnorm_h(0),
143  bytes_(bytes), norm_bytes_(norm_bytes) { }
144  virtual ~ReduceCuda() { }
145 
146  inline TuneKey tuneKey() const {
147  return TuneKey(blasStrings.vol_str, typeid(arg.r).name(), blasStrings.aux_tmp);
148  }
149 
150  void apply(const cudaStream_t &stream) {
151  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
152  result = reduceLaunch<doubleN,ReduceType,FloatN,M>(arg, tp, stream);
153  }
154 
155  void preTune() {
156  arg.X.backup(&X_h, &Xnorm_h, bytes_[0], norm_bytes_[0]);
157  arg.Y.backup(&Y_h, &Ynorm_h, bytes_[1], norm_bytes_[1]);
158  arg.Z.backup(&Z_h, &Znorm_h, bytes_[2], norm_bytes_[2]);
159  arg.W.backup(&W_h, &Wnorm_h, bytes_[3], norm_bytes_[3]);
160  arg.V.backup(&V_h, &Vnorm_h, bytes_[4], norm_bytes_[4]);
161  }
162 
163  void postTune() {
164  arg.X.restore(&X_h, &Xnorm_h, bytes_[0], norm_bytes_[0]);
165  arg.Y.restore(&Y_h, &Ynorm_h, bytes_[1], norm_bytes_[1]);
166  arg.Z.restore(&Z_h, &Znorm_h, bytes_[2], norm_bytes_[2]);
167  arg.W.restore(&W_h, &Wnorm_h, bytes_[3], norm_bytes_[3]);
168  arg.V.restore(&V_h, &Vnorm_h, bytes_[4], norm_bytes_[4]);
169  }
170 
171  void initTuneParam(TuneParam &param) const {
172  Tunable::initTuneParam(param);
173  param.grid.y = nParity;
174  }
175 
176  void defaultTuneParam(TuneParam &param) const {
177  Tunable::defaultTuneParam(param);
178  param.grid.y = nParity;
179  }
180 
181  long long flops() const { return arg.r.flops()*vec_length<FloatN>::value*arg.length*nParity*M; }
182 
183  long long bytes() const
184  {
185  // bytes for low-precision vector
186  size_t base_bytes = arg.X.Precision()*vec_length<FloatN>::value*M;
187  if (arg.X.Precision() == QUDA_HALF_PRECISION) base_bytes += sizeof(float);
188 
189  // bytes for high precision vector
190  size_t extra_bytes = arg.Y.Precision()*vec_length<FloatN>::value*M;
191  if (arg.Y.Precision() == QUDA_HALF_PRECISION) extra_bytes += sizeof(float);
192 
193  // the factor two here assumes we are reading and writing to the high precision vector
194  // this will evaluate correctly for non-mixed kernels since the +2/-2 will cancel out
195  return ((arg.r.streams()-2)*base_bytes + 2*extra_bytes)*arg.length*nParity;
196  }
197 
198  int tuningIter() const { return 3; }
199 };
200 
201 
202 template <typename doubleN, typename ReduceType, typename RegType, typename StoreType, typename zType,
203  int M, template <typename ReducerType, typename Float, typename FloatN> class Reducer,
204  int writeX, int writeY, int writeZ, int writeW, int writeV>
205 doubleN reduceCuda(const double2 &a, const double2 &b,
206  ColorSpinorField &x, ColorSpinorField &y,
207  ColorSpinorField &z, ColorSpinorField &w,
208  ColorSpinorField &v, int length) {
209 
211 
212  if (!x.isNative()) {
213  warningQuda("Device reductions on non-native fields is not supported\n");
214  doubleN value;
216  return value;
217  }
218 
219  blasStrings.vol_str = x.VolString();
220  strcpy(blasStrings.aux_tmp, x.AuxString());
221  if (typeid(StoreType) != typeid(zType)) {
222  strcat(blasStrings.aux_tmp, ",");
223  strcat(blasStrings.aux_tmp, z.AuxString());
224  }
225 
226  size_t bytes[] = {x.Bytes(), y.Bytes(), z.Bytes(), w.Bytes()};
227  size_t norm_bytes[] = {x.NormBytes(), y.NormBytes(), z.NormBytes(), w.NormBytes()};
228 
234 
235  typedef typename scalar<RegType>::type Float;
236  typedef typename vector<Float,2>::type Float2;
237  typedef vector<Float,2> vec2;
238  Reducer<ReduceType, Float2, RegType> r((Float2)vec2(a), (Float2)vec2(b));
239  doubleN value;
240 
241  int partitions = (x.IsComposite() ? x.CompositeDim() : 1) * (x.SiteSubset());
242  ReduceCuda<doubleN,ReduceType,RegType,M,
248  Reducer<ReduceType, Float2, RegType> >
249  reduce(value, X, Y, Z, W, V, r, length, partitions, bytes, norm_bytes);
250  reduce.apply(*(blas::getStream()));
251 
252  blas::bytes += reduce.bytes();
253  blas::flops += reduce.flops();
254 
255  checkCudaError();
256 
257  return value;
258 }
259 
266 template <typename ReduceType, typename Float2, int writeX, int writeY, int writeZ,
267  int writeW, int writeV, typename SpinorX, typename SpinorY, typename SpinorZ,
268  typename SpinorW, typename SpinorV, typename Reducer>
269 ReduceType genericReduce(SpinorX &X, SpinorY &Y, SpinorZ &Z, SpinorW &W, SpinorV &V, Reducer r) {
270 
271  ReduceType sum;
272  ::quda::zero(sum);
273 
274  for (int parity=0; parity<X.Nparity(); parity++) {
275  for (int x=0; x<X.VolumeCB(); x++) {
276  r.pre();
277  for (int s=0; s<X.Nspin(); s++) {
278  for (int c=0; c<X.Ncolor(); c++) {
279  Float2 X2 = make_Float2<Float2>( X(parity, x, s, c) );
280  Float2 Y2 = make_Float2<Float2>( Y(parity, x, s, c) );
281  Float2 Z2 = make_Float2<Float2>( Z(parity, x, s, c) );
282  Float2 W2 = make_Float2<Float2>( W(parity, x, s, c) );
283  Float2 V2 = make_Float2<Float2>( V(parity, x, s, c) );
284  r(sum, X2, Y2, Z2, W2, V2);
285  if (writeX) X(parity, x, s, c) = make_Complex(X2);
286  if (writeY) Y(parity, x, s, c) = make_Complex(Y2);
287  if (writeZ) Z(parity, x, s, c) = make_Complex(Z2);
288  if (writeW) W(parity, x, s, c) = make_Complex(W2);
289  if (writeV) V(parity, x, s, c) = make_Complex(V2);
290  }
291  }
292  r.post(sum);
293  }
294  }
295 
296  return sum;
297 }
298 
299 template<typename, int N> struct vector { };
300 template<> struct vector<double, 2> { typedef double2 type; };
301 template<> struct vector<float, 2> { typedef float2 type; };
302 
303 template <typename ReduceType, typename Float, typename zFloat, int nSpin, int nColor, QudaFieldOrder order,
304  int writeX, int writeY, int writeZ, int writeW, int writeV, typename R>
305  ReduceType genericReduce(ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z,
306  ColorSpinorField &w, ColorSpinorField &v, R r) {
307  colorspinor::FieldOrderCB<Float,nSpin,nColor,1,order> X(x), Y(y), W(w), V(v);
308  colorspinor::FieldOrderCB<zFloat,nSpin,nColor,1,order> Z(z);
309  typedef typename vector<zFloat,2>::type Float2;
310  return genericReduce<ReduceType,Float2,writeX,writeY,writeZ,writeW,writeV>(X, Y, Z, W, V, r);
311 }
312 
313 template <typename ReduceType, typename Float, typename zFloat, int nSpin, QudaFieldOrder order,
314  int writeX, int writeY, int writeZ, int writeW, int writeV, typename R>
315  ReduceType genericReduce(ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z,
316  ColorSpinorField &w, ColorSpinorField &v, R r) {
317  ReduceType value;
318  if (x.Ncolor() == 2) {
319  value = genericReduce<ReduceType,Float,zFloat,nSpin,2,order,writeX,writeY,writeZ,writeW,writeV,R>(x, y, z, w, v, r);
320  } else if (x.Ncolor() == 3) {
321  value = genericReduce<ReduceType,Float,zFloat,nSpin,3,order,writeX,writeY,writeZ,writeW,writeV,R>(x, y, z, w, v, r);
322  } else if (x.Ncolor() == 4) {
323  value = genericReduce<ReduceType,Float,zFloat,nSpin,4,order,writeX,writeY,writeZ,writeW,writeV,R>(x, y, z, w, v, r);
324  } else if (x.Ncolor() == 6) {
325  value = genericReduce<ReduceType,Float,zFloat,nSpin,6,order,writeX,writeY,writeZ,writeW,writeV,R>(x, y, z, w, v, r);
326  } else if (x.Ncolor() == 8) {
327  value = genericReduce<ReduceType,Float,zFloat,nSpin,8,order,writeX,writeY,writeZ,writeW,writeV,R>(x, y, z, w, v, r);
328  } else if (x.Ncolor() == 12) {
329  value = genericReduce<ReduceType,Float,zFloat,nSpin,12,order,writeX,writeY,writeZ,writeW,writeV,R>(x, y, z, w, v, r);
330  } else if (x.Ncolor() == 16) {
331  value = genericReduce<ReduceType,Float,zFloat,nSpin,16,order,writeX,writeY,writeZ,writeW,writeV,R>(x, y, z, w, v, r);
332  } else if (x.Ncolor() == 20) {
333  value = genericReduce<ReduceType,Float,zFloat,nSpin,20,order,writeX,writeY,writeZ,writeW,writeV,R>(x, y, z, w, v, r);
334  } else if (x.Ncolor() == 24) {
335  value = genericReduce<ReduceType,Float,zFloat,nSpin,24,order,writeX,writeY,writeZ,writeW,writeV,R>(x, y, z, w, v, r);
336  } else if (x.Ncolor() == 32) {
337  value = genericReduce<ReduceType,Float,zFloat,nSpin,32,order,writeX,writeY,writeZ,writeW,writeV,R>(x, y, z, w, v, r);
338  } else if (x.Ncolor() == 72) {
339  value = genericReduce<ReduceType,Float,zFloat,nSpin,72,order,writeX,writeY,writeZ,writeW,writeV,R>(x, y, z, w, v, r);
340  } else if (x.Ncolor() == 576) {
341  value = genericReduce<ReduceType,Float,zFloat,nSpin,576,order,writeX,writeY,writeZ,writeW,writeV,R>(x, y, z, w, v, r);
342  } else {
344  errorQuda("nColor = %d not implemeneted",x.Ncolor());
345  }
346  return value;
347 }
348 
349 template <typename ReduceType, typename Float, typename zFloat, QudaFieldOrder order,
350  int writeX, int writeY, int writeZ, int writeW, int writeV, typename R>
351  ReduceType genericReduce(ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z, ColorSpinorField &w, ColorSpinorField &v, R r) {
352  ReduceType value;
354  if (x.Nspin() == 4) {
355  value = genericReduce<ReduceType,Float,zFloat,4,order,writeX,writeY,writeZ,writeW,writeV,R>(x, y, z, w, v, r);
356  } else if (x.Nspin() == 2) {
357  value = genericReduce<ReduceType,Float,zFloat,2,order,writeX,writeY,writeZ,writeW,writeV,R>(x, y, z, w, v, r);
358 #ifdef GPU_STAGGERED_DIRAC
359  } else if (x.Nspin() == 1) {
360  value = genericReduce<ReduceType,Float,zFloat,1,order,writeX,writeY,writeZ,writeW,writeV,R>(x, y, z, w, v, r);
361 #endif
362  } else {
363  errorQuda("nSpin = %d not implemeneted",x.Nspin());
364  }
365  return value;
366 }
367 
368 template <typename doubleN, typename ReduceType, typename Float, typename zFloat,
369  int writeX, int writeY, int writeZ, int writeW, int writeV, typename R>
370 doubleN genericReduce(ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z,
371  ColorSpinorField &w, ColorSpinorField &v, R r) {
372  ReduceType value;
374  if (x.FieldOrder() == QUDA_SPACE_SPIN_COLOR_FIELD_ORDER) {
375  value = genericReduce<ReduceType,Float,zFloat,QUDA_SPACE_SPIN_COLOR_FIELD_ORDER,writeX,writeY,writeZ,writeW,writeV,R>
376  (x, y, z, w, v, r);
377  } else {
378  warningQuda("CPU reductions not implemeneted for %d field order", x.FieldOrder());
379  }
380  return set(value);
381 }
char * Wnorm_h
#define qudaMemcpy(dst, src, count, kind)
Definition: quda_cuda_api.h:32
void postTune()
dim3 dim3 blockDim
unsigned int sharedBytesPerBlock(const TuneParam &param) const
cudaStream_t stream
virtual bool advanceSharedBytes(TuneParam &param) const
bool commAsyncReduction()
char * Ynorm_h
cudaError_t qudaEventQuery(cudaEvent_t &event)
Wrapper around cudaEventQuery or cuEventQuery.
cudaDeviceProp deviceProp
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
doubleN reduceLaunch(ReductionArg< ReduceType, SpinorX, SpinorY, SpinorZ, SpinorW, SpinorV, Reducer > &arg, const TuneParam &tp, const cudaStream_t &stream)
Definition: reduce_core.cuh:85
#define errorQuda(...)
Definition: util_quda.h:90
char * Znorm_h
enum QudaFieldOrder_s QudaFieldOrder
static __shared__ bool isLastBlockDone
Definition: reduce_core.cuh:20
__global__ void reduceKernel(ReductionArg< ReduceType, SpinorX, SpinorY, SpinorZ, SpinorW, SpinorV, Reducer > arg)
Definition: reduce_core.cuh:43
void checkLength(const ColorSpinorField &a, ColorSpinorField &b)
Definition: reduce_quda.cu:49
char * strcpy(char *__dst, const char *__src)
ReduceCuda(doubleN &result, SpinorX &X, SpinorY &Y, SpinorZ &Z, SpinorW &W, SpinorV &V, Reducer &r, int length, int nParity, const size_t *bytes, const size_t *norm_bytes)
static int R[4]
char * strcat(char *__s1, const char *__s2)
void apply(const cudaStream_t &stream)
QudaGaugeParam param
Definition: pack_test.cpp:17
#define b
void initTuneParam(TuneParam &param) const
complex< double > make_Complex(const double2 &a)
Definition: float_vector.h:278
cudaStream_t * getStream()
Definition: blas_quda.cu:75
static cudaEvent_t reduceEnd
Definition: reduce_quda.cu:66
const size_t * norm_bytes_
void preTune()
const int length
Definition: reduce_core.cuh:33
const int nColor
Definition: covdev_test.cpp:77
ReductionArg(SpinorX X, SpinorY Y, SpinorZ Z, SpinorW W, SpinorV V, Reducer r, int length)
Definition: reduce_core.cuh:34
static struct quda::blas::@4 blasStrings
int int int w
ReductionArg< ReduceType, SpinorX, SpinorY, SpinorZ, SpinorW, SpinorV, Reducer > arg
int V
Definition: test_util.cpp:28
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603
#define warningQuda(...)
Definition: util_quda.h:101
int Z[4]
Definition: test_util.cpp:27
char * Xnorm_h
int tuningIter() const
doubleN reduceCuda(const double2 &a, const double2 &b, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z, ColorSpinorField &w, ColorSpinorField &v, int length)
long long bytes() const
void zero(ColorSpinorField &a)
Definition: blas_quda.cu:45
const size_t * bytes_
#define LAUNCH_KERNEL(kernel, tp, stream, arg,...)
unsigned int sharedBytesPerThread() const
static __device__ unsigned int count
Definition: reduce_core.cuh:19
__host__ __device__ void sum(double &a, double &b)
Definition: reduce_core.cuh:5
char * Vnorm_h
ReduceType genericReduce(SpinorX &X, SpinorY &Y, SpinorZ &Z, SpinorW &W, SpinorV &V, Reducer r)
__device__ void reduce(ReduceArg< T > arg, const T &in, const int idx=0)
Definition: cub_helper.cuh:163
static QudaSumFloat * h_reduce
Definition: reduce_quda.cu:64
TuneKey tuneKey() const
void defaultTuneParam(TuneParam &param) const
unsigned long long flops
Definition: blas_quda.cu:42
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:880
void size_t length
cudaError_t qudaEventRecord(cudaEvent_t &event, cudaStream_t stream=0)
Wrapper around cudaEventRecord or cuEventRecord.
const void * c
doubleN & result
#define checkCudaError()
Definition: util_quda.h:129
long long flops() const
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51
QudaParity parity
Definition: covdev_test.cpp:53
#define a
static QudaSumFloat * hd_reduce
Definition: reduce_quda.cu:65
const int nParity
unsigned long long bytes
Definition: blas_quda.cu:43
virtual ~ReduceCuda()