QUDA  0.9.0
multi_reduce_quda.cu
Go to the documentation of this file.
1 #include <blas_quda.h>
2 #include <tune_quda.h>
3 #include <float_vector.h>
5 #include <uint_to_char.h>
6 
7 //#define QUAD_SUM
8 #ifdef QUAD_SUM
9 #include <dbldbl.h>
10 #endif
11 
12 #include <cub_helper.cuh>
13 
14 template<typename> struct ScalarType { };
15 template<> struct ScalarType<double> { typedef double type; };
16 template<> struct ScalarType<double2> { typedef double type; };
17 template<> struct ScalarType<double3> { typedef double type; };
18 
19 template<typename> struct Vec2Type { };
20 template<> struct Vec2Type<double> { typedef double2 type; };
21 
22 #ifdef QUAD_SUM
23 #define QudaSumFloat doubledouble
24 #define QudaSumFloat2 doubledouble2
25 #define QudaSumFloat3 doubledouble3
26 template<> struct ScalarType<doubledouble> { typedef doubledouble type; };
27 template<> struct ScalarType<doubledouble2> { typedef doubledouble type; };
28 template<> struct ScalarType<doubledouble3> { typedef doubledouble type; };
29 template<> struct Vec2Type<doubledouble> { typedef doubledouble2 type; };
30 #else
31 #define QudaSumFloat double
32 #define QudaSumFloat2 double2
33 #define QudaSumFloat3 double3
34 #endif
35 
36 // work around for Fermi
37 #if (__COMPUTE_CAPABILITY__ < 300)
38 #undef MAX_MULTI_BLAS_N
39 #define MAX_MULTI_BLAS_N 2
40 #endif
41 
42 static void checkSpinor(const ColorSpinorField &a, const ColorSpinorField &b) {
43  if (a.Length() != b.Length())
44  errorQuda("lengths do not match: %lu %lu", a.Length(), b.Length());
45  if (a.Stride() != b.Stride())
46  errorQuda("strides do not match: %d %d", a.Stride(), b.Stride());
47 }
48 
49 static struct {
50  const char *vol_str;
51  const char *aux_str;
53 } blasStrings;
54 
55 namespace quda {
56 
57  // hooks into tune.cpp variables for policy tuning
58  typedef std::map<TuneKey, TuneParam> map;
59  const map& getTuneCache();
60 
61  void disableProfileCount();
62  void enableProfileCount();
63 
64  void setPolicyTuning(bool);
65 
66  namespace blas {
67 
68  cudaStream_t* getStream();
69  cudaEvent_t* getReduceEvent();
70 
71  template <int writeX, int writeY, int writeZ, int writeW>
72  struct write {
73  static constexpr int X = writeX;
74  static constexpr int Y = writeY;
75  static constexpr int Z = writeZ;
76  static constexpr int W = writeW;
77  };
78 
79  namespace reduce {
80 
81  namespace multi {
82 #define BLAS_SPINOR // do not include ghost functions in Spinor class to reduce parameter space overhead
83 #include <texture.h>
84  }
85 
86 #include <multi_reduce_core.cuh>
87 #include <multi_reduce_core.h>
88 
89  } // namespace reduce
90 
94  template <int NXZ, typename ReduceType, typename Float2, typename FloatN>
96 
98  virtual __device__ __host__ void pre() { ; }
99 
101  virtual __device__ __host__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y,
102  FloatN &z, FloatN &w, const int i, const int j) = 0;
103 
105  virtual __device__ __host__ void post(ReduceType &sum) { ; }
106 
107  };
108 
109 
114  template<typename ReduceType> __device__ __host__ void dot_(ReduceType &sum, const double2 &a, const double2 &b) {
115  sum += (ReduceType)a.x*(ReduceType)b.x;
116  sum += (ReduceType)a.y*(ReduceType)b.y;
117  }
118 
119  template<typename ReduceType> __device__ __host__ void dot_(ReduceType &sum, const float2 &a, const float2 &b) {
120  sum += (ReduceType)a.x*(ReduceType)b.x;
121  sum += (ReduceType)a.y*(ReduceType)b.y;
122  }
123 
124  template<typename ReduceType> __device__ __host__ void dot_(ReduceType &sum, const float4 &a, const float4 &b) {
125  sum += (ReduceType)a.x*(ReduceType)b.x;
126  sum += (ReduceType)a.y*(ReduceType)b.y;
127  sum += (ReduceType)a.z*(ReduceType)b.z;
128  sum += (ReduceType)a.w*(ReduceType)b.w;
129  }
130 
131  template <int NXZ, typename ReduceType, typename Float2, typename FloatN>
132  struct Dot : public MultiReduceFunctor<NXZ, ReduceType, Float2, FloatN> {
133  typedef typename scalar<Float2>::type real;
134  const int NYW;
135  Dot(const reduce::coeff_array<Complex> &a, const reduce::coeff_array<Complex> &b, const reduce::coeff_array<Complex> &c, int NYW) : NYW(NYW) { ; }
136  __device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)
137  { dot_<ReduceType>(sum,x,y); }
138  static int streams() { return 2; }
139  static int flops() { return 2; }
140  };
141 
142  void reDotProduct(double* result, std::vector<ColorSpinorField*>& x, std::vector<ColorSpinorField*>& y){
143 #ifndef SSTEP
144  errorQuda("S-step code not built\n");
145 #else
146  switch(x.size()){
147  case 1:
148  reduce::multiReduceCuda<1,double,QudaSumFloat,Dot,0,0,0,0,false>
149  (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
150  break;
151  case 2:
152  reduce::multiReduceCuda<2,double,QudaSumFloat,Dot,0,0,0,0,false>
153  (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
154  break;
155  case 3:
156  reduce::multiReduceCuda<3,double,QudaSumFloat,Dot,0,0,0,0,false>
157  (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
158  break;
159  case 4:
160  reduce::multiReduceCuda<4,double,QudaSumFloat,Dot,0,0,0,0,false>
161  (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
162  break;
163  case 5:
164  reduce::multiReduceCuda<5,double,QudaSumFloat,Dot,0,0,0,0,false>
165  (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
166  break;
167  case 6:
168  reduce::multiReduceCuda<6,double,QudaSumFloat,Dot,0,0,0,0,false>
169  (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
170  break;
171  case 7:
172  reduce::multiReduceCuda<7,double,QudaSumFloat,Dot,0,0,0,0,false>
173  (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
174  break;
175  case 8:
176  reduce::multiReduceCuda<8,double,QudaSumFloat,Dot,0,0,0,0,false>
177  (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
178  break;
179  /*case 9:
180  reduce::multiReduceCuda<9,double,QudaSumFloat,Dot,0,0,0,0,false>
181  (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
182  break;
183  case 10:
184  reduce::multiReduceCuda<10,double,QudaSumFloat,Dot,0,0,0,0,false>
185  (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
186  break;
187  case 11:
188  reduce::multiReduceCuda<11,double,QudaSumFloat,Dot,0,0,0,0,false>
189  (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
190  break;
191  case 12:
192  reduce::multiReduceCuda<12,double,QudaSumFloat,Dot,0,0,0,0,false>
193  (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
194  break;
195  case 13:
196  reduce::multiReduceCuda<13,double,QudaSumFloat,Dot,0,0,0,0,false>
197  (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
198  break;
199  case 14:
200  reduce::multiReduceCuda<14,double,QudaSumFloat,Dot,0,0,0,0,false>
201  (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
202  break;
203  case 15:
204  reduce::multiReduceCuda<15,double,QudaSumFloat,Dot,0,0,0,0,false>
205  (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
206  break;
207  case 16:
208  reduce::multiReduceCuda<16,double,QudaSumFloat,Dot,0,0,0,0,false>
209  (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0), x, y, x, y);
210  break;*/
211  default:
212  errorQuda("Unsupported vector size");
213  break;
214  }
215 #endif // SSTEP
216  // do a single multi-node reduction only once we have computed all local dot products
217  const int Nreduce = x.size()*y.size();
218  reduceDoubleArray((double*)result, Nreduce);
219  }
220 
221 
225  template<typename ReduceType>
226  __device__ __host__ void cdot_(ReduceType &sum, const double2 &a, const double2 &b) {
227  typedef typename ScalarType<ReduceType>::type scalar;
228  sum.x += (scalar)a.x*(scalar)b.x;
229  sum.x += (scalar)a.y*(scalar)b.y;
230  sum.y += (scalar)a.x*(scalar)b.y;
231  sum.y -= (scalar)a.y*(scalar)b.x;
232  }
233 
234  template<typename ReduceType>
235  __device__ __host__ void cdot_(ReduceType &sum, const float2 &a, const float2 &b) {
236  typedef typename ScalarType<ReduceType>::type scalar;
237  sum.x += (scalar)a.x*(scalar)b.x;
238  sum.x += (scalar)a.y*(scalar)b.y;
239  sum.y += (scalar)a.x*(scalar)b.y;
240  sum.y -= (scalar)a.y*(scalar)b.x;
241  }
242 
243  template<typename ReduceType>
244  __device__ __host__ void cdot_(ReduceType &sum, const float4 &a, const float4 &b) {
245  typedef typename ScalarType<ReduceType>::type scalar;
246  sum.x += (scalar)a.x*(scalar)b.x;
247  sum.x += (scalar)a.y*(scalar)b.y;
248  sum.x += (scalar)a.z*(scalar)b.z;
249  sum.x += (scalar)a.w*(scalar)b.w;
250  sum.y += (scalar)a.x*(scalar)b.y;
251  sum.y -= (scalar)a.y*(scalar)b.x;
252  sum.y += (scalar)a.z*(scalar)b.w;
253  sum.y -= (scalar)a.w*(scalar)b.z;
254  }
255 
256  template <int NXZ, typename ReduceType, typename Float2, typename FloatN>
257  struct Cdot : public MultiReduceFunctor<NXZ, ReduceType, Float2, FloatN> {
258  typedef typename scalar<Float2>::type real;
259  const int NYW;
260  Cdot(const reduce::coeff_array<Complex> &a, const reduce::coeff_array<Complex> &b, const reduce::coeff_array<Complex> &c, int NYW) : NYW(NYW) { ; }
261  __device__ __host__ inline void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)
262  { cdot_<ReduceType>(sum,x,y); }
263  static int streams() { return 2; }
264  static int flops() { return 4; }
265  };
266 
267  template <int NXZ, typename ReduceType, typename Float2, typename FloatN>
268  struct CdotCopy : public MultiReduceFunctor<NXZ, ReduceType, Float2, FloatN> {
269  typedef typename scalar<Float2>::type real;
270  const int NYW;
271  CdotCopy(const reduce::coeff_array<Complex> &a, const reduce::coeff_array<Complex> &b, const reduce::coeff_array<Complex> &c, int NYW) : NYW(NYW) { ; }
272  __device__ __host__ inline void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)
273  { cdot_<ReduceType>(sum,x,y); if (i==j) w = y;}
274  static int streams() { return 2; }
275  static int flops() { return 4; }
276  };
277 
278  // This function does the outer product of dot products... in column major.
279  // There's a function below called 'cDotProduct' that flips it to row major.
280  template <template <int MXZ, typename ReducerType, typename Float, typename FloatN> class ReducerDiagonal, typename writeDiagonal,
281  template <int MXZ, typename ReducerType, typename Float, typename FloatN> class ReducerOffDiagonal, typename writeOffDiagonal>
282  void multiReduce_recurse(Complex* result, std::vector<ColorSpinorField*>& x, std::vector<ColorSpinorField*>& y,
283  std::vector<ColorSpinorField*>&z, std::vector<ColorSpinorField*>&w, int i_idx, int j_idx, bool hermitian, unsigned int tile_size) {
284 
285  if (y.size() > tile_size) // if greater than max single-kernel size, split and recurse
286  {
287  // Do the recurse first.
288  Complex* result0 = &result[0];
289  Complex* result1 = &result[x.size()*(y.size()/2)];
290  std::vector<ColorSpinorField*> y0(y.begin(), y.begin() + y.size()/2);
291  std::vector<ColorSpinorField*> y1(y.begin() + y.size()/2, y.end());
292  std::vector<ColorSpinorField*> w0(w.begin(), w.begin() + w.size()/2);
293  std::vector<ColorSpinorField*> w1(w.begin() + w.size()/2, w.end());
294  multiReduce_recurse<ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal>(result0, x, y0, z, w0, i_idx, 2*j_idx+0, hermitian, tile_size);
295  multiReduce_recurse<ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal>(result1, x, y1, z, w1, i_idx, 2*j_idx+1, hermitian, tile_size);
296  }
297  else
298  {
299  double2* cdot = new double2[x.size()*y.size()];
300 
301  // if at bottom of recursion, return if on lower left
302  if (x.size() <= tile_size && hermitian) {
303  if (j_idx < i_idx) { return; }
304  }
305 
306  reduce::coeff_array<Complex> a, b, c;
307 
308  if (x.size() <= tile_size) {
309  switch(x.size()){ // COMMENT HERE FOR COMPILE TIME
310  case 1:
311  reduce::multiReduceCuda<1,double2,QudaSumFloat2,ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal,false>
312  (cdot, a, b, c, x, y, z, w, i_idx, j_idx );
313  break;
314 #if MAX_MULTI_BLAS_N >= 2
315  case 2:
316  reduce::multiReduceCuda<2,double2,QudaSumFloat2,ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal,false>
317  (cdot, a, b, c, x, y, z, w, i_idx, j_idx );
318  break;
319 #if MAX_MULTI_BLAS_N >= 3
320  case 3:
321  reduce::multiReduceCuda<3,double2,QudaSumFloat2,ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal,false>
322  (cdot, a, b, c, x, y, z, w, i_idx, j_idx );
323  break;
324 #if MAX_MULTI_BLAS_N >= 4
325  case 4:
326  reduce::multiReduceCuda<4,double2,QudaSumFloat2,ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal,false>
327  (cdot, a, b, c, x, y, z, w, i_idx, j_idx );
328  break;
329 #if MAX_MULTI_BLAS_N >= 5
330  case 5:
331  reduce::multiReduceCuda<5,double2,QudaSumFloat2,ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal,false>
332  (cdot, a, b, c, x, y, z, w, i_idx, j_idx );
333  break;
334 #if MAX_MULTI_BLAS_N >= 6
335  case 6:
336  reduce::multiReduceCuda<6,double2,QudaSumFloat2,ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal,false>
337  (cdot, a, b, c, x, y, z, w, i_idx, j_idx );
338  break;
339 #if MAX_MULTI_BLAS_N >= 7
340  case 7:
341  reduce::multiReduceCuda<7,double2,QudaSumFloat2,ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal,false>
342  (cdot, a, b, c, x, y, z, w, i_idx, j_idx );
343  break;
344 #if MAX_MULTI_BLAS_N >= 8
345  case 8:
346  reduce::multiReduceCuda<8,double2,QudaSumFloat2,ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal,false>
347  (cdot, a, b, c, x, y, z, w, i_idx, j_idx );
348  break;
349 #if MAX_MULTI_BLAS_N >= 9
350  case 9:
351  reduce::multiReduceCuda<9,double2,QudaSumFloat2,ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal,false>
352  (cdot, a, b, c, x, y, z, w, i_idx, j_idx );
353  break;
354 #if MAX_MULTI_BLAS_N >= 10
355  case 10:
356  reduce::multiReduceCuda<10,double2,QudaSumFloat2,ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal,false>
357  (cdot, a, b, c, x, y, z, w, i_idx, j_idx );
358  break;
359 #if MAX_MULTI_BLAS_N >= 11
360  case 11:
361  reduce::multiReduceCuda<11,double2,QudaSumFloat2,ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal,false>
362  (cdot, a, b, c, x, y, z, w, i_idx, j_idx );
363  break;
364 #if MAX_MULTI_BLAS_N >= 12
365  case 12:
366  reduce::multiReduceCuda<12,double2,QudaSumFloat2,ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal,false>
367  (cdot, a, b, c, x, y, z, w, i_idx, j_idx );
368  break;
369 #if MAX_MULTI_BLAS_N >= 13
370  case 13:
371  reduce::multiReduceCuda<13,double2,QudaSumFloat2,ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal,false>
372  (cdot, a, b, c, x, y, z, w, i_idx, j_idx );
373  break;
374 #if MAX_MULTI_BLAS_N >= 14
375  case 14:
376  reduce::multiReduceCuda<14,double2,QudaSumFloat2,ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal,false>
377  (cdot, a, b, c, x, y, z, w, i_idx, j_idx );
378  break;
379 #if MAX_MULTI_BLAS_N >= 15
380  case 15:
381  reduce::multiReduceCuda<15,double2,QudaSumFloat2,ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal,false>
382  (cdot, a, b, c, x, y, z, w, i_idx, j_idx );
383  break;
384 #if MAX_MULTI_BLAS_N >= 16
385  case 16:
386  reduce::multiReduceCuda<16,double2,QudaSumFloat2,ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal,false>
387  (cdot, a, b, c, x, y, z, w, i_idx, j_idx );
388  break;
389 #endif //16
390 #endif //15
391 #endif //14
392 #endif //13
393 #endif //12
394 #endif //11
395 #endif //10
396 #endif // 9
397 #endif // 8
398 #endif // 7
399 #endif // 6
400 #endif // 5
401 #endif // 4
402 #endif // 3
403 #endif // 2
404  }
405  } else {
406  // split the problem and recurse. Splitting in x requires
407  // memory reshuffling (unless y = 1).
408  // Use a few temporary variables.
409 
410  Complex* tmpmajor = new Complex[x.size()*y.size()];
411  Complex* result0 = &tmpmajor[0];
412  Complex* result1 = &tmpmajor[(x.size()/2)*y.size()];
413  std::vector<ColorSpinorField*> x0(x.begin(), x.begin() + x.size()/2);
414  std::vector<ColorSpinorField*> x1(x.begin() + x.size()/2, x.end());
415  std::vector<ColorSpinorField*> z0(z.begin(), z.begin() + z.size()/2);
416  std::vector<ColorSpinorField*> z1(z.begin() + z.size()/2, z.end());
417 
418  multiReduce_recurse<ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal>(result0, x0, y, z0, w, 2*i_idx+0, j_idx, hermitian, tile_size);
419  multiReduce_recurse<ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal>(result1, x1, y, z1, w, 2*i_idx+1, j_idx, hermitian, tile_size);
420 
421  const unsigned int xlen0 = x.size()/2;
422  const unsigned int xlen1 = x.size() - xlen0;
423  const unsigned int ylen = y.size();
424 
425  // Copy back into result.
426  int count = 0, count0 = 0, count1 = 0;
427  for (unsigned int i = 0; i < ylen; i++)
428  {
429  for (unsigned int j = 0; j < xlen0; j++)
430  result[count++] = result0[count0++];
431  for (unsigned int j = 0; j < xlen1; j++)
432  result[count++] = result1[count1++];
433  }
434 
435  delete[] tmpmajor;
436  }
437 
438  // we are at the leaf of the binary tree (e.g., we ran the kernel): perform the row-to-column-major transpose here.
439  if (x.size() <= tile_size)
440  {
441  const unsigned int xlen = x.size();
442  const unsigned int ylen = y.size();
443  for (unsigned int j = 0; j < xlen; j++)
444  for (unsigned int i = 0; i < ylen; i++)
445  result[i*xlen+j] = Complex(cdot[j*ylen + i].x, cdot[j*ylen+i].y);
446  }
447  delete[] cdot;
448  }
449  }
450 
451 
452  template <template <int MXZ, typename ReducerType, typename Float, typename FloatN> class ReducerDiagonal,
453  typename writeDiagonal,
454  template <int MXZ, typename ReducerType, typename Float, typename FloatN> class ReducerOffDiagonal,
455  typename writeOffDiagonal>
456  class TileSizeTune : public Tunable {
457  typedef std::vector<ColorSpinorField*> vec;
459  vec &x, &y, &z, &w;
460  bool hermitian;
461  bool Anorm;
462 
463  unsigned int sharedBytesPerThread() const { return 0; }
464  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
465 
466  unsigned int max_tile_size;
467 
468  public:
469  TileSizeTune(Complex *result, vec &x, vec &y, vec &z, vec &w, bool hermitian, bool Anorm = false)
471  {
472  strcpy(aux, "policy,");
473  strcat(aux, x[0]->AuxString());
474  strcat(aux, ",");
475  strcat(aux, y[0]->AuxString());
476  if (hermitian) strcat(aux, ",hermitian");
477  if (Anorm) strcat(aux, ",Anorm");
478  strcat(aux,",n=");
479  char size[8];
480  u64toa(size, x.size());
481  strcat(aux,size);
482  strcat(aux,",m=");
483  u64toa(size, y.size());
484  strcat(aux,size);
485 
486  // before we do policy tuning we must ensure the kernel
487  // constituents have been tuned since we can't do nested tuning
488  // FIXME this will break if the kernels are destructive - which they aren't here
489  if (getTuning() && getTuneCache().find(tuneKey()) == getTuneCache().end()) {
490  disableProfileCount(); // purely for profiling reasons, don't want to profile tunings.
491 
492  if ( x.size()==1 || y.size()==1 ) { // 1-d reduction
493 
494  max_tile_size = std::min(MAX_MULTI_BLAS_N, (int)std::max(x.size(), y.size()));
495 
496  // Make sure constituents are tuned.
497  for ( unsigned int tile_size=1; tile_size <= max_tile_size; tile_size++) {
498  multiReduce_recurse<ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal>
499  (result, x, y, z, w, 0, 0, hermitian, tile_size);
500  }
501 
502  } else { // 2-d reduction
503 
504  // max_tile_size should be set to the largest power of 2 less than
505  // MAX_MULTI_BLAS_N, since we have a requirement that the
506  // tile size is a power of 2.
507  unsigned int max_count = 0;
508  unsigned int tile_size_tmp = MAX_MULTI_BLAS_N;
509  while (tile_size_tmp != 1) { tile_size_tmp = tile_size_tmp >> 1; max_count++; }
510  tile_size_tmp = 1;
511  for (unsigned int i = 0; i < max_count; i++) { tile_size_tmp = tile_size_tmp << 1; }
512  max_tile_size = tile_size_tmp;
513 
514  // Make sure constituents are tuned.
515  for ( unsigned int tile_size=1; tile_size <= max_tile_size && tile_size <= x.size() &&
516  (tile_size <= y.size() || y.size()==1) ; tile_size*=2) {
517  multiReduce_recurse<ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal>
518  (result, x, y, z, w, 0, 0, hermitian, tile_size);
519  }
520 
521  }
522 
524  setPolicyTuning(true);
525  }
526  }
527 
528  virtual ~TileSizeTune() { setPolicyTuning(false); }
529 
530  void apply(const cudaStream_t &stream) {
531  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
532 
533  // tp.aux.x is where the tile size is stored. "tp" is the tuning struct.
534  // it contains blocksize, grid size, etc. Since we're only tuning
535  // a policy, we don't care about those sizes. That's why we only
536  // tune "aux.x", which is the tile size.
537  multiReduce_recurse<ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal>
538  (result, x, y, z, w, 0, 0, hermitian, tp.aux.x);
539  }
540 
541  // aux.x is the tile size
543  {
544 
545  if ( x.size()==1 || y.size()==1 ) { // 1-d reduction
546 
547  param.aux.x++;
548  if ( (unsigned int)param.aux.x <= max_tile_size ) {
549  return true;
550  } else {
551  param.aux.x = 1;
552  return false;
553  }
554 
555  } else { // 2-d reduction
556 
557  param.aux.x *= 2; // only tune powers of two (FIXME)
558 
559  if ( (unsigned int)param.aux.x <= max_tile_size && param.aux.x <= (int)x.size() &&
560  param.aux.x <= (int)y.size() ) {
561  return true;
562  } else {
563  param.aux.x = 1; // reset to the beginning (which we'd need for multi-dimensional tuning)
564  return false;
565  }
566 
567  }
568  }
569 
570  bool advanceTuneParam(TuneParam &param) const { return advanceAux(param); }
571 
574  param.aux.x = 1; param.aux.y = 0; param.aux.z = 0; param.aux.w = 0;
575  }
576 
578  Tunable::defaultTuneParam(param); // default is max tile size
579  // max_tile_size is MAX_MULTI_BLAS_N rounded down to the nearest power of 2.
580  param.aux.x = max_tile_size; param.aux.y = 0; param.aux.z = 0; param.aux.w = 0;
581  }
582 
583  TuneKey tuneKey() const {
584  return TuneKey(x[0]->VolString(), typeid(*this).name(), aux);
585  }
586 
587  long long flops() const { return 0; } // FIXME
588  long long bytes() const { return 0; } // FIXME
589 
590  void preTune() { } // FIXME - use write to determine what needs to be saved
591  void postTune() { } // FIXME - use write to determine what needs to be saved
592  };
593 
594  void cDotProduct(Complex* result, std::vector<ColorSpinorField*>& x, std::vector<ColorSpinorField*>& y){
595  if (x.size() == 0 || y.size() == 0) errorQuda("vector.size() == 0");
596  Complex* result_tmp = new Complex[x.size()*y.size()];
597  for (unsigned int i = 0; i < x.size()*y.size(); i++) result_tmp[i] = 0.0;
598 
599  // cDotProduct_recurse returns a column-major matrix.
600  // To be consistent with the multi-blas functions, we should
601  // switch this to row-major.
602  TileSizeTune<Cdot,write<0,0,0,0>,Cdot,write<0,0,0,0> > tile(result_tmp, x, y, x, y, false);
603  tile.apply(0);
604 
605  // do a single multi-node reduction only once we have computed all local dot products
606  const int Nreduce = 2*x.size()*y.size();
607  reduceDoubleArray((double*)result_tmp, Nreduce);
608 
609  // Switch from col-major to row-major
610  const unsigned int xlen = x.size();
611  const unsigned int ylen = y.size();
612  for (unsigned int j = 0; j < xlen; j++)
613  for (unsigned int i = 0; i < ylen; i++)
614  result[j*ylen+i] = result_tmp[i*xlen + j];
615 
616  delete[] result_tmp;
617  }
618 
619  void hDotProduct(Complex* result, std::vector<ColorSpinorField*>& x, std::vector<ColorSpinorField*>& y){
620  if (x.size() == 0 || y.size() == 0) errorQuda("vector.size() == 0");
621  if (x.size() != y.size()) errorQuda("Cannot call Hermitian block dot product on non-square inputs");
622 
623  Complex* result_tmp = new Complex[x.size()*y.size()];
624  for (unsigned int i = 0; i < x.size()*y.size(); i++) result_tmp[i] = 0.0;
625 
626  TileSizeTune<Cdot,write<0,0,0,0>,Cdot,write<0,0,0,0> > tile(result_tmp, x, y, x, y, true, false); // last false is b/c L2 norm
627  tile.apply(0);
628 
629  // do a single multi-node reduction only once we have computed all local dot products
630  const int Nreduce = 2*x.size()*y.size();
631  reduceDoubleArray((double*)result_tmp, Nreduce); // FIXME - could optimize this for Hermiticity as well
632 
633  // Switch from col-major to row-major
634  const unsigned int xlen = x.size();
635  const unsigned int ylen = y.size();
636  for (unsigned int j = 0; j < xlen; j++)
637  for (unsigned int i = j; i < ylen; i++) {
638  result[j*ylen+i] = result_tmp[i*xlen + j];
639  result[i*ylen+j] = conj(result_tmp[i*xlen + j]);
640  }
641 
642  delete[] result_tmp;
643  }
644 
645  // for (p, Ap) norms in CG which are Hermitian.
646  void hDotProduct_Anorm(Complex* result, std::vector<ColorSpinorField*>& x, std::vector<ColorSpinorField*>& y){
647  if (x.size() == 0 || y.size() == 0) errorQuda("vector.size() == 0");
648  if (x.size() != y.size()) errorQuda("Cannot call Hermitian block A-norm dot product on non-square inputs");
649 
650  Complex* result_tmp = new Complex[x.size()*y.size()];
651  for (unsigned int i = 0; i < x.size()*y.size(); i++) result_tmp[i] = 0.0;
652 
653  TileSizeTune<Cdot,write<0,0,0,0>,Cdot,write<0,0,0,0> > tile(result_tmp, x, y, x, y, true, true); // last true is b/c A norm
654  tile.apply(0);
655 
656  // do a single multi-node reduction only once we have computed all local dot products
657  const int Nreduce = 2*x.size()*y.size();
658  reduceDoubleArray((double*)result_tmp, Nreduce); // FIXME - could optimize this for Hermiticity as well
659 
660  // Switch from col-major to row-major
661  const unsigned int xlen = x.size();
662  const unsigned int ylen = y.size();
663  for (unsigned int j = 0; j < xlen; j++)
664  for (unsigned int i = j; i < ylen; i++) {
665  result[j*ylen+i] = result_tmp[i*xlen + j];
666  result[i*ylen+j] = conj(result_tmp[i*xlen + j]);
667  }
668 
669  delete[] result_tmp;
670  }
671 
672  // takes the outer product of inner products between and y and copies y into z
673  void cDotProductCopy(Complex* result, std::vector<ColorSpinorField*>& x, std::vector<ColorSpinorField*>& y,
674  std::vector<ColorSpinorField*>&z){
675 
676 #if 0
677  if (x.size() == 0 || y.size() == 0) errorQuda("vector.size() == 0");
678  if (y.size() != z.size()) errorQuda("Cannot copy input y of size %lu into z of size %lu\n", y.size(), z.size());
679 
680  Complex* result_tmp = new Complex[x.size()*y.size()];
681  for (unsigned int i = 0; i < x.size()*y.size(); i++) result_tmp[i] = 0.0;
682 
683  // When recursing, only the diagonal tiles will do the copy, the rest just do the outer product
684  TileSizeTune<CdotCopy,write<0,0,0,1>,Cdot,write<0,0,0,0> > tile(result_tmp, x, y, x, y, true);
685  tile.apply(0);
686 
687  // do a single multi-node reduction only once we have computed all local dot products
688  const int Nreduce = 2*x.size()*y.size();
689  reduceDoubleArray((double*)result_tmp, Nreduce);
690 
691  // Switch from col-major to row-major.
692  const unsigned int xlen = x.size();
693  const unsigned int ylen = y.size();
694  for (unsigned int j = 0; j < xlen; j++)
695  for (unsigned int i = 0; i < ylen; i++)
696  result[j*ylen+i] = result_tmp[i*xlen + j];
697 
698  delete[] result_tmp;
699 #else
700  errorQuda("cDotProductCopy not enabled");
701 #endif
702  }
703 
704  } // namespace blas
705 
706 } // namespace quda
static int flops()
total number of input and output streams
static void checkSpinor(const ColorSpinorField &a, const ColorSpinorField &b)
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)
where the reduction is usually computed and any auxiliary operations
void disableProfileCount()
Definition: tune.cpp:107
__device__ __host__ void cdot_(ReduceType &sum, const double2 &a, const double2 &b)
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
unsigned int sharedBytesPerBlock(const TuneParam &param) const
void end(void)
Definition: blas_quda.cu:70
#define errorQuda(...)
Definition: util_quda.h:90
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)
where the reduction is usually computed and any auxiliary operations
static int streams()
Complex cDotProduct(ColorSpinorField &, ColorSpinorField &)
Definition: reduce_quda.cu:500
void cDotProductCopy(Complex *result, std::vector< ColorSpinorField *> &a, std::vector< ColorSpinorField *> &b, std::vector< ColorSpinorField *> &c)
Computes the matrix of inner products between the vector set a and the vector set b...
virtual __device__ __host__ void pre()
pre-computation routine called before the "M-loop"
std::complex< double > Complex
Definition: eig_variables.h:13
cudaStream_t * stream
void reduceDoubleArray(double *, const int len)
char * strcpy(char *__dst, const char *__src)
double reDotProduct(ColorSpinorField &x, ColorSpinorField &y)
Definition: reduce_quda.cu:277
double y0(double) __attribute__((availability(macosx
char * strcat(char *__s1, const char *__s2)
void multiReduce_recurse(Complex *result, std::vector< ColorSpinorField *> &x, std::vector< ColorSpinorField *> &y, std::vector< ColorSpinorField *> &z, std::vector< ColorSpinorField *> &w, int i_idx, int j_idx, bool hermitian, unsigned int tile_size)
void initTuneParam(TuneParam &param) const
static constexpr int Y
Dot(const reduce::coeff_array< Complex > &a, const reduce::coeff_array< Complex > &b, const reduce::coeff_array< Complex > &c, int NYW)
void defaultTuneParam(TuneParam &param) const
void enableProfileCount()
Definition: tune.cpp:108
QudaGaugeParam param
Definition: pack_test.cpp:17
#define b
cudaStream_t * getStream()
Definition: blas_quda.cu:75
CdotCopy(const reduce::coeff_array< Complex > &a, const reduce::coeff_array< Complex > &b, const reduce::coeff_array< Complex > &c, int NYW)
char aux_tmp[quda::TuneKey::aux_n]
double y1(double) __attribute__((availability(macosx
virtual __device__ __host__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)=0
where the reduction is usually computed and any auxiliary operations
__host__ __device__ void sum(double &a, double &b)
static constexpr int Z
int int int w
unsigned int sharedBytesPerThread() const
bool advanceTuneParam(TuneParam &param) const
std::vector< ColorSpinorField * > vec
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603
static constexpr int W
void hDotProduct_Anorm(Complex *result, std::vector< ColorSpinorField *> &a, std::vector< ColorSpinorField *> &b)
Computes the matrix of inner products between the vector set a and the vector set b...
cudaEvent_t * getReduceEvent()
Definition: reduce_quda.cu:76
static constexpr int X
const char * aux_str
void setPolicyTuning(bool)
Definition: tune.cpp:457
static struct @7 blasStrings
scalar< Float2 >::type real
Cdot(const reduce::coeff_array< Complex > &a, const reduce::coeff_array< Complex > &b, const reduce::coeff_array< Complex > &c, int NYW)
void apply(const cudaStream_t &stream)
std::map< TuneKey, TuneParam > map
static int flops()
total number of input and output streams
TileSizeTune(Complex *result, vec &x, vec &y, vec &z, vec &w, bool hermitian, bool Anorm=false)
void hDotProduct(Complex *result, std::vector< ColorSpinorField *> &a, std::vector< ColorSpinorField *> &b)
Computes the matrix of inner products between the vector set a and the vector set b...
__device__ void reduce(ReduceArg< T > arg, const T &in, const int idx=0)
Definition: cub_helper.cuh:163
static const int aux_n
Definition: tune_key.h:12
bool advanceAux(TuneParam &param) const
static int flops()
total number of input and output streams
const void * c
virtual void initTuneParam(TuneParam &param) const
Definition: tune_quda.h:230
const map & getTuneCache()
Definition: tune.cpp:110
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:115
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51
void u64toa(char *buffer, uint64_t value)
Definition: uint_to_char.h:127
__device__ unsigned int count[QUDA_MAX_MULTI_REDUCE]
Definition: cub_helper.cuh:118
__device__ __host__ void dot_(ReduceType &sum, const double2 &a, const double2 &b)
#define a
scalar< Float2 >::type real
const char * vol_str
char aux[TuneKey::aux_n]
Definition: tune_quda.h:189
virtual __device__ __host__ void post(ReduceType &sum)
post-computation routine called after the "M-loop"
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)
where the reduction is usually computed and any auxiliary operations
#define MAX_MULTI_BLAS_N
virtual void defaultTuneParam(TuneParam &param) const
Definition: tune_quda.h:254
scalar< Float2 >::type real