3 #include <color_spinor_field_order.h>
4 #include <uint_to_char.h>
6 #include <launch_kernel.cuh>
7 #include <jitify_helper.cuh>
8 #include <kernels/multi_reduce_core.cuh>
14 qudaStream_t* getStream();
16 template <int block_size, typename real, int len, int NXZ, typename Arg>
17 typename std::enable_if<block_size!=32, qudaError_t>::type launch(Arg &arg, const TuneParam &tp, const qudaStream_t &stream)
19 if (tp.block.x == block_size)
20 return qudaLaunchKernel(multiReduceKernel<block_size, real, len, NXZ, Arg>, tp, stream, arg);
22 return launch<block_size - 32, real, len, NXZ>(arg, tp, stream);
25 template <int block_size, typename real, int len, int NXZ, typename Arg>
26 typename std::enable_if<block_size==32, qudaError_t>::type launch(Arg &arg, const TuneParam &tp, const qudaStream_t &stream)
28 if (block_size != tp.block.x) errorQuda("Unexpected block size %d\n", tp.block.x);
29 return qudaLaunchKernel(multiReduceKernel<block_size, real, len, NXZ, Arg>, tp, stream, arg);
32 #ifdef QUDA_FAST_COMPILE_REDUCE
33 constexpr static unsigned int max_block_size() { return 32; }
35 constexpr static unsigned int max_block_size() { return 128; }
38 template <typename real, int len, int NXZ, typename Arg, typename T>
39 void multiReduceLaunch(T result[], Arg &arg, const TuneParam &tp, const qudaStream_t &stream, Tunable &tunable)
41 using reduce_t = typename Arg::Reducer::reduce_t;
42 if (tp.grid.x > (unsigned int)deviceProp.maxGridSize[0])
43 errorQuda("Grid size %d greater than maximum %d\n", tp.grid.x, deviceProp.maxGridSize[0]);
46 using namespace jitify::reflection;
47 tunable.jitifyError() = program->kernel("quda::blas::multiReduceKernel")
48 .instantiate((int)tp.block.x, Type<real>(), len, NXZ, Type<Arg>())
49 .configure(tp.grid, tp.block, tp.shared_bytes, stream)
51 arg.launch_error = tunable.jitifyError() == CUDA_SUCCESS ? QUDA_SUCCESS : QUDA_ERROR;
53 arg.launch_error = launch<max_block_size(), real, len, NXZ>(arg, tp, stream);
56 std::vector<T> result_(NXZ * arg.NYW);
57 if (!commAsyncReduction()) arg.complete(result_, stream);
59 // need to transpose for same order with vector thread reduction
60 for (int i = 0; i < NXZ; i++) {
61 for (int j = 0; j < arg.NYW; j++) {
62 result[i * arg.NYW + j] = result_[j * NXZ + i];
68 template <template <typename ...> class Reducer, typename store_t, typename y_store_t, int nSpin, typename T>
69 class MultiReduce : public Tunable
71 using real = typename mapper<y_store_t>::type;
72 using host_reduce_t = typename Reducer<double, real>::reduce_t;
75 Reducer<device_reduce_t, real> r;
78 std::vector<ColorSpinorField *> &x, &y, &z, &w;
79 host_reduce_t *result;
80 QudaFieldLocation location;
82 unsigned int sharedBytesPerThread() const { return 0; }
83 unsigned int sharedBytesPerBlock(const TuneParam ¶m) const { return 0; }
85 virtual bool advanceSharedBytes(TuneParam ¶m) const
87 TuneParam next(param);
88 advanceBlockDim(next); // to get next blockDim
89 int nthreads = next.block.x * next.block.y * next.block.z;
90 param.shared_bytes = sharedBytesPerThread() * nthreads > sharedBytesPerBlock(param) ?
91 sharedBytesPerThread() * nthreads : sharedBytesPerBlock(param);
95 unsigned int maxBlockSize(const TuneParam ¶m) const { return max_block_size(); }
98 MultiReduce(const T &a, const T &b, const T &c, const ColorSpinorField &x_meta, const ColorSpinorField &y_meta,
99 std::vector<ColorSpinorField *> &x, std::vector<ColorSpinorField *> &y,
100 std::vector<ColorSpinorField *> &z, std::vector<ColorSpinorField *> &w,
101 host_reduce_t *result) :
105 nParity(x[0]->SiteSubset()),
114 location(checkLocation(*x[0], *y[0], *z[0], *w[0]))
116 checkLength(*x[0], *y[0], *z[0], *w[0]);
117 auto x_prec = checkPrecision(*x[0], *z[0], *w[0]);
118 auto y_prec = y[0]->Precision();
119 auto x_order = checkOrder(*x[0], *z[0], *w[0]);
120 auto y_order = y[0]->FieldOrder();
121 if (sizeof(store_t) != x_prec) errorQuda("Expected precision %lu but received %d", sizeof(store_t), x_prec);
122 if (sizeof(y_store_t) != y_prec) errorQuda("Expected precision %lu but received %d", sizeof(y_store_t), y_prec);
123 if (x_prec == y_prec && x_order != y_order) errorQuda("Orders %d %d do not match", x_order, y_order);
125 strcpy(aux, "policy_kernel,");
126 strcat(aux, x[0]->AuxString());
127 if (x_prec != y_prec) {
129 strcat(aux, y[0]->AuxString());
131 strcat(aux, nParity == 2 ? ",nParity=2" : ",nParity=1");
133 // since block dot product and block norm use the same functors, we need to distinguish them
134 bool is_norm = false;
137 for (int i = 0; i < NXZ; i++) {
138 if (x[i]->V() != y[i]->V() || x[i]->V() != z[i]->V() || x[i]->V() != w[i]->V()) {
144 if (is_norm) strcat(aux, ",norm");
147 ::quda::create_jitify_program("kernels/multi_reduce_core.cuh");
150 apply(*blas::getStream());
152 blas::bytes += bytes();
153 blas::flops += flops();
156 TuneKey tuneKey() const
158 char name[TuneKey::name_n];
161 u32toa(NXZ_str, NXZ);
162 u32toa(NYW_str, NYW);
164 strcat(name, NXZ_str);
166 strcat(name, NYW_str);
167 strcat(name, typeid(r).name());
168 return TuneKey(x[0]->VolString(), name, aux);
171 template <typename buffer_t>
172 void set_param(buffer_t &d, const T &h, const qudaStream_t &stream)
174 using coeff_t = typename decltype(r)::coeff_t;
175 constexpr size_t n_coeff = MAX_MATRIX_SIZE / sizeof(coeff_t);
177 coeff_t tmp[n_coeff];
178 for (int i = 0; i < NXZ; i++)
179 for (int j = 0; j < NYW; j++) tmp[NYW * i + j] = coeff_t(h.data[NYW * i + j]);
180 cudaMemcpyToSymbolAsync(d, tmp, NXZ * NYW * sizeof(decltype(tmp[0])), 0, cudaMemcpyHostToDevice, stream);
181 //cuMemcpyHtoDAsync(d, tmp, NXZ * NYW * sizeof(decltype(tmp[0])), stream);
184 template <int NXZ> void compute(const qudaStream_t &stream)
186 staticCheck<NXZ, store_t, y_store_t, decltype(r)>(r, x, y);
188 constexpr bool site_unroll_check = !std::is_same<store_t, y_store_t>::value || isFixed<store_t>::value;
189 if (site_unroll_check && (x[0]->Ncolor() != 3 || x[0]->Nspin() == 2))
190 errorQuda("site unroll not supported for nSpin = %d nColor = %d", x[0]->Nspin(), x[0]->Ncolor());
192 TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
194 if (location == QUDA_CUDA_FIELD_LOCATION) {
195 if (site_unroll_check) checkNative(*x[0], *y[0], *z[0], *w[0]); // require native order when using site_unroll
196 using device_store_t = typename device_type_mapper<store_t>::type;
197 using device_y_store_t = typename device_type_mapper<y_store_t>::type;
198 using device_real_t = typename mapper<device_y_store_t>::type;
199 Reducer<device_reduce_t, device_real_t> r_(NXZ, NYW);
201 // redefine site_unroll with device_store types to ensure we have correct N/Ny/M values
202 constexpr bool site_unroll = !std::is_same<device_store_t, device_y_store_t>::value || isFixed<device_store_t>::value;
203 constexpr int N = n_vector<device_store_t, true, nSpin, site_unroll>();
204 constexpr int Ny = n_vector<device_y_store_t, true, nSpin, site_unroll>();
205 constexpr int M = site_unroll ? (nSpin == 4 ? 24 : 6) : N; // real numbers per thread
206 const int length = x[0]->Length() / (nParity * M);
208 MultiReduceArg<NXZ, device_store_t, N, device_y_store_t, Ny, decltype(r_)> arg(x, y, z, w, r_, NYW, length, nParity, tp);
211 // need to get constants pointer from jitify instance
212 if (a.data || b.data || c.data) errorQuda("Constant memory buffer support not enabled with jitify yet");
214 if (a.data) set_param(Amatrix_d, a, stream);
215 if (b.data) set_param(Bmatrix_d, b, stream);
216 if (c.data) set_param(Cmatrix_d, c, stream);
218 multiReduceLaunch<device_real_t, M, NXZ>(result, arg, tp, stream, *this);
220 errorQuda("Only implemented for GPU fields");
224 template <int n> typename std::enable_if<n!=1, void>::type instantiateLinear(const qudaStream_t &stream)
226 if (NXZ == n) compute<n>(stream);
227 else instantiateLinear<n-1>(stream);
230 template <int n> typename std::enable_if<n==1, void>::type instantiateLinear(const qudaStream_t &stream)
235 template <int n> typename std::enable_if<n!=1, void>::type instantiatePow2(const qudaStream_t &stream)
237 if (NXZ == n) compute<n>(stream);
238 else instantiatePow2<n/2>(stream);
241 template <int n> typename std::enable_if<n==1, void>::type instantiatePow2(const qudaStream_t &stream)
246 void apply(const qudaStream_t &stream)
248 constexpr int pow2_max = max_NXZ_power2<true, isFixed<store_t>::value>();
249 if (NXZ <= pow2_max && is_power2(NXZ)) instantiatePow2<pow2_max>(stream);
250 else if (NXZ <= MAX_MULTI_BLAS_N) instantiateLinear<MAX_MULTI_BLAS_N>(stream);
251 else errorQuda("x.size %lu greater than MAX_MULTI_BLAS_N %d", x.size(), MAX_MULTI_BLAS_N);
254 bool advanceGridDim(TuneParam ¶m) const
256 bool rtn = Tunable::advanceGridDim(param);
257 if (NYW > deviceProp.maxGridSize[1]) errorQuda("N=%d is greater than the maximum support grid size", NYW);
261 void initTuneParam(TuneParam ¶m) const
263 Tunable::initTuneParam(param);
268 void defaultTuneParam(TuneParam ¶m) const
270 Tunable::defaultTuneParam(param);
277 for (int i = 0; i < NYW; ++i) {
278 if (r.write.X) x[i]->backup();
279 if (r.write.Y) y[i]->backup();
280 if (r.write.Z) z[i]->backup();
281 if (r.write.W) w[i]->backup();
287 for (int i = 0; i < NYW; ++i) {
288 if (r.write.X) x[i]->restore();
289 if (r.write.Y) y[i]->restore();
290 if (r.write.Z) z[i]->restore();
291 if (r.write.W) w[i]->restore();
295 long long flops() const
297 return NYW * NXZ * r.flops() * x[0]->Length();
300 long long bytes() const
302 // X and Z reads are repeated (and hopefully cached) across NYW
303 // each Y and W read/write is done once
304 return NYW * NXZ * (r.read.X + r.write.X) * x[0]->Bytes() +
305 NYW * (r.read.Y + r.write.Y) * y[0]->Bytes() +
306 NYW * NXZ * (r.read.Z + r.write.Z) * z[0]->Bytes() +
307 NYW * (r.read.W + r.write.W) * w[0]->Bytes();
310 int tuningIter() const { return 3; }
313 template <template <typename ...> class ReducerDiagonal, template <typename ...> class ReducerOffDiagonal, typename T>
314 void multiReduce(T result[], const coeff_array<T> &a, const coeff_array<T> &b, const coeff_array<T> &c,
315 CompositeColorSpinorField &x, CompositeColorSpinorField &y, CompositeColorSpinorField &z,
316 CompositeColorSpinorField &w, int i, int j)
318 if (i == j) { // we are on the diagonal so invoke the diagonal reducer
319 using host_reduce_t = typename ReducerDiagonal<double, double>::reduce_t;
320 instantiate<ReducerDiagonal, MultiReduce, true>(a, b, c, *x[0], *y[0], x, y, z, w, (host_reduce_t*)result);
321 } else { // we are on the diagonal so invoke the off-diagonal reducer
322 using host_reduce_t = typename ReducerOffDiagonal<double, double>::reduce_t;
323 instantiate<ReducerOffDiagonal, MultiReduce, true>(a, b, c, *x[0], *y[0], x, y, z, w, (host_reduce_t*)result);
327 // This function does the outer product of dot products... in column major.
328 // There's a function below called 'cDotProduct' that flips it to row major.
329 template <template <typename ...> class ReducerDiagonal,
330 template <typename ...> class ReducerOffDiagonal, typename T>
331 void multiReduce_recurse(T *result, std::vector<ColorSpinorField *> &x, std::vector<ColorSpinorField *> &y,
332 std::vector<ColorSpinorField *> &z, std::vector<ColorSpinorField *> &w, int i_idx,
333 int j_idx, bool hermitian, uint2 tile_size)
335 if (y.size() > tile_size.y) { // if greater than max single-kernel size, split and recurse
336 // Do the recurse first.
337 T* result0 = &result[0];
338 T* result1 = &result[x.size()*(y.size()/2)];
339 std::vector<ColorSpinorField*> y0(y.begin(), y.begin() + y.size()/2);
340 std::vector<ColorSpinorField*> y1(y.begin() + y.size()/2, y.end());
341 multiReduce_recurse<ReducerDiagonal,ReducerOffDiagonal>(result0, x, y0, z, w, i_idx, 2*j_idx+0, hermitian, tile_size);
342 multiReduce_recurse<ReducerDiagonal,ReducerOffDiagonal>(result1, x, y1, z, w, i_idx, 2*j_idx+1, hermitian, tile_size);
344 T* tmp_dot = new T[x.size()*y.size()];
346 // if at bottom of recursion, return if on lower left
347 if (x.size() <= tile_size.x && is_valid_NXZ(x.size(), true) && hermitian) {
348 if (j_idx < i_idx) { return; }
351 coeff_array<T> a, b, c;
353 if (x.size() <= tile_size.x && is_valid_NXZ(x.size(), true) && x.size() * y.size() <= (unsigned int)max_n_reduce()) {
354 // problem will fit, so do the computation
355 multiReduce<ReducerDiagonal, ReducerOffDiagonal>(tmp_dot, a, b, c, x, y, z, w, i_idx, j_idx);
357 // split the problem and recurse. Splitting in x requires
358 // memory reshuffling (unless y = 1).
359 // Use a few temporary variables.
361 T* tmpmajor = new T[x.size()*y.size()];
362 T* result0 = &tmpmajor[0];
363 T* result1 = &tmpmajor[(x.size()/2)*y.size()];
364 std::vector<ColorSpinorField*> x0(x.begin(), x.begin() + x.size()/2);
365 std::vector<ColorSpinorField*> x1(x.begin() + x.size()/2, x.end());
366 std::vector<ColorSpinorField*> z0(z.begin(), z.begin() + z.size()/2);
367 std::vector<ColorSpinorField*> z1(z.begin() + z.size()/2, z.end());
368 std::vector<ColorSpinorField*> w0(w.begin(), w.begin() + w.size()/2);
369 std::vector<ColorSpinorField*> w1(w.begin() + w.size()/2, w.end());
371 multiReduce_recurse<ReducerDiagonal,ReducerOffDiagonal>(result0, x0, y, z0, w0, 2*i_idx+0, j_idx, hermitian, tile_size);
372 multiReduce_recurse<ReducerDiagonal,ReducerOffDiagonal>(result1, x1, y, z1, w1, 2*i_idx+1, j_idx, hermitian, tile_size);
374 const unsigned int xlen0 = x.size()/2;
375 const unsigned int xlen1 = x.size() - xlen0;
376 const unsigned int ylen = y.size();
378 // Copy back into result.
379 int count = 0, count0 = 0, count1 = 0;
380 for (unsigned int i = 0; i < ylen; i++)
382 for (unsigned int j = 0; j < xlen0; j++)
383 result[count++] = result0[count0++];
384 for (unsigned int j = 0; j < xlen1; j++)
385 result[count++] = result1[count1++];
391 // we are at the leaf of the binary tree (e.g., we ran the kernel): perform the row-to-column-major transpose here.
392 if (x.size() <= tile_size.x && is_valid_NXZ(x.size(), true) && x.size() * y.size() <= (unsigned int)max_n_reduce()) {
393 const unsigned int xlen = x.size();
394 const unsigned int ylen = y.size();
395 for (unsigned int j = 0; j < xlen; j++)
396 for (unsigned int i = 0; i < ylen; i++)
397 result[i*xlen+j] = tmp_dot[j*ylen + i];
403 template <template <typename ...> class ReducerDiagonal,
404 template <typename ...> class ReducerOffDiagonal, typename T>
405 class TileSizeTune : public Tunable
407 typedef std::vector<ColorSpinorField*> vec;
413 unsigned int sharedBytesPerThread() const { return 0; }
414 unsigned int sharedBytesPerBlock(const TuneParam ¶m) const { return 0; }
420 TileSizeTune(T *result, vec &x, vec &y, vec &z, vec &w, int coeff_width, bool hermitian, bool Anorm = false,
421 bool nested_policy = false) :
427 hermitian(hermitian),
430 NYW_max = max_YW_size(x.size(), x[0]->Precision(), y[0]->Precision(), false, false, coeff_width, true);
431 max_tile_size = make_uint2(1, 1);
433 strcpy(aux, nested_policy ? "nested_policy," : "policy,");
434 strcat(aux, x[0]->AuxString());
436 strcat(aux, y[0]->AuxString());
437 if (hermitian) strcat(aux, ",hermitian");
438 if (Anorm) strcat(aux, ",Anorm");
441 u64toa(size, x.size());
444 u64toa(size, y.size());
446 u64toa(size, MAX_MULTI_BLAS_N);
447 strcat(aux, ",multi-blas-n=");
450 // before we do policy tuning we must ensure the kernel
451 // constituents have been tuned since we can't do nested tuning
452 // FIXME this will break if the kernels are destructive - which they aren't here
454 if (!nested_policy) disableProfileCount(); // purely for profiling reasons, don't want to profile tunings.
456 // note the 1-d tuning is all redundent now that we call
457 // multiReduce_recurse directly now for 1-d multi
458 // reductions, but I'll keep this code here for now
459 if (x.size() == 1) { // 1-d reduction
461 max_tile_size = make_uint2(1, std::min(NYW_max, (int)y.size()));
462 multiReduce_recurse<ReducerDiagonal, ReducerOffDiagonal>(result, x, y, z, w, 0, 0, hermitian, max_tile_size);
464 } else if (y.size() == 1) { // 1-d reduction
466 max_tile_size = make_uint2(std::min((size_t)max_NXZ_power2(true), x.size()), 1);
467 multiReduce_recurse<ReducerDiagonal, ReducerOffDiagonal>(result, x, y, z, w, 0, 0, hermitian, max_tile_size);
469 } else { // 2-d reduction
471 // max_tile_size should be set to the largest power of 2,
472 // since we have a requirement that the tile size is a
474 // FIXME - we only do simple square tiling here
475 max_tile_size = make_uint2(max_NXZ_power2(true), max_NXZ_power2(true));
477 // Make sure constituents are tuned.
478 for (unsigned int tile_size = 1;
479 tile_size <= max_tile_size.x && tile_size <= x.size() && (tile_size <= y.size() || y.size() == 1);
481 multiReduce_recurse<ReducerDiagonal, ReducerOffDiagonal>(result, x, y, z, w, 0, 0, hermitian, make_uint2(tile_size, tile_size));
484 // also test case using a single kernel if both dimensions are less than max
485 if (is_valid_NXZ(x.size(), true) && y.size() <= (unsigned int)NYW_max) {
486 multiReduce_recurse<ReducerDiagonal, ReducerOffDiagonal>(result, x, y, z, w, 0, 0, hermitian, make_uint2(x.size(), y.size()));
490 if (!nested_policy) enableProfileCount();
491 setPolicyTuning(true);
495 virtual ~TileSizeTune() { setPolicyTuning(false); }
497 void apply(const qudaStream_t &stream) {
498 TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
500 // tp.aux.x is where the tile size is stored. "tp" is the tuning struct.
501 // it contains blocksize, grid size, etc. Since we're only tuning
502 // a policy, we don't care about those sizes. That's why we only
503 // tune "aux.x", which is the tile size.
504 multiReduce_recurse<ReducerDiagonal, ReducerOffDiagonal>(result, x, y, z, w, 0, 0, hermitian, make_uint2(tp.aux.x, tp.aux.y));
507 // aux.x is the tile size
508 bool advanceAux(TuneParam ¶m) const
510 // for 1-d reductions we don't do any tuning and just use the largest tile
511 if (x.size() == 1 || y.size() == 1) {
513 } else { // 2-d reduction
515 if ((unsigned int)(2 * param.aux.x) <= max_tile_size.x && (unsigned int)(2 * param.aux.y) <= max_tile_size.y
516 && (unsigned int)(2 * param.aux.x) <= x.size() && (unsigned int)(2 * param.aux.y) <= y.size()) {
517 // only tune powers of two
521 } else if (is_valid_NXZ(x.size(), true) && y.size() <= (size_t)NYW_max
522 && ((size_t)param.aux.x != x.size() || (size_t)param.aux.y != y.size())) {
523 // we've run out of power of two tiles to try, but before
524 // we finish, try a single kernel if it fits
525 param.aux.x = x.size();
526 param.aux.y = y.size();
529 // reset to the beginning (which we'd need for multi-dimensional tuning)
537 bool advanceTuneParam(TuneParam ¶m) const { return advanceAux(param); }
539 void initTuneParam(TuneParam ¶m) const {
540 Tunable::initTuneParam(param);
541 if (x.size() == 1 || y.size() == 1) {
542 param.aux.x = max_tile_size.x;
543 param.aux.y = max_tile_size.y;
544 } else { // only do non-trivial tuning for 2-d reductions
552 void defaultTuneParam(TuneParam ¶m) const {
553 Tunable::defaultTuneParam(param); // default is max tile size
554 param.aux.x = max_tile_size.x;
555 param.aux.y = max_tile_size.y;
560 TuneKey tuneKey() const {
561 return TuneKey(x[0]->VolString(), typeid(*this).name(), aux);
564 long long flops() const { return 0; } // FIXME
565 long long bytes() const { return 0; } // FIXME
567 void preTune() { } // FIXME - use write to determine what needs to be saved
568 void postTune() { } // FIXME - use write to determine what needs to be saved
571 template <template <typename ...> class ReducerDiagonal,
572 template <typename ...> class ReducerOffDiagonal, typename T>
573 class TransposeTune : public Tunable
575 using TileTuner = TileSizeTune<ReducerDiagonal, ReducerOffDiagonal, T>;
576 using vec = std::vector<ColorSpinorField *>;
583 unsigned int sharedBytesPerThread() const { return 0; }
584 unsigned int sharedBytesPerBlock(const TuneParam ¶m) const { return 0; }
587 TransposeTune(T *result, vec &x, vec &y, int coeff_width, bool hermitian, bool Anorm = false) :
591 coeff_width(coeff_width),
592 hermitian(hermitian),
595 strcpy(aux, "policy,");
596 strcat(aux, x[0]->AuxString());
598 strcat(aux, y[0]->AuxString());
599 if (hermitian) strcat(aux, ",hermitian");
600 if (Anorm) strcat(aux, ",Anorm");
603 u64toa(size, x.size());
606 u64toa(size, y.size());
608 u64toa(size, MAX_MULTI_BLAS_N);
609 strcat(aux, ",multi-blas-n=");
612 // before we do policy tuning we must ensure the kernel
613 // constituents have been tuned since we can't do nested tuning
615 disableProfileCount(); // purely for profiling reasons, don't want to profile tunings.
617 // note the 1-d tuning is all redundent now that we call
618 // multiReduce_recurse directly now for 1-d multi
619 // reductions, but I'll keep this code here for now
621 TileTuner tile(result, x, y, x, x, coeff_width, hermitian, Anorm, true);
623 } else if (y.size() == 1) {
624 TileTuner tile(result, y, x, y, y, coeff_width, hermitian, Anorm, true);
628 { // tune regular inner product
629 TileTuner tile(result, x, y, x, x, coeff_width, hermitian, Anorm, true);
633 { // tune transpose inner product
634 TileTuner tile(result, y, x, y, y, coeff_width, hermitian, Anorm, true);
639 enableProfileCount();
640 setPolicyTuning(true);
644 virtual ~TransposeTune() { setPolicyTuning(false); }
646 void apply(const qudaStream_t &stream)
648 TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
651 TileTuner tile(result, x, y, x, x, coeff_width, hermitian, Anorm, true);
653 } else if (tp.aux.x == 1) {
654 T *result_trans = new T[x.size() * y.size()];
656 // swap (x<->y and w<-z> when doing transpose calculation)
657 TileTuner tile(result_trans, y, x, y, y, coeff_width, hermitian, Anorm, true);
660 // tranpose the result if we are doing the transpose calculation
661 const auto xlen = x.size();
662 const auto ylen = y.size();
663 for (unsigned int j = 0; j < xlen; j++)
664 for (unsigned int i = 0; i < ylen; i++) result[i * xlen + j] = conj(result_trans[j * ylen + i]);
666 delete[] result_trans;
668 errorQuda("Unexpected transpose parameter %d", tp.aux.x);
672 bool advanceAux(TuneParam ¶m) const
674 if (x.size() == 1 || y.size() == 1) {
677 if (param.aux.x == 0) {
687 bool advanceTuneParam(TuneParam ¶m) const { return advanceAux(param); }
689 void initTuneParam(TuneParam ¶m) const
691 Tunable::initTuneParam(param);
693 param.aux = make_int4(0, 0, 0, 0);
694 else if (y.size() == 1)
695 param.aux = make_int4(1, 0, 0, 0);
697 param.aux = make_int4(0, 0, 0, 0); // default is not to transpose
700 void defaultTuneParam(TuneParam ¶m) const { initTuneParam(param); }
702 TuneKey tuneKey() const { return TuneKey(x[0]->VolString(), typeid(*this).name(), aux); }
704 long long flops() const { return 0; } // FIXME
705 long long bytes() const { return 0; } // FIXME
707 void preTune() {} // FIXME - use write to determine what needs to be saved
708 void postTune() {} // FIXME - use write to determine what needs to be saved
711 void reDotProduct(double *result, std::vector<ColorSpinorField *> &x, std::vector<ColorSpinorField *> &y)
713 if (x.size() == 0 || y.size() == 0) errorQuda("vector.size() == 0");
714 double *result_tmp = new double[x.size() * y.size()];
715 for (unsigned int i = 0; i < x.size()*y.size(); i++) result_tmp[i] = 0.0;
719 int NYW_max = max_YW_size(x.size(), x[0]->Precision(), y[0]->Precision(), false, false, coeff_width, true);
720 // if fine-grid then we set max tile size to 32 to avoid unnecessary tuning
721 uint2 max_tile_size = make_uint2(1, std::min( {NYW_max, (int)y.size(), x[0]->Ncolor() == 3 ? 32 : NYW_max} ));
722 multiReduce_recurse<multiDot, multiDot>(result_tmp, x, y, x, x, 0, 0, false, max_tile_size);
723 } else if (y.size() == 1 && x[0]->Precision() == y[0]->Precision()) {
725 double *result_trans = new double[x.size() * y.size()];
727 // swap (x<->y and w<-z> when doing transpose calculation)
728 int NXZ_max = max_YW_size(y.size(), y[0]->Precision(), x[0]->Precision(), false, false, coeff_width, true);
729 // if fine-grid then we set max tile size to 32 to avoid unnecessary tuning
730 uint2 max_tile_size = make_uint2(1, std::min( {NXZ_max, (int)x.size(), x[0]->Ncolor() == 3 ? 32 : NXZ_max} ));
731 multiReduce_recurse<multiDot, multiDot>(result_trans, y, x, y, y, 0, 0, false, max_tile_size);
733 // transpose the result if we are doing the transpose calculation
734 const auto xlen = x.size();
735 const auto ylen = y.size();
736 for (unsigned int j = 0; j < xlen; j++)
737 for (unsigned int i = 0; i < ylen; i++) result_tmp[i * xlen + j] = result_trans[j * ylen + i];
739 delete[] result_trans;
741 } else if (x[0]->Precision() == y[0]->Precision()) {
742 TransposeTune<multiDot, multiDot, double> trans(result_tmp, x, y, coeff_width, false);
745 TileSizeTune<multiDot, multiDot, double> tile(result_tmp, x, y, x, x, coeff_width, false);
749 // do a single multi-node reduction only once we have computed all local dot products
750 const int Nreduce = x.size() * y.size();
751 reduceDoubleArray(result_tmp, Nreduce);
753 // multiReduce_recurse returns a column-major matrix.
754 // To be consistent with the multi-blas functions, we should
755 // switch this to row-major.
756 const unsigned int xlen = x.size();
757 const unsigned int ylen = y.size();
758 for (unsigned int j = 0; j < xlen; j++)
759 for (unsigned int i = 0; i < ylen; i++) result[j * ylen + i] = result_tmp[i * xlen + j];
764 void cDotProduct(Complex *result, std::vector<ColorSpinorField *> &x, std::vector<ColorSpinorField *> &y)
766 if (x.size() == 0 || y.size() == 0) errorQuda("vector.size() == 0");
767 Complex *result_tmp = new Complex[x.size() * y.size()];
768 for (unsigned int i = 0; i < x.size() * y.size(); i++) result_tmp[i] = 0.0;
772 int NYW_max = max_YW_size(x.size(), x[0]->Precision(), y[0]->Precision(), false, false, coeff_width, true);
773 // if fine-grid then we set max tile size to 32 to avoid unnecessary tuning
774 uint2 max_tile_size = make_uint2(1, std::min( {NYW_max, (int)y.size(), x[0]->Ncolor() == 3 ? 32 : NYW_max} ));
775 multiReduce_recurse<multiCdot, multiCdot>(result_tmp, x, y, x, x, 0, 0, false, max_tile_size);
776 } else if (y.size() == 1 && x[0]->Precision() == y[0]->Precision()) {
778 Complex *result_trans = new Complex[x.size() * y.size()];
780 // swap (x<->y and w<-z> when doing transpose calculation)
781 int NXZ_max = max_YW_size(y.size(), y[0]->Precision(), x[0]->Precision(), false, false, coeff_width, true);
782 // if fine-grid then we set max tile size to 32 to avoid unnecessary tuning
783 uint2 max_tile_size = make_uint2(1, std::min( {NXZ_max, (int)x.size(), x[0]->Ncolor() == 3 ? 32 : NXZ_max} ));
784 multiReduce_recurse<multiCdot, multiCdot>(result_trans, y, x, y, y, 0, 0, false, max_tile_size);
786 // transpose the result if we are doing the transpose calculation
787 const auto xlen = x.size();
788 const auto ylen = y.size();
789 for (unsigned int j = 0; j < xlen; j++)
790 for (unsigned int i = 0; i < ylen; i++) result_tmp[i * xlen + j] = conj(result_trans[j * ylen + i]);
792 delete[] result_trans;
794 } else if (x[0]->Precision() == y[0]->Precision()) {
795 TransposeTune<multiCdot, multiCdot, Complex> trans(result_tmp, x, y, coeff_width, false);
798 TileSizeTune<multiCdot, multiCdot, Complex> tile(result_tmp, x, y, x, x, coeff_width, false);
802 // do a single multi-node reduction only once we have computed all local dot products
803 const int Nreduce = 2*x.size()*y.size();
804 reduceDoubleArray((double*)result_tmp, Nreduce);
806 // multiReduce_recurse returns a column-major matrix.
807 // To be consistent with the multi-blas functions, we should
808 // switch this to row-major.
809 const unsigned int xlen = x.size();
810 const unsigned int ylen = y.size();
811 for (unsigned int j = 0; j < xlen; j++)
812 for (unsigned int i = 0; i < ylen; i++)
813 result[j*ylen+i] = result_tmp[i*xlen + j];
818 void hDotProduct(Complex *result, std::vector<ColorSpinorField *> &x, std::vector<ColorSpinorField *> &y)
820 if (x.size() == 0 || y.size() == 0) errorQuda("vector.size() == 0");
821 if (x.size() != y.size()) errorQuda("Cannot call Hermitian block dot product on non-square inputs");
823 Complex* result_tmp = new Complex[x.size()*y.size()];
824 for (unsigned int i = 0; i < x.size()*y.size(); i++) result_tmp[i] = 0.0;
827 TileSizeTune<multiCdot, multiCdot, Complex> tile(result_tmp, x, y, x, x, coeff_width, true, false); // last false is b/c L2 norm
830 // do a single multi-node reduction only once we have computed all local dot products
831 const int Nreduce = 2*x.size()*y.size();
832 reduceDoubleArray((double*)result_tmp, Nreduce); // FIXME - could optimize this for Hermiticity as well
834 // Switch from col-major to row-major
835 const unsigned int xlen = x.size();
836 const unsigned int ylen = y.size();
837 for (unsigned int j = 0; j < xlen; j++)
838 for (unsigned int i = j; i < ylen; i++) {
839 result[j*ylen+i] = result_tmp[i*xlen + j];
840 result[i*ylen+j] = conj(result_tmp[i*xlen + j]);
846 // for (p, Ap) norms in CG which are Hermitian.
847 void hDotProduct_Anorm(Complex *result, std::vector<ColorSpinorField *> &x, std::vector<ColorSpinorField *> &y)
849 if (x.size() == 0 || y.size() == 0) errorQuda("vector.size() == 0");
850 if (x.size() != y.size()) errorQuda("Cannot call Hermitian block A-norm dot product on non-square inputs");
852 Complex* result_tmp = new Complex[x.size()*y.size()];
853 for (unsigned int i = 0; i < x.size()*y.size(); i++) result_tmp[i] = 0.0;
856 TileSizeTune<multiCdot, multiCdot, Complex> tile(result_tmp, x, y, x, x, coeff_width, true, true); // last true is b/c A norm
859 // do a single multi-node reduction only once we have computed all local dot products
860 const int Nreduce = 2*x.size()*y.size();
861 reduceDoubleArray((double*)result_tmp, Nreduce); // FIXME - could optimize this for Hermiticity as well
863 // Switch from col-major to row-major
864 const unsigned int xlen = x.size();
865 const unsigned int ylen = y.size();
866 for (unsigned int j = 0; j < xlen; j++)
867 for (unsigned int i = j; i < ylen; i++) {
868 result[j*ylen+i] = result_tmp[i*xlen + j];
869 result[i*ylen+j] = conj(result_tmp[i*xlen + j]);
875 // takes the outer product of inner products between and y and copies y into z
876 void cDotProductCopy(Complex* result, std::vector<ColorSpinorField*>& x, std::vector<ColorSpinorField*>& y,
877 std::vector<ColorSpinorField*>&z){
880 // FIXME - if this is enabled we need to ensure that use_w is
881 // enabled above. Also, I think this might break if the diagonal
882 // write is different from the off-diagonal write
883 if (x.size() == 0 || y.size() == 0) errorQuda("vector.size() == 0");
884 if (y.size() != z.size()) errorQuda("Cannot copy input y of size %lu into z of size %lu\n", y.size(), z.size());
886 Complex* result_tmp = new Complex[x.size()*y.size()];
887 for (unsigned int i = 0; i < x.size()*y.size(); i++) result_tmp[i] = 0.0;
890 // When recursing, only the diagonal tiles will do the copy, the rest just do the outer product
891 TileSizeTune<double2, typename vector<device_reduce_t,2>::type,multiCdotCopy,multiCdot,Complex> tile(result_tmp, x, y, x, y, coeff_width, true);
894 // do a single multi-node reduction only once we have computed all local dot products
895 const int Nreduce = 2*x.size()*y.size();
896 reduceDoubleArray((double*)result_tmp, Nreduce);
898 // Switch from col-major to row-major.
899 const unsigned int xlen = x.size();
900 const unsigned int ylen = y.size();
901 for (unsigned int j = 0; j < xlen; j++)
902 for (unsigned int i = 0; i < ylen; i++)
903 result[j*ylen+i] = result_tmp[i*xlen + j];
907 errorQuda("cDotProductCopy not enabled");