30 template <
bool from_coarse,
typename Float,
int fineSpin,
31 int fineColor,
int coarseSpin,
int coarseColor,
typename Arg>
53 flops_ = 2l * arg.fineVolumeCB * 8 * fineSpin * coarseColor * fineColor * fineColor * (!from_coarse ? 1 : fineSpin);
58 flops_ = 2l * arg.fineVolumeCB * 8 * (fineSpin/2) * (fineSpin/2) * (fineSpin/2) * fineColor * fineColor * coarseColor;
62 flops_ = 4l * arg.fineVolumeCB * 8 * (fineSpin/2) * (fineSpin/2) * (fineSpin/2) * fineColor * fineColor * coarseColor;
66 flops_ = 2l * arg.fineVolumeCB * 8 * fineSpin * fineSpin * coarseColor * coarseColor * fineColor / (!from_coarse ? coarseSpin : 1);
70 flops_ = 2l * arg.fineVolumeCB * 8 * fineSpin * fineSpin * coarseColor * coarseColor * fineColor * fineColor / (!from_coarse ? coarseSpin : 1);
83 flops_ = 2l * arg.coarseVolumeCB*coarseSpin*coarseColor;
86 errorQuda(
"Undefined compute type %d", type);
96 bytes_ = arg.UV.Bytes() + arg.
V.
Bytes() + 2*arg.U.Bytes()*coarseColor;
99 bytes_ = arg.AV.Bytes() + arg.
V.
Bytes() + 2*arg.C.Bytes()*coarseColor;
102 bytes_ = arg.AV.Bytes() + arg.
V.
Bytes();
105 #ifdef DYNAMIC_CLOVER 106 bytes_ = arg.AV.Bytes() + arg.
V.
Bytes() + 2*arg.C.Bytes()*coarseColor;
108 bytes_ = arg.AV.Bytes() + arg.
V.
Bytes() + 4*arg.C.Bytes()*coarseColor;
113 bytes_ = 2*arg.C.Bytes();
121 bytes_ = 2*writes*arg.Y.Bytes() + (arg.bidirectional ? 1 : 2) * 2*writes*arg.X.Bytes() + coarseColor*(arg.UV.Bytes() + arg.
V.
Bytes());
125 bytes_ = 2*arg.X.Bytes() + 2*arg.C.Bytes() + arg.
V.
Bytes();
128 bytes_ = 4*2*2*arg.Y.Bytes();
131 bytes_ = 2*2*arg.X.Bytes();
134 bytes_ = dim == 4 ? 2*(arg.X.Bytes() + arg.X_atomic.Bytes()) : 2*(arg.Y.Bytes() + arg.Y_atomic.Bytes());
137 bytes_ = 2*2*arg.Y.Bytes();
140 errorQuda(
"Undefined compute type %d", type);
146 unsigned int threads = 0;
156 threads = arg.fineVolumeCB;
163 threads = arg.coarseVolumeCB;
166 errorQuda(
"Undefined compute type %d", type);
182 meta(meta), Y(Y), X(X), Y_atomic(Y_atomic), X_atomic(X_atomic), dim(0), dir(
QUDA_BACKWARDS)
186 create_jitify_program(
"kernels/coarse_op_kernel.cuh");
204 if (dim==0) ComputeUVCPU<from_coarse,Float,0,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(
arg);
205 else if (dim==1) ComputeUVCPU<from_coarse,Float,1,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(
arg);
206 else if (dim==2) ComputeUVCPU<from_coarse,Float,2,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(
arg);
207 else if (dim==3) ComputeUVCPU<from_coarse,Float,3,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(
arg);
209 if (dim==0) ComputeUVCPU<from_coarse,Float,0,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(
arg);
210 else if (dim==1) ComputeUVCPU<from_coarse,Float,1,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(
arg);
211 else if (dim==2) ComputeUVCPU<from_coarse,Float,2,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(
arg);
212 else if (dim==3) ComputeUVCPU<from_coarse,Float,3,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(
arg);
214 errorQuda(
"Undefined direction %d", dir);
219 if (from_coarse)
errorQuda(
"ComputeAV should only be called from the fine grid");
220 #if defined(GPU_CLOVER_DIRAC) && !defined(COARSECOARSE) 221 ComputeAVCPU<Float,fineSpin,fineColor,coarseColor>(
arg);
223 errorQuda(
"Clover dslash has not been built");
228 if (from_coarse)
errorQuda(
"ComputeTMAV should only be called from the fine grid");
229 #if defined(GPU_TWISTED_MASS_DIRAC) && !defined(COARSECOARSE) 230 ComputeTMAVCPU<Float,fineSpin,fineColor,coarseColor>(
arg);
232 errorQuda(
"Twisted mass dslash has not been built");
237 if (from_coarse)
errorQuda(
"ComputeTMCAV should only be called from the fine grid");
238 #if defined(GPU_TWISTED_CLOVER_DIRAC) && !defined(COARSECOARSE) 239 ComputeTMCAVCPU<Float,fineSpin,fineColor,coarseColor>(
arg);
241 errorQuda(
"Twisted clover dslash has not been built");
246 if (from_coarse)
errorQuda(
"ComputeInvCloverMax should only be called from the fine grid");
247 #if defined(DYNAMIC_CLOVER) && !defined(COARSECOARSE) 248 ComputeCloverInvMaxCPU<Float, false>(
arg);
250 errorQuda(
"ComputeInvCloverMax only enabled with dynamic clover");
255 if (from_coarse)
errorQuda(
"ComputeInvCloverMax should only be called from the fine grid");
256 #if defined(DYNAMIC_CLOVER) && !defined(COARSECOARSE) 257 ComputeCloverInvMaxCPU<Float, true>(
arg);
259 errorQuda(
"ComputeInvCloverMax only enabled with dynamic clover");
267 if (dim==0) ComputeVUVCPU<from_coarse,Float,0,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(
arg);
268 else if (dim==1) ComputeVUVCPU<from_coarse,Float,1,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(
arg);
269 else if (dim==2) ComputeVUVCPU<from_coarse,Float,2,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(
arg);
270 else if (dim==3) ComputeVUVCPU<from_coarse,Float,3,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(
arg);
272 if (dim==0) ComputeVUVCPU<from_coarse,Float,0,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(
arg);
273 else if (dim==1) ComputeVUVCPU<from_coarse,Float,1,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(
arg);
274 else if (dim==2) ComputeVUVCPU<from_coarse,Float,2,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(
arg);
275 else if (dim==3) ComputeVUVCPU<from_coarse,Float,3,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(
arg);
277 errorQuda(
"Undefined direction %d", dir);
282 ComputeCoarseCloverCPU<from_coarse,Float,fineSpin,coarseSpin,fineColor,coarseColor>(
arg);
286 ComputeYReverseCPU<Float,coarseSpin,coarseColor>(
arg);
290 AddCoarseDiagonalCPU<Float,coarseSpin,coarseColor>(
arg);
294 AddCoarseTmDiagonalCPU<Float,coarseSpin,coarseColor>(
arg);
299 ConvertCPU<Float,coarseSpin,coarseColor>(
arg);
304 RescaleYCPU<Float,coarseSpin,coarseColor>(
arg);
307 errorQuda(
"Undefined compute type %d", type);
315 using namespace jitify::reflection;
317 .instantiate(from_coarse,Type<Float>(),dim,dir,fineSpin,fineColor,coarseSpin,coarseColor,Type<Arg>())
321 if (dim==0) ComputeUVGPU<from_coarse,Float,0,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.
grid,tp.
block,tp.
shared_bytes>>>(
arg);
322 else if (dim==1) ComputeUVGPU<from_coarse,Float,1,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.
grid,tp.
block,tp.
shared_bytes>>>(
arg);
323 else if (dim==2) ComputeUVGPU<from_coarse,Float,2,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.
grid,tp.
block,tp.
shared_bytes>>>(
arg);
324 else if (dim==3) ComputeUVGPU<from_coarse,Float,3,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.
grid,tp.
block,tp.
shared_bytes>>>(
arg);
326 if (dim==0) ComputeUVGPU<from_coarse,Float,0,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.
grid,tp.
block,tp.
shared_bytes>>>(
arg);
327 else if (dim==1) ComputeUVGPU<from_coarse,Float,1,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.
grid,tp.
block,tp.
shared_bytes>>>(
arg);
328 else if (dim==2) ComputeUVGPU<from_coarse,Float,2,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.
grid,tp.
block,tp.
shared_bytes>>>(
arg);
329 else if (dim==3) ComputeUVGPU<from_coarse,Float,3,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.
grid,tp.
block,tp.
shared_bytes>>>(
arg);
335 if (from_coarse)
errorQuda(
"ComputeAV should only be called from the fine grid");
337 using namespace jitify::reflection;
339 .instantiate(Type<Float>(),fineSpin,fineColor,coarseColor,Type<Arg>())
342 #if defined(GPU_CLOVER_DIRAC) && !defined(COARSECOARSE) 345 errorQuda(
"Clover dslash has not been built");
351 if (from_coarse)
errorQuda(
"ComputeTMAV should only be called from the fine grid");
353 using namespace jitify::reflection;
355 .instantiate(Type<Float>(),fineSpin,fineColor,coarseColor,Type<Arg>())
358 #if defined(GPU_TWISTED_MASS_DIRAC) && !defined(COARSECOARSE) 361 errorQuda(
"Twisted mass dslash has not been built");
367 if (from_coarse)
errorQuda(
"ComputeTMCAV should only be called from the fine grid");
369 using namespace jitify::reflection;
371 .instantiate(Type<Float>(),fineSpin,fineColor,coarseColor,Type<Arg>())
374 #if defined(GPU_TWISTED_CLOVER_DIRAC) && !defined(COARSECOARSE) 377 errorQuda(
"Twisted clover dslash has not been built");
383 if (from_coarse)
errorQuda(
"ComputeCloverInvMax should only be called from the fine grid");
384 arg.max_d =
static_cast<Float*
>(
pool_device_malloc(2 * arg.fineVolumeCB *
sizeof(Float)));
387 using namespace jitify::reflection;
388 jitify_error = program->kernel(
"quda::ComputeCloverInvMaxGPU")
389 .instantiate(Type<Float>(),
false, Type<Arg>())
393 #if defined(DYNAMIC_CLOVER) && !defined(COARSECOARSE) 396 errorQuda(
"ComputeCloverInvMax only enabled with dynamic clover");
402 thrust::device_ptr<Float> ptr(arg.max_d);
403 arg.max_h =
thrust::reduce(thrust::cuda::par(alloc), ptr, ptr + 2 * arg.fineVolumeCB,
404 static_cast<Float>(0.0), thrust::maximum<Float>());
410 if (from_coarse)
errorQuda(
"ComputeCloverInvMax should only be called from the fine grid");
411 arg.max_d =
static_cast<Float *
>(
pool_device_malloc(2 * arg.fineVolumeCB *
sizeof(Float)));
414 using namespace jitify::reflection;
415 jitify_error = program->kernel(
"quda::ComputeCloverInvMaxGPU")
416 .instantiate(Type<Float>(),
true, Type<Arg>())
420 #if defined(DYNAMIC_CLOVER) && !defined(COARSECOARSE) 423 errorQuda(
"ComputeCloverInvMax only enabled with dynamic clover");
429 thrust::device_ptr<Float> ptr(arg.max_d);
430 arg.max_h =
thrust::reduce(thrust::cuda::par(alloc), ptr, ptr+2*arg.fineVolumeCB, static_cast<Float>(0.0), thrust::maximum<Float>());
443 arg.shared_atomic = tp.
aux.y;
444 arg.parity_flip = tp.
aux.z;
446 if (arg.shared_atomic) {
449 int block_size = arg.fineVolumeCB/arg.coarseVolumeCB;
450 if (block_size/2 < coarseSpin*coarseSpin)
451 errorQuda(
"Block size %d not supported in shared-memory atomic coarsening", block_size);
453 arg.aggregates_per_block = tp.
aux.x;
458 if (arg.coarse_color_wave) {
462 arg.grid_z = tp.
grid.z;
463 arg.coarse_color_grid_z = coarseColor*tp.
grid.z;
471 using namespace jitify::reflection;
473 .instantiate(arg.shared_atomic,arg.parity_flip,from_coarse,Type<Float>(),dim,dir,fineSpin,fineColor,coarseSpin,coarseColor,Type<Arg>())
476 if (arg.shared_atomic) {
477 if (arg.parity_flip !=
true)
errorQuda(
"parity_flip = %d not instantiated", arg.parity_flip);
478 constexpr
bool parity_flip =
true;
481 if (dim==0) ComputeVUVGPU<true,parity_flip,from_coarse,Float,0,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.
grid,tp.
block,tp.
shared_bytes>>>(
arg);
482 else if (dim==1) ComputeVUVGPU<true,parity_flip,from_coarse,Float,1,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.
grid,tp.
block,tp.
shared_bytes>>>(
arg);
483 else if (dim==2) ComputeVUVGPU<true,parity_flip,from_coarse,Float,2,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.
grid,tp.
block,tp.
shared_bytes>>>(
arg);
484 else if (dim==3) ComputeVUVGPU<true,parity_flip,from_coarse,Float,3,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.
grid,tp.
block,tp.
shared_bytes>>>(
arg);
486 if (dim==0) ComputeVUVGPU<true,parity_flip,from_coarse,Float,0,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.
grid,tp.
block,tp.
shared_bytes>>>(
arg);
487 else if (dim==1) ComputeVUVGPU<true,parity_flip,from_coarse,Float,1,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.
grid,tp.
block,tp.
shared_bytes>>>(
arg);
488 else if (dim==2) ComputeVUVGPU<true,parity_flip,from_coarse,Float,2,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.
grid,tp.
block,tp.
shared_bytes>>>(
arg);
489 else if (dim==3) ComputeVUVGPU<true,parity_flip,from_coarse,Float,3,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.
grid,tp.
block,tp.
shared_bytes>>>(
arg);
491 errorQuda(
"Undefined direction %d", dir);
494 if (arg.parity_flip !=
false)
errorQuda(
"parity_flip = %d not instantiated", arg.parity_flip);
495 constexpr
bool parity_flip =
false;
498 if (dim==0) ComputeVUVGPU<false,parity_flip,from_coarse,Float,0,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.
grid,tp.
block,tp.
shared_bytes>>>(
arg);
499 else if (dim==1) ComputeVUVGPU<false,parity_flip,from_coarse,Float,1,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.
grid,tp.
block,tp.
shared_bytes>>>(
arg);
500 else if (dim==2) ComputeVUVGPU<false,parity_flip,from_coarse,Float,2,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.
grid,tp.
block,tp.
shared_bytes>>>(
arg);
501 else if (dim==3) ComputeVUVGPU<false,parity_flip,from_coarse,Float,3,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.
grid,tp.
block,tp.
shared_bytes>>>(
arg);
503 if (dim==0) ComputeVUVGPU<false,parity_flip,from_coarse,Float,0,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.
grid,tp.
block,tp.
shared_bytes>>>(
arg);
504 else if (dim==1) ComputeVUVGPU<false,parity_flip,from_coarse,Float,1,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.
grid,tp.
block,tp.
shared_bytes>>>(
arg);
505 else if (dim==2) ComputeVUVGPU<false,parity_flip,from_coarse,Float,2,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.
grid,tp.
block,tp.
shared_bytes>>>(
arg);
506 else if (dim==3) ComputeVUVGPU<false,parity_flip,from_coarse,Float,3,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.
grid,tp.
block,tp.
shared_bytes>>>(
arg);
508 errorQuda(
"Undefined direction %d", dir);
514 if (arg.coarse_color_wave) {
516 tp.
grid.z = arg.grid_z;
521 if (arg.shared_atomic) {
529 using namespace jitify::reflection;
530 jitify_error = program->kernel(
"quda::ComputeCoarseCloverGPU")
531 .instantiate(from_coarse,Type<Float>(),fineSpin,coarseSpin,fineColor,coarseColor,Type<Arg>())
534 ComputeCoarseCloverGPU<from_coarse,Float,fineSpin,coarseSpin,fineColor,coarseColor>
541 using namespace jitify::reflection;
542 jitify_error = program->kernel(
"quda::ComputeYReverseGPU")
543 .instantiate(Type<Float>(),coarseSpin,coarseColor,Type<Arg>())
551 using namespace jitify::reflection;
552 jitify_error = program->kernel(
"quda::AddCoarseDiagonalGPU")
553 .instantiate(Type<Float>(),coarseSpin,coarseColor,Type<Arg>())
561 using namespace jitify::reflection;
562 jitify_error = program->kernel(
"quda::AddCoarseTmDiagonalGPU")
563 .instantiate(Type<Float>(),coarseSpin,coarseColor,Type<Arg>())
572 using namespace jitify::reflection;
574 .instantiate(Type<Float>(),coarseSpin,coarseColor,Type<Arg>())
583 using namespace jitify::reflection;
585 .instantiate(Type<Float>(),coarseSpin,coarseColor,Type<Arg>())
592 errorQuda(
"Undefined compute type %d", type);
614 arg.shared_atomic =
false;
615 arg.parity_flip =
false;
616 if (arg.shared_atomic) {
649 if (param.
aux.y == 0) {
655 int block_size = arg.fineVolumeCB/arg.coarseVolumeCB;
656 if (block_size/2 < coarseSpin*coarseSpin)
return false;
658 arg.shared_atomic =
true;
659 arg.parity_flip =
true;
672 if (param.
aux.x < 4) {
679 arg.shared_atomic =
false;
680 arg.parity_flip =
false;
705 param.
aux.y = arg.shared_atomic;
706 param.
aux.z = arg.parity_flip;
710 param.
block.x = arg.fineVolumeCB/(2*arg.coarseVolumeCB);
711 param.
grid.x = 2*arg.coarseVolumeCB;
720 param.
aux.y = arg.shared_atomic;
721 param.
aux.z = arg.parity_flip;
725 param.
block.x = arg.fineVolumeCB/(2*arg.coarseVolumeCB);
726 param.
grid.x = 2*arg.coarseVolumeCB;
734 if (type ==
COMPUTE_UV) strcat(Aux,
",computeUV");
735 else if (type ==
COMPUTE_AV) strcat(Aux,
",computeAV");
736 else if (type ==
COMPUTE_TMAV) strcat(Aux,
",computeTmAV");
739 strcat(Aux,
",computeCloverInverseMax");
741 strcat(Aux,
",computeTwistedCloverInverseMax");
742 else if (type ==
COMPUTE_VUV) strcat(Aux,
",computeVUV");
749 else errorQuda(
"Unknown type=%d\n", type);
751 #ifdef DYNAMIC_CLOVER 754 strcat(Aux,
",Dynamic");
758 if (dim == 0) strcat(Aux,
",dim=0");
759 else if (dim == 1) strcat(Aux,
",dim=1");
760 else if (dim == 2) strcat(Aux,
",dim=2");
761 else if (dim == 3) strcat(Aux,
",dim=3");
766 if (arg.bidirectional && type ==
COMPUTE_VUV) strcat(Aux,
",bidirectional");
775 strcat(Aux,
"coarse_vol=");
782 return TuneKey(vol_str,
typeid(*this).name(), Aux);
809 errorQuda(
"Undefined compute type %d", type);
837 errorQuda(
"Undefined compute type %d", type);
867 template<
bool from_coarse,
typename Float,
int fineSpin,
int fineColor,
int coarseSpin,
int coarseColor,
typename F,
868 typename Ftmp,
typename Vt,
typename coarseGauge,
typename coarseGaugeAtomic,
typename fineGauge,
typename fineClover>
871 Ftmp &UV, F &AV, Vt &
V, fineGauge &G, fineClover &C, fineClover &Cinv,
875 bool need_bidirectional,
const int *fine_to_coarse,
const int *coarse_to_fine) {
879 errorQuda(
"Unsupported coarsening of matpc = %d", matpc);
882 if (is_dirac_coarse && fineSpin != 2)
883 errorQuda(
"Input Dirac operator %d should have nSpin=2, not nSpin=%d\n", dirac, fineSpin);
884 if (!is_dirac_coarse && fineSpin != 4)
885 errorQuda(
"Input Dirac operator %d should have nSpin=4, not nSpin=%d\n", dirac, fineSpin);
886 if (!is_dirac_coarse && fineColor != 3)
887 errorQuda(
"Input Dirac operator %d should have nColor=3, not nColor=%d\n", dirac, fineColor);
889 if (G.Ndim() != 4)
errorQuda(
"Number of dimensions not supported");
893 for (
int i=0; i<4; i++) x_size[i] = v.
X(i);
897 for (
int i=0; i<4; i++) xc_size[i] = X_.
X()[i];
901 for(
int d = 0; d < nDim; d++) geo_bs[d] = x_size[d]/xc_size[d];
902 int spin_bs = V.Nspin()/Y.NspinCoarse();
910 if (bidirectional_links)
printfQuda(
"Doing bi-directional link coarsening\n");
911 else printfQuda(
"Doing uni-directional link coarsening\n");
916 typedef CalculateYArg<Float,fineSpin,coarseSpin,fineColor,coarseColor,coarseGauge,coarseGaugeAtomic,fineGauge,F,Ftmp,Vt,fineClover> Arg;
917 Arg
arg(Y, X, Y_atomic, X_atomic, UV, AV, G, V, C, Cinv, kappa,
918 mu, mu_factor, x_size, xc_size, geo_bs, spin_bs, fine_to_coarse, coarse_to_fine, bidirectional_links);
919 CalculateY<from_coarse, Float, fineSpin, fineColor, coarseSpin, coarseColor, Arg> y(arg, v, Y_, X_, Y_atomic_, X_atomic_);
928 if (&v == &av) arg.AV.resetGhost(av, av.
Ghost());
940 #ifdef DYNAMIC_CLOVER 943 double max = 6 * arg.max_h;
945 double max = 6*arg.Cinv.abs_max(0);
949 arg.AV.resetScale(max);
966 complex<Float> fp(1./(1.+arg.mu*arg.mu),-arg.mu/(1.+arg.mu*arg.mu));
967 complex<Float> fm(1./(1.+arg.mu*arg.mu),+arg.mu/(1.+arg.mu*arg.mu));
968 double max = std::max(
abs(fp),
abs(fm));
971 arg.AV.resetScale(max);
988 #ifdef DYNAMIC_CLOVER 991 double max = 6*
sqrt(arg.max_h);
993 double max = 6*
sqrt(arg.Cinv.abs_max(0));
997 arg.AV.resetScale(max);
1007 if (coarseGaugeAtomic::fixedPoint()) {
1009 arg.Y_atomic.resetScale(max);
1010 arg.X_atomic.resetScale(max);
1017 bool set_scale =
false;
1020 if (bidirectional_links) {
1021 for (
int d = 0; d < nDim; d++) {
1027 double U_max = 3.0*arg.U.abs_max(from_coarse ? d+4 : d);
1028 double uv_max = U_max * v.
Scale();
1030 arg.UV.resetScale(uv_max);
1040 if (Y_atomic.Geometry() == 1) Y_atomic_.
zero();
1047 if (coarseGaugeAtomic::fixedPoint() || coarseGauge::fixedPoint()) {
1049 if (coarseGauge::fixedPoint()) {
1050 double y_max = arg.Y_atomic.abs_max( (4+d) % arg.Y_atomic.geometry );
1054 Y_.
Scale(1.1*y_max);
1055 arg.Y.resetScale(Y_.
Scale());
1057 }
else if (y_max > Y_.
Scale()) {
1059 arg.rescale = Y_.
Scale() / y_max;
1062 for (
int d_=0; d_<d; d_++) {
1069 arg.Y.resetScale(Y_.
Scale());
1085 arg.AV.resetGhost(av, av.
Ghost());
1090 for (
int d = 0; d < nDim; d++) {
1096 double U_max = 3.0*arg.U.abs_max(d);
1097 double uv_max = U_max * av.
Scale();
1099 arg.UV.resetScale(uv_max);
1109 if (Y_atomic.Geometry() == 1) Y_atomic_.
zero();
1116 if (coarseGaugeAtomic::fixedPoint() || coarseGauge::fixedPoint() ) {
1118 if (coarseGauge::fixedPoint()) {
1119 double y_max = arg.Y_atomic.abs_max( d % arg.Y_atomic.geometry );
1123 Y_.
Scale(1.1*y_max);
1124 arg.Y.resetScale(Y_.
Scale());
1126 }
else if (y_max > Y_.
Scale()) {
1128 arg.rescale = Y_.
Scale() / y_max;
1132 if (bidirectional_links) {
1134 for (
int d_=0; d_<4; d_++) {
1141 for (
int d_=0; d_<d; d_++) {
1148 arg.Y.resetScale(Y_.
Scale());
1164 if ( !bidirectional_links ) {
1183 arg.mu_factor += 1.;
1190 if (coarseGaugeAtomic::fixedPoint() || coarseGauge::fixedPoint() ) {
1195 if (coarseGauge::fixedPoint()) {
1196 double x_max = arg.X_atomic.abs_max(0);
1198 arg.X.resetScale(x_max);
void setDirection(QudaDirection dir_)
const char * AuxString() const
void resizeStep(int y, int z) const
cudaDeviceProp deviceProp
bool advanceSharedBytes(TuneParam ¶m) const
QudaVerbosity getVerbosity()
virtual bool advanceSharedBytes(TuneParam ¶m) const
bool advanceAux(TuneParam ¶m) const
Helper file when using jitify run-time compilation. This file should be included in source code...
__host__ __device__ ValueType sqrt(ValueType x)
static std::map< void *, MemAlloc > alloc[N_ALLOC_TYPE]
void initTuneParam(TuneParam ¶m) const
const char * VolString() const
__device__ void reduce(ReduceArg< T > arg, const T &in, const int idx=0)
const char * comm_dim_partitioned_string(const int *comm_dim_override=0)
Return a string that defines the comm partitioning (used as a tuneKey)
const char * compile_type_str(const LatticeField &meta, QudaFieldLocation location_=QUDA_INVALID_FIELD_LOCATION)
Helper function for setting auxilary string.
void matpc(void *outEven, void **gauge, void *inEven, double kappa, QudaMatPCType matpc_type, int dagger_bit, QudaPrecision sPrecision, QudaPrecision gPrecision, double mferm)
virtual QudaMemoryType MemType() const
enum QudaDirection_s QudaDirection
unsigned int minThreads() const
void setComputeType(ComputeType type_)
void setDimension(int dim_)
enum QudaMatPCType_s QudaMatPCType
#define pool_device_malloc(size)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
#define max_color_per_block
#define checkLocation(...)
virtual void backup() const
Backs up the LatticeField.
char * getOmpThreadStr()
Returns a string of the form ",omp_threads=$OMP_NUM_THREADS", which can be used for storing the numbe...
void initTuneParam(TuneParam ¶m) const
bool activeTuning()
query if tuning is in progress
void resetGhost(const ColorSpinorField &a, void *const *ghost_) const
void * Ghost(const int i)
void calculateY(coarseGauge &Y, coarseGauge &X, coarseGaugeAtomic &Y_atomic, coarseGaugeAtomic &X_atomic, Ftmp &UV, F &AV, Vt &V, fineGauge &G, fineClover &C, fineClover &Cinv, GaugeField &Y_, GaugeField &X_, GaugeField &Y_atomic_, GaugeField &X_atomic_, ColorSpinorField &uv, ColorSpinorField &av, const ColorSpinorField &v, double kappa, double mu, double mu_factor, QudaDiracType dirac, QudaMatPCType matpc, bool need_bidirectional, const int *fine_to_coarse, const int *coarse_to_fine)
Calculate the coarse-link field, including the coarse clover field.
__host__ double norm2(bool global=true) const
QudaFieldLocation Location() const
enum QudaFieldLocation_s QudaFieldLocation
const ColorSpinorField & meta
DEVICEHOST void swap(Real &a, Real &b)
void apply(const cudaStream_t &stream)
void resizeVector(int y, int z) const
double mu_factor[QUDA_MAX_MG_LEVEL]
static bool bidirectional_debug
colorspinor::FieldOrderCB< real, Ns, Nc, 1, order > V
__host__ __device__ ValueType abs(ValueType x)
#define pool_device_free(ptr)
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
virtual void exchangeGhost(QudaParity parity, int nFace, int dagger, const MemoryLocation *pack_destination=nullptr, const MemoryLocation *halo_location=nullptr, bool gdr_send=false, bool gdr_recv=false, QudaPrecision ghost_precision=QUDA_INVALID_PRECISION) const =0
unsigned int sharedBytesPerBlock(const TuneParam ¶m) const
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
QudaPrecision Precision() const
virtual unsigned int sharedBytesPerBlock(const TuneParam ¶m) const
virtual void restore() const
Restores the LatticeField.
CalculateY(Arg &arg, const ColorSpinorField &meta, GaugeField &Y, GaugeField &X, GaugeField &Y_atomic, GaugeField &X_atomic)
void defaultTuneParam(TuneParam ¶m) const
enum QudaDiracType_s QudaDiracType
virtual bool advanceTuneParam(TuneParam ¶m) const
void defaultTuneParam(TuneParam ¶m) const
bool advanceTuneParam(TuneParam ¶m) const