5 #if __COMPUTE_CAPABILITY__ >= 300 6 #include <generics/shfl.h> 19 template <
typename Float,
int coarseSpin,
int coarseColor, QudaFieldOrder csOrder, QudaGaugeFieldOrder gOrder>
20 struct DslashCoarseArg {
21 typedef typename colorspinor::FieldOrderCB<Float,coarseSpin,coarseColor,1,csOrder> F;
22 typedef typename gauge::FieldOrder<Float,coarseColor*coarseSpin,coarseSpin,gOrder> G;
38 inline DslashCoarseArg(ColorSpinorField &
out,
const ColorSpinorField &inA,
const ColorSpinorField &inB,
39 const GaugeField &Y,
const GaugeField &
X, Float
kappa,
int parity)
40 :
out(const_cast<ColorSpinorField&>(
out)), inA(const_cast<ColorSpinorField&>(inA)),
41 inB(const_cast<ColorSpinorField&>(inB)), Y(const_cast<GaugeField&>(Y)),
43 nParity(
out.SiteSubset()), nFace(1), X0h( ((3-nParity) *
out.
X(0)) /2),
46 volumeCB(
out.VolumeCB()/
dim[4])
53 template <DslashType type>
54 static __host__ __device__
bool doHalo() {
67 template <DslashType type>
68 static __host__ __device__
bool doBulk() {
87 static __device__ __host__
inline void getCoordsCB(
int x[],
int cb_index,
const I
X[],
const I X0h,
int parity) {
93 int za = (cb_index / X0h);
97 x[2] = (
zb -
x[3] *
X[2]);
98 int x1odd = (
x[1] +
x[2] +
x[3] +
parity) & 1;
99 x[0] = (2 * cb_index + x1odd -
za *
X[0]);
113 extern __shared__
float s[];
114 template <
typename Float,
int nDim,
int Ns,
int Nc,
int Mc,
int color_str
ide,
int dim_str
ide,
int thread_dir,
int thread_dim,
bool dagger, DslashType type,
typename Arg>
115 __device__ __host__
inline void applyDslash(complex<Float>
out[], Arg &
arg,
int x_cb,
int src_idx,
int parity,
int s_row,
int color_block,
int color_offset) {
116 const int their_spinor_parity = (
arg.nParity == 2) ? 1-
parity : 0;
123 complex<Float> *shared_sum = (complex<Float>*)
s;
129 for(
int d = thread_dim;
d < nDim;
d+=dim_stride)
134 if (doHalo<type>()) {
135 int ghost_idx = ghostFaceIndex<1>(
coord,
arg.dim,
d,
arg.nFace);
138 for(
int color_local = 0; color_local < Mc; color_local++) {
139 int c_row = color_block + color_local;
140 int row = s_row*Nc + c_row;
142 for(
int s_col = 0; s_col < Ns; s_col++) {
144 for(
int c_col = 0; c_col < Nc; c_col+=color_stride) {
145 int col = s_col*Nc + c_col + color_offset;
148 *
arg.inA.Ghost(
d, 1, their_spinor_parity, ghost_idx +
src_idx*
arg.volumeCB, s_col, c_col+color_offset);
151 *
arg.inA.Ghost(
d, 1, their_spinor_parity, ghost_idx +
src_idx*
arg.volumeCB, s_col, c_col+color_offset);
156 }
else if (doBulk<type>()) {
158 for(
int color_local = 0; color_local < Mc; color_local++) {
159 int c_row = color_block + color_local;
160 int row = s_row*Nc + c_row;
162 for(
int s_col = 0; s_col < Ns; s_col++) {
164 for(
int c_col = 0; c_col < Nc; c_col+=color_stride) {
165 int col = s_col*Nc + c_col + color_offset;
168 *
arg.inA(their_spinor_parity, fwd_idx +
src_idx*
arg.volumeCB, s_col, c_col+color_offset);
171 *
arg.inA(their_spinor_parity, fwd_idx +
src_idx*
arg.volumeCB, s_col, c_col+color_offset);
179 #if defined(__CUDA_ARCH__) 180 if (thread_dim > 0) {
182 for (
int color_local=0; color_local < Mc; color_local++) {
194 for(
int d = thread_dim;
d < nDim;
d+=dim_stride)
197 const int gauge_idx = back_idx;
199 if (doHalo<type>()) {
200 const int ghost_idx = ghostFaceIndex<0>(
coord,
arg.dim,
d,
arg.nFace);
202 for (
int color_local=0; color_local<Mc; color_local++) {
203 int c_row = color_block + color_local;
204 int row = s_row*Nc + c_row;
206 for (
int s_col=0; s_col<Ns; s_col++)
208 for (
int c_col=0; c_col<Nc; c_col+=color_stride) {
209 int col = s_col*Nc + c_col + color_offset;
212 *
arg.inA.Ghost(
d, 0, their_spinor_parity, ghost_idx +
src_idx*
arg.volumeCB, s_col, c_col+color_offset);
215 *
arg.inA.Ghost(
d, 0, their_spinor_parity, ghost_idx +
src_idx*
arg.volumeCB, s_col, c_col+color_offset);
219 }
else if (doBulk<type>()) {
221 for(
int color_local = 0; color_local < Mc; color_local++) {
222 int c_row = color_block + color_local;
223 int row = s_row*Nc + c_row;
225 for(
int s_col = 0; s_col < Ns; s_col++)
227 for(
int c_col = 0; c_col < Nc; c_col+=color_stride) {
228 int col = s_col*Nc + c_col + color_offset;
231 *
arg.inA(their_spinor_parity, back_idx +
src_idx*
arg.volumeCB, s_col, c_col+color_offset);
234 *
arg.inA(their_spinor_parity, back_idx +
src_idx*
arg.volumeCB, s_col, c_col+color_offset);
241 #if defined(__CUDA_ARCH__) 244 for (
int color_local=0; color_local < Mc; color_local++) {
251 #ifdef __CUDA_ARCH__ // CUDA path has to recombine the foward and backward results 255 if (thread_dim == 0 && thread_dir == 0) {
259 for (
int d=1;
d<dim_stride;
d++) {
263 for (
int color_local=0; color_local < Mc; color_local++) {
265 shared_sum[(((color_local*
blockDim.z/(2*dim_stride) + threadIdx.z/(2*dim_stride)) * 2 * dim_stride +
d * 2 + 0)*
blockDim.y+threadIdx.y)*
blockDim.x+threadIdx.x];
270 for (
int d=0;
d<dim_stride;
d++) {
272 for (
int color_local=0; color_local < Mc; color_local++) {
274 shared_sum[(((color_local*
blockDim.z/(2*dim_stride) + threadIdx.z/(2*dim_stride)) * 2 * dim_stride +
d * 2 + 1)*
blockDim.y+threadIdx.y)*
blockDim.x+threadIdx.x];
280 for (
int color_local=0; color_local<Mc; color_local++)
out[color_local] *= -
arg.kappa;
284 #else // !__CUDA_ARCH__ 285 for (
int color_local=0; color_local<Mc; color_local++)
out[color_local] *= -
arg.kappa;
300 template <
typename Float,
int Ns,
int Nc,
int Mc,
int color_str
ide,
bool dagger,
typename Arg>
301 __device__ __host__
inline void applyClover(complex<Float>
out[], Arg &
arg,
int x_cb,
int src_idx,
int parity,
int s,
int color_block,
int color_offset) {
302 const int spinor_parity = (
arg.nParity == 2) ?
parity : 0;
306 for(
int color_local = 0; color_local < Mc; color_local++) {
307 int c = color_block + color_local;
310 for (
int s_col = 0; s_col < Ns; s_col++)
312 for (
int c_col = 0; c_col < Nc; c_col+=color_stride) {
314 int col = s_col*Nc + c_col + color_offset;
317 *
arg.inB(spinor_parity, x_cb+
src_idx*
arg.volumeCB, s_col, c_col+color_offset);
320 *
arg.inB(spinor_parity, x_cb+
src_idx*
arg.volumeCB, s_col, c_col+color_offset);
328 template <
typename Float,
int nDim,
int Ns,
int Nc,
int Mc,
int color_stride,
int dim_thread_split,
330 __device__ __host__
inline void coarseDslash(Arg &
arg,
int x_cb,
int src_idx,
int parity,
int s,
int color_block,
int color_offset)
332 complex <Float>
out[Mc];
334 for (
int c=0;
c<Mc;
c++)
out[
c] = 0.0;
335 if (
dslash) applyDslash<Float,nDim,Ns,Nc,Mc,color_stride,dim_thread_split,dir,dim,dagger,type>(
out,
arg, x_cb,
src_idx,
parity,
s, color_block, color_offset);
336 if (doBulk<type>() &&
clover && dir==0 &&
dim==0) applyClover<Float,Ns,Nc,Mc,color_stride,dagger>(
out,
arg, x_cb,
src_idx,
parity,
s, color_block, color_offset);
338 if (dir==0 &&
dim==0) {
339 const int my_spinor_parity = (
arg.nParity == 2) ?
parity : 0;
341 for (
int color_local=0; color_local<Mc; color_local++) {
342 #if __CUDA_ARCH__ >= 300 344 constexpr
int warp_size = 32;
347 #
if (__CUDACC_VER_MAJOR__ >= 9)
348 out[color_local] += __shfl_down_sync(WARP_CONVERGED,
out[color_local],
offset);
350 out[color_local] += __shfl_down(
out[color_local],
offset);
354 int c = color_block + color_local;
355 if (color_offset == 0) {
357 if (doBulk<type>())
arg.out(my_spinor_parity, x_cb+
src_idx*
arg.volumeCB,
s,
c) =
out[color_local];
365 template <
typename Float,
int nDim,
int Ns,
int Nc,
int Mc,
bool dslash,
bool clover,
bool dagger, DslashType type,
typename Arg>
366 void coarseDslash(Arg
arg)
369 const int color_stride = 1;
370 const int color_offset = 0;
371 const int dim_thread_split = 1;
381 for(
int x_cb = 0; x_cb <
arg.volumeCB; x_cb++) {
382 for (
int s=0;
s<2;
s++) {
383 for (
int color_block=0; color_block<Nc; color_block+=Mc) {
384 coarseDslash<Float,nDim,Ns,Nc,Mc,color_stride,dim_thread_split,dslash,clover,dagger,type,dir,dim>(
arg, x_cb,
src_idx,
parity,
s, color_block, color_offset);
394 template <
typename Float,
int nDim,
int Ns,
int Nc,
int Mc,
int color_str
ide,
int dim_thread_split,
bool dslash,
bool clover,
bool dagger, DslashType type,
typename Arg>
395 __global__
void coarseDslashKernel(Arg
arg)
397 constexpr
int warp_size = 32;
398 const int lane_id = threadIdx.x % warp_size;
399 const int warp_id = threadIdx.x / warp_size;
400 const int vector_site_width = warp_size / color_stride;
402 int x_cb = blockIdx.x*(
blockDim.x/color_stride) + warp_id*(warp_size/color_stride) + lane_id % vector_site_width;
404 const int color_offset = lane_id / vector_site_width;
407 #if 0 // disable multi-src since this has a measurable impact on single src performance 408 int paritySrc =
blockDim.y*blockIdx.y + threadIdx.y;
409 if (paritySrc >=
arg.nParity *
arg.dim[4])
return;
410 const int src_idx = (
arg.nParity == 2) ? paritySrc / 2 : paritySrc;
411 const int parity = (
arg.nParity == 2) ? paritySrc % 2 :
arg.parity;
418 int sMd =
blockDim.z*blockIdx.z + threadIdx.z;
420 int sMdim = sMd >> 1;
421 int dim = sMdim % dim_thread_split;
422 int sM = sMdim / dim_thread_split;
423 int s = sM / (Nc/Mc);
424 int color_block = (sM % (Nc/Mc)) * Mc;
426 if (x_cb >=
arg.volumeCB)
return;
429 if (
dim == 0) coarseDslash<Float,nDim,Ns,Nc,Mc,color_stride,dim_thread_split,dslash,clover,dagger,type,0,0>(
arg, x_cb,
src_idx,
parity,
s, color_block, color_offset);
430 else if (
dim == 1) coarseDslash<Float,nDim,Ns,Nc,Mc,color_stride,dim_thread_split,dslash,clover,dagger,type,0,1>(
arg, x_cb,
src_idx,
parity,
s, color_block, color_offset);
431 else if (
dim == 2) coarseDslash<Float,nDim,Ns,Nc,Mc,color_stride,dim_thread_split,dslash,clover,dagger,type,0,2>(
arg, x_cb,
src_idx,
parity,
s, color_block, color_offset);
432 else if (
dim == 3) coarseDslash<Float,nDim,Ns,Nc,Mc,color_stride,dim_thread_split,dslash,clover,dagger,type,0,3>(
arg, x_cb,
src_idx,
parity,
s, color_block, color_offset);
433 }
else if (dir == 1) {
434 if (
dim == 0) coarseDslash<Float,nDim,Ns,Nc,Mc,color_stride,dim_thread_split,dslash,clover,dagger,type,1,0>(
arg, x_cb,
src_idx,
parity,
s, color_block, color_offset);
435 else if (
dim == 1) coarseDslash<Float,nDim,Ns,Nc,Mc,color_stride,dim_thread_split,dslash,clover,dagger,type,1,1>(
arg, x_cb,
src_idx,
parity,
s, color_block, color_offset);
436 else if (
dim == 2) coarseDslash<Float,nDim,Ns,Nc,Mc,color_stride,dim_thread_split,dslash,clover,dagger,type,1,2>(
arg, x_cb,
src_idx,
parity,
s, color_block, color_offset);
437 else if (
dim == 3) coarseDslash<Float,nDim,Ns,Nc,Mc,color_stride,dim_thread_split,dslash,clover,dagger,type,1,3>(
arg, x_cb,
src_idx,
parity,
s, color_block, color_offset);
441 template <
typename Float,
int nDim,
int Ns,
int Nc,
int Mc,
bool dslash,
bool clover,
bool dagger, DslashType type>
442 class DslashCoarse :
public Tunable {
445 ColorSpinorField &
out;
446 const ColorSpinorField &inA;
447 const ColorSpinorField &inB;
455 #ifdef EIGHT_WAY_WARP_SPLIT 456 const int max_color_col_stride = 8;
458 const int max_color_col_stride = 4;
460 mutable int color_col_stride;
461 mutable int dim_threads;
464 long long flops()
const 466 return ((
dslash*2*nDim+
clover*1)*(8*Ns*Nc*Ns*Nc)-2*Ns*Nc)*nParity*(
long long)
out.VolumeCB();
468 long long bytes()
const 471 nSrc*nParity*(
dslash*Y.Bytes()*Y.VolumeCB()/(2*Y.Stride()) +
clover*
X.Bytes()/2);
473 unsigned int sharedBytesPerThread()
const {
return (
sizeof(complex<Float>) * Mc); }
474 unsigned int sharedBytesPerBlock(
const TuneParam &
param)
const {
return 0; }
475 bool tuneGridDim()
const {
return false; }
476 bool tuneAuxDim()
const {
return true; }
477 unsigned int minThreads()
const {
return color_col_stride *
X.VolumeCB(); }
478 unsigned int maxBlockSize()
const {
return deviceProp.maxThreadsPerBlock / (dim_threads * 2 * nParity); }
480 bool advanceBlockDim(TuneParam &
param)
const 483 dim3 grid =
param.grid;
492 if (
param.block.y < (
unsigned int)(nParity * nSrc)) {
499 param.grid.y = nParity * nSrc;
502 while(
param.block.z <= (
unsigned int)(dim_threads * 2 * 2 * (Nc/Mc))) {
503 param.block.z+=dim_threads * 2;
504 if ( (dim_threads*2*2*(Nc/Mc)) %
param.block.z == 0) {
505 param.grid.z = (dim_threads * 2 * 2 * (Nc/Mc)) /
param.block.z;
511 if (
param.block.z <= (
unsigned int)(dim_threads * 2 * 2 * (Nc/Mc)) &&
515 param.block.z = dim_threads * 2;
516 param.grid.z = 2 * (Nc/Mc);
528 bool advanceAux(TuneParam &
param)
const 531 #if __COMPUTE_CAPABILITY__ >= 300 533 if (2*
param.aux.x <= max_color_col_stride && Nc % (2*
param.aux.x) == 0 &&
540 color_col_stride =
param.aux.x;
552 color_col_stride =
param.aux.x;
559 dim_threads =
param.aux.y;
562 param.block.z = dim_threads * 2;
563 param.grid.z = 2* (Nc / Mc);
571 dim_threads =
param.aux.y;
577 param.block.z = dim_threads * 2;
578 param.grid.z = 2* (Nc / Mc);
587 virtual void initTuneParam(TuneParam &
param)
const 589 param.aux = make_int4(1,1,1,1);
590 color_col_stride =
param.aux.x;
591 dim_threads =
param.aux.y;
595 param.grid.y = nParity * nSrc;
596 param.block.z = dim_threads * 2;
597 param.grid.z = 2*(Nc/Mc);
603 virtual void defaultTuneParam(TuneParam &
param)
const 605 param.aux = make_int4(1,1,1,1);
606 color_col_stride =
param.aux.x;
607 dim_threads =
param.aux.y;
614 param.grid.y = nParity * nSrc;
615 param.block.z = dim_threads * 2;
616 param.grid.z = 2*(Nc/Mc);
622 inline DslashCoarse(ColorSpinorField &
out,
const ColorSpinorField &inA,
const ColorSpinorField &inB,
625 nParity(
out.SiteSubset()), nSrc(
out.Ndim()==5 ?
out.
X(4) : 1)
627 strcpy(aux,
"policy_kernel,");
637 case DSLASH_INTERIOR:
strcat(aux,
",interior");
break;
638 case DSLASH_EXTERIOR:
strcat(aux,
",exterior");
break;
639 case DSLASH_FULL:
strcat(aux,
",full");
break;
642 if (doHalo<type>()) {
643 char label[15] =
",halo=";
645 for (
int dir=0; dir<2; dir++) {
653 virtual ~DslashCoarse() { }
655 inline void apply(
const cudaStream_t &
stream) {
660 errorQuda(
"Unsupported field order colorspinor=%d gauge=%d combination\n", inA.FieldOrder(), Y.FieldOrder());
662 DslashCoarseArg<Float,Ns,Nc,QUDA_SPACE_SPIN_COLOR_FIELD_ORDER,QUDA_QDP_GAUGE_ORDER>
arg(
out, inA, inB, Y,
X, (Float)
kappa,
parity);
663 coarseDslash<Float,nDim,Ns,Nc,Mc,dslash,clover,dagger,type>(
arg);
669 errorQuda(
"Unsupported field order colorspinor=%d gauge=%d combination\n", inA.FieldOrder(), Y.FieldOrder());
671 DslashCoarseArg<Float,Ns,Nc,QUDA_FLOAT2_FIELD_ORDER,QUDA_FLOAT2_GAUGE_ORDER>
arg(
out, inA, inB, Y,
X, (Float)
kappa,
parity);
677 coarseDslashKernel<Float,nDim,Ns,Nc,Mc,1,1,dslash,clover,dagger,type> <<<tp.grid,tp.block,tp.shared_bytes,
stream>>>(
arg);
680 coarseDslashKernel<Float,nDim,Ns,Nc,Mc,2,1,dslash,clover,dagger,type> <<<tp.grid,tp.block,tp.shared_bytes,
stream>>>(
arg);
683 coarseDslashKernel<Float,nDim,Ns,Nc,Mc,4,1,dslash,clover,dagger,type> <<<tp.grid,tp.block,tp.shared_bytes,
stream>>>(
arg);
685 #ifdef EIGHT_WAY_WARP_SPLIT 687 coarseDslashKernel<Float,nDim,Ns,Nc,Mc,8,1,dslash,clover,dagger,type> <<<tp.grid,tp.block,tp.shared_bytes,
stream>>>(
arg);
691 errorQuda(
"Color column stride %d not valid", tp.aux.x);
697 coarseDslashKernel<Float,nDim,Ns,Nc,Mc,1,2,dslash,clover,dagger,type> <<<tp.grid,tp.block,tp.shared_bytes,
stream>>>(
arg);
700 coarseDslashKernel<Float,nDim,Ns,Nc,Mc,2,2,dslash,clover,dagger,type> <<<tp.grid,tp.block,tp.shared_bytes,
stream>>>(
arg);
703 coarseDslashKernel<Float,nDim,Ns,Nc,Mc,4,2,dslash,clover,dagger,type> <<<tp.grid,tp.block,tp.shared_bytes,
stream>>>(
arg);
705 #ifdef EIGHT_WAY_WARP_SPLIT 707 coarseDslashKernel<Float,nDim,Ns,Nc,Mc,8,2,dslash,clover,dagger,type> <<<tp.grid,tp.block,tp.shared_bytes,
stream>>>(
arg);
711 errorQuda(
"Color column stride %d not valid", tp.aux.x);
717 coarseDslashKernel<Float,nDim,Ns,Nc,Mc,1,4,dslash,clover,dagger,type> <<<tp.grid,tp.block,tp.shared_bytes,
stream>>>(
arg);
720 coarseDslashKernel<Float,nDim,Ns,Nc,Mc,2,4,dslash,clover,dagger,type> <<<tp.grid,tp.block,tp.shared_bytes,
stream>>>(
arg);
723 coarseDslashKernel<Float,nDim,Ns,Nc,Mc,4,4,dslash,clover,dagger,type> <<<tp.grid,tp.block,tp.shared_bytes,
stream>>>(
arg);
725 #ifdef EIGHT_WAY_WARP_SPLIT 727 coarseDslashKernel<Float,nDim,Ns,Nc,Mc,8,4,dslash,clover,dagger,type> <<<tp.grid,tp.block,tp.shared_bytes,
stream>>>(
arg);
731 errorQuda(
"Color column stride %d not valid", tp.aux.x);
735 errorQuda(
"Invalid dimension thread splitting %d", tp.aux.y);
740 TuneKey tuneKey()
const {
741 return TuneKey(
out.VolString(),
typeid(*this).name(), aux);
745 saveOut =
new char[
out.Bytes()];
746 cudaMemcpy(saveOut,
out.V(),
out.Bytes(), cudaMemcpyDeviceToHost);
751 cudaMemcpy(
out.V(), saveOut,
out.Bytes(), cudaMemcpyHostToDevice);
758 template <
typename Float,
int coarseColor,
int coarseSpin>
759 inline void ApplyCoarse(ColorSpinorField &
out,
const ColorSpinorField &inA,
const ColorSpinorField &inB,
763 const int colors_per_thread = 1;
769 if (type == DSLASH_FULL) {
770 DslashCoarse<Float,nDim,coarseSpin,coarseColor,colors_per_thread,true,true,true,DSLASH_FULL>
dslash(
out, inA, inB, Y,
X,
kappa,
parity, halo_location);
772 }
else {
errorQuda(
"Dslash type %d not instantiated", type); }
774 if (type == DSLASH_FULL) {
775 DslashCoarse<Float,nDim,coarseSpin,coarseColor,colors_per_thread,true,false,true,DSLASH_FULL>
dslash(
out, inA, inB, Y,
X,
kappa,
parity, halo_location);
777 }
else {
errorQuda(
"Dslash type %d not instantiated", type); }
780 if (type == DSLASH_EXTERIOR)
errorQuda(
"Cannot call halo on pure clover kernel");
782 DslashCoarse<Float,nDim,coarseSpin,coarseColor,colors_per_thread,false,true,true,DSLASH_FULL>
dslash(
out, inA, inB, Y,
X,
kappa,
parity, halo_location);
785 errorQuda(
"Unsupported dslash=false clover=false");
791 if (type == DSLASH_FULL) {
792 DslashCoarse<Float,nDim,coarseSpin,coarseColor,colors_per_thread,true,true,false,DSLASH_FULL>
dslash(
out, inA, inB, Y,
X,
kappa,
parity, halo_location);
794 }
else {
errorQuda(
"Dslash type %d not instantiated", type); }
796 if (type == DSLASH_FULL) {
797 DslashCoarse<Float,nDim,coarseSpin,coarseColor,colors_per_thread,true,false,false,DSLASH_FULL>
dslash(
out, inA, inB, Y,
X,
kappa,
parity, halo_location);
799 }
else {
errorQuda(
"Dslash type %d not instantiated", type); }
802 if (type == DSLASH_EXTERIOR)
errorQuda(
"Cannot call halo on pure clover kernel");
804 DslashCoarse<Float,nDim,coarseSpin,coarseColor,colors_per_thread,false,true,false,DSLASH_FULL>
dslash(
out, inA, inB, Y,
X,
kappa,
parity, halo_location);
807 errorQuda(
"Unsupported dslash=false clover=false");
814 template <
typename Float>
815 inline void ApplyCoarse(ColorSpinorField &
out,
const ColorSpinorField &inA,
const ColorSpinorField &inB,
819 if (Y.FieldOrder() !=
X.FieldOrder())
820 errorQuda(
"Field order mismatch Y = %d, X = %d", Y.FieldOrder(),
X.FieldOrder());
825 if (inA.Nspin() != 2)
826 errorQuda(
"Unsupported number of coarse spins %d\n",inA.Nspin());
828 if (inA.Ncolor() == 2) {
829 ApplyCoarse<Float,2,2>(
out, inA, inB, Y,
X,
kappa,
parity,
dslash,
clover,
dagger, type, halo_location);
831 }
else if (inA.Ncolor() == 4) {
832 ApplyCoarse<Float,4,2>(
out, inA, inB, Y,
X,
kappa,
parity,
dslash,
clover,
dagger, type, halo_location);
833 }
else if (inA.Ncolor() == 8) {
834 ApplyCoarse<Float,8,2>(
out, inA, inB, Y,
X,
kappa,
parity,
dslash,
clover,
dagger, type, halo_location);
835 }
else if (inA.Ncolor() == 12) {
836 ApplyCoarse<Float,12,2>(
out, inA, inB, Y,
X,
kappa,
parity,
dslash,
clover,
dagger, type, halo_location);
837 }
else if (inA.Ncolor() == 16) {
838 ApplyCoarse<Float,16,2>(
out, inA, inB, Y,
X,
kappa,
parity,
dslash,
clover,
dagger, type, halo_location);
839 }
else if (inA.Ncolor() == 20) {
840 ApplyCoarse<Float,20,2>(
out, inA, inB, Y,
X,
kappa,
parity,
dslash,
clover,
dagger, type, halo_location);
842 }
else if (inA.Ncolor() == 24) {
843 ApplyCoarse<Float,24,2>(
out, inA, inB, Y,
X,
kappa,
parity,
dslash,
clover,
dagger, type, halo_location);
845 }
else if (inA.Ncolor() == 28) {
846 ApplyCoarse<Float,28,2>(
out, inA, inB, Y,
X,
kappa,
parity,
dslash,
clover,
dagger, type, halo_location);
848 }
else if (inA.Ncolor() == 32) {
849 ApplyCoarse<Float,32,2>(
out, inA, inB, Y,
X,
kappa,
parity,
dslash,
clover,
dagger, type, halo_location);
851 errorQuda(
"Unsupported number of coarse dof %d\n", Y.Ncolor());
861 #endif // GPU_MULTIGRID 926 #ifdef GPU_MULTIGRID_DOUBLE 927 ApplyCoarse<double>(
out,
inA,
inB,
Y,
X,
kappa,
parity,
dslash,
clover,
dagger, DSLASH_FULL, halo_location);
930 errorQuda(
"Double precision multigrid has not been enabled");
933 ApplyCoarse<float>(
out,
inA,
inB,
Y,
X,
kappa,
parity,
dslash,
clover,
dagger, DSLASH_FULL, halo_location);
941 errorQuda(
"Multigrid has not been built");
948 typedef std::map<TuneKey, TuneParam>
map;
956 static std::vector<DslashCoarsePolicy>
policy;
979 static char *dslash_policy_env =
getenv(
"QUDA_ENABLE_DSLASH_COARSE_POLICY");
981 if (dslash_policy_env) {
982 std::stringstream policy_list(dslash_policy_env);
985 while (policy_list >> policy_) {
992 errorQuda(
"Cannot select a GDR policy %d unless QUDA_ENABLE_GDR is set", dslash_policy);
995 policy.push_back(static_cast<DslashCoarsePolicy>(policy_));
996 if (policy_list.peek() ==
',') policy_list.ignore();
1033 errorQuda(
"Machine configuration (P2P/GDR=%d) changed since tunecache was created (P2P/GDR=%d). Please delete " 1034 "this file or set the QUDA_RESOURCE_PATH environment variable to point to a new path.",
1038 if (tp.
aux.x >= (
int)
policy.size())
errorQuda(
"Requested policy that is outside of range");
1073 int Ns =
dslash.inA.Nspin();
1074 int Nc =
dslash.inA.Ncolor();
1075 int nParity =
dslash.inA.SiteSubset();
1076 int volumeCB =
dslash.inA.VolumeCB();
1077 return ((
dslash.dslash*2*nDim+
dslash.clover*1)*(8*Ns*Nc*Ns*Nc)-2*Ns*Nc)*nParity*volumeCB;
1081 int nParity =
dslash.inA.SiteSubset();
virtual void apply(const cudaStream_t &stream)=0
void operator()(DslashCoarsePolicy policy)
Execute the coarse dslash using the given policy.
enum QudaPrecision_s QudaPrecision
const char * comm_dim_partitioned_string()
Return a string that defines the comm partitioning (used as a tuneKey)
cudaDeviceProp deviceProp
void disableProfileCount()
#define checkPrecision(...)
const ColorSpinorField & inB
int comm_partitioned()
Loop over comm_dim_partitioned(dim) for all comms dimensions.
static __inline__ dim3 dim3 void size_t cudaStream_t int dim
char * strcpy(char *__dst, const char *__src)
unsigned int sharedBytesPerThread() const
unsigned int sharedBytesPerBlock(const TuneParam ¶m) const
char * strcat(char *__s1, const char *__s2)
DslashCoarseLaunch(ColorSpinorField &out, const ColorSpinorField &inA, const ColorSpinorField &inB, const GaugeField &Y, const GaugeField &X, double kappa, int parity, bool dslash, bool clover, bool dagger)
void enableProfileCount()
DslashCoarseLaunch & dslash
VOLATILE spinorFloat kappa
virtual void exchangeGhost(QudaParity parity, int nFace, int dagger, const MemoryLocation *pack_destination=nullptr, const MemoryLocation *halo_location=nullptr, bool gdr_send=false, bool gdr_recv=false) const =0
static __device__ __host__ int linkIndexM1(const int x[], const I X[4], const int mu)
for(int s=0;s< param.dc.Ls;s++)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
#define checkLocation(...)
static std::vector< DslashCoarsePolicy > policy
Main header file for host and device accessors to GaugeFields.
bool advanceAux(TuneParam ¶m) const
enum QudaParity_s QudaParity
void ApplyCoarse(ColorSpinorField &out, const ColorSpinorField &inA, const ColorSpinorField &inB, const GaugeField &Y, const GaugeField &X, double kappa, int parity=QUDA_INVALID_PARITY, bool dslash=true, bool clover=true, bool dagger=false)
void setPolicyTuning(bool)
std::map< TuneKey, TuneParam > map
void apply(const cudaStream_t &stream)
cpuColorSpinorField * out
virtual ~DslashCoarsePolicyTune()
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
bool comm_gdr_enabled()
Query if GPU Direct RDMA communication is enabled (global setting)
const ColorSpinorField & inA
virtual void initTuneParam(TuneParam ¶m) const
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
void defaultTuneParam(TuneParam ¶m) const
const map & getTuneCache()
__host__ __device__ ValueType conj(ValueType x)
virtual bool advanceBlockDim(TuneParam ¶m) const
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
bool advanceTuneParam(TuneParam ¶m) const
static __inline__ size_t size_t d
QudaPrecision Precision() const
static __device__ __host__ int linkIndexP1(const int x[], const I X[4], const int mu)
void initTuneParam(TuneParam ¶m) const
DslashCoarsePolicyTune(DslashCoarseLaunch &dslash)
int comm_peer2peer_enabled_global()
char * getenv(const char *)
QudaFieldOrder FieldOrder() const
int comm_dim_partitioned(int dim)
cudaEvent_t cudaEvent_t end
virtual void defaultTuneParam(TuneParam ¶m) const