3 #if (__COMPUTE_CAPABILITY__ >= 700) 5 #define SET_CACHE(f) qudaFuncSetAttribute( (const void*)f, cudaFuncAttributePreferredSharedMemoryCarveout, (int)cudaSharedmemCarveoutMaxShared) 11 #define LAUNCH_KERNEL(f, grid, block, shared, stream, param) \ 12 void *args[] = { ¶m }; \ 13 void (*func)( const DslashParam ) = &(f); \ 14 qudaLaunchKernel( (const void*)func, grid, block, args, shared, stream); 16 #define LAUNCH_KERNEL(f, grid, block, shared, stream, param) f<<<grid, block, shared, stream>>>(param) 19 #define EVEN_MORE_GENERIC_DSLASH(FUNC, FLOAT, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \ 21 if (reconstruct == QUDA_RECONSTRUCT_NO) { \ 22 SET_CACHE( FUNC ## FLOAT ## 18 ## DAG ## Kernel<kernel_type> ); \ 23 LAUNCH_KERNEL( FUNC ## FLOAT ## 18 ## DAG ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 24 } else if (reconstruct == QUDA_RECONSTRUCT_12) { \ 25 SET_CACHE( FUNC ## FLOAT ## 12 ## DAG ## Kernel<kernel_type> ); \ 26 LAUNCH_KERNEL( FUNC ## FLOAT ## 12 ## DAG ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 27 } else if (reconstruct == QUDA_RECONSTRUCT_8) { \ 28 SET_CACHE( FUNC ## FLOAT ## 8 ## DAG ## Kernel<kernel_type> ); \ 29 LAUNCH_KERNEL( FUNC ## FLOAT ## 8 ## DAG ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 32 if (reconstruct == QUDA_RECONSTRUCT_NO) { \ 33 SET_CACHE( FUNC ## FLOAT ## 18 ## DAG ## X ## Kernel<kernel_type> ); \ 34 LAUNCH_KERNEL( FUNC ## FLOAT ## 18 ## DAG ## X ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 35 } else if (reconstruct == QUDA_RECONSTRUCT_12) { \ 36 SET_CACHE( FUNC ## FLOAT ## 18 ## DAG ## X ## Kernel<kernel_type> ); \ 37 LAUNCH_KERNEL( FUNC ## FLOAT ## 12 ## DAG ## X ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 38 } else if (reconstruct == QUDA_RECONSTRUCT_8) { \ 39 SET_CACHE( FUNC ## FLOAT ## 18 ## DAG ## X ## Kernel<kernel_type> ); \ 40 LAUNCH_KERNEL( FUNC ## FLOAT ## 8 ## DAG ## X ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 44 #define MORE_GENERIC_DSLASH(FUNC, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \ 45 if (typeid(sFloat) == typeid(double2)) { \ 46 EVEN_MORE_GENERIC_DSLASH(FUNC, D, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \ 47 } else if (typeid(sFloat) == typeid(float4) || typeid(sFloat) == typeid(float2)) { \ 48 EVEN_MORE_GENERIC_DSLASH(FUNC, S, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \ 49 } else if (typeid(sFloat)==typeid(short4) || typeid(sFloat) == typeid(short2)) { \ 50 EVEN_MORE_GENERIC_DSLASH(FUNC, H, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \ 52 errorQuda("Undefined precision type"); \ 56 #define EVEN_MORE_GENERIC_STAGGERED_DSLASH(FUNC, FLOAT, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \ 58 if (reconstruct == QUDA_RECONSTRUCT_NO) { \ 59 LAUNCH_KERNEL( FUNC ## FLOAT ## 18 ## 18 ## DAG ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 60 } else if (reconstruct == QUDA_RECONSTRUCT_13) { \ 61 LAUNCH_KERNEL( FUNC ## FLOAT ## 18 ## 13 ## DAG ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 62 } else if (reconstruct == QUDA_RECONSTRUCT_12) { \ 63 LAUNCH_KERNEL( FUNC ## FLOAT ## 18 ## 12 ## DAG ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 64 } else if (reconstruct == QUDA_RECONSTRUCT_9) { \ 65 LAUNCH_KERNEL( FUNC ## FLOAT ## 18 ## 9 ## DAG ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 66 } else if (reconstruct == QUDA_RECONSTRUCT_8) { \ 67 LAUNCH_KERNEL( FUNC ## FLOAT ## 18 ## 8 ## DAG ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 70 if (reconstruct == QUDA_RECONSTRUCT_NO) { \ 71 LAUNCH_KERNEL( FUNC ## FLOAT ## 18 ## 18 ## DAG ## X ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 72 } else if (reconstruct == QUDA_RECONSTRUCT_13) { \ 73 LAUNCH_KERNEL( FUNC ## FLOAT ## 18 ## 13 ## DAG ## X ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 74 } else if (reconstruct == QUDA_RECONSTRUCT_12) { \ 75 LAUNCH_KERNEL( FUNC ## FLOAT ## 18 ## 12 ## DAG ## X ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 76 } else if (reconstruct == QUDA_RECONSTRUCT_9) { \ 77 LAUNCH_KERNEL( FUNC ## FLOAT ## 18 ## 9 ## DAG ## X ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 78 } else if (reconstruct == QUDA_RECONSTRUCT_8) { \ 79 LAUNCH_KERNEL( FUNC ## FLOAT ## 18 ## 8 ## DAG ## X ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 83 #define MORE_GENERIC_STAGGERED_DSLASH(FUNC, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \ 84 if (typeid(sFloat) == typeid(double2)) { \ 85 EVEN_MORE_GENERIC_STAGGERED_DSLASH(FUNC, D, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \ 86 } else if (typeid(sFloat) == typeid(float2)) { \ 87 EVEN_MORE_GENERIC_STAGGERED_DSLASH(FUNC, S, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \ 88 } else if (typeid(sFloat)==typeid(short2)) { \ 89 EVEN_MORE_GENERIC_STAGGERED_DSLASH(FUNC, H, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \ 91 errorQuda("Undefined precision type"); \ 96 #define GENERIC_DSLASH(FUNC, DAG, X, gridDim, blockDim, shared, stream, param) \ 97 switch(param.kernel_type) { \ 98 case INTERIOR_KERNEL: \ 99 MORE_GENERIC_DSLASH(FUNC, DAG, X, INTERIOR_KERNEL, gridDim, blockDim, shared, stream, param) \ 102 errorQuda("KernelType %d not defined for single GPU", param.kernel_type); \ 105 #define GENERIC_STAGGERED_DSLASH(FUNC, DAG, X, gridDim, blockDim, shared, stream, param) \ 106 switch(param.kernel_type) { \ 107 case INTERIOR_KERNEL: \ 108 MORE_GENERIC_STAGGERED_DSLASH(FUNC, DAG, X, INTERIOR_KERNEL, gridDim, blockDim, shared, stream, param) \ 111 errorQuda("KernelType %d not defined for single GPU", param.kernel_type); \ 117 #define GENERIC_DSLASH(FUNC, DAG, X, gridDim, blockDim, shared, stream, param) \ 118 switch(param.kernel_type) { \ 119 case INTERIOR_KERNEL: \ 120 MORE_GENERIC_DSLASH(FUNC, DAG, X, INTERIOR_KERNEL, gridDim, blockDim, shared, stream, param) \ 122 case EXTERIOR_KERNEL_X: \ 123 MORE_GENERIC_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_X, gridDim, blockDim, shared, stream, param) \ 125 case EXTERIOR_KERNEL_Y: \ 126 MORE_GENERIC_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_Y, gridDim, blockDim, shared, stream, param) \ 128 case EXTERIOR_KERNEL_Z: \ 129 MORE_GENERIC_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_Z, gridDim, blockDim, shared, stream, param) \ 131 case EXTERIOR_KERNEL_T: \ 132 MORE_GENERIC_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_T, gridDim, blockDim, shared, stream, param) \ 134 case EXTERIOR_KERNEL_ALL: \ 135 MORE_GENERIC_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_ALL, gridDim, blockDim, shared, stream, param) \ 141 #define GENERIC_STAGGERED_DSLASH(FUNC, DAG, X, gridDim, blockDim, shared, stream, param) \ 142 switch(param.kernel_type) { \ 143 case INTERIOR_KERNEL: \ 144 MORE_GENERIC_STAGGERED_DSLASH(FUNC, DAG, X, INTERIOR_KERNEL, gridDim, blockDim, shared, stream, param) \ 146 case EXTERIOR_KERNEL_X: \ 147 MORE_GENERIC_STAGGERED_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_X, gridDim, blockDim, shared, stream, param) \ 149 case EXTERIOR_KERNEL_Y: \ 150 MORE_GENERIC_STAGGERED_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_Y, gridDim, blockDim, shared, stream, param) \ 152 case EXTERIOR_KERNEL_Z: \ 153 MORE_GENERIC_STAGGERED_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_Z, gridDim, blockDim, shared, stream, param) \ 155 case EXTERIOR_KERNEL_T: \ 156 MORE_GENERIC_STAGGERED_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_T, gridDim, blockDim, shared, stream, param) \ 158 case EXTERIOR_KERNEL_ALL: \ 159 MORE_GENERIC_STAGGERED_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_ALL, gridDim, blockDim, shared, stream, param) \ 169 #define DSLASH(FUNC, gridDim, blockDim, shared, stream, param) \ 171 GENERIC_DSLASH(FUNC, , Xpay, gridDim, blockDim, shared, stream, param) \ 173 GENERIC_DSLASH(FUNC, Dagger, Xpay, gridDim, blockDim, shared, stream, param) \ 177 #define STAGGERED_DSLASH(gridDim, blockDim, shared, stream, param) \ 178 GENERIC_DSLASH(staggeredDslash, , Axpy, gridDim, blockDim, shared, stream, param) 180 #define IMPROVED_STAGGERED_DSLASH(gridDim, blockDim, shared, stream, param) \ 181 GENERIC_STAGGERED_DSLASH(improvedStaggeredDslash, , Axpy, gridDim, blockDim, shared, stream, param) 183 #define EVEN_MORE_GENERIC_ASYM_DSLASH(FUNC, FLOAT, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \ 184 if (reconstruct == QUDA_RECONSTRUCT_NO) { \ 185 SET_CACHE( FUNC ## FLOAT ## 18 ## DAG ## X ## Kernel<kernel_type> ); \ 186 LAUNCH_KERNEL( FUNC ## FLOAT ## 18 ## DAG ## X ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 187 } else if (reconstruct == QUDA_RECONSTRUCT_12) { \ 188 SET_CACHE( FUNC ## FLOAT ## 12 ## DAG ## X ## Kernel<kernel_type> ); \ 189 LAUNCH_KERNEL( FUNC ## FLOAT ## 12 ## DAG ## X ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 190 } else if (reconstruct == QUDA_RECONSTRUCT_8) { \ 191 SET_CACHE( FUNC ## FLOAT ## 8 ## DAG ## X ## Kernel<kernel_type> ); \ 192 LAUNCH_KERNEL( FUNC ## FLOAT ## 8 ## DAG ## X ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 195 #define MORE_GENERIC_ASYM_DSLASH(FUNC, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \ 196 if (typeid(sFloat) == typeid(double2)) { \ 197 EVEN_MORE_GENERIC_ASYM_DSLASH(FUNC, D, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \ 198 } else if (typeid(sFloat) == typeid(float4)) { \ 199 EVEN_MORE_GENERIC_ASYM_DSLASH(FUNC, S, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \ 200 } else if (typeid(sFloat)==typeid(short4)) { \ 201 EVEN_MORE_GENERIC_ASYM_DSLASH(FUNC, H, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \ 207 #define GENERIC_ASYM_DSLASH(FUNC, DAG, X, gridDim, blockDim, shared, stream, param) \ 208 switch(param.kernel_type) { \ 209 case INTERIOR_KERNEL: \ 210 MORE_GENERIC_ASYM_DSLASH(FUNC, DAG, X, INTERIOR_KERNEL, gridDim, blockDim, shared, stream, param) \ 213 errorQuda("KernelType %d not defined for single GPU", param.kernel_type); \ 218 #define GENERIC_ASYM_DSLASH(FUNC, DAG, X, gridDim, blockDim, shared, stream, param) \ 219 switch(param.kernel_type) { \ 220 case INTERIOR_KERNEL: \ 221 MORE_GENERIC_ASYM_DSLASH(FUNC, DAG, X, INTERIOR_KERNEL, gridDim, blockDim, shared, stream, param) \ 223 case EXTERIOR_KERNEL_X: \ 224 MORE_GENERIC_ASYM_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_X, gridDim, blockDim, shared, stream, param) \ 226 case EXTERIOR_KERNEL_Y: \ 227 MORE_GENERIC_ASYM_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_Y, gridDim, blockDim, shared, stream, param) \ 229 case EXTERIOR_KERNEL_Z: \ 230 MORE_GENERIC_ASYM_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_Z, gridDim, blockDim, shared, stream, param) \ 232 case EXTERIOR_KERNEL_T: \ 233 MORE_GENERIC_ASYM_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_T, gridDim, blockDim, shared, stream, param) \ 235 case EXTERIOR_KERNEL_ALL: \ 236 MORE_GENERIC_ASYM_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_ALL, gridDim, blockDim, shared, stream, param) \ 245 #define ASYM_DSLASH(FUNC, gridDim, blockDim, shared, stream, param) \ 247 GENERIC_ASYM_DSLASH(FUNC, , Xpay, gridDim, blockDim, shared, stream, param) \ 249 GENERIC_ASYM_DSLASH(FUNC, Dagger, Xpay, gridDim, blockDim, shared, stream, param) \ 256 #define EVEN_MORE_GENERIC_NDEG_TM_DSLASH(FUNC, FLOAT, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \ 257 if (x == 0 && d == 0) { \ 258 if (reconstruct == QUDA_RECONSTRUCT_NO) { \ 259 SET_CACHE( FUNC ## FLOAT ## 18 ## DAG ## Twist ## Kernel<kernel_type> ); \ 260 LAUNCH_KERNEL( FUNC ## FLOAT ## 18 ## DAG ## Twist ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 261 } else if (reconstruct == QUDA_RECONSTRUCT_12) { \ 262 SET_CACHE( FUNC ## FLOAT ## 12 ## DAG ## Twist ## Kernel<kernel_type> ); \ 263 LAUNCH_KERNEL( FUNC ## FLOAT ## 12 ## DAG ## Twist ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 265 SET_CACHE( FUNC ## FLOAT ## 8 ## DAG ## Twist ## Kernel<kernel_type> ); \ 266 LAUNCH_KERNEL( FUNC ## FLOAT ## 8 ## DAG ## Twist ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 268 } else if (x != 0 && d == 0) { \ 269 if (reconstruct == QUDA_RECONSTRUCT_NO) { \ 270 SET_CACHE( FUNC ## FLOAT ## 18 ## DAG ## Twist ## X ## Kernel<kernel_type> ); \ 271 LAUNCH_KERNEL( FUNC ## FLOAT ## 18 ## DAG ## Twist ## X ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 272 } else if (reconstruct == QUDA_RECONSTRUCT_12) { \ 273 SET_CACHE( FUNC ## FLOAT ## 12 ## DAG ## Twist ## X ## Kernel<kernel_type> ); \ 274 LAUNCH_KERNEL( FUNC ## FLOAT ## 12 ## DAG ## Twist ## X ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 275 } else if (reconstruct == QUDA_RECONSTRUCT_8) { \ 276 SET_CACHE( FUNC ## FLOAT ## 8 ## DAG ## Twist ## X ## Kernel<kernel_type> ); \ 277 LAUNCH_KERNEL( FUNC ## FLOAT ## 8 ## DAG ## Twist ## X ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 279 } else if (x == 0 && d != 0) { \ 280 if (reconstruct == QUDA_RECONSTRUCT_NO) { \ 281 SET_CACHE( FUNC ## FLOAT ## 18 ## DAG ## Kernel<kernel_type> ); \ 282 LAUNCH_KERNEL( FUNC ## FLOAT ## 18 ## DAG ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 283 } else if (reconstruct == QUDA_RECONSTRUCT_12) { \ 284 SET_CACHE( FUNC ## FLOAT ## 12 ## DAG ## Kernel<kernel_type> ); \ 285 LAUNCH_KERNEL( FUNC ## FLOAT ## 12 ## DAG ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 287 SET_CACHE( FUNC ## FLOAT ## 8 ## DAG ## Kernel<kernel_type> ); \ 288 LAUNCH_KERNEL( FUNC ## FLOAT ## 8 ## DAG ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 291 if (reconstruct == QUDA_RECONSTRUCT_NO) { \ 292 SET_CACHE( FUNC ## FLOAT ## 18 ## DAG ## X ## Kernel<kernel_type> ); \ 293 LAUNCH_KERNEL( FUNC ## FLOAT ## 18 ## DAG ## X ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 294 } else if (reconstruct == QUDA_RECONSTRUCT_12) { \ 295 SET_CACHE( FUNC ## FLOAT ## 12 ## DAG ## X ## Kernel<kernel_type> ); \ 296 LAUNCH_KERNEL( FUNC ## FLOAT ## 12 ## DAG ## X ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 297 } else if (reconstruct == QUDA_RECONSTRUCT_8) { \ 298 SET_CACHE( FUNC ## FLOAT ## 8 ## DAG ## X ## Kernel<kernel_type> ); \ 299 LAUNCH_KERNEL( FUNC ## FLOAT ## 8 ## DAG ## X ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 303 #define MORE_GENERIC_NDEG_TM_DSLASH(FUNC, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \ 304 if (typeid(sFloat) == typeid(double2)) { \ 305 EVEN_MORE_GENERIC_NDEG_TM_DSLASH(FUNC, D, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \ 306 } else if (typeid(sFloat) == typeid(float4)) { \ 307 EVEN_MORE_GENERIC_NDEG_TM_DSLASH(FUNC, S, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \ 308 } else if (typeid(sFloat)==typeid(short4)) { \ 309 EVEN_MORE_GENERIC_NDEG_TM_DSLASH(FUNC, H, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \ 311 errorQuda("Undefined precision type"); \ 316 #define GENERIC_NDEG_TM_DSLASH(FUNC, DAG, X, gridDim, blockDim, shared, stream, param) \ 317 switch(param.kernel_type) { \ 318 case INTERIOR_KERNEL: \ 319 MORE_GENERIC_NDEG_TM_DSLASH(FUNC, DAG, X, INTERIOR_KERNEL, gridDim, blockDim, shared, stream, param) \ 322 errorQuda("KernelType %d not defined for single GPU", param.kernel_type); \ 327 #define GENERIC_NDEG_TM_DSLASH(FUNC, DAG, X, gridDim, blockDim, shared, stream, param) \ 328 switch(param.kernel_type) { \ 329 case INTERIOR_KERNEL: \ 330 MORE_GENERIC_NDEG_TM_DSLASH(FUNC, DAG, X, INTERIOR_KERNEL, gridDim, blockDim, shared, stream, param) \ 332 case EXTERIOR_KERNEL_X: \ 333 MORE_GENERIC_NDEG_TM_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_X, gridDim, blockDim, shared, stream, param) \ 335 case EXTERIOR_KERNEL_Y: \ 336 MORE_GENERIC_NDEG_TM_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_Y, gridDim, blockDim, shared, stream, param) \ 338 case EXTERIOR_KERNEL_Z: \ 339 MORE_GENERIC_NDEG_TM_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_Z, gridDim, blockDim, shared, stream, param) \ 341 case EXTERIOR_KERNEL_T: \ 342 MORE_GENERIC_NDEG_TM_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_T, gridDim, blockDim, shared, stream, param) \ 344 case EXTERIOR_KERNEL_ALL: \ 345 MORE_GENERIC_NDEG_TM_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_ALL, gridDim, blockDim, shared, stream, param) \ 353 #define NDEG_TM_DSLASH(FUNC, gridDim, blockDim, shared, stream, param) \ 355 GENERIC_NDEG_TM_DSLASH(FUNC, , Xpay, gridDim, blockDim, shared, stream, param) \ 357 GENERIC_NDEG_TM_DSLASH(FUNC, Dagger, Xpay, gridDim, blockDim, shared, stream, param) \ 368 cudaColorSpinorField *
out;
369 const cudaColorSpinorField *
in;
370 const cudaColorSpinorField *
x;
383 char aux[8][TuneKey::aux_n];
438 for (
int dir=0; dir<2; dir++) {
447 #ifdef USE_TEXTURE_OBJECTS 450 #endif // USE_TEXTURE_OBJECTS 461 const cudaColorSpinorField *
x,
const GaugeField &
gauge,
462 const int parity,
const int dagger,
const int *commOverride)
466 if (
in->Precision() !=
gauge.Precision())
467 errorQuda(
"Mixing gauge %d and spinor %d precision not supported",
gauge.Precision(),
in->Precision());
469 constexpr
int nDimComms = 4;
470 for (
int i=0;
i<nDimComms;
i++){
487 #ifdef USE_TEXTURE_OBJECTS 494 #endif // USE_TEXTURE_OBJECTS 568 virtual int Nface()
const {
return 2; }
572 #if defined(DSLASH_TUNE_TILE) 574 bool advanceAux(TuneParam &
param)
const 576 if (
in->Nspin()==1 ||
in->Ndim()==5)
return false;
577 const int *
X =
in->X();
580 do {
param.aux.w++; }
while( (
X[3]) %
param.aux.w != 0);
581 if (
param.aux.w <=
X[3])
return true;
586 do {
param.aux.z++; }
while( (
X[2]) %
param.aux.z != 0);
587 if (
param.aux.z <=
X[2])
return true;
592 do {
param.aux.y++; }
while(
X[1] %
param.aux.y != 0);
593 if (
param.aux.y <=
X[1])
return true;
597 do {
param.aux.x++; }
while( (2*
X[0]) %
param.aux.x != 0);
598 if (
param.aux.x <= (2*
X[0]) )
return true;
603 param.aux = make_int4(2,1,1,1);
607 void initTuneParam(TuneParam &
param)
const 609 Tunable::initTuneParam(
param);
610 param.aux = make_int4(2,1,1,1);
614 void defaultTuneParam(TuneParam &
param)
const 616 Tunable::defaultTuneParam(
param);
617 param.aux = make_int4(2,1,1,1);
652 int mv_flops = (8 *
in->Ncolor() - 2) *
in->Ncolor();
653 int num_mv_multiply =
in->Nspin() == 4 ? 2 : 1;
654 int ghost_flops = (num_mv_multiply * mv_flops + 2*
in->Ncolor()*
in->Nspin());
655 int xpay_flops = 2 * 2 *
in->Ncolor() *
in->Nspin();
658 long long flops_ = 0;
668 long long ghost_sites = 2 * (
in->GhostFace()[0]+
in->GhostFace()[1]+
in->GhostFace()[2]+
in->GhostFace()[3]);
669 flops_ = (ghost_flops + (
x ? xpay_flops : 0)) * ghost_sites;
675 long long sites =
in->VolumeCB();
676 flops_ = (num_dir*(
in->Nspin()/4)*
in->Ncolor()*
in->Nspin() +
677 num_dir*num_mv_multiply*mv_flops +
678 ((num_dir-1)*2*
in->Ncolor()*
in->Nspin())) * sites;
679 if (
x) flops_ += xpay_flops * sites;
683 long long ghost_sites = 0;
685 flops_ -= (ghost_flops + (
x ? xpay_flops : 0)) * ghost_sites;
695 bool isHalf =
in->Precision() ==
sizeof(short) ?
true :
false;
696 int spinor_bytes = 2 *
in->Ncolor() *
in->Nspin() *
in->Precision() + (isHalf ?
sizeof(
float) : 0);
697 int proj_spinor_bytes = (
in->Nspin()==4 ? 1 : 2) *
in->Ncolor() *
in->Nspin() *
in->Precision() + (isHalf ?
sizeof(
float) : 0);
698 int ghost_bytes = (proj_spinor_bytes + gauge_bytes) + spinor_bytes;
711 long long ghost_sites = 2 * (
in->GhostFace()[0]+
in->GhostFace()[1]+
in->GhostFace()[2]+
in->GhostFace()[3]);
712 bytes_ = (ghost_bytes + (
x ? spinor_bytes : 0)) * ghost_sites;
718 long long sites =
in->VolumeCB();
719 bytes_ = (num_dir*gauge_bytes + ((num_dir-2)*spinor_bytes + 2*proj_spinor_bytes) + spinor_bytes)*sites;
720 if (
x) bytes_ += spinor_bytes;
724 long long ghost_sites = 0;
726 bytes_ -= (ghost_bytes + (
x ? spinor_bytes : 0)) * ghost_sites;
743 #ifdef SHARED_WILSON_DSLASH 747 bool advanceSharedBytes(TuneParam &
param)
const {
753 int sharedBytes(
const dim3 &
block)
const {
756 if (block_xy % warpSize != 0) block_xy = ((block_xy / warpSize) + 1)*warpSize;
757 return block_xy*
block.z*sharedBytesPerThread();
761 dim3 createGrid(
const dim3 &
block)
const {
765 return dim3(gx, gy, gz);
769 bool advanceBlockDim(TuneParam &
param)
const {
771 const unsigned int min_threads = 2;
772 const unsigned int max_threads = 512;
773 const unsigned int max_shared = 16384*3;
777 dim3 blockInit =
param.block;
779 for (
unsigned bx=blockInit.x; bx<=in->
X(0); bx++) {
781 for (
unsigned by=blockInit.y; by<=in->
X(1); by++) {
782 unsigned int gy = (
in->X(1) + by - 1 ) / by;
784 if (by > 1 && (by%2) != 0)
continue;
786 for (
unsigned bz=blockInit.z; bz<=in->
X(2); bz++) {
787 unsigned int gz = (
in->X(2) + bz - 1) / bz;
789 if (bz > 1 && (bz%2) != 0)
continue;
790 if (bx*by*bz > max_threads)
continue;
791 if (bx*by*bz < min_threads)
continue;
793 if (by*gy !=
in->X(1))
continue;
794 if (bz*gz !=
in->X(2))
continue;
795 if (sharedBytes(dim3(bx, by, bz)) > max_shared)
continue;
797 param.block = dim3(bx, by, bz);
809 param.block = dim3(
in->X(0), 1, 1);
813 param.shared_bytes = sharedBytes(
param.block);
820 const cudaColorSpinorField *
x,
const GaugeField &
gauge,
825 virtual void initTuneParam(TuneParam &
param)
const 829 param.block = dim3(
in->X(0), 1, 1);
831 param.shared_bytes = sharedBytes(
param.block);
835 virtual void defaultTuneParam(TuneParam &
param)
const 838 else initTuneParam(
param);
842 class SharedDslashCuda : public DslashCuda { 845 const cudaColorSpinorField *
x,
const GaugeField &gauge,
void unbindGaugeTex(const cudaGaugeField &gauge)
virtual long long bytes() const
const cudaColorSpinorField * in
virtual ~SharedDslashCuda()
cudaColorSpinorField * out
int commDim[QUDA_MAX_DIM]
void fillAuxBase()
Set the base strings used by the different dslash kernel types for autotuning.
void setPackComms(const int *commDim)
void augmentAux(KernelType type, const char *extra)
const char * getAux(KernelType type) const
float * ghostNorm[2 *QUDA_MAX_DIM]
static __inline__ dim3 dim3 void size_t cudaStream_t int dim
DslashCuda(cudaColorSpinorField *out, const cudaColorSpinorField *in, const cudaColorSpinorField *x, const GaugeField &gauge, const int parity, const int dagger, const int *commOverride)
char * strcpy(char *__dst, const char *__src)
virtual int Nface() const
char * strcat(char *__s1, const char *__s2)
int ghostOffset[QUDA_MAX_DIM+1][2]
char aux[8][TuneKey::aux_n]
char aux_base[TuneKey::aux_n]
void * ghost[2 *QUDA_MAX_DIM]
int int int enum cudaChannelFormatKind f
static char ghost_str[TuneKey::aux_n]
unsigned int sharedBytesPerBlock(const TuneParam ¶m) const
bool comm_peer2peer_enabled(int dir, int dim)
const cudaColorSpinorField * x
cpuColorSpinorField * out
void setAux(KernelType type, const char *aux_)
enum QudaReconstructType_s QudaReconstructType
virtual TuneKey tuneKey() const
unsigned int minThreads() const
int ghostDim[QUDA_MAX_DIM]
void fillAux(KernelType kernel_type, const char *kernel_str)
Specialize the auxiliary strings for each kernel type.
void setParam()
Set the dslashParam for the current multi-GPU parameters (set these at the last minute to ensure we a...
const QudaReconstructType reconstruct
virtual long long flops() const
static __inline__ size_t size_t d
SharedDslashCuda(cudaColorSpinorField *out, const cudaColorSpinorField *in, const cudaColorSpinorField *x, const GaugeField &gauge, int parity, int dagger, const int *commOverride)
int ghostNormOffset[QUDA_MAX_DIM+1][2]
int comm_dim_partitioned(int dim)
void bindGaugeTex(const cudaGaugeField &gauge, const int oddBit, T &dslashParam)