3 #if (__COMPUTE_CAPABILITY__ >= 700) 5 #define SET_CACHE(f) qudaFuncSetAttribute( (const void*)f, cudaFuncAttributePreferredSharedMemoryCarveout, (int)cudaSharedmemCarveoutMaxShared) 11 #define LAUNCH_KERNEL(f, grid, block, shared, stream, param) \ 12 void *args[] = { ¶m }; \ 13 void (*func)( const DslashParam ) = &(f); \ 14 qudaLaunchKernel( (const void*)func, grid, block, args, shared, stream); 16 #define LAUNCH_KERNEL(f, grid, block, shared, stream, param) f<<<grid, block, shared, stream>>>(param) 19 #define EVEN_MORE_GENERIC_DSLASH(FUNC, FLOAT, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \ 21 if (reconstruct == QUDA_RECONSTRUCT_NO) { \ 22 SET_CACHE( FUNC ## FLOAT ## 18 ## DAG ## Kernel<kernel_type> ); \ 23 LAUNCH_KERNEL( FUNC ## FLOAT ## 18 ## DAG ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 24 } else if (reconstruct == QUDA_RECONSTRUCT_12) { \ 25 SET_CACHE( FUNC ## FLOAT ## 12 ## DAG ## Kernel<kernel_type> ); \ 26 LAUNCH_KERNEL( FUNC ## FLOAT ## 12 ## DAG ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 27 } else if (reconstruct == QUDA_RECONSTRUCT_8) { \ 28 SET_CACHE( FUNC ## FLOAT ## 8 ## DAG ## Kernel<kernel_type> ); \ 29 LAUNCH_KERNEL( FUNC ## FLOAT ## 8 ## DAG ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 32 if (reconstruct == QUDA_RECONSTRUCT_NO) { \ 33 SET_CACHE( FUNC ## FLOAT ## 18 ## DAG ## X ## Kernel<kernel_type> ); \ 34 LAUNCH_KERNEL( FUNC ## FLOAT ## 18 ## DAG ## X ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 35 } else if (reconstruct == QUDA_RECONSTRUCT_12) { \ 36 SET_CACHE( FUNC ## FLOAT ## 18 ## DAG ## X ## Kernel<kernel_type> ); \ 37 LAUNCH_KERNEL( FUNC ## FLOAT ## 12 ## DAG ## X ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 38 } else if (reconstruct == QUDA_RECONSTRUCT_8) { \ 39 SET_CACHE( FUNC ## FLOAT ## 18 ## DAG ## X ## Kernel<kernel_type> ); \ 40 LAUNCH_KERNEL( FUNC ## FLOAT ## 8 ## DAG ## X ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 44 #define MORE_GENERIC_DSLASH(FUNC, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \ 45 if (typeid(sFloat) == typeid(double2)) { \ 46 EVEN_MORE_GENERIC_DSLASH(FUNC, D, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \ 47 } else if (typeid(sFloat) == typeid(float4) || typeid(sFloat) == typeid(float2)) { \ 48 EVEN_MORE_GENERIC_DSLASH(FUNC, S, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \ 49 } else if (typeid(sFloat)==typeid(short4) || typeid(sFloat) == typeid(short2)) { \ 50 EVEN_MORE_GENERIC_DSLASH(FUNC, H, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \ 52 errorQuda("Undefined precision type"); \ 56 #define EVEN_MORE_GENERIC_STAGGERED_DSLASH(FUNC, FLOAT, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \ 58 if (reconstruct == QUDA_RECONSTRUCT_NO) { \ 59 LAUNCH_KERNEL( FUNC ## FLOAT ## 18 ## 18 ## DAG ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 60 } else if (reconstruct == QUDA_RECONSTRUCT_13) { \ 61 LAUNCH_KERNEL( FUNC ## FLOAT ## 18 ## 13 ## DAG ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 62 } else if (reconstruct == QUDA_RECONSTRUCT_12) { \ 63 LAUNCH_KERNEL( FUNC ## FLOAT ## 18 ## 12 ## DAG ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 64 } else if (reconstruct == QUDA_RECONSTRUCT_9) { \ 65 LAUNCH_KERNEL( FUNC ## FLOAT ## 18 ## 9 ## DAG ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 66 } else if (reconstruct == QUDA_RECONSTRUCT_8) { \ 67 LAUNCH_KERNEL( FUNC ## FLOAT ## 18 ## 8 ## DAG ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 70 if (reconstruct == QUDA_RECONSTRUCT_NO) { \ 71 LAUNCH_KERNEL( FUNC ## FLOAT ## 18 ## 18 ## DAG ## X ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 72 } else if (reconstruct == QUDA_RECONSTRUCT_13) { \ 73 LAUNCH_KERNEL( FUNC ## FLOAT ## 18 ## 13 ## DAG ## X ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 74 } else if (reconstruct == QUDA_RECONSTRUCT_12) { \ 75 LAUNCH_KERNEL( FUNC ## FLOAT ## 18 ## 12 ## DAG ## X ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 76 } else if (reconstruct == QUDA_RECONSTRUCT_9) { \ 77 LAUNCH_KERNEL( FUNC ## FLOAT ## 18 ## 9 ## DAG ## X ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 78 } else if (reconstruct == QUDA_RECONSTRUCT_8) { \ 79 LAUNCH_KERNEL( FUNC ## FLOAT ## 18 ## 8 ## DAG ## X ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 83 #define MORE_GENERIC_STAGGERED_DSLASH(FUNC, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \ 84 if (typeid(sFloat) == typeid(double2)) { \ 85 EVEN_MORE_GENERIC_STAGGERED_DSLASH(FUNC, D, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \ 86 } else if (typeid(sFloat) == typeid(float2)) { \ 87 EVEN_MORE_GENERIC_STAGGERED_DSLASH(FUNC, S, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \ 88 } else if (typeid(sFloat)==typeid(short2)) { \ 89 EVEN_MORE_GENERIC_STAGGERED_DSLASH(FUNC, H, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \ 91 errorQuda("Undefined precision type"); \ 96 #define GENERIC_DSLASH(FUNC, DAG, X, gridDim, blockDim, shared, stream, param) \ 97 switch(param.kernel_type) { \ 98 case INTERIOR_KERNEL: \ 99 MORE_GENERIC_DSLASH(FUNC, DAG, X, INTERIOR_KERNEL, gridDim, blockDim, shared, stream, param) \ 102 errorQuda("KernelType %d not defined for single GPU", param.kernel_type); \ 105 #define GENERIC_STAGGERED_DSLASH(FUNC, DAG, X, gridDim, blockDim, shared, stream, param) \ 106 switch(param.kernel_type) { \ 107 case INTERIOR_KERNEL: \ 108 MORE_GENERIC_STAGGERED_DSLASH(FUNC, DAG, X, INTERIOR_KERNEL, gridDim, blockDim, shared, stream, param) \ 111 errorQuda("KernelType %d not defined for single GPU", param.kernel_type); \ 117 #define GENERIC_DSLASH(FUNC, DAG, X, gridDim, blockDim, shared, stream, param) \ 118 switch(param.kernel_type) { \ 119 case INTERIOR_KERNEL: \ 120 MORE_GENERIC_DSLASH(FUNC, DAG, X, INTERIOR_KERNEL, gridDim, blockDim, shared, stream, param) \ 122 case EXTERIOR_KERNEL_X: \ 123 MORE_GENERIC_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_X, gridDim, blockDim, shared, stream, param) \ 125 case EXTERIOR_KERNEL_Y: \ 126 MORE_GENERIC_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_Y, gridDim, blockDim, shared, stream, param) \ 128 case EXTERIOR_KERNEL_Z: \ 129 MORE_GENERIC_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_Z, gridDim, blockDim, shared, stream, param) \ 131 case EXTERIOR_KERNEL_T: \ 132 MORE_GENERIC_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_T, gridDim, blockDim, shared, stream, param) \ 134 case EXTERIOR_KERNEL_ALL: \ 135 MORE_GENERIC_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_ALL, gridDim, blockDim, shared, stream, param) \ 141 #define GENERIC_STAGGERED_DSLASH(FUNC, DAG, X, gridDim, blockDim, shared, stream, param) \ 142 switch(param.kernel_type) { \ 143 case INTERIOR_KERNEL: \ 144 MORE_GENERIC_STAGGERED_DSLASH(FUNC, DAG, X, INTERIOR_KERNEL, gridDim, blockDim, shared, stream, param) \ 146 case EXTERIOR_KERNEL_X: \ 147 MORE_GENERIC_STAGGERED_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_X, gridDim, blockDim, shared, stream, param) \ 149 case EXTERIOR_KERNEL_Y: \ 150 MORE_GENERIC_STAGGERED_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_Y, gridDim, blockDim, shared, stream, param) \ 152 case EXTERIOR_KERNEL_Z: \ 153 MORE_GENERIC_STAGGERED_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_Z, gridDim, blockDim, shared, stream, param) \ 155 case EXTERIOR_KERNEL_T: \ 156 MORE_GENERIC_STAGGERED_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_T, gridDim, blockDim, shared, stream, param) \ 158 case EXTERIOR_KERNEL_ALL: \ 159 MORE_GENERIC_STAGGERED_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_ALL, gridDim, blockDim, shared, stream, param) \ 169 #define DSLASH(FUNC, gridDim, blockDim, shared, stream, param) \ 171 GENERIC_DSLASH(FUNC, , Xpay, gridDim, blockDim, shared, stream, param) \ 173 GENERIC_DSLASH(FUNC, Dagger, Xpay, gridDim, blockDim, shared, stream, param) \ 177 #define STAGGERED_DSLASH(gridDim, blockDim, shared, stream, param) \ 179 GENERIC_DSLASH(staggeredDslash, , Axpy, gridDim, blockDim, shared, stream, param) \ 181 GENERIC_DSLASH(staggeredDslash, Dagger, Axpy, gridDim, blockDim, shared, stream, param) \ 185 #define STAGGERED_DSLASH_TIFR(gridDim, blockDim, shared, stream, param) \ 187 GENERIC_DSLASH(staggeredDslashTIFR, , Axpy, gridDim, blockDim, shared, stream, param) \ 189 GENERIC_DSLASH(staggeredDslashTIFR, Dagger, Axpy, gridDim, blockDim, shared, stream, param) \ 192 #define IMPROVED_STAGGERED_DSLASH(gridDim, blockDim, shared, stream, param) \ 194 GENERIC_STAGGERED_DSLASH(improvedStaggeredDslash, , Axpy, gridDim, blockDim, shared, stream, param) \ 196 GENERIC_STAGGERED_DSLASH(improvedStaggeredDslash, Dagger, Axpy, gridDim, blockDim, shared, stream, param) \ 199 #define EVEN_MORE_GENERIC_ASYM_DSLASH(FUNC, FLOAT, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \ 200 if (reconstruct == QUDA_RECONSTRUCT_NO) { \ 201 SET_CACHE( FUNC ## FLOAT ## 18 ## DAG ## X ## Kernel<kernel_type> ); \ 202 LAUNCH_KERNEL( FUNC ## FLOAT ## 18 ## DAG ## X ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 203 } else if (reconstruct == QUDA_RECONSTRUCT_12) { \ 204 SET_CACHE( FUNC ## FLOAT ## 12 ## DAG ## X ## Kernel<kernel_type> ); \ 205 LAUNCH_KERNEL( FUNC ## FLOAT ## 12 ## DAG ## X ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 206 } else if (reconstruct == QUDA_RECONSTRUCT_8) { \ 207 SET_CACHE( FUNC ## FLOAT ## 8 ## DAG ## X ## Kernel<kernel_type> ); \ 208 LAUNCH_KERNEL( FUNC ## FLOAT ## 8 ## DAG ## X ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 211 #define MORE_GENERIC_ASYM_DSLASH(FUNC, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \ 212 if (typeid(sFloat) == typeid(double2)) { \ 213 EVEN_MORE_GENERIC_ASYM_DSLASH(FUNC, D, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \ 214 } else if (typeid(sFloat) == typeid(float4)) { \ 215 EVEN_MORE_GENERIC_ASYM_DSLASH(FUNC, S, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \ 216 } else if (typeid(sFloat)==typeid(short4)) { \ 217 EVEN_MORE_GENERIC_ASYM_DSLASH(FUNC, H, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \ 223 #define GENERIC_ASYM_DSLASH(FUNC, DAG, X, gridDim, blockDim, shared, stream, param) \ 224 switch(param.kernel_type) { \ 225 case INTERIOR_KERNEL: \ 226 MORE_GENERIC_ASYM_DSLASH(FUNC, DAG, X, INTERIOR_KERNEL, gridDim, blockDim, shared, stream, param) \ 229 errorQuda("KernelType %d not defined for single GPU", param.kernel_type); \ 234 #define GENERIC_ASYM_DSLASH(FUNC, DAG, X, gridDim, blockDim, shared, stream, param) \ 235 switch(param.kernel_type) { \ 236 case INTERIOR_KERNEL: \ 237 MORE_GENERIC_ASYM_DSLASH(FUNC, DAG, X, INTERIOR_KERNEL, gridDim, blockDim, shared, stream, param) \ 239 case EXTERIOR_KERNEL_X: \ 240 MORE_GENERIC_ASYM_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_X, gridDim, blockDim, shared, stream, param) \ 242 case EXTERIOR_KERNEL_Y: \ 243 MORE_GENERIC_ASYM_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_Y, gridDim, blockDim, shared, stream, param) \ 245 case EXTERIOR_KERNEL_Z: \ 246 MORE_GENERIC_ASYM_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_Z, gridDim, blockDim, shared, stream, param) \ 248 case EXTERIOR_KERNEL_T: \ 249 MORE_GENERIC_ASYM_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_T, gridDim, blockDim, shared, stream, param) \ 251 case EXTERIOR_KERNEL_ALL: \ 252 MORE_GENERIC_ASYM_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_ALL, gridDim, blockDim, shared, stream, param) \ 261 #define ASYM_DSLASH(FUNC, gridDim, blockDim, shared, stream, param) \ 263 GENERIC_ASYM_DSLASH(FUNC, , Xpay, gridDim, blockDim, shared, stream, param) \ 265 GENERIC_ASYM_DSLASH(FUNC, Dagger, Xpay, gridDim, blockDim, shared, stream, param) \ 272 #define EVEN_MORE_GENERIC_NDEG_TM_DSLASH(FUNC, FLOAT, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \ 273 if (x == 0 && d == 0) { \ 274 if (reconstruct == QUDA_RECONSTRUCT_NO) { \ 275 SET_CACHE( FUNC ## FLOAT ## 18 ## DAG ## Twist ## Kernel<kernel_type> ); \ 276 LAUNCH_KERNEL( FUNC ## FLOAT ## 18 ## DAG ## Twist ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 277 } else if (reconstruct == QUDA_RECONSTRUCT_12) { \ 278 SET_CACHE( FUNC ## FLOAT ## 12 ## DAG ## Twist ## Kernel<kernel_type> ); \ 279 LAUNCH_KERNEL( FUNC ## FLOAT ## 12 ## DAG ## Twist ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 281 SET_CACHE( FUNC ## FLOAT ## 8 ## DAG ## Twist ## Kernel<kernel_type> ); \ 282 LAUNCH_KERNEL( FUNC ## FLOAT ## 8 ## DAG ## Twist ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 284 } else if (x != 0 && d == 0) { \ 285 if (reconstruct == QUDA_RECONSTRUCT_NO) { \ 286 SET_CACHE( FUNC ## FLOAT ## 18 ## DAG ## Twist ## X ## Kernel<kernel_type> ); \ 287 LAUNCH_KERNEL( FUNC ## FLOAT ## 18 ## DAG ## Twist ## X ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 288 } else if (reconstruct == QUDA_RECONSTRUCT_12) { \ 289 SET_CACHE( FUNC ## FLOAT ## 12 ## DAG ## Twist ## X ## Kernel<kernel_type> ); \ 290 LAUNCH_KERNEL( FUNC ## FLOAT ## 12 ## DAG ## Twist ## X ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 291 } else if (reconstruct == QUDA_RECONSTRUCT_8) { \ 292 SET_CACHE( FUNC ## FLOAT ## 8 ## DAG ## Twist ## X ## Kernel<kernel_type> ); \ 293 LAUNCH_KERNEL( FUNC ## FLOAT ## 8 ## DAG ## Twist ## X ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 295 } else if (x == 0 && d != 0) { \ 296 if (reconstruct == QUDA_RECONSTRUCT_NO) { \ 297 SET_CACHE( FUNC ## FLOAT ## 18 ## DAG ## Kernel<kernel_type> ); \ 298 LAUNCH_KERNEL( FUNC ## FLOAT ## 18 ## DAG ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 299 } else if (reconstruct == QUDA_RECONSTRUCT_12) { \ 300 SET_CACHE( FUNC ## FLOAT ## 12 ## DAG ## Kernel<kernel_type> ); \ 301 LAUNCH_KERNEL( FUNC ## FLOAT ## 12 ## DAG ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 303 SET_CACHE( FUNC ## FLOAT ## 8 ## DAG ## Kernel<kernel_type> ); \ 304 LAUNCH_KERNEL( FUNC ## FLOAT ## 8 ## DAG ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 307 if (reconstruct == QUDA_RECONSTRUCT_NO) { \ 308 SET_CACHE( FUNC ## FLOAT ## 18 ## DAG ## X ## Kernel<kernel_type> ); \ 309 LAUNCH_KERNEL( FUNC ## FLOAT ## 18 ## DAG ## X ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 310 } else if (reconstruct == QUDA_RECONSTRUCT_12) { \ 311 SET_CACHE( FUNC ## FLOAT ## 12 ## DAG ## X ## Kernel<kernel_type> ); \ 312 LAUNCH_KERNEL( FUNC ## FLOAT ## 12 ## DAG ## X ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 313 } else if (reconstruct == QUDA_RECONSTRUCT_8) { \ 314 SET_CACHE( FUNC ## FLOAT ## 8 ## DAG ## X ## Kernel<kernel_type> ); \ 315 LAUNCH_KERNEL( FUNC ## FLOAT ## 8 ## DAG ## X ## Kernel<kernel_type>, gridDim, blockDim, shared, stream, param); \ 319 #define MORE_GENERIC_NDEG_TM_DSLASH(FUNC, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \ 320 if (typeid(sFloat) == typeid(double2)) { \ 321 EVEN_MORE_GENERIC_NDEG_TM_DSLASH(FUNC, D, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \ 322 } else if (typeid(sFloat) == typeid(float4)) { \ 323 EVEN_MORE_GENERIC_NDEG_TM_DSLASH(FUNC, S, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \ 324 } else if (typeid(sFloat)==typeid(short4)) { \ 325 EVEN_MORE_GENERIC_NDEG_TM_DSLASH(FUNC, H, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param) \ 327 errorQuda("Undefined precision type"); \ 332 #define GENERIC_NDEG_TM_DSLASH(FUNC, DAG, X, gridDim, blockDim, shared, stream, param) \ 333 switch(param.kernel_type) { \ 334 case INTERIOR_KERNEL: \ 335 MORE_GENERIC_NDEG_TM_DSLASH(FUNC, DAG, X, INTERIOR_KERNEL, gridDim, blockDim, shared, stream, param) \ 338 errorQuda("KernelType %d not defined for single GPU", param.kernel_type); \ 343 #define GENERIC_NDEG_TM_DSLASH(FUNC, DAG, X, gridDim, blockDim, shared, stream, param) \ 344 switch(param.kernel_type) { \ 345 case INTERIOR_KERNEL: \ 346 MORE_GENERIC_NDEG_TM_DSLASH(FUNC, DAG, X, INTERIOR_KERNEL, gridDim, blockDim, shared, stream, param) \ 348 case EXTERIOR_KERNEL_X: \ 349 MORE_GENERIC_NDEG_TM_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_X, gridDim, blockDim, shared, stream, param) \ 351 case EXTERIOR_KERNEL_Y: \ 352 MORE_GENERIC_NDEG_TM_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_Y, gridDim, blockDim, shared, stream, param) \ 354 case EXTERIOR_KERNEL_Z: \ 355 MORE_GENERIC_NDEG_TM_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_Z, gridDim, blockDim, shared, stream, param) \ 357 case EXTERIOR_KERNEL_T: \ 358 MORE_GENERIC_NDEG_TM_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_T, gridDim, blockDim, shared, stream, param) \ 360 case EXTERIOR_KERNEL_ALL: \ 361 MORE_GENERIC_NDEG_TM_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_ALL, gridDim, blockDim, shared, stream, param) \ 369 #define NDEG_TM_DSLASH(FUNC, gridDim, blockDim, shared, stream, param) \ 371 GENERIC_NDEG_TM_DSLASH(FUNC, , Xpay, gridDim, blockDim, shared, stream, param) \ 373 GENERIC_NDEG_TM_DSLASH(FUNC, Dagger, Xpay, gridDim, blockDim, shared, stream, param) \ 384 cudaColorSpinorField *
out;
385 const cudaColorSpinorField *
in;
386 const cudaColorSpinorField *
x;
399 char aux[8][TuneKey::aux_n];
413 strcpy(aux_base,
",comm=");
414 strcat(aux_base,comm);
416 switch (reconstruct) {
425 if (x) strcat(aux_base,
",Xpay");
426 if (dagger) strcat(aux_base,
",dagger");
435 strcpy(aux[kernel_type],kernel_str);
436 if (kernel_type ==
INTERIOR_KERNEL) strcat(aux[kernel_type],ghost_str);
437 strcat(aux[kernel_type],aux_base);
452 for (
int dim=0; dim<4; dim++) {
455 for (
int dir=0; dir<2; dir++) {
464 #ifdef USE_TEXTURE_OBJECTS 466 dslashParam.ghostTexNorm[2*dim+dir] = in->GhostTexNorm();
467 #endif // USE_TEXTURE_OBJECTS 477 DslashCuda(cudaColorSpinorField *out,
const cudaColorSpinorField *in,
478 const cudaColorSpinorField *x,
const GaugeField &gauge,
479 const int parity,
const int dagger,
const int *commOverride)
480 : out(out), in(in), x(x), gauge(gauge), reconstruct(gauge.Reconstruct()),
481 dagger(dagger), saveOut(0), saveOutNorm(0) {
483 if (in->Precision() != gauge.Precision())
484 errorQuda(
"Mixing gauge %d and spinor %d precision not supported", gauge.Precision(), in->Precision());
486 constexpr
int nDimComms = 4;
487 for (
int i=0; i<nDimComms; i++){
488 dslashParam.
ghostOffset[i][0] = in->GhostOffset(i,0)/in->FieldOrder();
489 dslashParam.
ghostOffset[i][1] = in->GhostOffset(i,1)/in->FieldOrder();
497 dslashParam.
out = (
void*)out->V();
498 dslashParam.
outNorm = (
float*)out->Norm();
499 dslashParam.
in = (
void*)in->V();
500 dslashParam.
inNorm = (
float*)in->Norm();
501 dslashParam.
x = x ? (
void*)x->V() :
nullptr;
502 dslashParam.
xNorm = x ? (
float*)x->Norm() :
nullptr;
504 #ifdef USE_TEXTURE_OBJECTS 505 dslashParam.inTex = in->Tex();
506 dslashParam.inTexNorm = in->TexNorm();
507 if (out) dslashParam.outTex = out->Tex();
508 if (out) dslashParam.outTexNorm = out->TexNorm();
509 if (x) dslashParam.xTex = x->Tex();
510 if (x) dslashParam.xTexNorm = x->TexNorm();
511 #endif // USE_TEXTURE_OBJECTS 514 bindGaugeTex(static_cast<const cudaGaugeField&>(gauge), parity, dslashParam);
518 dslashParam.
dc = in->getDslashConstant();
529 dslashParam.
An2 = make_float2(gauge.Anisotropy(), 1.0 / (gauge.Anisotropy()*gauge.Anisotropy()));
532 dslashParam.
coeff = 1.0;
538 dslashParam.
No2 = make_float2(1.0f, 1.0f);
547 for (
int dim=0; dim<nDimComms; dim++) ghost[dim] = (dslashParam.
ghostDim[dim] ?
'1' :
'0');
549 strcpy(ghost_str,
",ghost=");
550 strcat(ghost_str,ghost);
569 virtual ~DslashCuda() { unbindGaugeTex(static_cast<const cudaGaugeField&>(gauge)); }
571 {
return TuneKey(in->VolString(),
typeid(*this).name(), aux[dslashParam.
kernel_type]); }
578 strcpy(aux[type], aux_);
582 strcat(aux[type], extra);
585 virtual int Nface()
const {
return 2; }
589 #if defined(DSLASH_TUNE_TILE) 591 bool advanceAux(TuneParam &
param)
const 593 if (in->Nspin()==1 || in->Ndim()==5)
return false;
594 const int *
X = in->X();
596 if (param.aux.w < X[3] && param.aux.x > 1 && param.aux.w < 2) {
597 do { param.aux.w++; }
while( (X[3]) % param.aux.w != 0);
598 if (param.aux.w <= X[3])
return true;
602 if (param.aux.z < X[2] && param.aux.x > 1 && param.aux.z < 8) {
603 do { param.aux.z++; }
while( (X[2]) % param.aux.z != 0);
604 if (param.aux.z <= X[2])
return true;
608 if (param.aux.y < X[1] && param.aux.x > 1 && param.aux.y < 32) {
609 do { param.aux.y++; }
while( X[1] % param.aux.y != 0);
610 if (param.aux.y <= X[1])
return true;
613 if (param.aux.x < (2*X[0]) && param.aux.x < 32) {
614 do { param.aux.x++; }
while( (2*X[0]) % param.aux.x != 0);
615 if (param.aux.x <= (2*X[0]) )
return true;
620 param.aux = make_int4(2,1,1,1);
624 void initTuneParam(TuneParam ¶m)
const 626 Tunable::initTuneParam(param);
627 param.aux = make_int4(2,1,1,1);
631 void defaultTuneParam(TuneParam ¶m)
const 633 Tunable::defaultTuneParam(param);
634 param.aux = make_int4(2,1,1,1);
669 int mv_flops = (8 * in->Ncolor() - 2) * in->Ncolor();
670 int num_mv_multiply = in->Nspin() == 4 ? 2 : 1;
671 int ghost_flops = (num_mv_multiply * mv_flops + 2*in->Ncolor()*in->Nspin());
672 int xpay_flops = 2 * 2 * in->Ncolor() * in->Nspin();
675 long long flops_ = 0;
681 flops_ = (ghost_flops + (x ? xpay_flops : 0)) * 2 * in->GhostFace()[dslashParam.
kernel_type];
685 long long ghost_sites = 2 * (in->GhostFace()[0]+in->GhostFace()[1]+in->GhostFace()[2]+in->GhostFace()[3]);
686 flops_ = (ghost_flops + (x ? xpay_flops : 0)) * ghost_sites;
692 long long sites = in->VolumeCB();
693 flops_ = (num_dir*(in->Nspin()/4)*in->Ncolor()*in->Nspin() +
694 num_dir*num_mv_multiply*mv_flops +
695 ((num_dir-1)*2*in->Ncolor()*in->Nspin())) * sites;
696 if (x) flops_ += xpay_flops * sites;
700 long long ghost_sites = 0;
701 for (
int d=0; d<4; d++)
if (dslashParam.
commDim[d]) ghost_sites += 2 * in->GhostFace()[d];
702 flops_ -= (ghost_flops + (x ? xpay_flops : 0)) * ghost_sites;
711 int gauge_bytes = reconstruct * in->Precision();
712 bool isFixed = (in->Precision() ==
sizeof(short) || in->Precision() ==
sizeof(char)) ? true :
false;
713 int spinor_bytes = 2 * in->Ncolor() * in->Nspin() * in->Precision() + (isFixed ?
sizeof(float) : 0);
714 int proj_spinor_bytes = (in->Nspin()==4 ? 1 : 2) * in->Ncolor() * in->Nspin() * in->Precision() + (isFixed ?
sizeof(float) : 0);
715 int ghost_bytes = (proj_spinor_bytes + gauge_bytes) + spinor_bytes;
724 bytes_ = (ghost_bytes + (x ? spinor_bytes : 0)) * 2 * in->GhostFace()[dslashParam.
kernel_type];
728 long long ghost_sites = 2 * (in->GhostFace()[0]+in->GhostFace()[1]+in->GhostFace()[2]+in->GhostFace()[3]);
729 bytes_ = (ghost_bytes + (x ? spinor_bytes : 0)) * ghost_sites;
735 long long sites = in->VolumeCB();
736 bytes_ = (num_dir*gauge_bytes + ((num_dir-2)*spinor_bytes + 2*proj_spinor_bytes) + spinor_bytes)*sites;
737 if (x) bytes_ += spinor_bytes;
741 long long ghost_sites = 0;
742 for (
int d=0; d<4; d++)
if (dslashParam.
commDim[d]) ghost_sites += 2*in->GhostFace()[d];
743 bytes_ -= (ghost_bytes + (x ? spinor_bytes : 0)) * ghost_sites;
760 #ifdef SHARED_WILSON_DSLASH 764 bool advanceSharedBytes(TuneParam ¶m)
const {
770 int sharedBytes(
const dim3 &block)
const {
772 int block_xy = block.x*block.y;
773 if (block_xy % warpSize != 0) block_xy = ((block_xy / warpSize) + 1)*warpSize;
774 return block_xy*block.z*sharedBytesPerThread();
778 dim3 createGrid(
const dim3 &block)
const {
779 unsigned int gx = (
in->X(0)*
in->X(3) + block.x - 1) / block.x;
780 unsigned int gy = (
in->X(1) + block.y - 1 ) / block.y;
781 unsigned int gz = (
in->X(2) + block.z - 1) / block.z;
782 return dim3(gx, gy, gz);
786 bool advanceBlockDim(TuneParam ¶m)
const {
788 const unsigned int min_threads = 2;
789 const unsigned int max_threads = 512;
790 const unsigned int max_shared = 16384*3;
794 dim3 blockInit = param.block;
796 for (
unsigned bx=blockInit.x; bx<=in->
X(0); bx++) {
798 for (
unsigned by=blockInit.y; by<=in->
X(1); by++) {
799 unsigned int gy = (
in->X(1) + by - 1 ) / by;
801 if (by > 1 && (by%2) != 0)
continue;
803 for (
unsigned bz=blockInit.z; bz<=in->
X(2); bz++) {
804 unsigned int gz = (
in->X(2) + bz - 1) / bz;
806 if (bz > 1 && (bz%2) != 0)
continue;
807 if (bx*by*bz > max_threads)
continue;
808 if (bx*by*bz < min_threads)
continue;
810 if (by*gy !=
in->X(1))
continue;
811 if (bz*gz !=
in->X(2))
continue;
812 if (sharedBytes(dim3(bx, by, bz)) > max_shared)
continue;
814 param.block = dim3(bx, by, bz);
824 if (param.block.x >
in->X(0) && param.block.y >
in->X(1) && param.block.z >
in->X(2) || !
set) {
826 param.block = dim3(
in->X(0), 1, 1);
829 param.grid = createGrid(param.block);
830 param.shared_bytes = sharedBytes(param.block);
837 const cudaColorSpinorField *
x,
const GaugeField &
gauge,
839 :
DslashCuda(out, in, x, gauge, parity, dagger, commOverride) { ; }
842 virtual void initTuneParam(TuneParam ¶m)
const 846 param.block = dim3(in->X(0), 1, 1);
847 param.grid = createGrid(param.block);
848 param.shared_bytes = sharedBytes(param.block);
852 virtual void defaultTuneParam(TuneParam ¶m)
const 855 else initTuneParam(param);
859 class SharedDslashCuda : public DslashCuda { 862 const cudaColorSpinorField *
x,
const GaugeField &
gauge,
864 :
DslashCuda(out, in, x, gauge, parity, dagger, commOverride) { }
virtual long long bytes() const
const cudaColorSpinorField * in
virtual ~SharedDslashCuda()
cudaColorSpinorField * out
int commDim[QUDA_MAX_DIM]
void fillAuxBase()
Set the base strings used by the different dslash kernel types for autotuning.
void augmentAux(KernelType type, const char *extra)
const char * getAux(KernelType type) const
float * ghostNorm[2 *QUDA_MAX_DIM]
DslashCuda(cudaColorSpinorField *out, const cudaColorSpinorField *in, const cudaColorSpinorField *x, const GaugeField &gauge, const int parity, const int dagger, const int *commOverride)
virtual int Nface() const
int ghostOffset[QUDA_MAX_DIM+1][2]
char aux[8][TuneKey::aux_n]
char aux_base[TuneKey::aux_n]
void * ghost[2 *QUDA_MAX_DIM]
static char ghost_str[TuneKey::aux_n]
unsigned int sharedBytesPerBlock(const TuneParam ¶m) const
bool comm_peer2peer_enabled(int dir, int dim)
const cudaColorSpinorField * x
void setAux(KernelType type, const char *aux_)
enum QudaReconstructType_s QudaReconstructType
virtual TuneKey tuneKey() const
unsigned int minThreads() const
int ghostDim[QUDA_MAX_DIM]
void fillAux(KernelType kernel_type, const char *kernel_str)
Specialize the auxiliary strings for each kernel type.
void setParam()
Set the dslashParam for the current multi-GPU parameters (set these at the last minute to ensure we a...
const QudaReconstructType reconstruct
virtual long long flops() const
SharedDslashCuda(cudaColorSpinorField *out, const cudaColorSpinorField *in, const cudaColorSpinorField *x, const GaugeField &gauge, int parity, int dagger, const int *commOverride)
int ghostNormOffset[QUDA_MAX_DIM+1][2]
int comm_dim_partitioned(int dim)
void setPackComms(const int *dim_pack)
Helper function that sets which dimensions the packing kernel should be packing for.