QUDA v0.4.0
A library for QCD on GPUs
|
00001 // wilson_dslash_def.h - Dslash kernel definitions 00002 00003 // There are currently 72 different variants of the Wilson Dslash 00004 // kernel, each one characterized by a set of 5 options, where each 00005 // option can take one of several values (2*3*2*2*3 = 72). This file 00006 // is structured so that the C preprocessor loops through all 72 00007 // variants (in a manner resembling a counter), sets the appropriate 00008 // macros, and defines the corresponding functions. 00009 // 00010 // As an example of the function naming conventions, consider 00011 // 00012 // cloverDslash12DaggerXpayKernel(float4* out, ...). 00013 // 00014 // This is a clover Dslash^dagger kernel where the result is 00015 // multiplied by "a" and summed with an input vector (Xpay), and the 00016 // gauge matrix is reconstructed from 12 real numbers. More 00017 // generally, each function name is given by the concatenation of the 00018 // following 4 fields, with "Kernel" at the end: 00019 // 00020 // DD_NAME_F = dslash, cloverDslash 00021 // DD_RECON_F = 8, 12, 18 00022 // DD_DAG_F = Dagger, [blank] 00023 // DD_XPAY_F = Xpay, [blank] 00024 // 00025 // In addition, the kernels are templated on the precision of the 00026 // fields (double, single, or half). 00027 00028 // initialize on first iteration 00029 00030 #ifndef DD_LOOP 00031 #define DD_LOOP 00032 #define DD_DAG 0 00033 #define DD_XPAY 0 00034 #define DD_RECON 0 00035 #define DD_PREC 0 00036 #define DD_CLOVER 0 00037 #endif 00038 00039 // set options for current iteration 00040 00041 #if (DD_CLOVER==0) // no clover 00042 #define DD_NAME_F dslash 00043 #else // clover 00044 #define DSLASH_CLOVER 00045 #define DD_NAME_F cloverDslash 00046 #endif 00047 00048 #if (DD_DAG==0) // no dagger 00049 #define DD_DAG_F 00050 #else // dagger 00051 #define DD_DAG_F Dagger 00052 #endif 00053 00054 #if (DD_XPAY==0) // no xpay 00055 #define DD_XPAY_F 00056 #else // xpay 00057 #define DD_XPAY_F Xpay 00058 #define DSLASH_XPAY 00059 #endif 00060 00061 #if (DD_PREC == 0) 00062 #define DD_PARAM_XPAY const double2 *x, const float *xNorm, const double a, 00063 #elif (DD_PREC == 1) 00064 #define DD_PARAM_XPAY const float4 *x, const float *xNorm, const float a, 00065 #else 00066 #define DD_PARAM_XPAY const short4 *x, const float *xNorm, const float a, 00067 #endif 00068 00069 #if (DD_RECON==0) // reconstruct from 8 reals 00070 #define DD_RECON_F 8 00071 00072 #if (DD_PREC==0) 00073 #define DD_PARAM_GAUGE const double2 *gauge0, const double2 *gauge1, 00074 #define RECONSTRUCT_GAUGE_MATRIX RECONSTRUCT_MATRIX_8_DOUBLE 00075 #ifdef DIRECT_ACCESS_LINK 00076 #define READ_GAUGE_MATRIX READ_GAUGE_MATRIX_8_DOUBLE2 00077 #else 00078 #define READ_GAUGE_MATRIX READ_GAUGE_MATRIX_8_DOUBLE2_TEX 00079 #endif // DIRECT_ACCESS_LINK 00080 00081 #elif (DD_PREC==1) 00082 #define DD_PARAM_GAUGE const float4 *gauge0, const float4 *gauge1, 00083 #define RECONSTRUCT_GAUGE_MATRIX RECONSTRUCT_MATRIX_8_SINGLE 00084 #ifdef DIRECT_ACCESS_LINK 00085 #define READ_GAUGE_MATRIX READ_GAUGE_MATRIX_8_FLOAT4 00086 #else 00087 #define READ_GAUGE_MATRIX READ_GAUGE_MATRIX_8_FLOAT4_TEX 00088 #endif // DIRECT_ACCESS_LINK 00089 00090 #else 00091 #define DD_PARAM_GAUGE const short4 *gauge0, const short4* gauge1, 00092 #define RECONSTRUCT_GAUGE_MATRIX RECONSTRUCT_MATRIX_8_SINGLE 00093 #ifdef DIRECT_ACCESS_LINK 00094 #define READ_GAUGE_MATRIX READ_GAUGE_MATRIX_8_SHORT4 00095 #else 00096 #define READ_GAUGE_MATRIX READ_GAUGE_MATRIX_8_SHORT4_TEX 00097 #endif // DIRECT_ACCESS_LINK 00098 #endif // DD_PREC 00099 #elif (DD_RECON==1) // reconstruct from 12 reals 00100 #define DD_RECON_F 12 00101 00102 #if (DD_PREC==0) 00103 #define RECONSTRUCT_GAUGE_MATRIX RECONSTRUCT_MATRIX_12_DOUBLE 00104 #ifdef DIRECT_ACCESS_LINK 00105 #define READ_GAUGE_MATRIX READ_GAUGE_MATRIX_12_DOUBLE2 00106 #else 00107 #define READ_GAUGE_MATRIX READ_GAUGE_MATRIX_12_DOUBLE2_TEX 00108 #endif // DIRECT_ACCESS_LINK 00109 #define DD_PARAM_GAUGE const double2 *gauge0, const double2 *gauge1, 00110 00111 #elif (DD_PREC==1) 00112 #define DD_PARAM_GAUGE const float4 *gauge0, const float4 *gauge1, 00113 #define RECONSTRUCT_GAUGE_MATRIX RECONSTRUCT_MATRIX_12_SINGLE 00114 #ifdef DIRECT_ACCESS_LINK 00115 #define READ_GAUGE_MATRIX READ_GAUGE_MATRIX_12_FLOAT4 00116 #else 00117 #define READ_GAUGE_MATRIX READ_GAUGE_MATRIX_12_FLOAT4_TEX 00118 #endif // DIRECT_ACCESS_LINK 00119 00120 #else 00121 #define DD_PARAM_GAUGE const short4 *gauge0, const short4 *gauge1, 00122 #define RECONSTRUCT_GAUGE_MATRIX RECONSTRUCT_MATRIX_12_SINGLE 00123 #ifdef DIRECT_ACCESS_LINK 00124 #define READ_GAUGE_MATRIX READ_GAUGE_MATRIX_12_SHORT4 00125 #else 00126 #define READ_GAUGE_MATRIX READ_GAUGE_MATRIX_12_SHORT4_TEX 00127 #endif // DIRECT_ACCESS_LINK 00128 #endif // DD_PREC 00129 #else // no reconstruct, load all components 00130 #define DD_RECON_F 18 00131 #define GAUGE_FLOAT2 00132 #if (DD_PREC==0) 00133 #define RECONSTRUCT_GAUGE_MATRIX RECONSTRUCT_MATRIX_18_DOUBLE 00134 #ifdef DIRECT_ACCESS_LINK 00135 #define READ_GAUGE_MATRIX READ_GAUGE_MATRIX_18_DOUBLE2 00136 #else 00137 #define READ_GAUGE_MATRIX READ_GAUGE_MATRIX_18_DOUBLE2_TEX 00138 #endif // DIRECT_ACCESS_LINK 00139 #define DD_PARAM_GAUGE const double2 *gauge0, const double2 *gauge1, 00140 00141 #elif (DD_PREC==1) 00142 #define DD_PARAM_GAUGE const float4 *gauge0, const float4 *gauge1, // FIXME for direct reading, really float2 00143 #define RECONSTRUCT_GAUGE_MATRIX RECONSTRUCT_MATRIX_18_SINGLE 00144 #ifdef DIRECT_ACCESS_LINK 00145 #define READ_GAUGE_MATRIX READ_GAUGE_MATRIX_18_FLOAT2 00146 #else 00147 #define READ_GAUGE_MATRIX READ_GAUGE_MATRIX_18_FLOAT2_TEX 00148 #endif // DIRECT_ACCESS_LINK 00149 00150 #else 00151 #define DD_PARAM_GAUGE const short4 *gauge0, const short4 *gauge1, // FIXME for direct reading, really short2 00152 #define RECONSTRUCT_GAUGE_MATRIX RECONSTRUCT_MATRIX_18_SINGLE 00153 #ifdef DIRECT_ACCESS_LINK 00154 #define READ_GAUGE_MATRIX READ_GAUGE_MATRIX_18_SHORT2 00155 #else 00156 #define READ_GAUGE_MATRIX READ_GAUGE_MATRIX_18_SHORT2_TEX 00157 #endif //DIRECT_ACCESS_LINK 00158 #endif 00159 #endif 00160 00161 #if (DD_PREC==0) // double-precision fields 00162 00163 #define TPROJSCALE tProjScale 00164 00165 // double-precision gauge field 00166 #if (defined DIRECT_ACCESS_WILSON_GAUGE) || (defined FERMI_NO_DBLE_TEX) 00167 #define GAUGE0TEX gauge0 00168 #define GAUGE1TEX gauge1 00169 #else 00170 #define GAUGE0TEX gauge0TexDouble2 00171 #define GAUGE1TEX gauge1TexDouble2 00172 #endif 00173 00174 #define GAUGE_FLOAT2 00175 00176 // double-precision spinor fields 00177 #define DD_PARAM_OUT double2* out, float *null1, 00178 #define DD_PARAM_IN const double2* in, const float *null4, 00179 00180 #if (defined DIRECT_ACCESS_WILSON_SPINOR) || (defined FERMI_NO_DBLE_TEX) 00181 #define READ_SPINOR READ_SPINOR_DOUBLE 00182 #define READ_SPINOR_UP READ_SPINOR_DOUBLE_UP 00183 #define READ_SPINOR_DOWN READ_SPINOR_DOUBLE_DOWN 00184 #define SPINORTEX in 00185 #else 00186 #define READ_SPINOR READ_SPINOR_DOUBLE_TEX 00187 #define READ_SPINOR_UP READ_SPINOR_DOUBLE_UP_TEX 00188 #define READ_SPINOR_DOWN READ_SPINOR_DOUBLE_DOWN_TEX 00189 #define SPINORTEX spinorTexDouble 00190 #endif 00191 #if (defined DIRECT_ACCESS_WILSON_INTER) || (defined FERMI_NO_DBLE_TEX) 00192 #define READ_INTERMEDIATE_SPINOR READ_SPINOR_DOUBLE 00193 #define INTERTEX out 00194 #else 00195 #define READ_INTERMEDIATE_SPINOR READ_SPINOR_DOUBLE_TEX 00196 #define INTERTEX interTexDouble 00197 #endif 00198 #define WRITE_SPINOR WRITE_SPINOR_DOUBLE2_STR 00199 #define SPINOR_DOUBLE 00200 #if (DD_XPAY==1) 00201 #if (defined DIRECT_ACCESS_WILSON_ACCUM) || (defined FERMI_NO_DBLE_TEX) 00202 #define ACCUMTEX x 00203 #define READ_ACCUM READ_ACCUM_DOUBLE 00204 #else 00205 #define ACCUMTEX accumTexDouble 00206 #define READ_ACCUM READ_ACCUM_DOUBLE_TEX 00207 #endif 00208 00209 #endif 00210 00211 #define SPINOR_HOP 12 00212 00213 // double-precision clover field 00214 #if (DD_CLOVER==0) 00215 #define DD_PARAM_CLOVER 00216 #else 00217 #define DD_PARAM_CLOVER const double2 *clover, const float *null3, 00218 #endif 00219 #if (defined DIRECT_ACCESS_CLOVER) || (defined FERMI_NO_DBLE_TEX) 00220 #define CLOVERTEX clover 00221 #define READ_CLOVER READ_CLOVER_DOUBLE 00222 #else 00223 #define CLOVERTEX cloverTexDouble 00224 #define READ_CLOVER READ_CLOVER_DOUBLE_TEX 00225 #endif 00226 #define CLOVER_DOUBLE 00227 00228 #elif (DD_PREC==1) // single-precision fields 00229 00230 #define TPROJSCALE tProjScale_f 00231 00232 // single-precision gauge field 00233 #ifdef DIRECT_ACCESS_LINK 00234 #define GAUGE0TEX gauge0 00235 #define GAUGE1TEX gauge1 00236 #else 00237 #if (DD_RECON_F == 18) 00238 #define GAUGE0TEX gauge0TexSingle2 00239 #define GAUGE1TEX gauge1TexSingle2 00240 #else 00241 #define GAUGE0TEX gauge0TexSingle4 00242 #define GAUGE1TEX gauge1TexSingle4 00243 #endif 00244 #endif 00245 00246 00247 // single-precision spinor fields 00248 #define DD_PARAM_OUT float4* out, float *null1, 00249 #define DD_PARAM_IN const float4* in, const float *null4, 00250 #ifdef DIRECT_ACCESS_WILSON_SPINOR 00251 #define READ_SPINOR READ_SPINOR_SINGLE 00252 #define READ_SPINOR_UP READ_SPINOR_SINGLE_UP 00253 #define READ_SPINOR_DOWN READ_SPINOR_SINGLE_DOWN 00254 #define SPINORTEX in 00255 #else 00256 #define READ_SPINOR READ_SPINOR_SINGLE_TEX 00257 #define READ_SPINOR_UP READ_SPINOR_SINGLE_UP_TEX 00258 #define READ_SPINOR_DOWN READ_SPINOR_SINGLE_DOWN_TEX 00259 #define SPINORTEX spinorTexSingle 00260 #endif 00261 #ifdef DIRECT_ACCESS_WILSON_INTER 00262 #define READ_INTERMEDIATE_SPINOR READ_SPINOR_SINGLE 00263 #define INTERTEX out 00264 #else 00265 #define READ_INTERMEDIATE_SPINOR READ_SPINOR_SINGLE_TEX 00266 #define INTERTEX interTexSingle 00267 #endif 00268 #define WRITE_SPINOR WRITE_SPINOR_FLOAT4_STR 00269 #if (DD_XPAY==1) 00270 #ifdef DIRECT_ACCESS_WILSON_ACCUM 00271 #define ACCUMTEX x 00272 #define READ_ACCUM READ_ACCUM_SINGLE 00273 #else 00274 #define ACCUMTEX accumTexSingle 00275 #define READ_ACCUM READ_ACCUM_SINGLE_TEX 00276 #endif 00277 #endif 00278 00279 #define SPINOR_HOP 6 00280 00281 // single-precision clover field 00282 #if (DD_CLOVER==0) 00283 #define DD_PARAM_CLOVER 00284 #else 00285 #define DD_PARAM_CLOVER const float4 *clover, const float *null3, 00286 #endif 00287 #ifdef DIRECT_ACCESS_CLOVER 00288 #define CLOVERTEX clover 00289 #define READ_CLOVER READ_CLOVER_SINGLE 00290 #else 00291 #define CLOVERTEX cloverTexSingle 00292 #define READ_CLOVER READ_CLOVER_SINGLE_TEX 00293 #endif 00294 00295 #else // half-precision fields 00296 00297 #define TPROJSCALE tProjScale_f 00298 00299 // half-precision gauge field 00300 #ifdef DIRECT_ACCESS_LINK 00301 #define GAUGE0TEX gauge0 00302 #define GAUGE1TEX gauge1 00303 #else 00304 #if (DD_RECON_F == 18) 00305 #define GAUGE0TEX gauge0TexHalf2 00306 #define GAUGE1TEX gauge1TexHalf2 00307 #else 00308 #define GAUGE0TEX gauge0TexHalf4 00309 #define GAUGE1TEX gauge1TexHalf4 00310 #endif 00311 #endif 00312 00313 00314 // half-precision spinor fields 00315 #ifdef DIRECT_ACCESS_WILSON_SPINOR 00316 #define READ_SPINOR READ_SPINOR_HALF 00317 #define READ_SPINOR_UP READ_SPINOR_HALF_UP 00318 #define READ_SPINOR_DOWN READ_SPINOR_HALF_DOWN 00319 #define SPINORTEX in 00320 #else 00321 #define READ_SPINOR READ_SPINOR_HALF_TEX 00322 #define READ_SPINOR_UP READ_SPINOR_HALF_UP_TEX 00323 #define READ_SPINOR_DOWN READ_SPINOR_HALF_DOWN_TEX 00324 #define SPINORTEX spinorTexHalf 00325 #endif 00326 #ifdef DIRECT_ACCESS_WILSON_INTER 00327 #define READ_INTERMEDIATE_SPINOR READ_SPINOR_HALF 00328 #define INTERTEX out 00329 #else 00330 #define READ_INTERMEDIATE_SPINOR READ_SPINOR_HALF_TEX 00331 #define INTERTEX interTexHalf 00332 #endif 00333 #define DD_PARAM_OUT short4* out, float *outNorm, 00334 #define DD_PARAM_IN const short4* in, const float *inNorm, 00335 #define WRITE_SPINOR WRITE_SPINOR_SHORT4_STR 00336 #if (DD_XPAY==1) 00337 #ifdef DIRECT_ACCESS_WILSON_ACCUM 00338 #define ACCUMTEX x 00339 #define READ_ACCUM READ_ACCUM_HALF 00340 #else 00341 #define ACCUMTEX accumTexHalf 00342 #define READ_ACCUM READ_ACCUM_HALF_TEX 00343 #endif 00344 #endif 00345 00346 #define SPINOR_HOP 6 00347 00348 // half-precision clover field 00349 #if (DD_CLOVER==0) 00350 #define DD_PARAM_CLOVER 00351 #else 00352 #define DD_PARAM_CLOVER const short4 *clover, const float *cloverNorm, 00353 #endif 00354 #ifdef DIRECT_ACCESS_CLOVER 00355 #define CLOVERTEX clover 00356 #define READ_CLOVER READ_CLOVER_HALF 00357 #else 00358 #define CLOVERTEX cloverTexHalf 00359 #define READ_CLOVER READ_CLOVER_HALF_TEX 00360 #endif 00361 00362 #endif 00363 00364 // only build double precision if supported 00365 #if !(__COMPUTE_CAPABILITY__ < 130 && DD_PREC == 0) 00366 00367 #define DD_CONCAT(n,r,d,x) n ## r ## d ## x ## Kernel 00368 #define DD_FUNC(n,r,d,x) DD_CONCAT(n,r,d,x) 00369 00370 #ifdef GPU_WILSON_DIRAC 00371 #define BUILD_WILSON 1 00372 #else 00373 #define BUILD_WILSON 0 00374 #endif 00375 00376 #ifdef GPU_CLOVER_DIRAC 00377 #define BUILD_CLOVER 1 00378 #else 00379 #define BUILD_CLOVER 0 00380 #endif 00381 00382 // define the kernel 00383 00384 template <KernelType kernel_type> 00385 __global__ void DD_FUNC(DD_NAME_F, DD_RECON_F, DD_DAG_F, DD_XPAY_F) 00386 (DD_PARAM_OUT DD_PARAM_GAUGE DD_PARAM_CLOVER DD_PARAM_IN DD_PARAM_XPAY const DslashParam param) { 00387 00388 // build Wilson or clover as appropriate 00389 #if ((DD_CLOVER==0 && BUILD_WILSON) || (DD_CLOVER==1 && BUILD_CLOVER)) 00390 00391 #if (__COMPUTE_CAPABILITY__ >= 200 && defined(SHARED_WILSON_DSLASH)) // Fermi optimal code 00392 00393 #if DD_DAG 00394 #include "wilson_dslash_dagger_fermi_core.h" 00395 #else 00396 #include "wilson_dslash_fermi_core.h" 00397 #endif 00398 00399 #elif (__COMPUTE_CAPABILITY__ >= 120) // GT200 optimal code 00400 00401 #if DD_DAG 00402 #include "wilson_dslash_dagger_gt200_core.h" 00403 #else 00404 #include "wilson_dslash_gt200_core.h" 00405 #endif 00406 00407 #else // fall-back is original G80 00408 00409 #if DD_DAG 00410 #include "wilson_dslash_dagger_g80_core.h" 00411 #else 00412 #include "wilson_dslash_g80_core.h" 00413 #endif 00414 00415 #endif 00416 00417 00418 #endif 00419 00420 } 00421 00422 #endif 00423 00424 // clean up 00425 00426 #undef DD_NAME_F 00427 #undef DD_RECON_F 00428 #undef DD_DAG_F 00429 #undef DD_XPAY_F 00430 #undef DD_PARAM_OUT 00431 #undef DD_PARAM_GAUGE 00432 #undef DD_PARAM_CLOVER 00433 #undef DD_PARAM_IN 00434 #undef DD_PARAM_XPAY 00435 #undef DD_CONCAT 00436 #undef DD_FUNC 00437 00438 #undef DSLASH_XPAY 00439 #undef READ_GAUGE_MATRIX 00440 #undef RECONSTRUCT_GAUGE_MATRIX 00441 #undef GAUGE0TEX 00442 #undef GAUGE1TEX 00443 #undef READ_SPINOR 00444 #undef READ_SPINOR_UP 00445 #undef READ_SPINOR_DOWN 00446 #undef SPINORTEX 00447 #undef READ_INTERMEDIATE_SPINOR 00448 #undef INTERTEX 00449 #undef WRITE_SPINOR 00450 #undef READ_ACCUM 00451 #undef ACCUMTEX 00452 #undef READ_CLOVER 00453 #undef CLOVERTEX 00454 #undef DSLASH_CLOVER 00455 #undef GAUGE_FLOAT2 00456 #undef SPINOR_DOUBLE 00457 #undef CLOVER_DOUBLE 00458 #undef SPINOR_HOP 00459 00460 #undef TPROJSCALE 00461 00462 // prepare next set of options, or clean up after final iteration 00463 00464 #if (DD_DAG==0) 00465 #undef DD_DAG 00466 #define DD_DAG 1 00467 #else 00468 #undef DD_DAG 00469 #define DD_DAG 0 00470 00471 #if (DD_XPAY==0) 00472 #undef DD_XPAY 00473 #define DD_XPAY 1 00474 #else 00475 #undef DD_XPAY 00476 #define DD_XPAY 0 00477 00478 #if (DD_RECON==0) 00479 #undef DD_RECON 00480 #define DD_RECON 1 00481 #elif (DD_RECON==1) 00482 #undef DD_RECON 00483 #define DD_RECON 2 00484 #else 00485 #undef DD_RECON 00486 #define DD_RECON 0 00487 00488 #if (DD_PREC==0) 00489 #undef DD_PREC 00490 #define DD_PREC 1 00491 #elif (DD_PREC==1) 00492 #undef DD_PREC 00493 #define DD_PREC 2 00494 #else 00495 #undef DD_PREC 00496 #define DD_PREC 0 00497 00498 #if (DD_CLOVER==0) 00499 #undef DD_CLOVER 00500 #define DD_CLOVER 1 00501 00502 #else 00503 00504 #undef DD_LOOP 00505 #undef DD_DAG 00506 #undef DD_XPAY 00507 #undef DD_RECON 00508 #undef DD_PREC 00509 #undef DD_CLOVER 00510 00511 #endif // DD_CLOVER 00512 #endif // DD_PREC 00513 #endif // DD_RECON 00514 #endif // DD_XPAY 00515 #endif // DD_DAG 00516 00517 #ifdef DD_LOOP 00518 #include "wilson_dslash_def.h" 00519 #endif