3 #define DSLASH_SHARED_FLOATS_PER_THREAD 0 6 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler 8 #else // Open64 compiler 9 #define VOLATILE volatile 13 #define spinorFloat double 38 #define acc00_re accum0.x 39 #define acc00_im accum0.y 40 #define acc01_re accum1.x 41 #define acc01_im accum1.y 42 #define acc02_re accum2.x 43 #define acc02_im accum2.y 44 #define acc10_re accum3.x 45 #define acc10_im accum3.y 46 #define acc11_re accum4.x 47 #define acc11_im accum4.y 48 #define acc12_re accum5.x 49 #define acc12_im accum5.y 50 #define acc20_re accum6.x 51 #define acc20_im accum6.y 52 #define acc21_re accum7.x 53 #define acc21_im accum7.y 54 #define acc22_re accum8.x 55 #define acc22_im accum8.y 56 #define acc30_re accum9.x 57 #define acc30_im accum9.y 58 #define acc31_re accum10.x 59 #define acc31_im accum10.y 60 #define acc32_re accum11.x 61 #define acc32_im accum11.y 63 #define spinorFloat float 88 #define acc00_re accum0.x 89 #define acc00_im accum0.y 90 #define acc01_re accum0.z 91 #define acc01_im accum0.w 92 #define acc02_re accum1.x 93 #define acc02_im accum1.y 94 #define acc10_re accum1.z 95 #define acc10_im accum1.w 96 #define acc11_re accum2.x 97 #define acc11_im accum2.y 98 #define acc12_re accum2.z 99 #define acc12_im accum2.w 100 #define acc20_re accum3.x 101 #define acc20_im accum3.y 102 #define acc21_re accum3.z 103 #define acc21_im accum3.w 104 #define acc22_re accum4.x 105 #define acc22_im accum4.y 106 #define acc30_re accum4.z 107 #define acc30_im accum4.w 108 #define acc31_re accum5.x 109 #define acc31_im accum5.y 110 #define acc32_re accum5.z 111 #define acc32_im accum5.w 112 #endif // SPINOR_DOUBLE 155 #endif // GAUGE_DOUBLE 158 #define gT00_re (+g00_re) 159 #define gT00_im (-g00_im) 160 #define gT01_re (+g10_re) 161 #define gT01_im (-g10_im) 162 #define gT02_re (+g20_re) 163 #define gT02_im (-g20_im) 164 #define gT10_re (+g01_re) 165 #define gT10_im (-g01_im) 166 #define gT11_re (+g11_re) 167 #define gT11_im (-g11_im) 168 #define gT12_re (+g21_re) 169 #define gT12_im (-g21_im) 170 #define gT20_re (+g02_re) 171 #define gT20_im (-g02_im) 172 #define gT21_re (+g12_re) 173 #define gT21_im (-g12_im) 174 #define gT22_re (+g22_re) 175 #define gT22_im (-g22_im) 179 #define c00_00_re C0.x 180 #define c01_01_re C0.y 181 #define c02_02_re C1.x 182 #define c10_10_re C1.y 183 #define c11_11_re C2.x 184 #define c12_12_re C2.y 185 #define c01_00_re C3.x 186 #define c01_00_im C3.y 187 #define c02_00_re C4.x 188 #define c02_00_im C4.y 189 #define c10_00_re C5.x 190 #define c10_00_im C5.y 191 #define c11_00_re C6.x 192 #define c11_00_im C6.y 193 #define c12_00_re C7.x 194 #define c12_00_im C7.y 195 #define c02_01_re C8.x 196 #define c02_01_im C8.y 197 #define c10_01_re C9.x 198 #define c10_01_im C9.y 199 #define c11_01_re C10.x 200 #define c11_01_im C10.y 201 #define c12_01_re C11.x 202 #define c12_01_im C11.y 203 #define c10_02_re C12.x 204 #define c10_02_im C12.y 205 #define c11_02_re C13.x 206 #define c11_02_im C13.y 207 #define c12_02_re C14.x 208 #define c12_02_im C14.y 209 #define c11_10_re C15.x 210 #define c11_10_im C15.y 211 #define c12_10_re C16.x 212 #define c12_10_im C16.y 213 #define c12_11_re C17.x 214 #define c12_11_im C17.y 216 #define c00_00_re C0.x 217 #define c01_01_re C0.y 218 #define c02_02_re C0.z 219 #define c10_10_re C0.w 220 #define c11_11_re C1.x 221 #define c12_12_re C1.y 222 #define c01_00_re C1.z 223 #define c01_00_im C1.w 224 #define c02_00_re C2.x 225 #define c02_00_im C2.y 226 #define c10_00_re C2.z 227 #define c10_00_im C2.w 228 #define c11_00_re C3.x 229 #define c11_00_im C3.y 230 #define c12_00_re C3.z 231 #define c12_00_im C3.w 232 #define c02_01_re C4.x 233 #define c02_01_im C4.y 234 #define c10_01_re C4.z 235 #define c10_01_im C4.w 236 #define c11_01_re C5.x 237 #define c11_01_im C5.y 238 #define c12_01_re C5.z 239 #define c12_01_im C5.w 240 #define c10_02_re C6.x 241 #define c10_02_im C6.y 242 #define c11_02_re C6.z 243 #define c11_02_im C6.w 244 #define c12_02_re C7.x 245 #define c12_02_im C7.y 246 #define c11_10_re C7.z 247 #define c11_10_im C7.w 248 #define c12_10_re C8.x 249 #define c12_10_im C8.y 250 #define c12_11_re C8.z 251 #define c12_11_im C8.w 252 #endif // CLOVER_DOUBLE 254 #define c00_01_re (+c01_00_re) 255 #define c00_01_im (-c01_00_im) 256 #define c00_02_re (+c02_00_re) 257 #define c00_02_im (-c02_00_im) 258 #define c01_02_re (+c02_01_re) 259 #define c01_02_im (-c02_01_im) 260 #define c00_10_re (+c10_00_re) 261 #define c00_10_im (-c10_00_im) 262 #define c01_10_re (+c10_01_re) 263 #define c01_10_im (-c10_01_im) 264 #define c02_10_re (+c10_02_re) 265 #define c02_10_im (-c10_02_im) 266 #define c00_11_re (+c11_00_re) 267 #define c00_11_im (-c11_00_im) 268 #define c01_11_re (+c11_01_re) 269 #define c01_11_im (-c11_01_im) 270 #define c02_11_re (+c11_02_re) 271 #define c02_11_im (-c11_02_im) 272 #define c10_11_re (+c11_10_re) 273 #define c10_11_im (-c11_10_im) 274 #define c00_12_re (+c12_00_re) 275 #define c00_12_im (-c12_00_im) 276 #define c01_12_re (+c12_01_re) 277 #define c01_12_im (-c12_01_im) 278 #define c02_12_re (+c12_02_re) 279 #define c02_12_im (-c12_02_im) 280 #define c10_12_re (+c12_10_re) 281 #define c10_12_im (-c12_10_im) 282 #define c11_12_re (+c12_11_re) 283 #define c11_12_im (-c12_11_im) 286 #define c20_20_re c00_00_re 287 #define c21_20_re c01_00_re 288 #define c21_20_im c01_00_im 289 #define c22_20_re c02_00_re 290 #define c22_20_im c02_00_im 291 #define c30_20_re c10_00_re 292 #define c30_20_im c10_00_im 293 #define c31_20_re c11_00_re 294 #define c31_20_im c11_00_im 295 #define c32_20_re c12_00_re 296 #define c32_20_im c12_00_im 297 #define c20_21_re c00_01_re 298 #define c20_21_im c00_01_im 299 #define c21_21_re c01_01_re 300 #define c22_21_re c02_01_re 301 #define c22_21_im c02_01_im 302 #define c30_21_re c10_01_re 303 #define c30_21_im c10_01_im 304 #define c31_21_re c11_01_re 305 #define c31_21_im c11_01_im 306 #define c32_21_re c12_01_re 307 #define c32_21_im c12_01_im 308 #define c20_22_re c00_02_re 309 #define c20_22_im c00_02_im 310 #define c21_22_re c01_02_re 311 #define c21_22_im c01_02_im 312 #define c22_22_re c02_02_re 313 #define c30_22_re c10_02_re 314 #define c30_22_im c10_02_im 315 #define c31_22_re c11_02_re 316 #define c31_22_im c11_02_im 317 #define c32_22_re c12_02_re 318 #define c32_22_im c12_02_im 319 #define c20_30_re c00_10_re 320 #define c20_30_im c00_10_im 321 #define c21_30_re c01_10_re 322 #define c21_30_im c01_10_im 323 #define c22_30_re c02_10_re 324 #define c22_30_im c02_10_im 325 #define c30_30_re c10_10_re 326 #define c31_30_re c11_10_re 327 #define c31_30_im c11_10_im 328 #define c32_30_re c12_10_re 329 #define c32_30_im c12_10_im 330 #define c20_31_re c00_11_re 331 #define c20_31_im c00_11_im 332 #define c21_31_re c01_11_re 333 #define c21_31_im c01_11_im 334 #define c22_31_re c02_11_re 335 #define c22_31_im c02_11_im 336 #define c30_31_re c10_11_re 337 #define c30_31_im c10_11_im 338 #define c31_31_re c11_11_re 339 #define c32_31_re c12_11_re 340 #define c32_31_im c12_11_im 341 #define c20_32_re c00_12_re 342 #define c20_32_im c00_12_im 343 #define c21_32_re c01_12_re 344 #define c21_32_im c01_12_im 345 #define c22_32_re c02_12_re 346 #define c22_32_im c02_12_im 347 #define c30_32_re c10_12_re 348 #define c30_32_im c10_12_im 349 #define c31_32_re c11_12_re 350 #define c31_32_im c11_12_im 351 #define c32_32_re c12_12_re 356 #define cinv00_00_re C0.x 357 #define cinv01_01_re C0.y 358 #define cinv02_02_re C1.x 359 #define cinv10_10_re C1.y 360 #define cinv11_11_re C2.x 361 #define cinv12_12_re C2.y 362 #define cinv01_00_re C3.x 363 #define cinv01_00_im C3.y 364 #define cinv02_00_re C4.x 365 #define cinv02_00_im C4.y 366 #define cinv10_00_re C5.x 367 #define cinv10_00_im C5.y 368 #define cinv11_00_re C6.x 369 #define cinv11_00_im C6.y 370 #define cinv12_00_re C7.x 371 #define cinv12_00_im C7.y 372 #define cinv02_01_re C8.x 373 #define cinv02_01_im C8.y 374 #define cinv10_01_re C9.x 375 #define cinv10_01_im C9.y 376 #define cinv11_01_re C10.x 377 #define cinv11_01_im C10.y 378 #define cinv12_01_re C11.x 379 #define cinv12_01_im C11.y 380 #define cinv10_02_re C12.x 381 #define cinv10_02_im C12.y 382 #define cinv11_02_re C13.x 383 #define cinv11_02_im C13.y 384 #define cinv12_02_re C14.x 385 #define cinv12_02_im C14.y 386 #define cinv11_10_re C15.x 387 #define cinv11_10_im C15.y 388 #define cinv12_10_re C16.x 389 #define cinv12_10_im C16.y 390 #define cinv12_11_re C17.x 391 #define cinv12_11_im C17.y 393 #define cinv00_00_re C0.x 394 #define cinv01_01_re C0.y 395 #define cinv02_02_re C0.z 396 #define cinv10_10_re C0.w 397 #define cinv11_11_re C1.x 398 #define cinv12_12_re C1.y 399 #define cinv01_00_re C1.z 400 #define cinv01_00_im C1.w 401 #define cinv02_00_re C2.x 402 #define cinv02_00_im C2.y 403 #define cinv10_00_re C2.z 404 #define cinv10_00_im C2.w 405 #define cinv11_00_re C3.x 406 #define cinv11_00_im C3.y 407 #define cinv12_00_re C3.z 408 #define cinv12_00_im C3.w 409 #define cinv02_01_re C4.x 410 #define cinv02_01_im C4.y 411 #define cinv10_01_re C4.z 412 #define cinv10_01_im C4.w 413 #define cinv11_01_re C5.x 414 #define cinv11_01_im C5.y 415 #define cinv12_01_re C5.z 416 #define cinv12_01_im C5.w 417 #define cinv10_02_re C6.x 418 #define cinv10_02_im C6.y 419 #define cinv11_02_re C6.z 420 #define cinv11_02_im C6.w 421 #define cinv12_02_re C7.x 422 #define cinv12_02_im C7.y 423 #define cinv11_10_re C7.z 424 #define cinv11_10_im C7.w 425 #define cinv12_10_re C8.x 426 #define cinv12_10_im C8.y 427 #define cinv12_11_re C8.z 428 #define cinv12_11_im C8.w 429 #endif // CLOVER_DOUBLE 431 #define cinv00_01_re (+cinv01_00_re) 432 #define cinv00_01_im (-cinv01_00_im) 433 #define cinv00_02_re (+cinv02_00_re) 434 #define cinv00_02_im (-cinv02_00_im) 435 #define cinv01_02_re (+cinv02_01_re) 436 #define cinv01_02_im (-cinv02_01_im) 437 #define cinv00_10_re (+cinv10_00_re) 438 #define cinv00_10_im (-cinv10_00_im) 439 #define cinv01_10_re (+cinv10_01_re) 440 #define cinv01_10_im (-cinv10_01_im) 441 #define cinv02_10_re (+cinv10_02_re) 442 #define cinv02_10_im (-cinv10_02_im) 443 #define cinv00_11_re (+cinv11_00_re) 444 #define cinv00_11_im (-cinv11_00_im) 445 #define cinv01_11_re (+cinv11_01_re) 446 #define cinv01_11_im (-cinv11_01_im) 447 #define cinv02_11_re (+cinv11_02_re) 448 #define cinv02_11_im (-cinv11_02_im) 449 #define cinv10_11_re (+cinv11_10_re) 450 #define cinv10_11_im (-cinv11_10_im) 451 #define cinv00_12_re (+cinv12_00_re) 452 #define cinv00_12_im (-cinv12_00_im) 453 #define cinv01_12_re (+cinv12_01_re) 454 #define cinv01_12_im (-cinv12_01_im) 455 #define cinv02_12_re (+cinv12_02_re) 456 #define cinv02_12_im (-cinv12_02_im) 457 #define cinv10_12_re (+cinv12_10_re) 458 #define cinv10_12_im (-cinv12_10_im) 459 #define cinv11_12_re (+cinv12_11_re) 460 #define cinv11_12_im (-cinv12_11_im) 463 #define cinv20_20_re cinv00_00_re 464 #define cinv21_20_re cinv01_00_re 465 #define cinv21_20_im cinv01_00_im 466 #define cinv22_20_re cinv02_00_re 467 #define cinv22_20_im cinv02_00_im 468 #define cinv30_20_re cinv10_00_re 469 #define cinv30_20_im cinv10_00_im 470 #define cinv31_20_re cinv11_00_re 471 #define cinv31_20_im cinv11_00_im 472 #define cinv32_20_re cinv12_00_re 473 #define cinv32_20_im cinv12_00_im 474 #define cinv20_21_re cinv00_01_re 475 #define cinv20_21_im cinv00_01_im 476 #define cinv21_21_re cinv01_01_re 477 #define cinv22_21_re cinv02_01_re 478 #define cinv22_21_im cinv02_01_im 479 #define cinv30_21_re cinv10_01_re 480 #define cinv30_21_im cinv10_01_im 481 #define cinv31_21_re cinv11_01_re 482 #define cinv31_21_im cinv11_01_im 483 #define cinv32_21_re cinv12_01_re 484 #define cinv32_21_im cinv12_01_im 485 #define cinv20_22_re cinv00_02_re 486 #define cinv20_22_im cinv00_02_im 487 #define cinv21_22_re cinv01_02_re 488 #define cinv21_22_im cinv01_02_im 489 #define cinv22_22_re cinv02_02_re 490 #define cinv30_22_re cinv10_02_re 491 #define cinv30_22_im cinv10_02_im 492 #define cinv31_22_re cinv11_02_re 493 #define cinv31_22_im cinv11_02_im 494 #define cinv32_22_re cinv12_02_re 495 #define cinv32_22_im cinv12_02_im 496 #define cinv20_30_re cinv00_10_re 497 #define cinv20_30_im cinv00_10_im 498 #define cinv21_30_re cinv01_10_re 499 #define cinv21_30_im cinv01_10_im 500 #define cinv22_30_re cinv02_10_re 501 #define cinv22_30_im cinv02_10_im 502 #define cinv30_30_re cinv10_10_re 503 #define cinv31_30_re cinv11_10_re 504 #define cinv31_30_im cinv11_10_im 505 #define cinv32_30_re cinv12_10_re 506 #define cinv32_30_im cinv12_10_im 507 #define cinv20_31_re cinv00_11_re 508 #define cinv20_31_im cinv00_11_im 509 #define cinv21_31_re cinv01_11_re 510 #define cinv21_31_im cinv01_11_im 511 #define cinv22_31_re cinv02_11_re 512 #define cinv22_31_im cinv02_11_im 513 #define cinv30_31_re cinv10_11_re 514 #define cinv30_31_im cinv10_11_im 515 #define cinv31_31_re cinv11_11_re 516 #define cinv32_31_re cinv12_11_re 517 #define cinv32_31_im cinv12_11_im 518 #define cinv20_32_re cinv00_12_re 519 #define cinv20_32_im cinv00_12_im 520 #define cinv21_32_re cinv01_12_re 521 #define cinv21_32_im cinv01_12_im 522 #define cinv22_32_re cinv02_12_re 523 #define cinv22_32_im cinv02_12_im 524 #define cinv30_32_re cinv10_12_re 525 #define cinv30_32_im cinv10_12_im 526 #define cinv31_32_re cinv11_12_re 527 #define cinv31_32_im cinv11_12_im 528 #define cinv32_32_re cinv12_12_re 567 #endif // CLOVER_DOUBLE 675 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)][1];
676 #if (DD_PREC==2) // half precision 677 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][1];
716 const int sp_stride_pad =
param.dc.ghostFace[
static_cast<int>(kernel_type)];
869 face_idx +
param.ghostOffset[static_cast<int>(kernel_type)][0];
870 #if (DD_PREC==2) // half precision 871 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][0];
914 const int sp_stride_pad =
param.dc.ghostFace[
static_cast<int>(kernel_type)];
1067 face_idx +
param.ghostOffset[static_cast<int>(kernel_type)][1];
1068 #if (DD_PREC==2) // half precision 1069 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][1];
1108 const int sp_stride_pad =
param.dc.ghostFace[
static_cast<int>(kernel_type)];
1261 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)][0];
1262 #if (DD_PREC==2) // half precision 1263 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][0];
1306 const int sp_stride_pad =
param.dc.ghostFace[
static_cast<int>(kernel_type)];
1459 face_idx +
param.ghostOffset[static_cast<int>(kernel_type)][1];
1460 #if (DD_PREC==2) // half precision 1461 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][1];
1500 const int sp_stride_pad =
param.dc.ghostFace[
static_cast<int>(kernel_type)];
1653 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)][0];
1654 #if (DD_PREC==2) // half precision 1655 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][0];
1698 const int sp_stride_pad =
param.dc.ghostFace[
static_cast<int>(kernel_type)];
1851 face_idx +
param.ghostOffset[static_cast<int>(kernel_type)][1];
1852 #if (DD_PREC==2) // half precision 1853 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][1];
1894 const int sp_stride_pad =
param.dc.ghostFace[
static_cast<int>(kernel_type)];
1965 const int sp_stride_pad =
param.dc.ghostFace[
static_cast<int>(kernel_type)];
2108 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)][0];
2109 #if (DD_PREC==2) // half precision 2110 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][0];
2155 const int sp_stride_pad =
param.dc.ghostFace[
static_cast<int>(kernel_type)];
2226 const int sp_stride_pad =
param.dc.ghostFace[
static_cast<int>(kernel_type)];
2360 switch(kernel_type) {
2375 #ifdef SPINOR_DOUBLE 2381 #ifdef SPINOR_DOUBLE 2386 READ_ACCUM(ACCUMTEX,
param.sp_stride)
2388 #ifndef CLOVER_TWIST_XPAY 2390 #ifndef DYNAMIC_CLOVER 2445 #endif//CLOVER_TWIST_XPAY 2447 #ifndef DYNAMIC_CLOVER
VOLATILE spinorFloat o00_re
#define APPLY_CLOVER_TWIST(c, a, reg)
VOLATILE spinorFloat o10_re
READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, param.gauge_stride)
RECONSTRUCT_GAUGE_MATRIX(0)
APPLY_CLOVER_TWIST_INV(c, cinv, a, o)
VOLATILE spinorFloat o21_im
VOLATILE spinorFloat o32_im
VOLATILE spinorFloat o01_re
READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
#define APPLY_CLOVER_TWIST_DYN_INV(c, a, reg)
VOLATILE spinorFloat o30_re
VOLATILE spinorFloat o20_re
coordsFromIndex< 4, QUDA_4D_PC, EVEN_X >(X, coord, sid, param)
VOLATILE spinorFloat o01_im
WRITE_SPINOR(param.sp_stride)
READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
VOLATILE spinorFloat o11_re
VOLATILE spinorFloat o12_im
VOLATILE spinorFloat o02_re
VOLATILE spinorFloat o31_re
VOLATILE spinorFloat o21_re
#define READ_INTERMEDIATE_SPINOR
VOLATILE spinorFloat o30_im
VOLATILE spinorFloat o10_im
VOLATILE spinorFloat o02_im
#define READ_SPINOR_GHOST
VOLATILE spinorFloat o12_re
VOLATILE spinorFloat o00_im
VOLATILE spinorFloat o22_im
VOLATILE spinorFloat o11_im
VOLATILE spinorFloat o31_im
READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
VOLATILE spinorFloat o22_re
VOLATILE spinorFloat o20_im
VOLATILE spinorFloat o32_re