3 #define DSLASH_SHARED_FLOATS_PER_THREAD 24 6 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler 8 #else // Open64 compiler 9 #define VOLATILE volatile 13 #define spinorFloat double 14 #define WRITE_SPINOR_SHARED WRITE_SPINOR_SHARED_DOUBLE2 15 #define READ_SPINOR_SHARED READ_SPINOR_SHARED_DOUBLE2 40 #define acc00_re accum0.x 41 #define acc00_im accum0.y 42 #define acc01_re accum1.x 43 #define acc01_im accum1.y 44 #define acc02_re accum2.x 45 #define acc02_im accum2.y 46 #define acc10_re accum3.x 47 #define acc10_im accum3.y 48 #define acc11_re accum4.x 49 #define acc11_im accum4.y 50 #define acc12_re accum5.x 51 #define acc12_im accum5.y 52 #define acc20_re accum6.x 53 #define acc20_im accum6.y 54 #define acc21_re accum7.x 55 #define acc21_im accum7.y 56 #define acc22_re accum8.x 57 #define acc22_im accum8.y 58 #define acc30_re accum9.x 59 #define acc30_im accum9.y 60 #define acc31_re accum10.x 61 #define acc31_im accum10.y 62 #define acc32_re accum11.x 63 #define acc32_im accum11.y 65 #define spinorFloat float 66 #define WRITE_SPINOR_SHARED WRITE_SPINOR_SHARED_FLOAT4 67 #define READ_SPINOR_SHARED READ_SPINOR_SHARED_FLOAT4 92 #define acc00_re accum0.x 93 #define acc00_im accum0.y 94 #define acc01_re accum0.z 95 #define acc01_im accum0.w 96 #define acc02_re accum1.x 97 #define acc02_im accum1.y 98 #define acc10_re accum1.z 99 #define acc10_im accum1.w 100 #define acc11_re accum2.x 101 #define acc11_im accum2.y 102 #define acc12_re accum2.z 103 #define acc12_im accum2.w 104 #define acc20_re accum3.x 105 #define acc20_im accum3.y 106 #define acc21_re accum3.z 107 #define acc21_im accum3.w 108 #define acc22_re accum4.x 109 #define acc22_im accum4.y 110 #define acc30_re accum4.z 111 #define acc30_im accum4.w 112 #define acc31_re accum5.x 113 #define acc31_im accum5.y 114 #define acc32_re accum5.z 115 #define acc32_im accum5.w 116 #endif // SPINOR_DOUBLE 159 #endif // GAUGE_DOUBLE 162 #define gT00_re (+g00_re) 163 #define gT00_im (-g00_im) 164 #define gT01_re (+g10_re) 165 #define gT01_im (-g10_im) 166 #define gT02_re (+g20_re) 167 #define gT02_im (-g20_im) 168 #define gT10_re (+g01_re) 169 #define gT10_im (-g01_im) 170 #define gT11_re (+g11_re) 171 #define gT11_im (-g11_im) 172 #define gT12_re (+g21_re) 173 #define gT12_im (-g21_im) 174 #define gT20_re (+g02_re) 175 #define gT20_im (-g02_im) 176 #define gT21_re (+g12_re) 177 #define gT21_im (-g12_im) 178 #define gT22_re (+g22_re) 179 #define gT22_im (-g22_im) 183 #define c00_00_re C0.x 184 #define c01_01_re C0.y 185 #define c02_02_re C1.x 186 #define c10_10_re C1.y 187 #define c11_11_re C2.x 188 #define c12_12_re C2.y 189 #define c01_00_re C3.x 190 #define c01_00_im C3.y 191 #define c02_00_re C4.x 192 #define c02_00_im C4.y 193 #define c10_00_re C5.x 194 #define c10_00_im C5.y 195 #define c11_00_re C6.x 196 #define c11_00_im C6.y 197 #define c12_00_re C7.x 198 #define c12_00_im C7.y 199 #define c02_01_re C8.x 200 #define c02_01_im C8.y 201 #define c10_01_re C9.x 202 #define c10_01_im C9.y 203 #define c11_01_re C10.x 204 #define c11_01_im C10.y 205 #define c12_01_re C11.x 206 #define c12_01_im C11.y 207 #define c10_02_re C12.x 208 #define c10_02_im C12.y 209 #define c11_02_re C13.x 210 #define c11_02_im C13.y 211 #define c12_02_re C14.x 212 #define c12_02_im C14.y 213 #define c11_10_re C15.x 214 #define c11_10_im C15.y 215 #define c12_10_re C16.x 216 #define c12_10_im C16.y 217 #define c12_11_re C17.x 218 #define c12_11_im C17.y 220 #define c00_00_re C0.x 221 #define c01_01_re C0.y 222 #define c02_02_re C0.z 223 #define c10_10_re C0.w 224 #define c11_11_re C1.x 225 #define c12_12_re C1.y 226 #define c01_00_re C1.z 227 #define c01_00_im C1.w 228 #define c02_00_re C2.x 229 #define c02_00_im C2.y 230 #define c10_00_re C2.z 231 #define c10_00_im C2.w 232 #define c11_00_re C3.x 233 #define c11_00_im C3.y 234 #define c12_00_re C3.z 235 #define c12_00_im C3.w 236 #define c02_01_re C4.x 237 #define c02_01_im C4.y 238 #define c10_01_re C4.z 239 #define c10_01_im C4.w 240 #define c11_01_re C5.x 241 #define c11_01_im C5.y 242 #define c12_01_re C5.z 243 #define c12_01_im C5.w 244 #define c10_02_re C6.x 245 #define c10_02_im C6.y 246 #define c11_02_re C6.z 247 #define c11_02_im C6.w 248 #define c12_02_re C7.x 249 #define c12_02_im C7.y 250 #define c11_10_re C7.z 251 #define c11_10_im C7.w 252 #define c12_10_re C8.x 253 #define c12_10_im C8.y 254 #define c12_11_re C8.z 255 #define c12_11_im C8.w 256 #endif // CLOVER_DOUBLE 258 #define c00_01_re (+c01_00_re) 259 #define c00_01_im (-c01_00_im) 260 #define c00_02_re (+c02_00_re) 261 #define c00_02_im (-c02_00_im) 262 #define c01_02_re (+c02_01_re) 263 #define c01_02_im (-c02_01_im) 264 #define c00_10_re (+c10_00_re) 265 #define c00_10_im (-c10_00_im) 266 #define c01_10_re (+c10_01_re) 267 #define c01_10_im (-c10_01_im) 268 #define c02_10_re (+c10_02_re) 269 #define c02_10_im (-c10_02_im) 270 #define c00_11_re (+c11_00_re) 271 #define c00_11_im (-c11_00_im) 272 #define c01_11_re (+c11_01_re) 273 #define c01_11_im (-c11_01_im) 274 #define c02_11_re (+c11_02_re) 275 #define c02_11_im (-c11_02_im) 276 #define c10_11_re (+c11_10_re) 277 #define c10_11_im (-c11_10_im) 278 #define c00_12_re (+c12_00_re) 279 #define c00_12_im (-c12_00_im) 280 #define c01_12_re (+c12_01_re) 281 #define c01_12_im (-c12_01_im) 282 #define c02_12_re (+c12_02_re) 283 #define c02_12_im (-c12_02_im) 284 #define c10_12_re (+c12_10_re) 285 #define c10_12_im (-c12_10_im) 286 #define c11_12_re (+c12_11_re) 287 #define c11_12_im (-c12_11_im) 290 #define c20_20_re c00_00_re 291 #define c21_20_re c01_00_re 292 #define c21_20_im c01_00_im 293 #define c22_20_re c02_00_re 294 #define c22_20_im c02_00_im 295 #define c30_20_re c10_00_re 296 #define c30_20_im c10_00_im 297 #define c31_20_re c11_00_re 298 #define c31_20_im c11_00_im 299 #define c32_20_re c12_00_re 300 #define c32_20_im c12_00_im 301 #define c20_21_re c00_01_re 302 #define c20_21_im c00_01_im 303 #define c21_21_re c01_01_re 304 #define c22_21_re c02_01_re 305 #define c22_21_im c02_01_im 306 #define c30_21_re c10_01_re 307 #define c30_21_im c10_01_im 308 #define c31_21_re c11_01_re 309 #define c31_21_im c11_01_im 310 #define c32_21_re c12_01_re 311 #define c32_21_im c12_01_im 312 #define c20_22_re c00_02_re 313 #define c20_22_im c00_02_im 314 #define c21_22_re c01_02_re 315 #define c21_22_im c01_02_im 316 #define c22_22_re c02_02_re 317 #define c30_22_re c10_02_re 318 #define c30_22_im c10_02_im 319 #define c31_22_re c11_02_re 320 #define c31_22_im c11_02_im 321 #define c32_22_re c12_02_re 322 #define c32_22_im c12_02_im 323 #define c20_30_re c00_10_re 324 #define c20_30_im c00_10_im 325 #define c21_30_re c01_10_re 326 #define c21_30_im c01_10_im 327 #define c22_30_re c02_10_re 328 #define c22_30_im c02_10_im 329 #define c30_30_re c10_10_re 330 #define c31_30_re c11_10_re 331 #define c31_30_im c11_10_im 332 #define c32_30_re c12_10_re 333 #define c32_30_im c12_10_im 334 #define c20_31_re c00_11_re 335 #define c20_31_im c00_11_im 336 #define c21_31_re c01_11_re 337 #define c21_31_im c01_11_im 338 #define c22_31_re c02_11_re 339 #define c22_31_im c02_11_im 340 #define c30_31_re c10_11_re 341 #define c30_31_im c10_11_im 342 #define c31_31_re c11_11_re 343 #define c32_31_re c12_11_re 344 #define c32_31_im c12_11_im 345 #define c20_32_re c00_12_re 346 #define c20_32_im c00_12_im 347 #define c21_32_re c01_12_re 348 #define c21_32_im c01_12_im 349 #define c22_32_re c02_12_re 350 #define c22_32_im c02_12_im 351 #define c30_32_re c10_12_re 352 #define c30_32_im c10_12_im 353 #define c31_32_re c11_12_re 354 #define c31_32_im c11_12_im 355 #define c32_32_re c12_12_re 360 #define cinv00_00_re C0.x 361 #define cinv01_01_re C0.y 362 #define cinv02_02_re C1.x 363 #define cinv10_10_re C1.y 364 #define cinv11_11_re C2.x 365 #define cinv12_12_re C2.y 366 #define cinv01_00_re C3.x 367 #define cinv01_00_im C3.y 368 #define cinv02_00_re C4.x 369 #define cinv02_00_im C4.y 370 #define cinv10_00_re C5.x 371 #define cinv10_00_im C5.y 372 #define cinv11_00_re C6.x 373 #define cinv11_00_im C6.y 374 #define cinv12_00_re C7.x 375 #define cinv12_00_im C7.y 376 #define cinv02_01_re C8.x 377 #define cinv02_01_im C8.y 378 #define cinv10_01_re C9.x 379 #define cinv10_01_im C9.y 380 #define cinv11_01_re C10.x 381 #define cinv11_01_im C10.y 382 #define cinv12_01_re C11.x 383 #define cinv12_01_im C11.y 384 #define cinv10_02_re C12.x 385 #define cinv10_02_im C12.y 386 #define cinv11_02_re C13.x 387 #define cinv11_02_im C13.y 388 #define cinv12_02_re C14.x 389 #define cinv12_02_im C14.y 390 #define cinv11_10_re C15.x 391 #define cinv11_10_im C15.y 392 #define cinv12_10_re C16.x 393 #define cinv12_10_im C16.y 394 #define cinv12_11_re C17.x 395 #define cinv12_11_im C17.y 397 #define cinv00_00_re C0.x 398 #define cinv01_01_re C0.y 399 #define cinv02_02_re C0.z 400 #define cinv10_10_re C0.w 401 #define cinv11_11_re C1.x 402 #define cinv12_12_re C1.y 403 #define cinv01_00_re C1.z 404 #define cinv01_00_im C1.w 405 #define cinv02_00_re C2.x 406 #define cinv02_00_im C2.y 407 #define cinv10_00_re C2.z 408 #define cinv10_00_im C2.w 409 #define cinv11_00_re C3.x 410 #define cinv11_00_im C3.y 411 #define cinv12_00_re C3.z 412 #define cinv12_00_im C3.w 413 #define cinv02_01_re C4.x 414 #define cinv02_01_im C4.y 415 #define cinv10_01_re C4.z 416 #define cinv10_01_im C4.w 417 #define cinv11_01_re C5.x 418 #define cinv11_01_im C5.y 419 #define cinv12_01_re C5.z 420 #define cinv12_01_im C5.w 421 #define cinv10_02_re C6.x 422 #define cinv10_02_im C6.y 423 #define cinv11_02_re C6.z 424 #define cinv11_02_im C6.w 425 #define cinv12_02_re C7.x 426 #define cinv12_02_im C7.y 427 #define cinv11_10_re C7.z 428 #define cinv11_10_im C7.w 429 #define cinv12_10_re C8.x 430 #define cinv12_10_im C8.y 431 #define cinv12_11_re C8.z 432 #define cinv12_11_im C8.w 433 #endif // CLOVER_DOUBLE 435 #define cinv00_01_re (+cinv01_00_re) 436 #define cinv00_01_im (-cinv01_00_im) 437 #define cinv00_02_re (+cinv02_00_re) 438 #define cinv00_02_im (-cinv02_00_im) 439 #define cinv01_02_re (+cinv02_01_re) 440 #define cinv01_02_im (-cinv02_01_im) 441 #define cinv00_10_re (+cinv10_00_re) 442 #define cinv00_10_im (-cinv10_00_im) 443 #define cinv01_10_re (+cinv10_01_re) 444 #define cinv01_10_im (-cinv10_01_im) 445 #define cinv02_10_re (+cinv10_02_re) 446 #define cinv02_10_im (-cinv10_02_im) 447 #define cinv00_11_re (+cinv11_00_re) 448 #define cinv00_11_im (-cinv11_00_im) 449 #define cinv01_11_re (+cinv11_01_re) 450 #define cinv01_11_im (-cinv11_01_im) 451 #define cinv02_11_re (+cinv11_02_re) 452 #define cinv02_11_im (-cinv11_02_im) 453 #define cinv10_11_re (+cinv11_10_re) 454 #define cinv10_11_im (-cinv11_10_im) 455 #define cinv00_12_re (+cinv12_00_re) 456 #define cinv00_12_im (-cinv12_00_im) 457 #define cinv01_12_re (+cinv12_01_re) 458 #define cinv01_12_im (-cinv12_01_im) 459 #define cinv02_12_re (+cinv12_02_re) 460 #define cinv02_12_im (-cinv12_02_im) 461 #define cinv10_12_re (+cinv12_10_re) 462 #define cinv10_12_im (-cinv12_10_im) 463 #define cinv11_12_re (+cinv12_11_re) 464 #define cinv11_12_im (-cinv12_11_im) 467 #define cinv20_20_re cinv00_00_re 468 #define cinv21_20_re cinv01_00_re 469 #define cinv21_20_im cinv01_00_im 470 #define cinv22_20_re cinv02_00_re 471 #define cinv22_20_im cinv02_00_im 472 #define cinv30_20_re cinv10_00_re 473 #define cinv30_20_im cinv10_00_im 474 #define cinv31_20_re cinv11_00_re 475 #define cinv31_20_im cinv11_00_im 476 #define cinv32_20_re cinv12_00_re 477 #define cinv32_20_im cinv12_00_im 478 #define cinv20_21_re cinv00_01_re 479 #define cinv20_21_im cinv00_01_im 480 #define cinv21_21_re cinv01_01_re 481 #define cinv22_21_re cinv02_01_re 482 #define cinv22_21_im cinv02_01_im 483 #define cinv30_21_re cinv10_01_re 484 #define cinv30_21_im cinv10_01_im 485 #define cinv31_21_re cinv11_01_re 486 #define cinv31_21_im cinv11_01_im 487 #define cinv32_21_re cinv12_01_re 488 #define cinv32_21_im cinv12_01_im 489 #define cinv20_22_re cinv00_02_re 490 #define cinv20_22_im cinv00_02_im 491 #define cinv21_22_re cinv01_02_re 492 #define cinv21_22_im cinv01_02_im 493 #define cinv22_22_re cinv02_02_re 494 #define cinv30_22_re cinv10_02_re 495 #define cinv30_22_im cinv10_02_im 496 #define cinv31_22_re cinv11_02_re 497 #define cinv31_22_im cinv11_02_im 498 #define cinv32_22_re cinv12_02_re 499 #define cinv32_22_im cinv12_02_im 500 #define cinv20_30_re cinv00_10_re 501 #define cinv20_30_im cinv00_10_im 502 #define cinv21_30_re cinv01_10_re 503 #define cinv21_30_im cinv01_10_im 504 #define cinv22_30_re cinv02_10_re 505 #define cinv22_30_im cinv02_10_im 506 #define cinv30_30_re cinv10_10_re 507 #define cinv31_30_re cinv11_10_re 508 #define cinv31_30_im cinv11_10_im 509 #define cinv32_30_re cinv12_10_re 510 #define cinv32_30_im cinv12_10_im 511 #define cinv20_31_re cinv00_11_re 512 #define cinv20_31_im cinv00_11_im 513 #define cinv21_31_re cinv01_11_re 514 #define cinv21_31_im cinv01_11_im 515 #define cinv22_31_re cinv02_11_re 516 #define cinv22_31_im cinv02_11_im 517 #define cinv30_31_re cinv10_11_re 518 #define cinv30_31_im cinv10_11_im 519 #define cinv31_31_re cinv11_11_re 520 #define cinv32_31_re cinv12_11_re 521 #define cinv32_31_im cinv12_11_im 522 #define cinv20_32_re cinv00_12_re 523 #define cinv20_32_im cinv00_12_im 524 #define cinv21_32_re cinv01_12_re 525 #define cinv21_32_im cinv01_12_im 526 #define cinv22_32_re cinv02_12_re 527 #define cinv22_32_im cinv02_12_im 528 #define cinv30_32_re cinv10_12_re 529 #define cinv30_32_im cinv10_12_im 530 #define cinv31_32_re cinv11_12_re 531 #define cinv31_32_im cinv11_12_im 532 #define cinv32_32_re cinv12_12_re 571 #endif // CLOVER_DOUBLE 599 #define SHARED_STRIDE 16 // to avoid bank conflicts on Fermi 601 #define SHARED_STRIDE 32 // to avoid bank conflicts on Fermi 686 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)][1];
687 #if (DD_PREC==2) // half precision 688 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][1];
730 const int sp_stride_pad =
param.dc.ghostFace[
static_cast<int>(kernel_type)];
883 face_idx +
param.ghostOffset[static_cast<int>(kernel_type)][0];
884 #if (DD_PREC==2) // half precision 885 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][0];
930 const int sp_stride_pad =
param.dc.ghostFace[
static_cast<int>(kernel_type)];
1083 face_idx +
param.ghostOffset[static_cast<int>(kernel_type)][1];
1084 #if (DD_PREC==2) // half precision 1085 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][1];
1145 const int sp_stride_pad =
param.dc.ghostFace[
static_cast<int>(kernel_type)];
1298 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)][0];
1299 #if (DD_PREC==2) // half precision 1300 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][0];
1343 int ty = (threadIdx.y > 0) ? threadIdx.y - 1 :
blockDim.y - 1;
1364 const int sp_stride_pad =
param.dc.ghostFace[
static_cast<int>(kernel_type)];
1517 face_idx +
param.ghostOffset[static_cast<int>(kernel_type)][1];
1518 #if (DD_PREC==2) // half precision 1519 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][1];
1579 const int sp_stride_pad =
param.dc.ghostFace[
static_cast<int>(kernel_type)];
1732 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)][0];
1733 #if (DD_PREC==2) // half precision 1734 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][0];
1757 if (threadIdx.z == 0 &&
blockDim.z < X3) {
1777 int tz = (threadIdx.z > 0) ? threadIdx.z - 1 :
blockDim.z - 1;
1798 const int sp_stride_pad =
param.dc.ghostFace[
static_cast<int>(kernel_type)];
1951 face_idx +
param.ghostOffset[static_cast<int>(kernel_type)][1];
1952 #if (DD_PREC==2) // half precision 1953 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][1];
1994 const int sp_stride_pad =
param.dc.ghostFace[
static_cast<int>(kernel_type)];
2065 const int sp_stride_pad =
param.dc.ghostFace[
static_cast<int>(kernel_type)];
2208 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)][0];
2209 #if (DD_PREC==2) // half precision 2210 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][0];
2255 const int sp_stride_pad =
param.dc.ghostFace[
static_cast<int>(kernel_type)];
2326 const int sp_stride_pad =
param.dc.ghostFace[
static_cast<int>(kernel_type)];
2460 switch(kernel_type) {
2475 #ifdef SPINOR_DOUBLE 2481 #ifdef SPINOR_DOUBLE 2486 READ_ACCUM(ACCUMTEX,
param.sp_stride)
2488 #ifndef CLOVER_TWIST_XPAY 2490 #ifndef DYNAMIC_CLOVER 2545 #endif//CLOVER_TWIST_XPAY 2547 #ifndef DYNAMIC_CLOVER 2560 #undef WRITE_SPINOR_SHARED 2561 #undef READ_SPINOR_SHARED 2562 #undef SHARED_STRIDE
#define WRITE_SPINOR_SHARED
READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, param.gauge_stride)
VOLATILE spinorFloat o12_re
#define APPLY_CLOVER_TWIST(c, a, reg)
VOLATILE spinorFloat o31_re
VOLATILE spinorFloat o22_re
VOLATILE spinorFloat o30_re
VOLATILE spinorFloat o32_im
#define APPLY_CLOVER_TWIST_DYN_INV(c, a, reg)
READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
VOLATILE spinorFloat o30_im
VOLATILE spinorFloat o20_re
WRITE_SPINOR(param.sp_stride)
VOLATILE spinorFloat o22_im
VOLATILE spinorFloat o11_re
READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
VOLATILE spinorFloat o01_re
VOLATILE spinorFloat o21_im
VOLATILE spinorFloat o10_re
VOLATILE spinorFloat o02_re
VOLATILE spinorFloat o00_re
VOLATILE spinorFloat o20_im
#define READ_INTERMEDIATE_SPINOR
VOLATILE spinorFloat o11_im
VOLATILE spinorFloat o02_im
VOLATILE spinorFloat o32_re
VOLATILE spinorFloat o00_im
#define READ_SPINOR_SHARED
#define READ_SPINOR_GHOST
RECONSTRUCT_GAUGE_MATRIX(0)
READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
APPLY_CLOVER_TWIST_INV(c, cinv, a, o)
VOLATILE spinorFloat o31_im
VOLATILE spinorFloat o21_re
VOLATILE spinorFloat o10_im
VOLATILE spinorFloat o12_im
VOLATILE spinorFloat o01_im
coordsFromIndex3D< EVEN_X >(X, coord, sid, param)