3 #define DSLASH_SHARED_FLOATS_PER_THREAD 24 6 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler 8 #else // Open64 compiler 9 #define VOLATILE volatile 13 #define spinorFloat double 14 #define WRITE_SPINOR_SHARED WRITE_SPINOR_SHARED_DOUBLE2 15 #define READ_SPINOR_SHARED READ_SPINOR_SHARED_DOUBLE2 40 #define acc00_re accum0.x 41 #define acc00_im accum0.y 42 #define acc01_re accum1.x 43 #define acc01_im accum1.y 44 #define acc02_re accum2.x 45 #define acc02_im accum2.y 46 #define acc10_re accum3.x 47 #define acc10_im accum3.y 48 #define acc11_re accum4.x 49 #define acc11_im accum4.y 50 #define acc12_re accum5.x 51 #define acc12_im accum5.y 52 #define acc20_re accum6.x 53 #define acc20_im accum6.y 54 #define acc21_re accum7.x 55 #define acc21_im accum7.y 56 #define acc22_re accum8.x 57 #define acc22_im accum8.y 58 #define acc30_re accum9.x 59 #define acc30_im accum9.y 60 #define acc31_re accum10.x 61 #define acc31_im accum10.y 62 #define acc32_re accum11.x 63 #define acc32_im accum11.y 65 #define spinorFloat float 66 #define WRITE_SPINOR_SHARED WRITE_SPINOR_SHARED_FLOAT4 67 #define READ_SPINOR_SHARED READ_SPINOR_SHARED_FLOAT4 92 #define acc00_re accum0.x 93 #define acc00_im accum0.y 94 #define acc01_re accum0.z 95 #define acc01_im accum0.w 96 #define acc02_re accum1.x 97 #define acc02_im accum1.y 98 #define acc10_re accum1.z 99 #define acc10_im accum1.w 100 #define acc11_re accum2.x 101 #define acc11_im accum2.y 102 #define acc12_re accum2.z 103 #define acc12_im accum2.w 104 #define acc20_re accum3.x 105 #define acc20_im accum3.y 106 #define acc21_re accum3.z 107 #define acc21_im accum3.w 108 #define acc22_re accum4.x 109 #define acc22_im accum4.y 110 #define acc30_re accum4.z 111 #define acc30_im accum4.w 112 #define acc31_re accum5.x 113 #define acc31_im accum5.y 114 #define acc32_re accum5.z 115 #define acc32_im accum5.w 116 #endif // SPINOR_DOUBLE 159 #endif // GAUGE_DOUBLE 162 #define gT00_re (+g00_re) 163 #define gT00_im (-g00_im) 164 #define gT01_re (+g10_re) 165 #define gT01_im (-g10_im) 166 #define gT02_re (+g20_re) 167 #define gT02_im (-g20_im) 168 #define gT10_re (+g01_re) 169 #define gT10_im (-g01_im) 170 #define gT11_re (+g11_re) 171 #define gT11_im (-g11_im) 172 #define gT12_re (+g21_re) 173 #define gT12_im (-g21_im) 174 #define gT20_re (+g02_re) 175 #define gT20_im (-g02_im) 176 #define gT21_re (+g12_re) 177 #define gT21_im (-g12_im) 178 #define gT22_re (+g22_re) 179 #define gT22_im (-g22_im) 183 #define c00_00_re C0.x 184 #define c01_01_re C0.y 185 #define c02_02_re C1.x 186 #define c10_10_re C1.y 187 #define c11_11_re C2.x 188 #define c12_12_re C2.y 189 #define c01_00_re C3.x 190 #define c01_00_im C3.y 191 #define c02_00_re C4.x 192 #define c02_00_im C4.y 193 #define c10_00_re C5.x 194 #define c10_00_im C5.y 195 #define c11_00_re C6.x 196 #define c11_00_im C6.y 197 #define c12_00_re C7.x 198 #define c12_00_im C7.y 199 #define c02_01_re C8.x 200 #define c02_01_im C8.y 201 #define c10_01_re C9.x 202 #define c10_01_im C9.y 203 #define c11_01_re C10.x 204 #define c11_01_im C10.y 205 #define c12_01_re C11.x 206 #define c12_01_im C11.y 207 #define c10_02_re C12.x 208 #define c10_02_im C12.y 209 #define c11_02_re C13.x 210 #define c11_02_im C13.y 211 #define c12_02_re C14.x 212 #define c12_02_im C14.y 213 #define c11_10_re C15.x 214 #define c11_10_im C15.y 215 #define c12_10_re C16.x 216 #define c12_10_im C16.y 217 #define c12_11_re C17.x 218 #define c12_11_im C17.y 220 #define c00_00_re C0.x 221 #define c01_01_re C0.y 222 #define c02_02_re C0.z 223 #define c10_10_re C0.w 224 #define c11_11_re C1.x 225 #define c12_12_re C1.y 226 #define c01_00_re C1.z 227 #define c01_00_im C1.w 228 #define c02_00_re C2.x 229 #define c02_00_im C2.y 230 #define c10_00_re C2.z 231 #define c10_00_im C2.w 232 #define c11_00_re C3.x 233 #define c11_00_im C3.y 234 #define c12_00_re C3.z 235 #define c12_00_im C3.w 236 #define c02_01_re C4.x 237 #define c02_01_im C4.y 238 #define c10_01_re C4.z 239 #define c10_01_im C4.w 240 #define c11_01_re C5.x 241 #define c11_01_im C5.y 242 #define c12_01_re C5.z 243 #define c12_01_im C5.w 244 #define c10_02_re C6.x 245 #define c10_02_im C6.y 246 #define c11_02_re C6.z 247 #define c11_02_im C6.w 248 #define c12_02_re C7.x 249 #define c12_02_im C7.y 250 #define c11_10_re C7.z 251 #define c11_10_im C7.w 252 #define c12_10_re C8.x 253 #define c12_10_im C8.y 254 #define c12_11_re C8.z 255 #define c12_11_im C8.w 256 #endif // CLOVER_DOUBLE 258 #define c00_01_re (+c01_00_re) 259 #define c00_01_im (-c01_00_im) 260 #define c00_02_re (+c02_00_re) 261 #define c00_02_im (-c02_00_im) 262 #define c01_02_re (+c02_01_re) 263 #define c01_02_im (-c02_01_im) 264 #define c00_10_re (+c10_00_re) 265 #define c00_10_im (-c10_00_im) 266 #define c01_10_re (+c10_01_re) 267 #define c01_10_im (-c10_01_im) 268 #define c02_10_re (+c10_02_re) 269 #define c02_10_im (-c10_02_im) 270 #define c00_11_re (+c11_00_re) 271 #define c00_11_im (-c11_00_im) 272 #define c01_11_re (+c11_01_re) 273 #define c01_11_im (-c11_01_im) 274 #define c02_11_re (+c11_02_re) 275 #define c02_11_im (-c11_02_im) 276 #define c10_11_re (+c11_10_re) 277 #define c10_11_im (-c11_10_im) 278 #define c00_12_re (+c12_00_re) 279 #define c00_12_im (-c12_00_im) 280 #define c01_12_re (+c12_01_re) 281 #define c01_12_im (-c12_01_im) 282 #define c02_12_re (+c12_02_re) 283 #define c02_12_im (-c12_02_im) 284 #define c10_12_re (+c12_10_re) 285 #define c10_12_im (-c12_10_im) 286 #define c11_12_re (+c12_11_re) 287 #define c11_12_im (-c12_11_im) 290 #define c20_20_re c00_00_re 291 #define c21_20_re c01_00_re 292 #define c21_20_im c01_00_im 293 #define c22_20_re c02_00_re 294 #define c22_20_im c02_00_im 295 #define c30_20_re c10_00_re 296 #define c30_20_im c10_00_im 297 #define c31_20_re c11_00_re 298 #define c31_20_im c11_00_im 299 #define c32_20_re c12_00_re 300 #define c32_20_im c12_00_im 301 #define c20_21_re c00_01_re 302 #define c20_21_im c00_01_im 303 #define c21_21_re c01_01_re 304 #define c22_21_re c02_01_re 305 #define c22_21_im c02_01_im 306 #define c30_21_re c10_01_re 307 #define c30_21_im c10_01_im 308 #define c31_21_re c11_01_re 309 #define c31_21_im c11_01_im 310 #define c32_21_re c12_01_re 311 #define c32_21_im c12_01_im 312 #define c20_22_re c00_02_re 313 #define c20_22_im c00_02_im 314 #define c21_22_re c01_02_re 315 #define c21_22_im c01_02_im 316 #define c22_22_re c02_02_re 317 #define c30_22_re c10_02_re 318 #define c30_22_im c10_02_im 319 #define c31_22_re c11_02_re 320 #define c31_22_im c11_02_im 321 #define c32_22_re c12_02_re 322 #define c32_22_im c12_02_im 323 #define c20_30_re c00_10_re 324 #define c20_30_im c00_10_im 325 #define c21_30_re c01_10_re 326 #define c21_30_im c01_10_im 327 #define c22_30_re c02_10_re 328 #define c22_30_im c02_10_im 329 #define c30_30_re c10_10_re 330 #define c31_30_re c11_10_re 331 #define c31_30_im c11_10_im 332 #define c32_30_re c12_10_re 333 #define c32_30_im c12_10_im 334 #define c20_31_re c00_11_re 335 #define c20_31_im c00_11_im 336 #define c21_31_re c01_11_re 337 #define c21_31_im c01_11_im 338 #define c22_31_re c02_11_re 339 #define c22_31_im c02_11_im 340 #define c30_31_re c10_11_re 341 #define c30_31_im c10_11_im 342 #define c31_31_re c11_11_re 343 #define c32_31_re c12_11_re 344 #define c32_31_im c12_11_im 345 #define c20_32_re c00_12_re 346 #define c20_32_im c00_12_im 347 #define c21_32_re c01_12_re 348 #define c21_32_im c01_12_im 349 #define c22_32_re c02_12_re 350 #define c22_32_im c02_12_im 351 #define c30_32_re c10_12_re 352 #define c30_32_im c10_12_im 353 #define c31_32_re c11_12_re 354 #define c31_32_im c11_12_im 355 #define c32_32_re c12_12_re 384 #define SHARED_STRIDE 16 // to avoid bank conflicts on Fermi 386 #define SHARED_STRIDE 32 // to avoid bank conflicts on Fermi 470 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)][1];
471 #if (DD_PREC==2) // half precision 472 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][1];
514 const int sp_stride_pad =
param.dc.ghostFace[
static_cast<int>(kernel_type)];
667 face_idx +
param.ghostOffset[static_cast<int>(kernel_type)][0];
668 #if (DD_PREC==2) // half precision 669 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][0];
714 const int sp_stride_pad =
param.dc.ghostFace[
static_cast<int>(kernel_type)];
867 face_idx +
param.ghostOffset[static_cast<int>(kernel_type)][1];
868 #if (DD_PREC==2) // half precision 869 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][1];
908 int ty = (threadIdx.y <
blockDim.y - 1) ? threadIdx.y + 1 : 0;
929 const int sp_stride_pad =
param.dc.ghostFace[
static_cast<int>(kernel_type)];
1082 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)][0];
1083 #if (DD_PREC==2) // half precision 1084 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][0];
1127 int ty = (threadIdx.y > 0) ? threadIdx.y - 1 :
blockDim.y - 1;
1148 const int sp_stride_pad =
param.dc.ghostFace[
static_cast<int>(kernel_type)];
1301 face_idx +
param.ghostOffset[static_cast<int>(kernel_type)][1];
1302 #if (DD_PREC==2) // half precision 1303 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][1];
1363 const int sp_stride_pad =
param.dc.ghostFace[
static_cast<int>(kernel_type)];
1516 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)][0];
1517 #if (DD_PREC==2) // half precision 1518 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][0];
1561 int tz = (threadIdx.z > 0) ? threadIdx.z - 1 :
blockDim.z - 1;
1582 const int sp_stride_pad =
param.dc.ghostFace[
static_cast<int>(kernel_type)];
1735 face_idx +
param.ghostOffset[static_cast<int>(kernel_type)][1];
1736 #if (DD_PREC==2) // half precision 1737 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][1];
1778 const int sp_stride_pad =
param.dc.ghostFace[
static_cast<int>(kernel_type)];
1849 const int sp_stride_pad =
param.dc.ghostFace[
static_cast<int>(kernel_type)];
1992 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)][0];
1993 #if (DD_PREC==2) // half precision 1994 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][0];
2039 const int sp_stride_pad =
param.dc.ghostFace[
static_cast<int>(kernel_type)];
2110 const int sp_stride_pad =
param.dc.ghostFace[
static_cast<int>(kernel_type)];
2240 #if defined MULTI_GPU && (defined DSLASH_XPAY || defined DSLASH_CLOVER) 2244 switch(kernel_type) {
2258 #ifdef DSLASH_CLOVER 2675 #endif // DSLASH_CLOVER 2679 READ_ACCUM(ACCUMTEX,
param.sp_stride)
2681 #ifdef SPINOR_DOUBLE 2710 #endif // DSLASH_XPAY 2718 #undef WRITE_SPINOR_SHARED 2719 #undef READ_SPINOR_SHARED 2720 #undef SHARED_STRIDE
VOLATILE spinorFloat o11_im
VOLATILE spinorFloat o20_re
VOLATILE spinorFloat o31_im
VOLATILE spinorFloat o32_im
VOLATILE spinorFloat o21_im
VOLATILE spinorFloat o00_re
VOLATILE spinorFloat o30_im
VOLATILE spinorFloat o02_re
VOLATILE spinorFloat o02_im
VOLATILE spinorFloat o01_re
VOLATILE spinorFloat o10_im
VOLATILE spinorFloat o11_re
coordsFromIndex3D< EVEN_X >(X, x, sid, param.parity, param.dc.X)
VOLATILE spinorFloat o00_im
#define READ_INTERMEDIATE_SPINOR
READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
VOLATILE spinorFloat o22_im
VOLATILE spinorFloat o01_im
VOLATILE spinorFloat o30_re
#define READ_SPINOR_GHOST
VOLATILE spinorFloat o31_re
VOLATILE spinorFloat o21_re
#define READ_SPINOR_SHARED
READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
VOLATILE spinorFloat o12_im
#define WRITE_SPINOR_SHARED
READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
VOLATILE spinorFloat o22_re
RECONSTRUCT_GAUGE_MATRIX(0)
VOLATILE spinorFloat o10_re
VOLATILE spinorFloat o12_re
VOLATILE spinorFloat o20_im
VOLATILE spinorFloat o32_re
READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, param.gauge_stride)