3 #define DSLASH_SHARED_FLOATS_PER_THREAD 0 6 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler 8 #else // Open64 compiler 9 #define VOLATILE volatile 13 #define spinorFloat double 38 #define acc00_re accum0.x 39 #define acc00_im accum0.y 40 #define acc01_re accum1.x 41 #define acc01_im accum1.y 42 #define acc02_re accum2.x 43 #define acc02_im accum2.y 44 #define acc10_re accum3.x 45 #define acc10_im accum3.y 46 #define acc11_re accum4.x 47 #define acc11_im accum4.y 48 #define acc12_re accum5.x 49 #define acc12_im accum5.y 50 #define acc20_re accum6.x 51 #define acc20_im accum6.y 52 #define acc21_re accum7.x 53 #define acc21_im accum7.y 54 #define acc22_re accum8.x 55 #define acc22_im accum8.y 56 #define acc30_re accum9.x 57 #define acc30_im accum9.y 58 #define acc31_re accum10.x 59 #define acc31_im accum10.y 60 #define acc32_re accum11.x 61 #define acc32_im accum11.y 63 #define spinorFloat float 88 #define acc00_re accum0.x 89 #define acc00_im accum0.y 90 #define acc01_re accum0.z 91 #define acc01_im accum0.w 92 #define acc02_re accum1.x 93 #define acc02_im accum1.y 94 #define acc10_re accum1.z 95 #define acc10_im accum1.w 96 #define acc11_re accum2.x 97 #define acc11_im accum2.y 98 #define acc12_re accum2.z 99 #define acc12_im accum2.w 100 #define acc20_re accum3.x 101 #define acc20_im accum3.y 102 #define acc21_re accum3.z 103 #define acc21_im accum3.w 104 #define acc22_re accum4.x 105 #define acc22_im accum4.y 106 #define acc30_re accum4.z 107 #define acc30_im accum4.w 108 #define acc31_re accum5.x 109 #define acc31_im accum5.y 110 #define acc32_re accum5.z 111 #define acc32_im accum5.w 112 #endif // SPINOR_DOUBLE 155 #endif // GAUGE_DOUBLE 158 #define gT00_re (+g00_re) 159 #define gT00_im (-g00_im) 160 #define gT01_re (+g10_re) 161 #define gT01_im (-g10_im) 162 #define gT02_re (+g20_re) 163 #define gT02_im (-g20_im) 164 #define gT10_re (+g01_re) 165 #define gT10_im (-g01_im) 166 #define gT11_re (+g11_re) 167 #define gT11_im (-g11_im) 168 #define gT12_re (+g21_re) 169 #define gT12_im (-g21_im) 170 #define gT20_re (+g02_re) 171 #define gT20_im (-g02_im) 172 #define gT21_re (+g12_re) 173 #define gT21_im (-g12_im) 174 #define gT22_re (+g22_re) 175 #define gT22_im (-g22_im) 179 #define c00_00_re C0.x 180 #define c01_01_re C0.y 181 #define c02_02_re C1.x 182 #define c10_10_re C1.y 183 #define c11_11_re C2.x 184 #define c12_12_re C2.y 185 #define c01_00_re C3.x 186 #define c01_00_im C3.y 187 #define c02_00_re C4.x 188 #define c02_00_im C4.y 189 #define c10_00_re C5.x 190 #define c10_00_im C5.y 191 #define c11_00_re C6.x 192 #define c11_00_im C6.y 193 #define c12_00_re C7.x 194 #define c12_00_im C7.y 195 #define c02_01_re C8.x 196 #define c02_01_im C8.y 197 #define c10_01_re C9.x 198 #define c10_01_im C9.y 199 #define c11_01_re C10.x 200 #define c11_01_im C10.y 201 #define c12_01_re C11.x 202 #define c12_01_im C11.y 203 #define c10_02_re C12.x 204 #define c10_02_im C12.y 205 #define c11_02_re C13.x 206 #define c11_02_im C13.y 207 #define c12_02_re C14.x 208 #define c12_02_im C14.y 209 #define c11_10_re C15.x 210 #define c11_10_im C15.y 211 #define c12_10_re C16.x 212 #define c12_10_im C16.y 213 #define c12_11_re C17.x 214 #define c12_11_im C17.y 216 #define c00_00_re C0.x 217 #define c01_01_re C0.y 218 #define c02_02_re C0.z 219 #define c10_10_re C0.w 220 #define c11_11_re C1.x 221 #define c12_12_re C1.y 222 #define c01_00_re C1.z 223 #define c01_00_im C1.w 224 #define c02_00_re C2.x 225 #define c02_00_im C2.y 226 #define c10_00_re C2.z 227 #define c10_00_im C2.w 228 #define c11_00_re C3.x 229 #define c11_00_im C3.y 230 #define c12_00_re C3.z 231 #define c12_00_im C3.w 232 #define c02_01_re C4.x 233 #define c02_01_im C4.y 234 #define c10_01_re C4.z 235 #define c10_01_im C4.w 236 #define c11_01_re C5.x 237 #define c11_01_im C5.y 238 #define c12_01_re C5.z 239 #define c12_01_im C5.w 240 #define c10_02_re C6.x 241 #define c10_02_im C6.y 242 #define c11_02_re C6.z 243 #define c11_02_im C6.w 244 #define c12_02_re C7.x 245 #define c12_02_im C7.y 246 #define c11_10_re C7.z 247 #define c11_10_im C7.w 248 #define c12_10_re C8.x 249 #define c12_10_im C8.y 250 #define c12_11_re C8.z 251 #define c12_11_im C8.w 252 #endif // CLOVER_DOUBLE 254 #define c00_01_re (+c01_00_re) 255 #define c00_01_im (-c01_00_im) 256 #define c00_02_re (+c02_00_re) 257 #define c00_02_im (-c02_00_im) 258 #define c01_02_re (+c02_01_re) 259 #define c01_02_im (-c02_01_im) 260 #define c00_10_re (+c10_00_re) 261 #define c00_10_im (-c10_00_im) 262 #define c01_10_re (+c10_01_re) 263 #define c01_10_im (-c10_01_im) 264 #define c02_10_re (+c10_02_re) 265 #define c02_10_im (-c10_02_im) 266 #define c00_11_re (+c11_00_re) 267 #define c00_11_im (-c11_00_im) 268 #define c01_11_re (+c11_01_re) 269 #define c01_11_im (-c11_01_im) 270 #define c02_11_re (+c11_02_re) 271 #define c02_11_im (-c11_02_im) 272 #define c10_11_re (+c11_10_re) 273 #define c10_11_im (-c11_10_im) 274 #define c00_12_re (+c12_00_re) 275 #define c00_12_im (-c12_00_im) 276 #define c01_12_re (+c12_01_re) 277 #define c01_12_im (-c12_01_im) 278 #define c02_12_re (+c12_02_re) 279 #define c02_12_im (-c12_02_im) 280 #define c10_12_re (+c12_10_re) 281 #define c10_12_im (-c12_10_im) 282 #define c11_12_re (+c12_11_re) 283 #define c11_12_im (-c12_11_im) 286 #define c20_20_re c00_00_re 287 #define c21_20_re c01_00_re 288 #define c21_20_im c01_00_im 289 #define c22_20_re c02_00_re 290 #define c22_20_im c02_00_im 291 #define c30_20_re c10_00_re 292 #define c30_20_im c10_00_im 293 #define c31_20_re c11_00_re 294 #define c31_20_im c11_00_im 295 #define c32_20_re c12_00_re 296 #define c32_20_im c12_00_im 297 #define c20_21_re c00_01_re 298 #define c20_21_im c00_01_im 299 #define c21_21_re c01_01_re 300 #define c22_21_re c02_01_re 301 #define c22_21_im c02_01_im 302 #define c30_21_re c10_01_re 303 #define c30_21_im c10_01_im 304 #define c31_21_re c11_01_re 305 #define c31_21_im c11_01_im 306 #define c32_21_re c12_01_re 307 #define c32_21_im c12_01_im 308 #define c20_22_re c00_02_re 309 #define c20_22_im c00_02_im 310 #define c21_22_re c01_02_re 311 #define c21_22_im c01_02_im 312 #define c22_22_re c02_02_re 313 #define c30_22_re c10_02_re 314 #define c30_22_im c10_02_im 315 #define c31_22_re c11_02_re 316 #define c31_22_im c11_02_im 317 #define c32_22_re c12_02_re 318 #define c32_22_im c12_02_im 319 #define c20_30_re c00_10_re 320 #define c20_30_im c00_10_im 321 #define c21_30_re c01_10_re 322 #define c21_30_im c01_10_im 323 #define c22_30_re c02_10_re 324 #define c22_30_im c02_10_im 325 #define c30_30_re c10_10_re 326 #define c31_30_re c11_10_re 327 #define c31_30_im c11_10_im 328 #define c32_30_re c12_10_re 329 #define c32_30_im c12_10_im 330 #define c20_31_re c00_11_re 331 #define c20_31_im c00_11_im 332 #define c21_31_re c01_11_re 333 #define c21_31_im c01_11_im 334 #define c22_31_re c02_11_re 335 #define c22_31_im c02_11_im 336 #define c30_31_re c10_11_re 337 #define c30_31_im c10_11_im 338 #define c31_31_re c11_11_re 339 #define c32_31_re c12_11_re 340 #define c32_31_im c12_11_im 341 #define c20_32_re c00_12_re 342 #define c20_32_im c00_12_im 343 #define c21_32_re c01_12_re 344 #define c21_32_im c01_12_im 345 #define c22_32_re c02_12_re 346 #define c22_32_im c02_12_im 347 #define c30_32_re c10_12_re 348 #define c30_32_im c10_12_im 349 #define c31_32_re c11_12_re 350 #define c31_32_im c11_12_im 351 #define c32_32_re c12_12_re 411 #ifdef DSLASH_CLOVER_XPAY 413 READ_ACCUM(ACCUMTEX,
param.sp_stride)
832 #endif // DSLASH_CLOVER 858 #endif // DSLASH_CLOVER_XPAY 907 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)][1];
908 #if (DD_PREC==2) // half precision 909 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][1];
948 const int sp_stride_pad =
param.dc.ghostFace[
static_cast<int>(kernel_type)];
1059 #ifdef SPINOR_DOUBLE 1106 face_idx +
param.ghostOffset[static_cast<int>(kernel_type)][0];
1107 #if (DD_PREC==2) // half precision 1108 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][0];
1151 const int sp_stride_pad =
param.dc.ghostFace[
static_cast<int>(kernel_type)];
1262 #ifdef SPINOR_DOUBLE 1309 face_idx +
param.ghostOffset[static_cast<int>(kernel_type)][1];
1310 #if (DD_PREC==2) // half precision 1311 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][1];
1350 const int sp_stride_pad =
param.dc.ghostFace[
static_cast<int>(kernel_type)];
1461 #ifdef SPINOR_DOUBLE 1508 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)][0];
1509 #if (DD_PREC==2) // half precision 1510 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][0];
1553 const int sp_stride_pad =
param.dc.ghostFace[
static_cast<int>(kernel_type)];
1664 #ifdef SPINOR_DOUBLE 1711 face_idx +
param.ghostOffset[static_cast<int>(kernel_type)][1];
1712 #if (DD_PREC==2) // half precision 1713 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][1];
1752 const int sp_stride_pad =
param.dc.ghostFace[
static_cast<int>(kernel_type)];
1863 #ifdef SPINOR_DOUBLE 1910 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)][0];
1911 #if (DD_PREC==2) // half precision 1912 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][0];
1955 const int sp_stride_pad =
param.dc.ghostFace[
static_cast<int>(kernel_type)];
2066 #ifdef SPINOR_DOUBLE 2113 face_idx +
param.ghostOffset[static_cast<int>(kernel_type)][1];
2114 #if (DD_PREC==2) // half precision 2115 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][1];
2156 const int sp_stride_pad =
param.dc.ghostFace[
static_cast<int>(kernel_type)];
2180 #ifdef SPINOR_DOUBLE 2232 const int sp_stride_pad =
param.dc.ghostFace[
static_cast<int>(kernel_type)];
2344 #ifdef SPINOR_DOUBLE 2380 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)][0];
2381 #if (DD_PREC==2) // half precision 2382 const int sp_norm_idx =
face_idx +
param.ghostNormOffset[
static_cast<int>(kernel_type)][0];
2427 const int sp_stride_pad =
param.dc.ghostFace[
static_cast<int>(kernel_type)];
2451 #ifdef SPINOR_DOUBLE 2503 const int sp_stride_pad =
param.dc.ghostFace[
static_cast<int>(kernel_type)];
2615 #ifdef SPINOR_DOUBLE
READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
VOLATILE spinorFloat o30_re
VOLATILE spinorFloat o11_re
VOLATILE spinorFloat o10_re
VOLATILE spinorFloat o30_im
VOLATILE spinorFloat o02_im
VOLATILE spinorFloat o11_im
VOLATILE spinorFloat o01_re
VOLATILE spinorFloat o32_re
VOLATILE spinorFloat o32_im
VOLATILE spinorFloat o20_im
VOLATILE spinorFloat o01_im
RECONSTRUCT_GAUGE_MATRIX(0)
READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
VOLATILE spinorFloat o12_im
VOLATILE spinorFloat o00_im
coordsFromIndex< 4, QUDA_4D_PC, EVEN_X >(X, coord, sid, param)
WRITE_SPINOR(param.sp_stride)
#define READ_INTERMEDIATE_SPINOR
VOLATILE spinorFloat o31_re
READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
VOLATILE spinorFloat o00_re
#define READ_SPINOR_GHOST
READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, param.gauge_stride)
VOLATILE spinorFloat o10_im
VOLATILE spinorFloat o20_re
VOLATILE spinorFloat o22_im
VOLATILE spinorFloat o21_re
VOLATILE spinorFloat o21_im
VOLATILE spinorFloat o22_re
VOLATILE spinorFloat o02_re
VOLATILE spinorFloat o12_re
VOLATILE spinorFloat o31_im