5 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
8 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler
10 #else // Open64 compiler
11 #define VOLATILE volatile
15 #define spinorFloat double
40 #define acc00_re accum0.x
41 #define acc00_im accum0.y
42 #define acc01_re accum1.x
43 #define acc01_im accum1.y
44 #define acc02_re accum2.x
45 #define acc02_im accum2.y
46 #define acc10_re accum3.x
47 #define acc10_im accum3.y
48 #define acc11_re accum4.x
49 #define acc11_im accum4.y
50 #define acc12_re accum5.x
51 #define acc12_im accum5.y
52 #define acc20_re accum6.x
53 #define acc20_im accum6.y
54 #define acc21_re accum7.x
55 #define acc21_im accum7.y
56 #define acc22_re accum8.x
57 #define acc22_im accum8.y
58 #define acc30_re accum9.x
59 #define acc30_im accum9.y
60 #define acc31_re accum10.x
61 #define acc31_im accum10.y
62 #define acc32_re accum11.x
63 #define acc32_im accum11.y
65 #define spinorFloat float
90 #define acc00_re accum0.x
91 #define acc00_im accum0.y
92 #define acc01_re accum0.z
93 #define acc01_im accum0.w
94 #define acc02_re accum1.x
95 #define acc02_im accum1.y
96 #define acc10_re accum1.z
97 #define acc10_im accum1.w
98 #define acc11_re accum2.x
99 #define acc11_im accum2.y
100 #define acc12_re accum2.z
101 #define acc12_im accum2.w
102 #define acc20_re accum3.x
103 #define acc20_im accum3.y
104 #define acc21_re accum3.z
105 #define acc21_im accum3.w
106 #define acc22_re accum4.x
107 #define acc22_im accum4.y
108 #define acc30_re accum4.z
109 #define acc30_im accum4.w
110 #define acc31_re accum5.x
111 #define acc31_im accum5.y
112 #define acc32_re accum5.z
113 #define acc32_im accum5.w
114 #endif // SPINOR_DOUBLE
157 #endif // GAUGE_DOUBLE
160 #define gT00_re (+g00_re)
161 #define gT00_im (-g00_im)
162 #define gT01_re (+g10_re)
163 #define gT01_im (-g10_im)
164 #define gT02_re (+g20_re)
165 #define gT02_im (-g20_im)
166 #define gT10_re (+g01_re)
167 #define gT10_im (-g01_im)
168 #define gT11_re (+g11_re)
169 #define gT11_im (-g11_im)
170 #define gT12_re (+g21_re)
171 #define gT12_im (-g21_im)
172 #define gT20_re (+g02_re)
173 #define gT20_im (-g02_im)
174 #define gT21_re (+g12_re)
175 #define gT21_im (-g12_im)
176 #define gT22_re (+g22_re)
177 #define gT22_im (-g22_im)
181 #define c00_00_re C0.x
182 #define c01_01_re C0.y
183 #define c02_02_re C1.x
184 #define c10_10_re C1.y
185 #define c11_11_re C2.x
186 #define c12_12_re C2.y
187 #define c01_00_re C3.x
188 #define c01_00_im C3.y
189 #define c02_00_re C4.x
190 #define c02_00_im C4.y
191 #define c10_00_re C5.x
192 #define c10_00_im C5.y
193 #define c11_00_re C6.x
194 #define c11_00_im C6.y
195 #define c12_00_re C7.x
196 #define c12_00_im C7.y
197 #define c02_01_re C8.x
198 #define c02_01_im C8.y
199 #define c10_01_re C9.x
200 #define c10_01_im C9.y
201 #define c11_01_re C10.x
202 #define c11_01_im C10.y
203 #define c12_01_re C11.x
204 #define c12_01_im C11.y
205 #define c10_02_re C12.x
206 #define c10_02_im C12.y
207 #define c11_02_re C13.x
208 #define c11_02_im C13.y
209 #define c12_02_re C14.x
210 #define c12_02_im C14.y
211 #define c11_10_re C15.x
212 #define c11_10_im C15.y
213 #define c12_10_re C16.x
214 #define c12_10_im C16.y
215 #define c12_11_re C17.x
216 #define c12_11_im C17.y
218 #define c00_00_re C0.x
219 #define c01_01_re C0.y
220 #define c02_02_re C0.z
221 #define c10_10_re C0.w
222 #define c11_11_re C1.x
223 #define c12_12_re C1.y
224 #define c01_00_re C1.z
225 #define c01_00_im C1.w
226 #define c02_00_re C2.x
227 #define c02_00_im C2.y
228 #define c10_00_re C2.z
229 #define c10_00_im C2.w
230 #define c11_00_re C3.x
231 #define c11_00_im C3.y
232 #define c12_00_re C3.z
233 #define c12_00_im C3.w
234 #define c02_01_re C4.x
235 #define c02_01_im C4.y
236 #define c10_01_re C4.z
237 #define c10_01_im C4.w
238 #define c11_01_re C5.x
239 #define c11_01_im C5.y
240 #define c12_01_re C5.z
241 #define c12_01_im C5.w
242 #define c10_02_re C6.x
243 #define c10_02_im C6.y
244 #define c11_02_re C6.z
245 #define c11_02_im C6.w
246 #define c12_02_re C7.x
247 #define c12_02_im C7.y
248 #define c11_10_re C7.z
249 #define c11_10_im C7.w
250 #define c12_10_re C8.x
251 #define c12_10_im C8.y
252 #define c12_11_re C8.z
253 #define c12_11_im C8.w
254 #endif // CLOVER_DOUBLE
256 #define c00_01_re (+c01_00_re)
257 #define c00_01_im (-c01_00_im)
258 #define c00_02_re (+c02_00_re)
259 #define c00_02_im (-c02_00_im)
260 #define c01_02_re (+c02_01_re)
261 #define c01_02_im (-c02_01_im)
262 #define c00_10_re (+c10_00_re)
263 #define c00_10_im (-c10_00_im)
264 #define c01_10_re (+c10_01_re)
265 #define c01_10_im (-c10_01_im)
266 #define c02_10_re (+c10_02_re)
267 #define c02_10_im (-c10_02_im)
268 #define c00_11_re (+c11_00_re)
269 #define c00_11_im (-c11_00_im)
270 #define c01_11_re (+c11_01_re)
271 #define c01_11_im (-c11_01_im)
272 #define c02_11_re (+c11_02_re)
273 #define c02_11_im (-c11_02_im)
274 #define c10_11_re (+c11_10_re)
275 #define c10_11_im (-c11_10_im)
276 #define c00_12_re (+c12_00_re)
277 #define c00_12_im (-c12_00_im)
278 #define c01_12_re (+c12_01_re)
279 #define c01_12_im (-c12_01_im)
280 #define c02_12_re (+c12_02_re)
281 #define c02_12_im (-c12_02_im)
282 #define c10_12_re (+c12_10_re)
283 #define c10_12_im (-c12_10_im)
284 #define c11_12_re (+c12_11_re)
285 #define c11_12_im (-c12_11_im)
288 #define c20_20_re c00_00_re
289 #define c21_20_re c01_00_re
290 #define c21_20_im c01_00_im
291 #define c22_20_re c02_00_re
292 #define c22_20_im c02_00_im
293 #define c30_20_re c10_00_re
294 #define c30_20_im c10_00_im
295 #define c31_20_re c11_00_re
296 #define c31_20_im c11_00_im
297 #define c32_20_re c12_00_re
298 #define c32_20_im c12_00_im
299 #define c20_21_re c00_01_re
300 #define c20_21_im c00_01_im
301 #define c21_21_re c01_01_re
302 #define c22_21_re c02_01_re
303 #define c22_21_im c02_01_im
304 #define c30_21_re c10_01_re
305 #define c30_21_im c10_01_im
306 #define c31_21_re c11_01_re
307 #define c31_21_im c11_01_im
308 #define c32_21_re c12_01_re
309 #define c32_21_im c12_01_im
310 #define c20_22_re c00_02_re
311 #define c20_22_im c00_02_im
312 #define c21_22_re c01_02_re
313 #define c21_22_im c01_02_im
314 #define c22_22_re c02_02_re
315 #define c30_22_re c10_02_re
316 #define c30_22_im c10_02_im
317 #define c31_22_re c11_02_re
318 #define c31_22_im c11_02_im
319 #define c32_22_re c12_02_re
320 #define c32_22_im c12_02_im
321 #define c20_30_re c00_10_re
322 #define c20_30_im c00_10_im
323 #define c21_30_re c01_10_re
324 #define c21_30_im c01_10_im
325 #define c22_30_re c02_10_re
326 #define c22_30_im c02_10_im
327 #define c30_30_re c10_10_re
328 #define c31_30_re c11_10_re
329 #define c31_30_im c11_10_im
330 #define c32_30_re c12_10_re
331 #define c32_30_im c12_10_im
332 #define c20_31_re c00_11_re
333 #define c20_31_im c00_11_im
334 #define c21_31_re c01_11_re
335 #define c21_31_im c01_11_im
336 #define c22_31_re c02_11_re
337 #define c22_31_im c02_11_im
338 #define c30_31_re c10_11_re
339 #define c30_31_im c10_11_im
340 #define c31_31_re c11_11_re
341 #define c32_31_re c12_11_re
342 #define c32_31_im c12_11_im
343 #define c20_32_re c00_12_re
344 #define c20_32_im c00_12_im
345 #define c21_32_re c01_12_re
346 #define c21_32_im c01_12_im
347 #define c22_32_re c02_12_re
348 #define c22_32_im c02_12_im
349 #define c30_32_re c10_12_re
350 #define c30_32_im c10_12_im
351 #define c31_32_re c11_12_re
352 #define c31_32_im c11_12_im
353 #define c32_32_re c12_12_re
358 #define cinv00_00_re C0.x
359 #define cinv01_01_re C0.y
360 #define cinv02_02_re C1.x
361 #define cinv10_10_re C1.y
362 #define cinv11_11_re C2.x
363 #define cinv12_12_re C2.y
364 #define cinv01_00_re C3.x
365 #define cinv01_00_im C3.y
366 #define cinv02_00_re C4.x
367 #define cinv02_00_im C4.y
368 #define cinv10_00_re C5.x
369 #define cinv10_00_im C5.y
370 #define cinv11_00_re C6.x
371 #define cinv11_00_im C6.y
372 #define cinv12_00_re C7.x
373 #define cinv12_00_im C7.y
374 #define cinv02_01_re C8.x
375 #define cinv02_01_im C8.y
376 #define cinv10_01_re C9.x
377 #define cinv10_01_im C9.y
378 #define cinv11_01_re C10.x
379 #define cinv11_01_im C10.y
380 #define cinv12_01_re C11.x
381 #define cinv12_01_im C11.y
382 #define cinv10_02_re C12.x
383 #define cinv10_02_im C12.y
384 #define cinv11_02_re C13.x
385 #define cinv11_02_im C13.y
386 #define cinv12_02_re C14.x
387 #define cinv12_02_im C14.y
388 #define cinv11_10_re C15.x
389 #define cinv11_10_im C15.y
390 #define cinv12_10_re C16.x
391 #define cinv12_10_im C16.y
392 #define cinv12_11_re C17.x
393 #define cinv12_11_im C17.y
395 #define cinv00_00_re C0.x
396 #define cinv01_01_re C0.y
397 #define cinv02_02_re C0.z
398 #define cinv10_10_re C0.w
399 #define cinv11_11_re C1.x
400 #define cinv12_12_re C1.y
401 #define cinv01_00_re C1.z
402 #define cinv01_00_im C1.w
403 #define cinv02_00_re C2.x
404 #define cinv02_00_im C2.y
405 #define cinv10_00_re C2.z
406 #define cinv10_00_im C2.w
407 #define cinv11_00_re C3.x
408 #define cinv11_00_im C3.y
409 #define cinv12_00_re C3.z
410 #define cinv12_00_im C3.w
411 #define cinv02_01_re C4.x
412 #define cinv02_01_im C4.y
413 #define cinv10_01_re C4.z
414 #define cinv10_01_im C4.w
415 #define cinv11_01_re C5.x
416 #define cinv11_01_im C5.y
417 #define cinv12_01_re C5.z
418 #define cinv12_01_im C5.w
419 #define cinv10_02_re C6.x
420 #define cinv10_02_im C6.y
421 #define cinv11_02_re C6.z
422 #define cinv11_02_im C6.w
423 #define cinv12_02_re C7.x
424 #define cinv12_02_im C7.y
425 #define cinv11_10_re C7.z
426 #define cinv11_10_im C7.w
427 #define cinv12_10_re C8.x
428 #define cinv12_10_im C8.y
429 #define cinv12_11_re C8.z
430 #define cinv12_11_im C8.w
431 #endif // CLOVER_DOUBLE
433 #define cinv00_01_re (+cinv01_00_re)
434 #define cinv00_01_im (-cinv01_00_im)
435 #define cinv00_02_re (+cinv02_00_re)
436 #define cinv00_02_im (-cinv02_00_im)
437 #define cinv01_02_re (+cinv02_01_re)
438 #define cinv01_02_im (-cinv02_01_im)
439 #define cinv00_10_re (+cinv10_00_re)
440 #define cinv00_10_im (-cinv10_00_im)
441 #define cinv01_10_re (+cinv10_01_re)
442 #define cinv01_10_im (-cinv10_01_im)
443 #define cinv02_10_re (+cinv10_02_re)
444 #define cinv02_10_im (-cinv10_02_im)
445 #define cinv00_11_re (+cinv11_00_re)
446 #define cinv00_11_im (-cinv11_00_im)
447 #define cinv01_11_re (+cinv11_01_re)
448 #define cinv01_11_im (-cinv11_01_im)
449 #define cinv02_11_re (+cinv11_02_re)
450 #define cinv02_11_im (-cinv11_02_im)
451 #define cinv10_11_re (+cinv11_10_re)
452 #define cinv10_11_im (-cinv11_10_im)
453 #define cinv00_12_re (+cinv12_00_re)
454 #define cinv00_12_im (-cinv12_00_im)
455 #define cinv01_12_re (+cinv12_01_re)
456 #define cinv01_12_im (-cinv12_01_im)
457 #define cinv02_12_re (+cinv12_02_re)
458 #define cinv02_12_im (-cinv12_02_im)
459 #define cinv10_12_re (+cinv12_10_re)
460 #define cinv10_12_im (-cinv12_10_im)
461 #define cinv11_12_re (+cinv12_11_re)
462 #define cinv11_12_im (-cinv12_11_im)
465 #define cinv20_20_re cinv00_00_re
466 #define cinv21_20_re cinv01_00_re
467 #define cinv21_20_im cinv01_00_im
468 #define cinv22_20_re cinv02_00_re
469 #define cinv22_20_im cinv02_00_im
470 #define cinv30_20_re cinv10_00_re
471 #define cinv30_20_im cinv10_00_im
472 #define cinv31_20_re cinv11_00_re
473 #define cinv31_20_im cinv11_00_im
474 #define cinv32_20_re cinv12_00_re
475 #define cinv32_20_im cinv12_00_im
476 #define cinv20_21_re cinv00_01_re
477 #define cinv20_21_im cinv00_01_im
478 #define cinv21_21_re cinv01_01_re
479 #define cinv22_21_re cinv02_01_re
480 #define cinv22_21_im cinv02_01_im
481 #define cinv30_21_re cinv10_01_re
482 #define cinv30_21_im cinv10_01_im
483 #define cinv31_21_re cinv11_01_re
484 #define cinv31_21_im cinv11_01_im
485 #define cinv32_21_re cinv12_01_re
486 #define cinv32_21_im cinv12_01_im
487 #define cinv20_22_re cinv00_02_re
488 #define cinv20_22_im cinv00_02_im
489 #define cinv21_22_re cinv01_02_re
490 #define cinv21_22_im cinv01_02_im
491 #define cinv22_22_re cinv02_02_re
492 #define cinv30_22_re cinv10_02_re
493 #define cinv30_22_im cinv10_02_im
494 #define cinv31_22_re cinv11_02_re
495 #define cinv31_22_im cinv11_02_im
496 #define cinv32_22_re cinv12_02_re
497 #define cinv32_22_im cinv12_02_im
498 #define cinv20_30_re cinv00_10_re
499 #define cinv20_30_im cinv00_10_im
500 #define cinv21_30_re cinv01_10_re
501 #define cinv21_30_im cinv01_10_im
502 #define cinv22_30_re cinv02_10_re
503 #define cinv22_30_im cinv02_10_im
504 #define cinv30_30_re cinv10_10_re
505 #define cinv31_30_re cinv11_10_re
506 #define cinv31_30_im cinv11_10_im
507 #define cinv32_30_re cinv12_10_re
508 #define cinv32_30_im cinv12_10_im
509 #define cinv20_31_re cinv00_11_re
510 #define cinv20_31_im cinv00_11_im
511 #define cinv21_31_re cinv01_11_re
512 #define cinv21_31_im cinv01_11_im
513 #define cinv22_31_re cinv02_11_re
514 #define cinv22_31_im cinv02_11_im
515 #define cinv30_31_re cinv10_11_re
516 #define cinv30_31_im cinv10_11_im
517 #define cinv31_31_re cinv11_11_re
518 #define cinv32_31_re cinv12_11_re
519 #define cinv32_31_im cinv12_11_im
520 #define cinv20_32_re cinv00_12_re
521 #define cinv20_32_im cinv00_12_im
522 #define cinv21_32_re cinv01_12_re
523 #define cinv21_32_im cinv01_12_im
524 #define cinv22_32_re cinv02_12_re
525 #define cinv22_32_im cinv02_12_im
526 #define cinv30_32_re cinv10_12_re
527 #define cinv30_32_im cinv10_12_im
528 #define cinv31_32_re cinv11_12_re
529 #define cinv31_32_im cinv11_12_im
530 #define cinv32_32_re cinv12_12_re
569 #endif // CLOVER_DOUBLE
604 #if (DD_PREC==2) // half precision
606 #endif // half precision
619 sid = blockIdx.x*blockDim.x + threadIdx.x;
625 const int face_volume = ((
param.threadDimMapUpper[
dim] -
param.threadDimMapLower[
dim]) >> 1);
626 const int face_num = (
sid >= face_volume);
631 coordsFromFaceIndex<1>(
X,
sid,
x1,
x2,
x3,
x4,
face_idx, face_volume,
dim, face_num,
param.parity,
dims);
634 for(
int dir=0; dir<4; ++dir){
635 active = active || isActive(dim,dir,+1,x1,x2,x3,x4,
param.commDim,
param.
X);
663 const int sp_idx = face_idx +
param.ghostOffset[0];
815 if (isActive(dim,0,-1,x1,x2,x3,x4,
param.commDim,
param.
X) && x1==0 )
824 const int sp_idx = face_idx +
param.ghostOffset[0];
826 sp_norm_idx = face_idx +
param.ghostNormOffset[0];
985 const int sp_idx = face_idx +
param.ghostOffset[1];
990 const int ga_idx =
sid;
1137 if (isActive(dim,1,-1,x1,x2,x3,x4,
param.commDim,
param.
X) && x2==0 )
1146 const int sp_idx = face_idx +
param.ghostOffset[1];
1148 sp_norm_idx = face_idx +
param.ghostNormOffset[1];
1298 if (isActive(dim,2,+1,x1,x2,x3,x4,
param.commDim,
param.
X) && x3==
X3m1 )
1307 const int sp_idx = face_idx +
param.ghostOffset[2];
1312 const int ga_idx =
sid;
1459 if (isActive(dim,2,-1,x1,x2,x3,x4,
param.commDim,
param.
X) && x3==0 )
1468 const int sp_idx = face_idx +
param.ghostOffset[2];
1470 sp_norm_idx = face_idx +
param.ghostNormOffset[2];
1620 if (isActive(dim,3,+1,x1,x2,x3,x4,
param.commDim,
param.
X) && x4==
X4m1 )
1629 const int sp_idx = face_idx +
param.ghostOffset[3];
1634 const int ga_idx =
sid;
1652 #ifdef CLOVER_TWIST_INV_DSLASH
1706 #ifdef CLOVER_TWIST_INV_DSLASH
1836 if (isActive(dim,3,-1,x1,x2,x3,x4,
param.commDim,
param.
X) && x4==0 )
1845 const int sp_idx = face_idx +
param.ghostOffset[3];
1847 sp_norm_idx = face_idx +
param.ghostNormOffset[3];
1868 #ifdef CLOVER_TWIST_INV_DSLASH
1922 #ifdef CLOVER_TWIST_INV_DSLASH
2054 READ_ACCUM(ACCUMTEX,
param.sp_stride)
2056 #ifndef CLOVER_TWIST_XPAY
2109 #endif//CLOVER_TWIST_XPAY
#define APPLY_CLOVER_TWIST(c, a, reg)
#define READ_INTERMEDIATE_SPINOR
__constant__ int ghostFace[QUDA_MAX_DIM+1]
#define RECONSTRUCT_GAUGE_MATRIX
#define APPLY_CLOVER_TWIST_INV(c, cinv, a, reg)
__constant__ int gauge_fixed
__constant__ int ga_stride
#define READ_GAUGE_MATRIX
__constant__ int X4X3X2X1hmX3X2X1h