3 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
7 #if (CUDA_VERSION >= 4010)
10 #define VOLATILE volatile
14 #define spinorFloat double
40 #define spinorFloat float
65 #endif // SPINOR_DOUBLE
108 #endif // GAUGE_DOUBLE
111 #define gT00_re (+g00_re)
112 #define gT00_im (-g00_im)
113 #define gT01_re (+g10_re)
114 #define gT01_im (-g10_im)
115 #define gT02_re (+g20_re)
116 #define gT02_im (-g20_im)
117 #define gT10_re (+g01_re)
118 #define gT10_im (-g01_im)
119 #define gT11_re (+g11_re)
120 #define gT11_im (-g11_im)
121 #define gT12_re (+g21_re)
122 #define gT12_im (-g21_im)
123 #define gT20_re (+g02_re)
124 #define gT20_im (-g02_im)
125 #define gT21_re (+g12_re)
126 #define gT21_im (-g12_im)
127 #define gT22_re (+g22_re)
128 #define gT22_im (-g22_im)
157 #if (__COMPUTE_CAPABILITY__ >= 200)
158 #define SHARED_STRIDE 16 // to avoid bank conflicts on Fermi
160 #define SHARED_STRIDE 8 // to avoid bank conflicts on G80 and GT200
163 #if (__COMPUTE_CAPABILITY__ >= 200)
164 #define SHARED_STRIDE 32 // to avoid bank conflicts on Fermi
166 #define SHARED_STRIDE 16 // to avoid bank conflicts on G80 and GT200
173 #if (defined MULTI_GPU) && (DD_PREC==2) // half precision
175 #endif // MULTI_GPU half precision
177 int sid = ((blockIdx.y*blockDim.y + threadIdx.y)*gridDim.x + blockIdx.x)*blockDim.x + threadIdx.x;
195 X = 2*
sid + (boundaryCrossing +
param.parity) % 2;
219 const int face_volume = (
param.threads*
param.Ls >> 1);
220 const int face_num = (
sid >= face_volume);
221 face_idx =
sid - face_num*face_volume;
227 #if (DD_PREC==2) // half precision
232 coordsFromDWFaceIndex<1>(
sid,
x1,
x2,
x3,
x4,
xs,
face_idx, face_volume,
dim, face_num,
param.parity,
dims);
237 X = 2*sid + (boundaryCrossing +
param.parity) % 2;
258 #if (DD_PREC==0) //temporal hack
304 const int sp_idx = (x1==
X1m1 ? X-
X1m1 : X+1) >> 1;
495 const int sp_idx = (x1==0 ? X+
X1m1 : X-1) >> 1;
501 const int ga_idx = sp_idx %
Vh;
693 const int ga_idx = sid %
Vh;
881 const int sp_idx = (x2==0 ? X+
X2X1mX1 : X-
X1) >> 1;
887 const int ga_idx = sp_idx %
Vh;
1079 const int ga_idx = sid %
Vh;
1273 const int ga_idx = sp_idx %
Vh;
1465 const int ga_idx = sid %
Vh;
1506 a0_re = t_proj_scale*
i00_re; a0_im = t_proj_scale*
i00_im;
1507 a1_re = t_proj_scale*
i01_re; a1_im = t_proj_scale*
i01_im;
1508 a2_re = t_proj_scale*
i02_re; a2_im = t_proj_scale*
i02_im;
1509 b0_re = t_proj_scale*
i10_re; b0_im = t_proj_scale*
i10_im;
1510 b1_re = t_proj_scale*
i11_re; b1_im = t_proj_scale*
i11_im;
1511 b2_re = t_proj_scale*
i12_re; b2_im = t_proj_scale*
i12_im;
1580 a0_re = t_proj_scale*
i00_re; a0_im = t_proj_scale*
i00_im;
1581 a1_re = t_proj_scale*
i01_re; a1_im = t_proj_scale*
i01_im;
1582 a2_re = t_proj_scale*
i02_re; a2_im = t_proj_scale*
i02_im;
1583 b0_re = t_proj_scale*
i10_re; b0_im = t_proj_scale*
i10_im;
1584 b1_re = t_proj_scale*
i11_re; b1_im = t_proj_scale*
i11_im;
1585 b2_re = t_proj_scale*
i12_re; b2_im = t_proj_scale*
i12_im;
1721 const int ga_idx = sp_idx %
Vh;
1763 a0_re = t_proj_scale*
i00_re; a0_im = t_proj_scale*
i00_im;
1764 a1_re = t_proj_scale*
i01_re; a1_im = t_proj_scale*
i01_im;
1765 a2_re = t_proj_scale*
i02_re; a2_im = t_proj_scale*
i02_im;
1766 b0_re = t_proj_scale*
i10_re; b0_im = t_proj_scale*
i10_im;
1767 b1_re = t_proj_scale*
i11_re; b1_im = t_proj_scale*
i11_im;
1768 b2_re = t_proj_scale*
i12_re; b2_im = t_proj_scale*
i12_im;
1837 a0_re = t_proj_scale*
i00_re; a0_im = t_proj_scale*
i00_im;
1838 a1_re = t_proj_scale*
i01_re; a1_im = t_proj_scale*
i01_im;
1839 a2_re = t_proj_scale*
i02_re; a2_im = t_proj_scale*
i02_im;
1840 b0_re = t_proj_scale*
i10_re; b0_im = t_proj_scale*
i10_im;
1841 b1_re = t_proj_scale*
i11_re; b1_im = t_proj_scale*
i11_im;
1842 b2_re = t_proj_scale*
i12_re; b2_im = t_proj_scale*
i12_im;
1966 int sp_idx = ( xs ==
param.Ls-1 ? X-(
param.Ls-1)*2*
Vh : X+2*
Vh ) / 2;
1971 if ( xs !=
param.Ls-1 )
2011 int sp_idx = ( xs == 0 ? X+(
param.Ls-1)*2*
Vh : X-2*
Vh ) / 2;
2056 #if defined MULTI_GPU && defined DSLASH_XPAY
2061 READ_ACCUM(ACCUMTEX,
param.sp_stride)
2064 #ifdef SPINOR_DOUBLE
2065 o00_re = o00_re + a_inv*accum0.x;
2090 o00_re = o00_re + a_inv*accum0.x;
2114 #endif // SPINOR_DOUBLE
2116 #endif // DSLASH_XPAY
2119 #if defined MULTI_GPU && defined DSLASH_XPAY
2125 incomplete = incomplete || (
param.commDim[3] && (x4==0 || x4==
X4m1));
2127 incomplete = incomplete || (
param.commDim[2] && (x3==0 || x3==
X3m1));
2129 incomplete = incomplete || (
param.commDim[1] && (x2==0 || x2==
X2m1));
2131 incomplete = incomplete || (
param.commDim[0] && (x1==0 || x1==
X1m1));
2139 #ifdef SPINOR_DOUBLE
2189 #endif // SPINOR_DOUBLE
2191 #endif // DSLASH_XPAY
2199 #undef SHARED_STRIDE
READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
VOLATILE spinorFloat o30_im
RECONSTRUCT_GAUGE_MATRIX(0)
VOLATILE spinorFloat o20_re
READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
VOLATILE spinorFloat o10_re
VOLATILE spinorFloat o11_im
__constant__ int X3X2X1mX2X1
VOLATILE spinorFloat o02_re
#define READ_INTERMEDIATE_SPINOR
VOLATILE spinorFloat o02_im
VOLATILE spinorFloat o30_re
VOLATILE spinorFloat o21_re
VOLATILE spinorFloat o10_im
VOLATILE spinorFloat o22_im
VOLATILE spinorFloat o20_im
VOLATILE spinorFloat o31_im
VOLATILE spinorFloat o12_re
__constant__ int ghostFace[QUDA_MAX_DIM+1]
VOLATILE spinorFloat o00_re
VOLATILE spinorFloat o31_re
VOLATILE spinorFloat o01_re
VOLATILE spinorFloat o01_im
VOLATILE spinorFloat o21_im
VOLATILE spinorFloat o00_im
__constant__ int gauge_fixed
__constant__ int X4X3X2X1mX3X2X1
VOLATILE spinorFloat o32_im
#define ASSN_GAUGE_MATRIX
__constant__ int ga_stride
VOLATILE spinorFloat o11_re
VOLATILE spinorFloat o12_im
VOLATILE spinorFloat o32_re
VOLATILE spinorFloat o22_re
__constant__ int X4X3X2X1hmX3X2X1h
READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx)