27 for (
int i=0; i<4; i++) y[i] = (x[i] + dx[i] + param.
X[i]-1) % param.
X[i];
28 int idx = (((y[3]*param.
X[2] + y[2])*param.
X[1] + y[1])*param.
X[0] + y[0]) >> 1;
40 template <
typename Cmplx>
44 int X = blockIdx.x * blockDim.x + threadIdx.x;
55 int aux1 = X / (param.
X[0]/2);
57 x[0] = X - aux1 * (param.
X[0]/2);
58 int aux2 = aux1 / param.
X[1];
59 x[1] = aux1 - aux2 * param.
X[1];
60 int aux3 = aux2 / param.
X[2];
61 x[2] = aux2 - aux3 * param.
X[2];
63 x[3] = aux3 - parity * param.
X[3];
66 for (
int mu=0;
mu<4;
mu++) {
67 for (
int nu=0; nu<
mu; nu++) {
75 int dx[4] = {0, 0, 0, 0};
95 Ftmp =
conj(U3) * Ftmp;
103 Ftmp =
conj(U4) * Ftmp;
106 F += Ftmp +
conj(Ftmp);
113 int dx[4] = {0, 0, 0, 0};
135 Ftmp =
conj(U3) * Ftmp;
148 F += Ftmp +
conj(Ftmp);
156 int dx[4] = {0, 0, 0, 0};
188 Ftmp =
conj(U4) * Ftmp;
191 F += Ftmp +
conj(Ftmp);
198 int dx[4] = {0, 0, 0, 0};
237 F += Ftmp +
conj(Ftmp);
241 int munu_idx = (mu*(mu-1))/2 + nu;
250 #ifdef GPU_CLOVER_DIRAC
251 using namespace quda;
261 for (
int i=0; i<4; i++)
param.
X[i] = gauge.
X()[i];
264 param.FmunuLengthCB = Fmunu.Length() / 2;
265 param.FmunuStride = Fmunu.Stride();
267 dim3 blockDim(32, 1, 1);
268 dim3 gridDim((
param.threads + blockDim.x - 1) / blockDim.x, 1, 1);
271 computeFmunuKernel<<<gridDim,blockDim>>>((double2*)Fmunu.Gauge_p(), (double2*)gauge.
Gauge_p(),
param);
273 computeFmunuKernel<<<gridDim,blockDim>>>((float2*)Fmunu.Gauge_p(), (float2*)gauge.
Gauge_p(),
param);
280 errorQuda(
"Clover dslash has not been built");