5 #if (CUDA_VERSION < 8000) 10 #define SHARED_ACCUMULATOR 19 #endif // DYNAMIC_MU_NU 24 #ifdef SHARED_ACCUMULATOR 26 #define DECLARE_LINK(U) \ 27 extern __shared__ int s[]; \ 28 real *U = (real *)s; \ 30 const int tid = (threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x; \ 31 const int block = blockDim.x * blockDim.y * blockDim.z; \ 32 for (int i = 0; i < 18; i++) force[i * block + tid] = 0.0; \ 37 template <
typename real,
typename Link> __device__
inline void axpy(real a,
const real *x, Link &y)
39 const int tid = (threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x;
40 const int block = blockDim.x * blockDim.y * blockDim.z;
42 for (
int i = 0; i < 9; i++) {
43 y.data[i] += a * complex<real>(x[(2 * i + 0) * block + tid], x[(2 * i + 1) * block + tid]);
47 template <
typename real,
typename Link> __device__
inline void operator+=(real *y,
const Link &x)
49 const int tid = (threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x;
50 const int block = blockDim.x * blockDim.y * blockDim.z;
52 for (
int i = 0; i < 9; i++) {
53 y[(2 * i + 0) * block + tid] += x.data[i].real();
54 y[(2 * i + 1) * block + tid] += x.data[i].imag();
58 template <
typename real,
typename Link> __device__
inline void operator-=(real *y,
const Link &x)
60 const int tid = (threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x;
61 const int block = blockDim.x * blockDim.y * blockDim.z;
63 for (
int i = 0; i < 9; i++) {
64 y[(2 * i + 0) * block + tid] -= x.data[i].real();
65 y[(2 * i + 1) * block + tid] -= x.data[i].imag();
71 #define DECLARE_LINK(U) Link U; 75 template <
typename real,
typename Link> __device__
inline void axpy(real a,
const Link &x, Link &y) { y += a * x; }
79 #if defined(SHARED_ARRAY) && defined(SHARED_ACCUMULATOR) 81 #define DECLARE_ARRAY(d, idx) \ 84 extern __shared__ int s[]; \ 85 int tid = (threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x; \ 86 int block = blockDim.x * blockDim.y * blockDim.z; \ 87 int offset = 18 * block * sizeof(real) / sizeof(int) + idx * block + tid; \ 89 d = (unsigned char *)&s[offset]; \ 91 #elif defined(SHARED_ARRAY) 92 #error Cannot use SHARED_ARRAY with SHARED_ACCUMULATOR 94 #define DECLARE_ARRAY(d, idx) int d[4] = {0, 0, 0, 0}; 97 template <
class Float,
typename Force,
typename Gauge,
typename Oprod>
struct CloverDerivArg {
109 CloverDerivArg(
const Force &force,
const Gauge &gauge,
const Oprod &oprod,
const int *X_,
const int *E_,
110 double coeff,
int parity) :
113 volumeCB(force.volumeCB),
118 for (
int dir = 0; dir < 4; ++dir) {
119 this->X[dir] = X_[dir];
120 this->E[dir] = E_[dir];
121 this->border[dir] = (E_[dir] - X_[dir]) / 2;
127 template <
typename real,
typename Arg,
typename Link>
131 template <
typename real,
typename Arg,
int mu,
int nu,
typename Link>
136 int otherparity = (1 - arg.parity);
138 const int tidx = mu > nu ? (mu - 1) * mu / 2 + nu : (nu - 1) * nu / 2 +
mu;
154 Link U2 = arg.gauge(nu,
linkIndexShift(x, d, arg.E), otherparity);
159 Link U3 = arg.gauge(mu,
linkIndexShift(x, d, arg.E), otherparity);
166 Link Oprod1 = arg.oprod(tidx,
linkIndexShift(x, d, arg.E), arg.parity);
169 force -= U1 * U2 *
conj(U3) *
conj(U4) * Oprod1;
171 force += U1 * U2 *
conj(U3) *
conj(U4) * Oprod1;
175 Link Oprod2 = arg.oprod(tidx,
linkIndexShift(x, d, arg.E), arg.parity);
180 force -= U1 * U2 * Oprod2 *
conj(U3) *
conj(U4);
182 force += U1 * U2 * Oprod2 *
conj(U3) *
conj(U4);
190 Link U1 = arg.gauge(nu,
linkIndexShift(x, d, arg.E), otherparity);
195 Link U2 = arg.gauge(mu,
linkIndexShift(x, d, arg.E), otherparity);
210 Link Oprod1 = arg.oprod(tidx,
linkIndexShift(x, d, arg.E), arg.parity);
215 force +=
conj(U1) * U2 * Oprod1 * U3 *
conj(U4);
217 force -=
conj(U1) * U2 * Oprod1 * U3 *
conj(U4);
219 Link Oprod4 = arg.oprod(tidx,
linkIndexShift(x, d, arg.E), arg.parity);
222 force += Oprod4 *
conj(U1) * U2 * U3 *
conj(U4);
224 force -= Oprod4 *
conj(U1) * U2 * U3 *
conj(U4);
236 Link U1 = arg.gauge(mu,
linkIndexShift(y, d, arg.E), otherparity);
249 Link U4 = arg.gauge(nu,
linkIndexShift(y, d, arg.E), otherparity);
253 Link Oprod3 = arg.oprod(tidx,
linkIndexShift(y, d, arg.E), arg.parity);
257 force -= U1 * U2 *
conj(U3) * Oprod3 *
conj(U4);
259 force += U1 * U2 *
conj(U3) * Oprod3 *
conj(U4);
263 Link Oprod4 = arg.oprod(tidx,
linkIndexShift(y, d, arg.E), arg.parity);
267 force -= U1 * Oprod4 * U2 *
conj(U3) *
conj(U4);
269 force += U1 * Oprod4 * U2 *
conj(U3) *
conj(U4);
290 Link U3 = arg.gauge(nu,
linkIndexShift(y, d, arg.E), otherparity);
295 Link U4 = arg.gauge(mu,
linkIndexShift(y, d, arg.E), otherparity);
299 Link Oprod1 = arg.oprod(tidx,
linkIndexShift(y, d, arg.E), arg.parity);
303 force +=
conj(U1) * U2 * U3 * Oprod1 *
conj(U4);
305 force -=
conj(U1) * U2 * U3 * Oprod1 *
conj(U4);
308 Link Oprod2 = arg.oprod(tidx,
linkIndexShift(y, d, arg.E), arg.parity);
312 force +=
conj(U1) * Oprod2 * U2 * U3 *
conj(U4);
314 force -=
conj(U1) * Oprod2 * U2 * U3 *
conj(U4);
322 int index = threadIdx.x + blockIdx.x * blockDim.x;
326 int yIndex = threadIdx.y + blockIdx.y * blockDim.y;
327 if (yIndex >= 2)
return;
330 int mu = threadIdx.z + blockIdx.z * blockDim.z;
339 for (
int nu = 0; nu < 4; nu++) {
340 if (mu == nu)
continue;
346 computeForce<real, Arg, 0, 1, Link>(
force,
arg,
index, yIndex);
347 computeForce<real, Arg, 0, 2, Link>(
force,
arg,
index, yIndex);
348 computeForce<real, Arg, 0, 3, Link>(
force,
arg,
index, yIndex);
351 computeForce<real, Arg, 1, 0, Link>(
force,
arg,
index, yIndex);
352 computeForce<real, Arg, 1, 3, Link>(
force,
arg,
index, yIndex);
353 computeForce<real, Arg, 1, 2, Link>(
force,
arg,
index, yIndex);
356 computeForce<real, Arg, 2, 3, Link>(
force,
arg,
index, yIndex);
357 computeForce<real, Arg, 2, 0, Link>(
force,
arg,
index, yIndex);
358 computeForce<real, Arg, 2, 1, Link>(
force,
arg,
index, yIndex);
361 computeForce<real, Arg, 3, 2, Link>(
force,
arg,
index, yIndex);
362 computeForce<real, Arg, 3, 1, Link>(
force,
arg,
index, yIndex);
363 computeForce<real, Arg, 3, 0, Link>(
force,
arg,
index, yIndex);
369 Link F = arg.force(mu, index, yIndex == 0 ? arg.parity : 1 - arg.parity);
370 axpy(arg.coeff, force, F);
371 arg.force(mu, index, yIndex == 0 ? arg.parity : 1 - arg.parity) = F;
__host__ __device__ float4 operator-=(float4 &x, const float4 y)
static __device__ __host__ void getCoordsExtended(I x[], int cb_index, const J X[], int parity, const int R[])
static __device__ __host__ int linkIndexShift(const I x[], const J dx[], const K X[4])
#define DECLARE_ARRAY(d, idx)
Main header file for host and device accessors to GaugeFields.
std::complex< double > Complex
static int index(int ndim, const int *dims, const int *x)
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
__host__ __device__ float4 operator+=(float4 &x, const float4 y)
__device__ void axpy(real a, const real *x, Link &y)
__host__ __device__ ValueType conj(ValueType x)
CloverDerivArg(const Force &force, const Gauge &gauge, const Oprod &oprod, const int *X_, const int *E_, double coeff, int parity)
__global__ void cloverDerivativeKernel(Arg arg)
__device__ void computeForce(LINK force, Arg &arg, int xIndex, int yIndex, int mu, int nu)