12 #ifdef GPU_CLOVER_DIRAC
21 typename RealTypeId<Cmplx>::Type
coeff;
39 CloverDerivArg(cudaGaugeField& force, cudaGaugeField&
gauge, cudaGaugeField& oprod,
int mu,
int nu,
double coeff,
int parity,
bool conjugate) :
40 mu(mu), nu(nu), coeff(coeff), parity(parity), volumeCB(force.VolumeCB()),
41 force(reinterpret_cast<Cmplx*>(force.Gauge_p())), gauge(reinterpret_cast<Cmplx*>(gauge.Gauge_p())), oprod(reinterpret_cast<Cmplx*>(oprod.Gauge_p())),
42 forceStride(force.Stride()), gaugeStride(gauge.Stride()), oprodStride(oprod.Stride()),
43 forceOffset(force.Bytes()/(2*sizeof(Cmplx))), gaugeOffset(gauge.Bytes()/(2*sizeof(Cmplx))), oprodOffset(oprod.Bytes()/(2*sizeof(Cmplx)))
45 for(
int dir=0; dir<4; ++dir)
X[dir] = force.X()[dir];
47 for(
int dir=0; dir<4; ++dir) border[dir] = 2;
51 __device__
void getCoords(
int x[4],
int cb_index,
const int X[4],
int parity)
53 x[3] = cb_index/(X[2]*X[1]*X[0]/2);
54 x[2] = (cb_index/(X[1]*X[0]/2)) % X[2];
55 x[1] = (cb_index/(X[0]/2)) % X[1];
56 x[0] = 2*(cb_index%(X[0]/2)) + ((x[3]+x[2]+x[1]+
parity)&1);
61 __device__
int linkIndex(
const int x[4],
const int dx[4],
const int X[4])
64 for (
int i=0; i<4; i++) y[i] = (x[i] + dx[i] + X[i]) % X[i];
65 return (((y[3]*X[2] + y[2])*X[1] + y[1])*X[0] + y[0])/2;
69 template<
typename Cmplx,
bool isConjugate>
71 cloverDerivativeKernel(
const CloverDerivArg<Cmplx>
arg)
73 int index = threadIdx.x + blockIdx.x*blockDim.x;
75 if(index >= arg.volumeCB)
return;
80 int otherparity = (1-arg.parity);
84 for(
int dir=0; dir<4; ++dir) X[dir] = arg.X[dir];
86 for(
int dir=0; dir<4; ++dir){
87 x[dir] += arg.border[dir];
88 y[dir] += arg.border[dir];
89 X[dir] += 2*arg.border[dir];
92 Cmplx* thisGauge = arg.gauge + arg.parity*arg.gaugeOffset;
93 Cmplx* otherGauge = arg.gauge + (otherparity)*arg.gaugeOffset;
95 Cmplx* thisOprod = arg.oprod + arg.parity*arg.oprodOffset;
97 const int& mu = arg.mu;
98 const int& nu = arg.nu;
105 int d[4] = {0, 0, 0, 0};
110 arg.gaugeStride, &U1);
117 arg.gaugeStride, &U2);
125 arg.gaugeStride, &U3);
131 arg.gaugeStride, &U4);
137 if(isConjugate) Oprod1 -=
conj(Oprod1);
138 thisForce = U1*U2*
conj(U3)*
conj(U4)*Oprod1;
145 if(isConjugate) Oprod2 -=
conj(Oprod2);
147 thisForce += U1*U2*Oprod2*
conj(U3)*
conj(U4);
152 int d[4] = {0, 0, 0, 0};
156 arg.gaugeStride, &U1);
162 arg.gaugeStride, &U2);
169 arg.gaugeStride, &U3);
175 arg.gaugeStride, &U4);
183 if(isConjugate) Oprod3 -=
conj(Oprod3);
184 otherForce = U1*U2*
conj(U3)*Oprod3*
conj(U4);
192 if(isConjugate) Oprod4 -=
conj(Oprod4);
194 otherForce += U1*Oprod4*U2*
conj(U3)*
conj(U4);
201 int d[4] = {0, 0, 0, 0};
206 arg.gaugeStride, &U1);
213 arg.gaugeStride, &U2);
220 arg.gaugeStride, &U3);
226 arg.gaugeStride, &U4);
234 if(isConjugate) Oprod1 -=
conj(Oprod1);
236 otherForce -=
conj(U1)*U2*U3*Oprod1*
conj(U4);
243 if(isConjugate) Oprod2 -=
conj(Oprod2);
244 otherForce -=
conj(U1)*Oprod2*U2*U3*
conj(U4);
248 int d[4] = {0, 0, 0, 0};
253 arg.gaugeStride, &U1);
260 arg.gaugeStride, &U2);
267 arg.gaugeStride, &U3);
273 arg.gaugeStride, &U4);
281 if(isConjugate) Oprod1 -=
conj(Oprod1);
282 thisForce -=
conj(U1)*U2*Oprod1*U3*
conj(U4);
287 if(isConjugate) Oprod4 -=
conj(Oprod4);
288 thisForce -= Oprod4*
conj(U1)*U2*U3*
conj(U4);
291 thisForce *= arg.coeff;
292 otherForce *= arg.coeff;
297 appendMatrixToArray(thisForce, index, arg.forceStride, arg.force + arg.parity*arg.forceOffset);
298 appendMatrixToArray(otherForce, index, arg.forceStride, arg.force + otherparity*arg.forceOffset);
304 template<
typename Complex>
305 class CloverDerivative :
public Tunable {
308 CloverDerivArg<Complex>
arg;
309 const GaugeField &meta;
311 unsigned int sharedBytesPerThread()
const {
return 0; }
312 unsigned int sharedBytesPerBlock(
const TuneParam &)
const {
return 0; }
314 unsigned int minThreads()
const {
return arg.volumeCB; }
315 bool tuneGridDim()
const {
return false; }
318 CloverDerivative(
const CloverDerivArg<Complex> &arg,
const GaugeField &meta)
319 : arg(arg), meta(meta) {
320 writeAuxString(
"threads=%d,prec=%lu,stride=%d,geometery=%d",arg.volumeCB,
sizeof(
Complex)/2,arg.forceOffset);
322 virtual ~CloverDerivative() {}
324 void apply(
const cudaStream_t &
stream){
327 cloverDerivativeKernel<Complex,true><<<tp.grid,tp.block,tp.shared_bytes>>>(
arg);
329 cloverDerivativeKernel<Complex,false><<<tp.grid,tp.block,tp.shared_bytes>>>(
arg);
336 long long flops()
const {
340 long long bytes()
const {
return 0; }
342 TuneKey tuneKey()
const {
return TuneKey(meta.VolString(),
typeid(*this).name(), aux); }
347 template<
typename Float>
349 cudaGaugeField& gauge,
350 cudaGaugeField& oprod,
351 int mu,
int nu,
double coeff,
int parity,
354 typedef typename ComplexTypeId<Float>::Type
Complex;
355 CloverDerivArg<Complex>
arg(out, gauge, oprod, mu, nu, coeff, parity, conjugate);
358 dim3 blockDim(128, 1, 1);
359 dim3 gridDim((arg.volumeCB + blockDim.x-1)/blockDim.x, 1, 1);
361 cloverDerivativeKernel<Complex,true><<<gridDim,blockDim,0>>>(
arg);
363 cloverDerivativeKernel<Complex,false><<<gridDim,blockDim,0>>>(
arg);
372 int mu,
int nu,
double coeff,
QudaParity parity,
int conjugate)
374 #ifdef GPU_CLOVER_DIRAC
381 cloverDerivative<double>(
out,
gauge, oprod,
mu, nu,
coeff, device_parity, conjugate);
383 cloverDerivative<float>(
out,
gauge, oprod,
mu, nu,
coeff, device_parity, conjugate);
__device__ __host__ int linkIndex(int x[], int dx[], const int X[4])
QudaVerbosity getVerbosity()
Matrix< N, std::complex< T > > conj(const Matrix< N, std::complex< T > > &mat)
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int mu
std::complex< double > Complex
void cloverDerivative(cudaGaugeField &out, cudaGaugeField &gauge, cudaGaugeField &oprod, int mu, int nu, double coeff, QudaParity parity, int conjugate)
QudaPrecision Precision() const
__device__ __host__ int index(int i, int j)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
__constant__ double coeff
enum QudaParity_s QudaParity
__device__ void loadMatrixFromArray(const T *const array, const int idx, const int stride, Matrix< T, N > *mat)
cpuColorSpinorField * out
__device__ void loadLinkVariableFromArray(const T *const array, const int dir, const int idx, const int stride, Matrix< T, 3 > *link)
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
__device__ void appendMatrixToArray(const Matrix< double2, 3 > &mat, const int idx, const int stride, double2 *const array)
QudaFieldGeometry Geometry() const
__host__ __device__ ValueType conj(ValueType x)
__device__ __host__ void getCoords(int x[4], int cb_index, const int X[4], int parity)