42 #if (CUDA_VERSION < 8000) 47 #define SHARED_ACCUMULATOR 57 #endif // DYNAMIC_MU_NU 62 #ifdef SHARED_ACCUMULATOR 64 #define DECLARE_LINK(U) \ 65 extern __shared__ int s[]; \ 68 const int tid = (threadIdx.z*blockDim.y + threadIdx.y)*blockDim.x + threadIdx.x; \ 69 const int block = blockDim.x * blockDim.y * blockDim.z; \ 70 for (int i=0; i<18; i++) force[i*block + tid] = 0.0; \ 75 template <
typename real,
typename Link>
76 __device__
inline void axpy(real
a,
const real *
x, Link &
y) {
77 const int tid = (threadIdx.z*
blockDim.y + threadIdx.y)*
blockDim.x + threadIdx.x;
85 template <
typename real,
typename Link>
87 const int tid = (threadIdx.z*
blockDim.y + threadIdx.y)*
blockDim.x + threadIdx.x;
91 y[(2*
i+0)*
block + tid] +=
x.data[
i].real();
92 y[(2*
i+1)*
block + tid] +=
x.data[
i].imag();
96 template <
typename real,
typename Link>
98 const int tid = (threadIdx.z*
blockDim.y + threadIdx.y)*
blockDim.x + threadIdx.x;
102 y[(2*
i+0)*
block + tid] -=
x.data[
i].real();
103 y[(2*
i+1)*
block + tid] -=
x.data[
i].imag();
109 #define DECLARE_LINK(U) Link U; 113 template <
typename real,
typename Link>
114 __device__
inline void axpy(real
a,
const Link &
x, Link &
y) {
y +=
a*
x; }
118 #if defined(SHARED_ARRAY) && defined(SHARED_ACCUMULATOR) 120 #define DECLARE_ARRAY(d, idx) \ 123 extern __shared__ int s[]; \ 124 int tid = (threadIdx.z*blockDim.y + threadIdx.y)*blockDim.x + threadIdx.x; \ 125 int block = blockDim.x*blockDim.y*blockDim.z; \ 126 int offset = 18*block*sizeof(real)/sizeof(int) + idx*block + tid; \ 128 d = (unsigned char*)&s[offset]; \ 130 #elif defined(SHARED_ARRAY) 131 #error Cannot use SHARED_ARRAY with SHARED_ACCUMULATOR 133 #define DECLARE_ARRAY(d, idx) \ 134 int d[4] = {0, 0, 0, 0}; 138 #ifdef GPU_CLOVER_DIRAC 140 template<
class Float,
typename Force,
typename Gauge,
typename Oprod>
141 struct CloverDerivArg
154 CloverDerivArg(
const Force& force,
const Gauge& gauge,
const Oprod& oprod,
155 const int *X_,
const int *E_,
158 force(force), gauge(gauge), oprod(oprod)
160 for(
int dir=0; dir<4; ++dir) {
161 this->X[dir] = X_[dir];
162 this->E[dir] = E_[dir];
163 this->border[dir] = (E_[dir] - X_[dir])/2;
170 template <
typename real,
typename Arg,
typename Link>
171 __device__
void computeForce(
LINK force, Arg &
arg,
int xIndex,
int yIndex,
int mu,
int nu) {
173 template <
typename real,
typename Arg,
int mu,
int nu,
typename Link>
174 __device__ __forceinline__
void computeForce(
LINK force, Arg &
arg,
int xIndex,
int yIndex) {
177 int otherparity = (1-
arg.parity);
179 const int tidx =
mu > nu ? (
mu-1)*
mu/2 + nu : (nu-1)*nu/2 +
mu;
209 if (nu <
mu) force -= U1*U2*
conj(U3)*
conj(U4)*Oprod1;
210 else force += U1*U2*
conj(U3)*
conj(U4)*Oprod1;
216 if (nu <
mu) force -= U1*U2*Oprod2*
conj(U3)*
conj(U4);
217 else force += U1*U2*Oprod2*
conj(U3)*
conj(U4);
245 if (nu <
mu) force +=
conj(U1)*U2*Oprod1*U3*
conj(U4);
246 else force -=
conj(U1)*U2*Oprod1*U3*
conj(U4);
250 if (nu <
mu) force += Oprod4*
conj(U1)*U2*U3*
conj(U4);
251 else force -= Oprod4*
conj(U1)*U2*U3*
conj(U4);
283 if (nu <
mu) force -= U1*U2*
conj(U3)*Oprod3*
conj(U4);
284 else force += U1*U2*
conj(U3)*Oprod3*
conj(U4);
291 if (nu <
mu) force -= U1*Oprod4*U2*
conj(U3)*
conj(U4);
292 else force += U1*Oprod4*U2*
conj(U3)*
conj(U4);
323 if (nu <
mu) force +=
conj(U1)*U2*U3*Oprod1*
conj(U4);
324 else force -=
conj(U1)*U2*U3*Oprod1*
conj(U4);
330 if (nu <
mu) force +=
conj(U1)*Oprod2*U2*U3*
conj(U4);
331 else force -=
conj(U1)*Oprod2*U2*U3*
conj(U4);
338 template<
typename real,
typename Arg>
339 __global__
void cloverDerivativeKernel(Arg
arg)
345 int yIndex = threadIdx.y + blockIdx.y*
blockDim.y;
346 if (yIndex >= 2)
return;
358 for (
int nu=0; nu<4; nu++) {
359 if (
mu==nu)
continue;
360 computeForce<real,Arg,Link>(force,
arg,
index, yIndex,
mu, nu);
365 computeForce<real,Arg,0,1,Link>(force,
arg,
index, yIndex);
366 computeForce<real,Arg,0,2,Link>(force,
arg,
index, yIndex);
367 computeForce<real,Arg,0,3,Link>(force,
arg,
index, yIndex);
370 computeForce<real,Arg,1,0,Link>(force,
arg,
index, yIndex);
371 computeForce<real,Arg,1,3,Link>(force,
arg,
index, yIndex);
372 computeForce<real,Arg,1,2,Link>(force,
arg,
index, yIndex);
375 computeForce<real,Arg,2,3,Link>(force,
arg,
index, yIndex);
376 computeForce<real,Arg,2,0,Link>(force,
arg,
index, yIndex);
377 computeForce<real,Arg,2,1,Link>(force,
arg,
index, yIndex);
380 computeForce<real,Arg,3,2,Link>(force,
arg,
index, yIndex);
381 computeForce<real,Arg,3,1,Link>(force,
arg,
index, yIndex);
382 computeForce<real,Arg,3,0,Link>(force,
arg,
index, yIndex);
389 arg.force.load((real*)(F.data),
index,
mu, yIndex == 0 ?
arg.parity : 1-
arg.parity);
391 arg.force.save((real*)(F.data),
index,
mu, yIndex == 0 ?
arg.parity : 1-
arg.parity);
397 template<
typename Float,
typename Arg>
398 class CloverDerivative :
public TunableVectorY {
402 const GaugeField &meta;
404 #if defined(SHARED_ACCUMULATOR) && defined(SHARED_ARRAY) 405 unsigned int sharedBytesPerThread()
const {
return 18*
sizeof(Float) + 8; }
406 #elif defined(SHARED_ACCUMULATOR) 407 unsigned int sharedBytesPerThread()
const {
return 18*
sizeof(Float); }
409 unsigned int sharedBytesPerThread()
const {
return 0; }
411 unsigned int sharedBytesPerBlock(
const TuneParam &)
const {
return 0; }
413 unsigned int minThreads()
const {
return arg.volumeCB; }
414 bool tuneGridDim()
const {
return false; }
417 CloverDerivative(
const Arg &
arg,
const GaugeField &meta) : TunableVectorY(2),
arg(
arg), meta(meta) {
418 writeAuxString(
"threads=%d,prec=%lu,fstride=%d,gstride=%d,ostride=%d",
419 arg.volumeCB,
sizeof(Float),
arg.force.stride,
420 arg.gauge.stride,
arg.oprod.stride);
422 virtual ~CloverDerivative() {}
424 void apply(
const cudaStream_t &
stream){
426 cloverDerivativeKernel<Float><<<tp.grid,tp.block,tp.shared_bytes>>>(
arg);
429 bool advanceBlockDim(TuneParam &
param)
const {
431 dim3 grid =
param.grid;
434 param.grid.z = grid.z;
437 if (
param.block.z < 4) {
450 void initTuneParam(TuneParam &
param)
const {
458 void defaultTuneParam(TuneParam &
param)
const { initTuneParam(
param); }
461 void preTune() {
arg.force.save(); }
462 void postTune(){
arg.force.load(); }
464 long long flops()
const {
return 16 * 198 * 3 * 4 * 2 * (
long long)
arg.volumeCB; }
465 long long bytes()
const {
return ((8*
arg.gauge.Bytes() + 4*
arg.oprod.Bytes())*3 + 2*
arg.force.Bytes()) * 4 * 2 *
arg.volumeCB; }
467 TuneKey tuneKey()
const {
return TuneKey(meta.VolString(),
typeid(*this).name(), aux); }
471 template<
typename Float>
473 cudaGaugeField &gauge,
474 cudaGaugeField &oprod,
478 errorQuda(
"Force field does not support reconstruction");
480 if (force.Order() != oprod.Order())
481 errorQuda(
"Force and Oprod orders must match");
484 errorQuda(
"Force field does not support reconstruction");
487 typedef gauge::FloatNOrder<Float, 18, 2, 18> F;
488 typedef gauge::FloatNOrder<Float, 18, 2, 18> O;
490 if (gauge.isNative()) {
492 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type G;
493 typedef CloverDerivArg<Float,F,G,O> Arg;
494 Arg
arg(F(force), G(gauge), O(oprod), force.X(), oprod.X(),
coeff,
parity);
495 CloverDerivative<Float, Arg> deriv(
arg, gauge);
499 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type G;
500 typedef CloverDerivArg<Float,F,G,O> Arg;
501 Arg
arg(F(force), G(gauge), O(oprod), force.X(), oprod.X(),
coeff,
parity);
502 CloverDerivative<Float, Arg> deriv(
arg, gauge);
506 errorQuda(
"Reconstruction type %d not supported",gauge.Reconstruct());
509 errorQuda(
"Gauge order %d not supported", gauge.Order());
512 errorQuda(
"Force order %d not supported", force.Order());
524 #ifdef GPU_CLOVER_DIRAC 528 for (
int d=0;
d<4;
d++) {
529 if (oprod.
X()[
d] != gauge.
X()[
d])
530 errorQuda(
"Incompatible extended dimensions d=%d gauge=%d oprod=%d",
d, gauge.
X()[
d], oprod.
X()[
d]);
536 cloverDerivative<double>(force, gauge, oprod,
coeff, device_parity);
539 cloverDerivative<float>(force, gauge, oprod,
coeff, device_parity);
__host__ __device__ float4 operator-=(float4 &x, const float4 y)
void cloverDerivative(cudaGaugeField &force, cudaGaugeField &gauge, cudaGaugeField &oprod, double coeff, QudaParity parity)
Compute the derivative of the clover matrix in the direction mu,nu and compute the resulting force gi...
static __device__ __host__ void getCoordsExtended(I x[], int cb_index, const J X[], int parity, const int R[])
#define DECLARE_ARRAY(d, idx)
static __device__ __host__ int linkIndexShift(const I x[], const J dx[], const K X[4])
QudaVerbosity getVerbosity()
std::complex< double > Complex
void initTuneParam(TuneParam ¶m) const
QudaFieldGeometry Geometry() const
bool advanceBlockDim(TuneParam ¶m) const
char * index(const char *, int)
for(int s=0;s< param.dc.Ls;s++)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Main header file for host and device accessors to GaugeFields.
enum QudaParity_s QudaParity
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
__host__ __device__ float4 operator+=(float4 &x, const float4 y)
__device__ void axpy(real a, const real *x, Link &y)
__host__ __device__ ValueType conj(ValueType x)
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
static __inline__ size_t size_t d
QudaPrecision Precision() const