11 #ifdef GPU_CLOVER_DIRAC 13 template<
typename Float,
typename Clover,
typename Fmunu>
22 CloverArg(Clover &clover, Fmunu& f,
const GaugeField &meta,
double cloverCoeff)
23 : threads(meta.VolumeCB()), cloverCoeff(cloverCoeff), clover(clover), f(f)
25 for(
int dir=0; dir<4; ++dir) X[dir] = meta.X()[dir];
55 template<
typename Float,
typename Arg>
56 __device__ __host__
void cloverComputeCore(Arg &
arg,
int x_cb,
int parity){
59 constexpr
int nSpin = 4;
60 constexpr
int N = nColor*nSpin/2;
67 for (
int i=0; i<6; ++i) F[i] = arg.f(i, x_cb, parity);
70 Complex coeff(0.0,arg.cloverCoeff);
71 Link block1[2], block2[2];
72 block1[0] = coeff*(F[0]-F[5]);
73 block1[1] = coeff*(F[0]+F[5]);
74 block2[0] = arg.cloverCoeff*(F[1]+F[4] - I*(F[2]-F[3]));
75 block2[1] = arg.cloverCoeff*(F[1]-F[4] - I*(F[2]+F[3]));
79 for (
int ch=0; ch<2; ++ch) {
84 for(
int i=0; i<N/2; ++i){
85 A(i+0,i+0) = 1.0 - block1[ch](i,i).real();
86 A(i+3,i+3) = 1.0 + block1[ch](i,i).real();
91 A(1,0) = -block1[ch](1,0);
93 A(2,0) = -block1[ch](2,0);
94 A(2,1) = -block1[ch](2,1);
96 A(3,0) = block2[ch](0,0);
97 A(3,1) = block2[ch](0,1);
98 A(3,2) = block2[ch](0,2);
100 A(4,0) = block2[ch](1,0);
101 A(4,1) = block2[ch](1,1);
102 A(4,2) = block2[ch](1,2);
103 A(4,3) = block1[ch](1,0);
105 A(5,0) = block2[ch](2,0);
106 A(5,1) = block2[ch](2,1);
107 A(5,2) = block2[ch](2,2);
108 A(5,3) = block1[ch](2,0);
109 A(5,4) = block1[ch](2,1);
110 A *=
static_cast<Float
>(0.5);
112 arg.clover(x_cb, parity, ch) = A;
120 template<
typename Float,
typename Clover,
typename Fmunu>
121 __global__
void cloverComputeKernel(CloverArg<Float,Clover,Fmunu> arg){
122 int x_cb = threadIdx.x + blockIdx.x*blockDim.x;
123 int parity = threadIdx.y + blockIdx.y*blockDim.y;
124 if (x_cb >= arg.threads)
return;
125 cloverComputeCore<Float>(
arg, x_cb,
parity);
128 template<
typename Float,
typename Clover,
typename Fmunu>
129 void cloverComputeCPU(CloverArg<Float,Clover,Fmunu> arg){
130 for (
int parity = 0; parity<2; parity++) {
131 for (
int x_cb=0; x_cb<arg.threads; x_cb++){
132 cloverComputeCore<Float>(
arg, x_cb,
parity);
138 template<
typename Float,
typename Clover,
typename Fmunu>
139 class CloverCompute : TunableVectorY {
140 CloverArg<Float,Clover,Fmunu>
arg;
141 const GaugeField &meta;
145 unsigned int sharedBytesPerThread()
const {
return 0; }
146 unsigned int sharedBytesPerBlock(
const TuneParam &
param)
const {
return 0; }
148 bool tuneSharedBytes()
const {
return false; }
149 bool tuneGridDim()
const {
return false; }
150 unsigned int minThreads()
const {
return arg.threads; }
153 CloverCompute(CloverArg<Float,Clover,Fmunu> &arg,
const GaugeField &meta,
QudaFieldLocation location)
154 : TunableVectorY(2), arg(arg), meta(meta), location(location) {
155 writeAuxString(
"threads=%d,stride=%d,prec=%lu",arg.threads,arg.clover.stride,
sizeof(Float));
158 virtual ~CloverCompute() {}
160 void apply(
const cudaStream_t &
stream) {
163 cloverComputeKernel<<<tp.grid,tp.block,tp.shared_bytes>>>(
arg);
165 cloverComputeCPU(arg);
169 TuneKey tuneKey()
const {
170 return TuneKey(meta.VolString(),
typeid(*this).name(), aux);
173 long long flops()
const {
return 2*arg.threads*480ll; }
174 long long bytes()
const {
return 2*arg.threads*(6*arg.f.Bytes() + arg.clover.Bytes()); }
179 template<
typename Float,
typename Clover,
typename Fmunu>
181 CloverArg<Float,Clover,Fmunu>
arg(clover, f, meta, cloverCoeff);
182 CloverCompute<Float,Clover,Fmunu> cloverCompute(arg, meta, location);
183 cloverCompute.apply(0);
188 template<
typename Float>
191 if (clover.isNative()) {
192 typedef typename clover_mapper<Float>::type
C;
193 computeClover(
C(clover,0), gauge::FloatNOrder<Float,18,2,18>(f), f, cloverCoeff, location);
195 errorQuda(
"Clover field order %d not supported", clover.Order());
198 errorQuda(
"Fmunu field order %d not supported", f.Precision());
206 #ifdef GPU_CLOVER_DIRAC 212 computeClover<double>(
clover, f, cloverCoeff, location);
214 computeClover<float>(
clover, f, cloverCoeff, location);
QudaVerbosity getVerbosity()
Main header file for host and device accessors to CloverFields.
#define qudaDeviceSynchronize()
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
clover_mapper< Float, length >::type C
Main header file for host and device accessors to GaugeFields.
std::complex< double > Complex
CloverArg(ColorSpinorField &out, const ColorSpinorField &in, const CloverField &clover, bool inverse, int parity, RegType kappa=0.0, RegType mu=0.0, RegType epsilon=0.0, bool dagger=false, QudaTwistGamma5Type twist=QUDA_TWIST_GAMMA5_INVALID)
enum QudaFieldLocation_s QudaFieldLocation
colorspinor_mapper< Float, nSpin, nColor >::type F
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
QudaPrecision Precision() const
void computeClover(CloverField &clover, const GaugeField &gauge, double coeff, QudaFieldLocation location)