10 #ifdef GPU_GAUGE_TOOLS 12 template<
typename Float,
typename Fmunu,
typename Gauge>
20 FmunuArg(Fmunu &
f, Gauge &gauge,
const GaugeField &meta,
const GaugeField &meta_ex)
21 : threads(meta.VolumeCB()),
f(
f), gauge(gauge) {
22 for (
int dir=0; dir<4; ++dir) {
23 X[dir] = meta.X()[dir];
24 border[dir] = (meta_ex.X()[dir] -
X[dir])/2;
29 template <
int mu,
int nu,
typename Float,
typename Arg>
30 __device__ __forceinline__
void computeFmunuCore(Arg &
arg,
int idx,
int parity) {
38 for (
int dir=0; dir<4; ++dir) {
39 x[dir] +=
arg.border[dir];
40 X[dir] += 2*
arg.border[dir];
47 int dx[4] = {0, 0, 0, 0};
70 int dx[4] = {0, 0, 0, 0};
97 int dx[4] = {0, 0, 0, 0};
124 int dx[4] = {0, 0, 0, 0};
160 F *=
static_cast<Float
>(0.125);
164 constexpr
int munu_idx = (
mu*(
mu-1))/2 + nu;
169 template<
typename Float,
typename Arg>
170 __global__
void computeFmunuKernel(Arg
arg){
171 int x_cb = threadIdx.x + blockIdx.x*
blockDim.x;
173 int mu_nu = threadIdx.z + blockIdx.z*
blockDim.z;
174 if (x_cb >=
arg.threads)
return;
175 if (mu_nu >= 6)
return;
178 case 0: computeFmunuCore<1,0,Float>(
arg, x_cb,
parity);
break;
179 case 1: computeFmunuCore<2,0,Float>(
arg, x_cb,
parity);
break;
180 case 2: computeFmunuCore<2,1,Float>(
arg, x_cb,
parity);
break;
181 case 3: computeFmunuCore<3,0,Float>(
arg, x_cb,
parity);
break;
182 case 4: computeFmunuCore<3,1,Float>(
arg, x_cb,
parity);
break;
183 case 5: computeFmunuCore<3,2,Float>(
arg, x_cb,
parity);
break;
187 template<
typename Float,
typename Arg>
188 void computeFmunuCPU(Arg &
arg) {
190 for (
int x_cb=0; x_cb<
arg.threads; x_cb++) {
191 for (
int mu=0;
mu<4;
mu++) {
192 for (
int nu=0; nu<
mu; nu++) {
193 int mu_nu = (
mu*(
mu-1))/2 + nu;
195 case 0: computeFmunuCore<1,0,Float>(
arg, x_cb,
parity);
break;
196 case 1: computeFmunuCore<2,0,Float>(
arg, x_cb,
parity);
break;
197 case 2: computeFmunuCore<2,1,Float>(
arg, x_cb,
parity);
break;
198 case 3: computeFmunuCore<3,0,Float>(
arg, x_cb,
parity);
break;
199 case 4: computeFmunuCore<3,1,Float>(
arg, x_cb,
parity);
break;
200 case 5: computeFmunuCore<3,2,Float>(
arg, x_cb,
parity);
break;
209 template<
typename Float,
typename Arg>
210 class FmunuCompute : TunableVectorYZ {
212 const GaugeField &meta;
216 unsigned int minThreads()
const {
return arg.threads; }
217 bool tuneGridDim()
const {
return false; }
221 : TunableVectorYZ(2,6),
arg(
arg), meta(meta), location(location) {
222 writeAuxString(
"threads=%d,stride=%d,prec=%lu",
arg.threads,meta.Stride(),
sizeof(Float));
224 virtual ~FmunuCompute() {}
226 void apply(
const cudaStream_t &
stream){
229 computeFmunuKernel<Float><<<tp.grid,tp.block,tp.shared_bytes>>>(
arg);
231 computeFmunuCPU<Float>(
arg);
235 TuneKey tuneKey()
const {
236 return TuneKey(meta.VolString(),
typeid(*this).name(), aux);
239 long long flops()
const {
return (2430 + 36)*6*2*(
long long)
arg.threads; }
240 long long bytes()
const {
return ((16*
arg.gauge.Bytes() +
arg.f.Bytes())*6*2*
arg.threads); }
245 template<
typename Float,
typename Fmunu,
typename Gauge>
247 FmunuArg<Float,Fmunu,Gauge>
arg(f_munu, gauge, meta, meta_ex);
248 FmunuCompute<Float,FmunuArg<Float,Fmunu,Gauge> > fmunuCompute(
arg, meta, location);
249 fmunuCompute.apply(0);
254 template<
typename Float>
257 if (gauge.isNative()) {
258 typedef gauge::FloatNOrder<Float, 18, 2, 18> F;
261 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type G;
262 computeFmunu<Float>(F(Fmunu), G(gauge), Fmunu, gauge, location);
264 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type G;
265 computeFmunu<Float>(F(Fmunu), G(gauge), Fmunu, gauge, location);
267 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type G;
268 computeFmunu<Float>(F(Fmunu), G(gauge), Fmunu, gauge, location);
270 errorQuda(
"Reconstruction type %d not supported", gauge.Reconstruct());
273 errorQuda(
"Gauge field order %d not supported", gauge.Order());
276 errorQuda(
"Fmunu field order %d not supported", Fmunu.Order());
281 #endif // GPU_GAUGE_TOOLS 285 #ifdef GPU_GAUGE_TOOLS 291 computeFmunu<double>(Fmunu, gauge, location);
293 computeFmunu<float>(Fmunu, gauge, location);
300 #endif // GPU_GAUGE_TOOLS
static __device__ __host__ int linkIndexShift(const I x[], const J dx[], const K X[4])
QudaVerbosity getVerbosity()
void computeFmunu(GaugeField &Fmunu, const GaugeField &gauge, QudaFieldLocation location)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
int int int enum cudaChannelFormatKind f
Main header file for host and device accessors to GaugeFields.
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
enum QudaFieldLocation_s QudaFieldLocation
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
__host__ __device__ ValueType conj(ValueType x)
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
QudaPrecision Precision() const
static __device__ __host__ void getCoords(int x[], int cb_index, const I X[], int parity)