3 #include <gauge_field.h>
5 #include <blas_lapack.h>
7 #include <staggered_kd_build_xinv.h>
9 #include <jitify_helper.cuh>
10 #include <kernels/staggered_kd_build_xinv_kernel.cuh>
14 template <typename Float, int fineColor, int coarseSpin, int coarseColor, typename Arg>
15 class CalculateStaggeredKDBlock : public TunableVectorYZ {
18 const GaugeField &meta;
21 long long flops() const {
22 // only real work is multiplying the mass by two
23 return arg.coarseVolumeCB*coarseSpin*coarseColor;
26 long long bytes() const
28 // 1. meta.Bytes() / 2 b/c the Kahler-Dirac blocking is a dual decomposition: only
29 // half of the gauge field needs to be loaded.
30 // 2. Storing X, extra factor of two b/c it stores forwards and backwards.
31 // 3. Storing mass contribution
32 return meta.Bytes() / 2 + (meta.Bytes() * X.Precision()) / meta.Precision() + 2 * coarseSpin * coarseColor * arg.coarseVolumeCB * X.Precision();
35 unsigned int minThreads() const { return arg.fineVolumeCB; }
36 bool tuneSharedBytes() const { return false; } // FIXME don't tune the grid dimension
37 bool tuneGridDim() const { return false; } // FIXME don't tune the grid dimension
38 bool tuneAuxDim() const { return false; }
41 CalculateStaggeredKDBlock(Arg &arg, const GaugeField &meta, GaugeField &X) :
42 TunableVectorYZ(fineColor*fineColor, 2),
48 create_jitify_program("kernels/staggered_kd_build_xinv_kernel.cuh");
50 strcpy(aux, compile_type_str(meta));
51 strcpy(aux, meta.AuxString());
52 if (meta.Location() == QUDA_CPU_FIELD_LOCATION) strcat(aux, getOmpThreadStr());
53 strcat(aux,",computeStaggeredKDBlock");
54 strcat(aux, (meta.Location()==QUDA_CUDA_FIELD_LOCATION && X.MemType() == QUDA_MEMORY_MAPPED) ? ",GPU-mapped," :
55 meta.Location()==QUDA_CUDA_FIELD_LOCATION ? ",GPU-device," : ",CPU,");
56 strcat(aux,"coarse_vol=");
57 strcat(aux,X.VolString());
60 void apply(const qudaStream_t &stream)
62 TuneParam tp = tuneLaunch(*this, getTuning(), QUDA_VERBOSE);
64 if (meta.Location() == QUDA_CPU_FIELD_LOCATION) {
65 ComputeStaggeredKDBlockCPU(arg);
68 using namespace jitify::reflection;
69 jitify_error = program->kernel("quda::ComputeStaggeredKDBlockGPU")
70 .instantiate(Type<Arg>())
71 .configure(tp.grid,tp.block,tp.shared_bytes,stream).launch(arg);
73 qudaLaunchKernel(ComputeStaggeredKDBlockGPU<Arg>, tp, stream, arg);
78 bool advanceTuneParam(TuneParam ¶m) const {
79 // only do autotuning if we have device fields
80 if (X.MemType() == QUDA_MEMORY_DEVICE) return Tunable::advanceTuneParam(param);
84 TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
88 @brief Calculate the staggered Kahler-Dirac block (coarse clover)
90 @param X[out] KD block (coarse clover field) accessor
91 @param G[in] Fine grid link / gauge field accessor
92 @param X_[out] KD block (coarse clover field)
93 @param G_[in] Fine gauge field
96 template<typename Float, int fineColor, int coarseSpin, int coarseColor, typename xGauge, typename fineGauge>
97 void calculateStaggeredKDBlock(xGauge &X, fineGauge &G, GaugeField &X_, const GaugeField &G_, double mass)
101 errorQuda("Input gauge field should have nColor=3, not nColor=%d\n", fineColor);
103 if (G.Ndim() != 4) errorQuda("Number of dimensions not supported");
106 if (fineColor * 16 != coarseColor*coarseSpin)
107 errorQuda("Fine nColor=%d is not consistent with KD dof %d", fineColor, coarseColor*coarseSpin);
109 int x_size[QUDA_MAX_DIM] = { };
110 int xc_size[QUDA_MAX_DIM] = { };
111 for (int i = 0; i < nDim; i++) {
112 x_size[i] = G_.X()[i];
113 xc_size[i] = X_.X()[i];
114 // check that local volumes are consistent
115 if (2 * xc_size[i] != x_size[i]) {
116 errorQuda("Inconsistent fine dimension %d and coarse KD dimension %d", x_size[i], xc_size[i]);
119 x_size[4] = xc_size[4] = 1;
121 // Calculate X (KD block), which is really just a permutation of the gauge fields w/in a KD block
122 using Arg = CalculateStaggeredKDBlockArg<Float,coarseSpin,fineColor,coarseColor,xGauge,fineGauge>;
123 Arg arg(X, G, mass, x_size, xc_size);
124 CalculateStaggeredKDBlock<Float, fineColor, coarseSpin, coarseColor, Arg> y(arg, G_, X_);
126 QudaFieldLocation location = checkLocation(X_, G_);
127 if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Calculating the KD block on the %s\n", location == QUDA_CUDA_FIELD_LOCATION ? "GPU" : "CPU");
129 // We know exactly what the scale should be: the max of all of the (fat) links.
130 double max_scale = G_.abs_max();
131 if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Global U_max = %e\n", max_scale);
133 if (xGauge::fixedPoint()) {
134 arg.X.resetScale(max_scale > 2.0*mass ? max_scale : 2.0*mass); // To be safe
135 X_.Scale(max_scale > 2.0*mass ? max_scale : 2.0*mass); // To be safe
138 if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Computing KD block\n");
141 if (getVerbosity() >= QUDA_VERBOSE) printfQuda("X2 = %e\n", X_.norm2(0));
144 template <typename Float, typename vFloat, int fineColor, int coarseColor, int coarseSpin>
145 void calculateStaggeredKDBlock(GaugeField &X, const GaugeField &g, const double mass)
148 QudaFieldLocation location = X.Location();
150 if (location == QUDA_CPU_FIELD_LOCATION) {
152 constexpr QudaGaugeFieldOrder gOrder = QUDA_QDP_GAUGE_ORDER;
154 if (g.FieldOrder() != gOrder) errorQuda("Unsupported field order %d\n", g.FieldOrder());
156 using gFine = typename gauge::FieldOrder<Float,fineColor,1,gOrder>;
157 using xCoarse = typename gauge::FieldOrder<Float,coarseColor*coarseSpin,coarseSpin,gOrder,true,vFloat>;
159 gFine gAccessor(const_cast<GaugeField&>(g));
160 xCoarse xAccessor(const_cast<GaugeField&>(X));
162 calculateStaggeredKDBlock<Float,fineColor,coarseSpin,coarseColor>(xAccessor, gAccessor, X, g, mass);
166 constexpr QudaGaugeFieldOrder gOrder = QUDA_FLOAT2_GAUGE_ORDER;
168 if (g.FieldOrder() != gOrder) errorQuda("Unsupported field order %d\n", g.FieldOrder());
170 using gFine = typename gauge::FieldOrder<Float,fineColor,1,gOrder,true,Float>;
171 using xCoarse = typename gauge::FieldOrder<Float,coarseColor*coarseSpin,coarseSpin,gOrder,true,vFloat>;
173 gFine gAccessor(const_cast<GaugeField&>(g));
174 xCoarse xAccessor(const_cast<GaugeField&>(X));
176 calculateStaggeredKDBlock<Float,fineColor,coarseSpin,coarseColor>(xAccessor, gAccessor, X, g, mass);
181 // template on the number of KD (coarse) degrees of freedom
182 template <typename Float, typename vFloat, int fineColor>
183 void calculateStaggeredKDBlock(GaugeField &X, const GaugeField& g, const double mass)
185 constexpr int coarseSpin = 2;
186 const int coarseColor = X.Ncolor() / coarseSpin;
188 if (coarseColor == 24) { // half the dof w/in a KD-block
189 calculateStaggeredKDBlock<Float,vFloat,fineColor,24,coarseSpin>(X, g, mass);
191 errorQuda("Unsupported number of Kahler-Dirac dof %d\n", X.Ncolor());
195 // template on fine colors
196 template <typename Float, typename vFloat>
197 void calculateStaggeredKDBlock(GaugeField &X, const GaugeField &g, const double mass)
199 if (g.Ncolor() == 3) {
200 calculateStaggeredKDBlock<Float,vFloat,3>(X, g, mass);
202 errorQuda("Unsupported number of colors %d\n", g.Ncolor());
206 //Does the heavy lifting of building X
207 void calculateStaggeredKDBlock(GaugeField &X, const GaugeField &g, const double mass)
209 #if defined(GPU_STAGGERED_DIRAC)
211 // FIXME remove when done
212 if (getVerbosity() >= QUDA_SUMMARIZE) printfQuda("Computing X for StaggeredKD...\n");
214 #if QUDA_PRECISION & 8 && defined(GPU_MULTIGRID_DOUBLE)
215 if (X.Precision() == QUDA_DOUBLE_PRECISION) {
216 calculateStaggeredKDBlock<double,double>(X, g, mass);
219 #if QUDA_PRECISION & 4
220 if (X.Precision() == QUDA_SINGLE_PRECISION) {
221 calculateStaggeredKDBlock<float,float>(X, g, mass);
224 #if QUDA_PRECISION & 2
225 if (X.Precision() == QUDA_HALF_PRECISION) {
226 calculateStaggeredKDBlock<float,short>(X, g, mass);
229 //#if QUDA_PRECISION & 1
230 // if (X.Precision() == QUDA_QUARTER_PRECISION) {
231 // calculateStaggeredKDBlock<float,int8_t>(X, g, mass);
235 errorQuda("Unsupported precision %d", X.Precision());
238 if (getVerbosity() >= QUDA_SUMMARIZE) printfQuda("....done computing X for StaggeredKD\n");
240 errorQuda("Staggered fermion support has not been built");
244 // Calculates the inverse KD block and puts the result in Xinv. Assumes Xinv has been allocated, in MILC data order
245 void BuildStaggeredKahlerDiracInverse(GaugeField &Xinv, const cudaGaugeField &gauge, const double mass)
247 using namespace blas_lapack;
248 auto invert = use_native() ? native::BatchInvertMatrix : generic::BatchInvertMatrix;
250 // Xinv should have a MILC gauge order independent of being
252 if (Xinv.FieldOrder() != QUDA_MILC_GAUGE_ORDER)
253 errorQuda("Unsupported field order %d", Xinv.FieldOrder());
255 QudaPrecision precision = Xinv.Precision();
256 QudaFieldLocation location = Xinv.Location();
258 // Logic copied from `staggered_coarse_op.cu`
259 GaugeField *U = location == QUDA_CUDA_FIELD_LOCATION ? const_cast<cudaGaugeField*>(&gauge) : nullptr;
261 if (location == QUDA_CPU_FIELD_LOCATION) {
263 //First make a cpu gauge field from the cuda gauge field
265 GaugeFieldParam gf_param(gauge.X(), precision, QUDA_RECONSTRUCT_NO, pad, gauge.Geometry());
266 gf_param.order = QUDA_QDP_GAUGE_ORDER;
267 gf_param.fixed = gauge.GaugeFixed();
268 gf_param.link_type = gauge.LinkType();
269 gf_param.t_boundary = gauge.TBoundary();
270 gf_param.anisotropy = gauge.Anisotropy();
271 gf_param.gauge = NULL;
272 gf_param.create = QUDA_NULL_FIELD_CREATE;
273 gf_param.siteSubset = QUDA_FULL_SITE_SUBSET;
275 gf_param.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
277 U = new cpuGaugeField(gf_param);
279 //Copy the cuda gauge field to the cpu
280 gauge.saveCPUField(*static_cast<cpuGaugeField*>(U));
282 } else if (location == QUDA_CUDA_FIELD_LOCATION) {
284 // no reconstruct not strictly necessary, for now we do this for simplicity so
285 // we can take advantage of fine-grained access like in "staggered_coarse_op.cu"
286 // technically don't need to require the precision check, but it should
287 // generally be equal anyway
289 // FIXME: make this work for any gauge precision
290 if (gauge.Reconstruct() != QUDA_RECONSTRUCT_NO || gauge.Precision() != precision || gauge.Precision() < QUDA_SINGLE_PRECISION) {
291 GaugeFieldParam gf_param(gauge);
292 gf_param.reconstruct = QUDA_RECONSTRUCT_NO;
293 gf_param.order = QUDA_FLOAT2_GAUGE_ORDER; // guaranteed for no recon
294 gf_param.setPrecision( precision == QUDA_DOUBLE_PRECISION ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION );
295 U = new cudaGaugeField(gf_param);
301 // Create X based on Xinv, but switch to a native ordering
303 GaugeField *X = nullptr;
304 GaugeFieldParam x_param(Xinv);
305 if (location == QUDA_CUDA_FIELD_LOCATION) {
306 x_param.order = QUDA_FLOAT2_GAUGE_ORDER;
307 x_param.setPrecision(x_param.Precision());
308 X = static_cast<GaugeField*>(new cudaGaugeField(x_param));
310 X = static_cast<GaugeField*>(new cpuGaugeField(x_param));
314 calculateStaggeredKDBlock(*X, *U, mass);
317 // Logic copied from `coarse_op_preconditioned.cu`
318 const int n = Xinv.Ncolor();
319 if (location == QUDA_CUDA_FIELD_LOCATION) {
320 // FIXME: add support for double precision inverse
321 // Reorder to MILC order for inversion, based on "coarse_op_preconditioned.cu"
322 GaugeFieldParam param(Xinv);
323 param.order = QUDA_MILC_GAUGE_ORDER; // MILC order == QDP order for Xinv
324 param.setPrecision(QUDA_SINGLE_PRECISION); // FIXME until double prec support is added
325 cudaGaugeField X_(param);
326 cudaGaugeField* Xinv_ = ( Xinv.Precision() == QUDA_SINGLE_PRECISION) ? static_cast<cudaGaugeField*>(&Xinv) : new cudaGaugeField(param);
330 blas::flops += invert((void*)Xinv_->Gauge_p(), (void*)X_.Gauge_p(), n, X_.Volume(), X_.Precision(), X->Location());
332 if ( Xinv_ != &Xinv) {
333 if (Xinv.Precision() < QUDA_SINGLE_PRECISION) Xinv.Scale( Xinv_->abs_max() );
338 } else if (location == QUDA_CPU_FIELD_LOCATION) {
340 if (Xinv.Precision() == QUDA_DOUBLE_PRECISION)
341 errorQuda("Unsupported precision %d", Xinv.Precision());
343 const cpuGaugeField *X_h = static_cast<const cpuGaugeField*>(X);
344 cpuGaugeField *Xinv_h = static_cast<cpuGaugeField*>(&Xinv);
345 blas::flops += invert((void*)Xinv_h->Gauge_p(), (void*)X_h->Gauge_p(), n, X_h->Volume(), X->Precision(), X->Location());
349 if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Xinv = %e\n", Xinv.norm2(0));
353 if (U != &gauge) delete U;
358 // Allocates and calculates the inverse KD block, returning Xinv
359 cudaGaugeField* AllocateAndBuildStaggeredKahlerDiracInverse(const cudaGaugeField &gauge, const double mass, const QudaPrecision override_prec)
362 int xc[QUDA_MAX_DIM];
363 for (int i = 0; i < ndim; i++) { xc[i] = gauge.X()[i]/2; }
364 const int Nc_c = gauge.Ncolor() * 8; // 24
365 const int Ns_c = 2; // staggered parity
366 GaugeFieldParam gParam;
367 memcpy(gParam.x, xc, QUDA_MAX_DIM*sizeof(int));
368 gParam.nColor = Nc_c*Ns_c;
369 gParam.reconstruct = QUDA_RECONSTRUCT_NO;
370 gParam.order = QUDA_MILC_GAUGE_ORDER;
371 gParam.link_type = QUDA_COARSE_LINKS;
372 gParam.t_boundary = QUDA_PERIODIC_T;
373 gParam.create = QUDA_ZERO_FIELD_CREATE;
374 gParam.setPrecision( override_prec );
376 gParam.siteSubset = QUDA_FULL_SITE_SUBSET;
377 gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
379 gParam.geometry = QUDA_SCALAR_GEOMETRY;
382 cudaGaugeField* Xinv = new cudaGaugeField(gParam);
384 BuildStaggeredKahlerDiracInverse(*Xinv, gauge, mass);