3 #include <color_spinor_field.h>
4 #include <gauge_field.h>
6 #include <jitify_helper.cuh>
8 // For naive Kahler-Dirac coarsening
9 #include <kernels/staggered_coarse_op_kernel.cuh>
11 // This define controls which kernels get compiled in `coarse_op.cuh`.
12 // This ensures only kernels relevant for coarsening the staggered
13 // operator get built, saving compile time.
14 #define STAGGEREDCOARSE
15 #include <coarse_op.cuh>
20 @brief dummyClover is a helper function to allow us to create an
21 empty clover object - this allows us to use the the externally
22 linked reduction kernels when we do have a clover field. Taken from
25 inline std::unique_ptr<cudaCloverField> dummyClover()
27 CloverFieldParam cf_param;
30 cf_param.setPrecision(QUDA_SINGLE_PRECISION);
32 for (int i = 0; i < cf_param.nDim; i++) cf_param.x[i] = 0;
34 cf_param.direct = true;
35 cf_param.inverse = true;
36 cf_param.clover = nullptr;
38 cf_param.cloverInv = nullptr;
40 cf_param.create = QUDA_NULL_FIELD_CREATE;
41 cf_param.siteSubset = QUDA_FULL_SITE_SUBSET;
43 // create a dummy cudaCloverField if one is not defined
44 cf_param.order = QUDA_INVALID_CLOVER_ORDER;
45 return std::make_unique<cudaCloverField>(cf_param);
48 template <typename Float, int fineColor, int coarseSpin, int coarseColor, typename Arg>
49 class CalculateStaggeredY : public TunableVectorYZ {
52 const GaugeField &meta;
56 long long flops() const { return arg.coarseVolumeCB*coarseSpin*coarseColor; }
58 long long bytes() const
60 // 2 from forwards / backwards contributions, Y and X are sparse - only needs to write non-zero elements, 2nd term is mass term
61 return meta.Bytes() + (2 * meta.Bytes() * Y.Precision()) / meta.Precision() + 2 * 2 * coarseSpin * coarseColor * arg.coarseVolumeCB * X.Precision();
64 unsigned int minThreads() const { return arg.fineVolumeCB; }
65 bool tuneSharedBytes() const { return false; } // don't tune the grid dimension
66 bool tuneGridDim() const { return false; } // don't tune the grid dimension
67 bool tuneAuxDim() const { return false; }
70 CalculateStaggeredY(Arg &arg, const GaugeField &meta, GaugeField &Y, GaugeField &X) :
71 TunableVectorYZ(fineColor*fineColor, 2),
77 if (meta.Location() == QUDA_CUDA_FIELD_LOCATION) {
79 create_jitify_program("kernels/staggered_coarse_op_kernel.cuh");
82 strcpy(aux, compile_type_str(meta));
83 strcpy(aux, meta.AuxString());
84 strcat(aux,comm_dim_partitioned_string());
85 if (meta.Location() == QUDA_CPU_FIELD_LOCATION) strcat(aux, getOmpThreadStr());
86 strcat(aux,",computeStaggeredVUV");
87 strcat(aux, (meta.Location()==QUDA_CUDA_FIELD_LOCATION && Y.MemType() == QUDA_MEMORY_MAPPED) ? ",GPU-mapped," :
88 meta.Location()==QUDA_CUDA_FIELD_LOCATION ? ",GPU-device," : ",CPU,");
89 strcat(aux,"coarse_vol=");
90 strcat(aux,X.VolString());
93 void apply(const qudaStream_t &stream)
95 TuneParam tp = tuneLaunch(*this, getTuning(), QUDA_VERBOSE);
97 if (meta.Location() == QUDA_CPU_FIELD_LOCATION) {
98 ComputeStaggeredVUVCPU<Float,fineColor,coarseSpin,coarseColor>(arg);
101 using namespace jitify::reflection;
102 jitify_error = program->kernel("quda::ComputeStaggeredVUVGPU")
103 .instantiate(Type<Float>(),fineColor,coarseSpin,coarseColor,Type<Arg>())
104 .configure(tp.grid,tp.block,tp.shared_bytes,stream).launch(arg);
106 qudaLaunchKernel(ComputeStaggeredVUVGPU<Float,fineColor,coarseSpin,coarseColor,Arg>, tp, stream, arg);
111 bool advanceTuneParam(TuneParam ¶m) const {
112 // only do autotuning if we have device fields
113 if (meta.Location() == QUDA_CUDA_FIELD_LOCATION && Y.MemType() == QUDA_MEMORY_DEVICE) return Tunable::advanceTuneParam(param);
117 TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
121 @brief Calculate the coarse-link field, including the coarse clover field.
123 @param Y[out] Coarse (fat-)link field accessor
124 @param X[out] Coarse clover field accessor
125 @param G[in] Fine grid link / gauge field accessor
126 @param Y_[out] Coarse link field
127 @param X_[out] Coarse clover field
128 @param X_[out] Coarse clover inverese field (used as temporary here)
129 @param v[in] Packed null-space vectors
130 @param G_[in] Fine gauge field
131 @param mass[in] Kappa parameter
132 @param matpc[in] The type of preconditioning of the source fine-grid operator
134 template<typename Float, int fineSpin, int fineColor, int coarseSpin, int coarseColor,
135 typename coarseGauge, typename fineGauge>
136 void calculateStaggeredY(coarseGauge &Y, coarseGauge &X, fineGauge &G, GaugeField &Y_, GaugeField &X_,
137 const GaugeField &G_, double mass, QudaDiracType dirac, QudaMatPCType matpc)
140 if (matpc == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC || matpc == QUDA_MATPC_ODD_ODD_ASYMMETRIC)
141 errorQuda("Unsupported coarsening of matpc = %d", matpc);
143 // This is the last time we use fineSpin, since this file only coarsens
144 // staggered-type ops, not wilson-type AND coarse-type.
146 errorQuda("Input Dirac operator %d should have nSpin=1, not nSpin=%d\n", dirac, fineSpin);
148 errorQuda("Input Dirac operator %d should have nColor=3, not nColor=%d\n", dirac, fineColor);
150 if (G.Ndim() != 4) errorQuda("Number of dimensions not supported");
153 int x_size[QUDA_MAX_DIM] = { };
154 for (int i=0; i<4; i++) x_size[i] = G_.X()[i];
157 int xc_size[QUDA_MAX_DIM] = { };
158 for (int i=0; i<4; i++) xc_size[i] = X_.X()[i];
161 int geo_bs[QUDA_MAX_DIM] = { };
162 for(int d = 0; d < nDim; d++) geo_bs[d] = x_size[d]/xc_size[d];
163 int spin_bs = 0; // 0 -> spin-less types.
165 // Calculate VUV in one pass (due to KD-transform) for each dimension,
166 // accumulating directly into the coarse gauge field Y
168 using Arg = CalculateStaggeredYArg<Float,coarseSpin,fineColor,coarseColor,coarseGauge,fineGauge>;
169 Arg arg(Y, X, G, mass, x_size, xc_size, geo_bs, spin_bs);
170 CalculateStaggeredY<Float, fineColor, coarseSpin, coarseColor, Arg> y(arg, G_, Y_, X_);
172 QudaFieldLocation location = checkLocation(Y_, X_, G_);
173 if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Running link coarsening on the %s\n", location == QUDA_CUDA_FIELD_LOCATION ? "GPU" : "CPU");
175 // We know exactly what the scale should be: the max of all of the (fat) links.
176 double max_scale = G_.abs_max();
177 if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Global U_max = %e\n", max_scale);
179 if (coarseGauge::fixedPoint()) {
180 arg.Y.resetScale(max_scale);
181 arg.X.resetScale(max_scale > 2.0*mass ? max_scale : 2.0*mass); // To be safe
183 X_.Scale(max_scale > 2.0*mass ? max_scale : 2.0*mass); // To be safe
186 // We can technically do a uni-directional build, but becauase
187 // the coarse link builds are just permutations plus lots of zeros,
188 // it's faster to skip the flip!
190 if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Computing VUV\n");
193 if (getVerbosity() >= QUDA_VERBOSE) {
194 for (int d = 0; d < nDim; d++) printfQuda("Y2[%d] = %e\n", 4+d, Y_.norm2( 4+d ));
195 for (int d = 0; d < nDim; d++) printfQuda("Y2[%d] = %e\n", d, Y_.norm2( d ));
198 if (getVerbosity() >= QUDA_VERBOSE) printfQuda("X2 = %e\n", X_.norm2(0));
201 template <typename Float, typename vFloat, int fineColor, int fineSpin, int coarseColor, int coarseSpin>
202 void calculateStaggeredY(GaugeField &Y, GaugeField &X, const Transfer &T, const GaugeField &g,
203 double mass, QudaDiracType dirac, QudaMatPCType matpc)
205 QudaFieldLocation location = Y.Location();
207 if (location == QUDA_CPU_FIELD_LOCATION) {
209 constexpr QudaFieldOrder csOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
210 constexpr QudaGaugeFieldOrder gOrder = QUDA_QDP_GAUGE_ORDER;
212 if (T.Vectors(Y.Location()).FieldOrder() != csOrder)
213 errorQuda("Unsupported field order %d\n", T.Vectors(Y.Location()).FieldOrder());
214 if (g.FieldOrder() != gOrder) errorQuda("Unsupported field order %d\n", g.FieldOrder());
216 using gFine = typename gauge::FieldOrder<Float,fineColor,1,gOrder>;
217 using gCoarse = typename gauge::FieldOrder<Float,coarseColor*coarseSpin,coarseSpin,gOrder,true,vFloat>;
219 gFine gAccessor(const_cast<GaugeField&>(g));
220 gCoarse yAccessor(const_cast<GaugeField&>(Y));
221 gCoarse xAccessor(const_cast<GaugeField&>(X));
223 calculateStaggeredY<Float,fineSpin,fineColor,coarseSpin,coarseColor>
224 (yAccessor, xAccessor, gAccessor, Y, X, g, mass, dirac, matpc);
227 constexpr QudaFieldOrder csOrder = QUDA_FLOAT2_FIELD_ORDER;
228 constexpr QudaGaugeFieldOrder gOrder = QUDA_FLOAT2_GAUGE_ORDER;
230 if (T.Vectors(Y.Location()).FieldOrder() != csOrder)
231 errorQuda("Unsupported field order %d\n", T.Vectors(Y.Location()).FieldOrder());
232 if (g.FieldOrder() != gOrder) errorQuda("Unsupported field order %d\n", g.FieldOrder());
234 using gFine = typename gauge::FieldOrder<Float,fineColor,1,gOrder,true,Float>;
235 using gCoarse = typename gauge::FieldOrder<Float,coarseColor*coarseSpin,coarseSpin,gOrder,true,vFloat>;
237 gFine gAccessor(const_cast<GaugeField&>(g));
238 gCoarse yAccessor(const_cast<GaugeField&>(Y));
239 gCoarse xAccessor(const_cast<GaugeField&>(X));
241 calculateStaggeredY<Float,fineSpin,fineColor,coarseSpin,coarseColor>
242 (yAccessor, xAccessor, gAccessor, Y, X, g, mass, dirac, matpc);
247 template <typename Float, typename vFloat, typename vFloatXinv, int fineColor, int fineSpin, int xinvColor, int xinvSpin, int coarseColor, int coarseSpin>
248 void aggregateStaggeredY(GaugeField &Y, GaugeField &X,
249 const Transfer &T, const GaugeField &g, const GaugeField &XinvKD, double mass, QudaDiracType dirac, QudaMatPCType matpc)
251 // Actually create the temporaries like UV, etc.
252 auto location = Y.Location();
254 //Create a field UV which holds U*V. Has the same structure as V,
255 // No need to double spin for the staggered operator, will need to double it for the K-D op.
256 ColorSpinorParam UVparam(T.Vectors(location));
257 UVparam.create = QUDA_ZERO_FIELD_CREATE;
258 UVparam.location = location;
259 UVparam.setPrecision(T.Vectors(location).Precision());
260 UVparam.mem_type = Y.MemType(); // allocate temporaries to match coarse-grid link field
262 ColorSpinorField *uv = ColorSpinorField::Create(UVparam);
264 ColorSpinorField *av = (dirac == QUDA_STAGGEREDKD_DIRAC || dirac == QUDA_ASQTADKD_DIRAC) ? ColorSpinorField::Create(UVparam) : &const_cast<ColorSpinorField&>(T.Vectors(location));
266 GaugeField *Yatomic = &Y;
267 GaugeField *Xatomic = &X;
269 if (Y.Precision() < QUDA_SINGLE_PRECISION) {
270 // we need to coarsen into single precision fields (float or int), so we allocate temporaries for this purpose
271 // else we can just coarsen directly into the original fields
272 GaugeFieldParam param(X); // use X since we want scalar geometry
273 param.location = location;
274 param.setPrecision(QUDA_SINGLE_PRECISION, location == QUDA_CUDA_FIELD_LOCATION ? true : false);
276 Yatomic = GaugeField::Create(param);
277 Xatomic = GaugeField::Create(param);
280 // Moving along to the build
282 const double kappa = -1.; // cancels a minus sign factor for kappa w/in the dslash application
283 const double mu_dummy = 0.;
284 const double mu_factor_dummy = 0.;
286 bool need_bidirectional = false;
287 if (dirac == QUDA_STAGGEREDKD_DIRAC || dirac == QUDA_ASQTADKD_DIRAC) need_bidirectional = true;
289 constexpr QudaGaugeFieldOrder xinvOrder = QUDA_MILC_GAUGE_ORDER;
290 if (XinvKD.FieldOrder() != xinvOrder && XinvKD.Bytes()) errorQuda("unsupported field order %d\n", XinvKD.FieldOrder());
292 if (Y.Location() == QUDA_CPU_FIELD_LOCATION) {
294 constexpr QudaFieldOrder csOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
295 constexpr QudaGaugeFieldOrder gOrder = QUDA_QDP_GAUGE_ORDER;
297 if (T.Vectors(Y.Location()).FieldOrder() != csOrder)
298 errorQuda("Unsupported field order %d\n", T.Vectors(Y.Location()).FieldOrder());
299 if (g.FieldOrder() != gOrder) errorQuda("Unsupported field order %d\n", g.FieldOrder());
301 using V = typename colorspinor::FieldOrderCB<Float,fineSpin,fineColor,coarseColor,csOrder,vFloat>;
302 using F = typename colorspinor::FieldOrderCB<Float,fineSpin,fineColor,coarseColor,csOrder,vFloat>; // will need 2x the spin components for the KD op
303 using gFine = typename gauge::FieldOrder<Float,fineColor,1,gOrder>;
304 using xinvFine = typename gauge::FieldOrder<typename mapper<vFloatXinv>::type,xinvColor,xinvSpin,xinvOrder,true,vFloatXinv>;
305 using gCoarse = typename gauge::FieldOrder<Float,coarseColor*coarseSpin,coarseSpin,gOrder,true,vFloat>;
306 using gCoarseAtomic = typename gauge::FieldOrder<Float,coarseColor*coarseSpin,coarseSpin,gOrder,true,storeType>;
308 const ColorSpinorField &v = T.Vectors(Y.Location());
310 V vAccessor(const_cast<ColorSpinorField&>(v));
311 V uvAccessor(*uv); // will need 2x the spin components for the KD op
313 gFine gAccessor(const_cast<GaugeField&>(g));
314 xinvFine xinvAccessor(const_cast<GaugeField&>(XinvKD));
315 gCoarse yAccessor(const_cast<GaugeField&>(Y));
316 gCoarse xAccessor(const_cast<GaugeField&>(X));
317 gCoarseAtomic yAccessorAtomic(*Yatomic);
318 gCoarseAtomic xAccessorAtomic(*Xatomic);
320 // the repeated xinvAccessor is intentional
321 calculateY<QUDA_CPU_FIELD_LOCATION, false, Float, fineSpin, fineColor, coarseSpin, coarseColor>(
322 yAccessor, xAccessor, yAccessorAtomic, xAccessorAtomic, uvAccessor, avAccessor, vAccessor, gAccessor, xinvAccessor, xinvAccessor,
323 Y, X, *Yatomic, *Xatomic, *uv, *av, v, g, *dummyClover(), kappa, mass, mu_dummy,
324 mu_factor_dummy, dirac, matpc, need_bidirectional, T.fineToCoarse(Y.Location()), T.coarseToFine(Y.Location()));
327 constexpr QudaFieldOrder csOrder = QUDA_FLOAT2_FIELD_ORDER;
328 constexpr QudaGaugeFieldOrder gOrder = QUDA_FLOAT2_GAUGE_ORDER;
330 if (T.Vectors(Y.Location()).FieldOrder() != csOrder)
331 errorQuda("Unsupported field order %d\n", T.Vectors(Y.Location()).FieldOrder());
332 if (g.FieldOrder() != gOrder) errorQuda("Unsupported field order %d\n", g.FieldOrder());
334 using V = typename colorspinor::FieldOrderCB<Float, fineSpin, fineColor, coarseColor, csOrder, vFloat, vFloat, false, false>;
335 using F = typename colorspinor::FieldOrderCB<Float, fineSpin, fineColor, coarseColor, csOrder, vFloat, vFloat, false, false>; // will need 2x the spin components for the KD op
336 using gFine = typename gauge::FieldOrder<Float,fineColor,1,gOrder,true,Float>;
337 using xinvFine = typename gauge::FieldOrder<typename mapper<vFloatXinv>::type,xinvColor,xinvSpin,xinvOrder,true,vFloatXinv>;
338 using gCoarse = typename gauge::FieldOrder<Float, coarseColor * coarseSpin, coarseSpin, gOrder, true, vFloat>;
339 using gCoarseAtomic = typename gauge::FieldOrder<Float, coarseColor * coarseSpin, coarseSpin, gOrder, true, storeType>;
341 const ColorSpinorField &v = T.Vectors(Y.Location());
343 V vAccessor(const_cast<ColorSpinorField &>(v));
344 V uvAccessor(*uv); // will need 2x the spin components for the KD op
346 gFine gAccessor(const_cast<GaugeField &>(g));
347 xinvFine xinvAccessor(const_cast<GaugeField&>(XinvKD));
348 gCoarse yAccessor(const_cast<GaugeField &>(Y));
349 gCoarse xAccessor(const_cast<GaugeField &>(X));
350 gCoarseAtomic yAccessorAtomic(*Yatomic);
351 gCoarseAtomic xAccessorAtomic(*Xatomic);
353 // create a dummy clover field to allow us to call the external clover reduction routines elsewhere
354 calculateY<QUDA_CUDA_FIELD_LOCATION, false, Float, fineSpin, fineColor, coarseSpin, coarseColor>(
355 yAccessor, xAccessor, yAccessorAtomic, xAccessorAtomic, uvAccessor, avAccessor, vAccessor, gAccessor,
356 xinvAccessor, xinvAccessor, Y, X, *Yatomic, *Xatomic, *uv, *av, v, g, *dummyClover(),
357 kappa, mass, mu_dummy, mu_factor_dummy, dirac, matpc, need_bidirectional, T.fineToCoarse(Y.Location()),
358 T.coarseToFine(Y.Location()));
362 if (Yatomic != &Y) delete Yatomic;
363 if (Xatomic != &X) delete Xatomic;
365 if (av != nullptr && &T.Vectors(location) != av) delete av;
366 if (uv != nullptr) delete uv;
370 // template on precision of XinvKD
371 template <typename Float, typename vFloat, int fineColor, int fineSpin, int coarseColor, int coarseSpin>
372 void aggregateStaggeredY(GaugeField &Y, GaugeField &X, const Transfer &T, const GaugeField &g,
373 const GaugeField &XinvKD, double mass, QudaDiracType dirac, QudaMatPCType matpc)
375 if (XinvKD.Ncolor() != 16*fineColor)
376 errorQuda("Invalid Nc %d", XinvKD.Ncolor());
378 constexpr int xinvColor = 16 * fineColor;
379 constexpr int xinvSpin = 2;
381 if (XinvKD.Precision() == QUDA_SINGLE_PRECISION) {
382 aggregateStaggeredY<Float,vFloat,float,fineColor,fineSpin,xinvColor,xinvSpin,coarseColor,coarseSpin>(Y, X, T, g, XinvKD, mass, dirac, matpc);
383 //} else if (XinvKD.Precision() == QUDA_HALF_PRECISION) { --- unnecessary until we add KD coarsening support
384 // aggregateStaggeredY<Float,vFloat,short,fineColor,fineSpin,xinvColor,xinvSpin,coarseColor,coarseSpin>(Y, X, T, g, XinvKD, mass, dirac, matpc);
386 errorQuda("Unsupported precision %d", XinvKD.Precision());
391 // template on the number of coarse degrees of freedom, branch between naive K-D
392 // and actual aggregation
393 template <typename Float, typename vFloat, int fineColor, int fineSpin>
394 void calculateStaggeredY(GaugeField &Y, GaugeField &X, const Transfer &T, const GaugeField &g,
395 const GaugeField &XinvKD, double mass, QudaDiracType dirac, QudaMatPCType matpc)
397 const int coarseSpin = 2;
398 const int coarseColor = Y.Ncolor() / coarseSpin;
400 if (coarseColor == 24) {
401 if (T.getTransferType() == QUDA_TRANSFER_COARSE_KD)
402 // the permutation routines don't need Yatomic, Xatomic, uv, av
403 calculateStaggeredY<Float,vFloat,fineColor,fineSpin,24,coarseSpin>(Y, X, T, g, mass, dirac, matpc);
405 // free field aggregation
406 aggregateStaggeredY<Float,vFloat,fineColor,fineSpin,24,coarseSpin>(Y, X, T, g, XinvKD, mass, dirac, matpc);
408 } else if (coarseColor == 64) {
409 aggregateStaggeredY<Float,vFloat,fineColor,fineSpin,64,coarseSpin>(Y, X, T, g, XinvKD, mass, dirac, matpc);
410 } else { // note --- may revisit 3 -> 96 in the future
411 errorQuda("Unsupported number of coarse dof %d\n", Y.Ncolor());
415 // template on fine spin
416 template <typename Float, typename vFloat, int fineColor>
417 void calculateStaggeredY(GaugeField &Y, GaugeField &X, const Transfer &T, const GaugeField &g,
418 const GaugeField &XinvKD, double mass, QudaDiracType dirac, QudaMatPCType matpc)
420 if (T.Vectors().Nspin() == 1) {
421 calculateStaggeredY<Float,vFloat,fineColor,1>(Y, X, T, g, XinvKD, mass, dirac, matpc);
423 errorQuda("Unsupported number of spins %d\n", T.Vectors(X.Location()).Nspin());
427 // template on fine colors
428 template <typename Float, typename vFloat>
429 void calculateStaggeredY(GaugeField &Y, GaugeField &X, const Transfer &T, const GaugeField &g,
430 const GaugeField &XinvKD, double mass, QudaDiracType dirac, QudaMatPCType matpc)
432 if (g.Ncolor() == 3) {
433 calculateStaggeredY<Float,vFloat,3>(Y, X, T, g, XinvKD, mass, dirac, matpc);
435 errorQuda("Unsupported number of colors %d\n", g.Ncolor());
439 //Does the heavy lifting of creating the coarse color matrices Y
440 void calculateStaggeredY(GaugeField &Y, GaugeField &X, const Transfer &T, const GaugeField &g,
441 const GaugeField &XinvKD, double mass, QudaDiracType dirac, QudaMatPCType matpc)
443 #if defined(GPU_MULTIGRID) && defined(GPU_STAGGERED_DIRAC)
444 checkPrecision(T.Vectors(X.Location()), X, Y);
446 if (getVerbosity() >= QUDA_SUMMARIZE) printfQuda("Computing Y field......\n");
448 if (Y.Precision() == QUDA_DOUBLE_PRECISION) {
449 #ifdef GPU_MULTIGRID_DOUBLE
450 if (T.Vectors(X.Location()).Precision() == QUDA_DOUBLE_PRECISION) {
451 calculateStaggeredY<double,double>(Y, X, T, g, XinvKD, mass, dirac, matpc);
453 errorQuda("Unsupported precision %d\n", Y.Precision());
456 errorQuda("Double precision multigrid has not been enabled");
459 #if QUDA_PRECISION & 4
460 if (Y.Precision() == QUDA_SINGLE_PRECISION) {
461 if (T.Vectors(X.Location()).Precision() == QUDA_SINGLE_PRECISION) {
462 calculateStaggeredY<float,float>(Y, X, T, g, XinvKD, mass, dirac, matpc);
464 errorQuda("Unsupported precision %d\n", T.Vectors(X.Location()).Precision());
468 #if QUDA_PRECISION & 2
469 if (Y.Precision() == QUDA_HALF_PRECISION) {
470 if (T.Vectors(X.Location()).Precision() == QUDA_HALF_PRECISION) {
471 calculateStaggeredY<float,short>(Y, X, T, g, XinvKD, mass, dirac, matpc);
473 errorQuda("Unsupported precision %d\n", T.Vectors(X.Location()).Precision());
478 errorQuda("Unsupported precision %d\n", Y.Precision());
480 if (getVerbosity() >= QUDA_SUMMARIZE) printfQuda("....done computing Y field\n");
482 errorQuda("Staggered multigrid has not been built");
486 //Calculates the coarse color matrix and puts the result in Y.
487 //N.B. Assumes Y, X have been allocated.
488 void StaggeredCoarseOp(GaugeField &Y, GaugeField &X, const Transfer &T, const cudaGaugeField &gauge,
489 const cudaGaugeField* XinvKD, double mass, QudaDiracType dirac, QudaMatPCType matpc)
491 QudaPrecision precision = Y.Precision();
492 QudaFieldLocation location = checkLocation(Y, X);
494 GaugeField *U = location == QUDA_CUDA_FIELD_LOCATION ? const_cast<cudaGaugeField*>(&gauge) : nullptr;
495 GaugeField *Xinv = location == QUDA_CUDA_FIELD_LOCATION ? const_cast<cudaGaugeField*>(XinvKD) : nullptr;
497 if (location == QUDA_CPU_FIELD_LOCATION) {
498 //First make a cpu gauge field from the cuda gauge field
500 GaugeFieldParam gf_param(gauge.X(), precision, QUDA_RECONSTRUCT_NO, pad, gauge.Geometry());
501 gf_param.order = QUDA_QDP_GAUGE_ORDER;
502 gf_param.fixed = gauge.GaugeFixed();
503 gf_param.link_type = gauge.LinkType();
504 gf_param.t_boundary = gauge.TBoundary();
505 gf_param.anisotropy = gauge.Anisotropy();
506 gf_param.gauge = NULL;
507 gf_param.create = QUDA_NULL_FIELD_CREATE;
508 gf_param.siteSubset = QUDA_FULL_SITE_SUBSET;
510 gf_param.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
512 U = new cpuGaugeField(gf_param);
514 //Copy the cuda gauge field to the cpu
515 gauge.saveCPUField(*static_cast<cpuGaugeField*>(U));
516 } else if (location == QUDA_CUDA_FIELD_LOCATION && gauge.Reconstruct() != QUDA_RECONSTRUCT_NO) {
517 //Create a copy of the gauge field with no reconstruction, required for fine-grained access
518 GaugeFieldParam gf_param(gauge);
519 gf_param.reconstruct = QUDA_RECONSTRUCT_NO;
520 gf_param.order = QUDA_FLOAT2_GAUGE_ORDER;
521 gf_param.setPrecision(gf_param.Precision());
522 U = new cudaGaugeField(gf_param);
527 // Create an Xinv in the spirit of what's done in coarse_op.cu
529 GaugeFieldParam xinvParam;
531 auto precision_xinv = XinvKD ? XinvKD->Precision() : QUDA_SINGLE_PRECISION;
532 if (precision_xinv < QUDA_HALF_PRECISION) {
533 precision_xinv = QUDA_HALF_PRECISION;
534 } else if (precision_xinv > QUDA_SINGLE_PRECISION) {
535 precision_xinv = QUDA_SINGLE_PRECISION;
537 xinvParam.setPrecision( precision_xinv );
538 for (int i = 0; i < xinvParam.nDim; i++) xinvParam.x[i] = XinvKD ? XinvKD->X()[i] : 0;
539 xinvParam.nColor = XinvKD ? XinvKD->Ncolor() : 16 * U->Ncolor();
540 xinvParam.reconstruct = QUDA_RECONSTRUCT_NO;
541 xinvParam.order = QUDA_MILC_GAUGE_ORDER;
542 xinvParam.link_type = QUDA_COARSE_LINKS;
543 xinvParam.t_boundary = QUDA_PERIODIC_T;
544 xinvParam.create = QUDA_NULL_FIELD_CREATE;
545 xinvParam.siteSubset = QUDA_FULL_SITE_SUBSET;
546 xinvParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
548 xinvParam.geometry = QUDA_SCALAR_GEOMETRY;
551 if (location == QUDA_CUDA_FIELD_LOCATION && !XinvKD) {
552 // create a dummy GaugeField if one is not defined
553 xinvParam.order = QUDA_INVALID_GAUGE_ORDER;
554 Xinv = new cudaGaugeField(xinvParam);
557 } else if (location == QUDA_CPU_FIELD_LOCATION) {
558 // Create a cpuGaugeField from the cudaGaugeField
559 Xinv = new cpuGaugeField(xinvParam);
561 if (XinvKD) XinvKD->saveCPUField(*static_cast<cpuGaugeField*>(Xinv));
564 calculateStaggeredY(Y, X, T, *U, *Xinv, mass, dirac, matpc);
566 if (U != &gauge) delete U;
567 if (Xinv != XinvKD) delete Xinv;