QUDA  v1.1.0
A library for QCD on GPUs
staggered_coarse_op.cu
Go to the documentation of this file.
1 #include <tune_quda.h>
2 #include <transfer.h>
3 #include <color_spinor_field.h>
4 #include <gauge_field.h>
5 
6 #include <jitify_helper.cuh>
7 
8 // For naive Kahler-Dirac coarsening
9 #include <kernels/staggered_coarse_op_kernel.cuh>
10 
11 // This define controls which kernels get compiled in `coarse_op.cuh`.
12 // This ensures only kernels relevant for coarsening the staggered
13 // operator get built, saving compile time.
14 #define STAGGEREDCOARSE
15 #include <coarse_op.cuh>
16 
17 namespace quda {
18 
19  /**
20  @brief dummyClover is a helper function to allow us to create an
21  empty clover object - this allows us to use the the externally
22  linked reduction kernels when we do have a clover field. Taken from
23  coarsecoarse_op.cu.
24  */
25  inline std::unique_ptr<cudaCloverField> dummyClover()
26  {
27  CloverFieldParam cf_param;
28  cf_param.nDim = 4;
29  cf_param.pad = 0;
30  cf_param.setPrecision(QUDA_SINGLE_PRECISION);
31 
32  for (int i = 0; i < cf_param.nDim; i++) cf_param.x[i] = 0;
33 
34  cf_param.direct = true;
35  cf_param.inverse = true;
36  cf_param.clover = nullptr;
37  cf_param.norm = 0;
38  cf_param.cloverInv = nullptr;
39  cf_param.invNorm = 0;
40  cf_param.create = QUDA_NULL_FIELD_CREATE;
41  cf_param.siteSubset = QUDA_FULL_SITE_SUBSET;
42 
43  // create a dummy cudaCloverField if one is not defined
44  cf_param.order = QUDA_INVALID_CLOVER_ORDER;
45  return std::make_unique<cudaCloverField>(cf_param);
46  }
47 
48  template <typename Float, int fineColor, int coarseSpin, int coarseColor, typename Arg>
49  class CalculateStaggeredY : public TunableVectorYZ {
50 
51  Arg &arg;
52  const GaugeField &meta;
53  GaugeField &Y;
54  GaugeField &X;
55 
56  long long flops() const { return arg.coarseVolumeCB*coarseSpin*coarseColor; }
57 
58  long long bytes() const
59  {
60  // 2 from forwards / backwards contributions, Y and X are sparse - only needs to write non-zero elements, 2nd term is mass term
61  return meta.Bytes() + (2 * meta.Bytes() * Y.Precision()) / meta.Precision() + 2 * 2 * coarseSpin * coarseColor * arg.coarseVolumeCB * X.Precision();
62  }
63 
64  unsigned int minThreads() const { return arg.fineVolumeCB; }
65  bool tuneSharedBytes() const { return false; } // don't tune the grid dimension
66  bool tuneGridDim() const { return false; } // don't tune the grid dimension
67  bool tuneAuxDim() const { return false; }
68 
69  public:
70  CalculateStaggeredY(Arg &arg, const GaugeField &meta, GaugeField &Y, GaugeField &X) :
71  TunableVectorYZ(fineColor*fineColor, 2),
72  arg(arg),
73  meta(meta),
74  Y(Y),
75  X(X)
76  {
77  if (meta.Location() == QUDA_CUDA_FIELD_LOCATION) {
78 #ifdef JITIFY
79  create_jitify_program("kernels/staggered_coarse_op_kernel.cuh");
80 #endif
81  }
82  strcpy(aux, compile_type_str(meta));
83  strcpy(aux, meta.AuxString());
84  strcat(aux,comm_dim_partitioned_string());
85  if (meta.Location() == QUDA_CPU_FIELD_LOCATION) strcat(aux, getOmpThreadStr());
86  strcat(aux,",computeStaggeredVUV");
87  strcat(aux, (meta.Location()==QUDA_CUDA_FIELD_LOCATION && Y.MemType() == QUDA_MEMORY_MAPPED) ? ",GPU-mapped," :
88  meta.Location()==QUDA_CUDA_FIELD_LOCATION ? ",GPU-device," : ",CPU,");
89  strcat(aux,"coarse_vol=");
90  strcat(aux,X.VolString());
91  }
92 
93  void apply(const qudaStream_t &stream)
94  {
95  TuneParam tp = tuneLaunch(*this, getTuning(), QUDA_VERBOSE);
96 
97  if (meta.Location() == QUDA_CPU_FIELD_LOCATION) {
98  ComputeStaggeredVUVCPU<Float,fineColor,coarseSpin,coarseColor>(arg);
99  } else {
100 #ifdef JITIFY
101  using namespace jitify::reflection;
102  jitify_error = program->kernel("quda::ComputeStaggeredVUVGPU")
103  .instantiate(Type<Float>(),fineColor,coarseSpin,coarseColor,Type<Arg>())
104  .configure(tp.grid,tp.block,tp.shared_bytes,stream).launch(arg);
105 #else // not jitify
106  qudaLaunchKernel(ComputeStaggeredVUVGPU<Float,fineColor,coarseSpin,coarseColor,Arg>, tp, stream, arg);
107 #endif // JITIFY
108  }
109  }
110 
111  bool advanceTuneParam(TuneParam &param) const {
112  // only do autotuning if we have device fields
113  if (meta.Location() == QUDA_CUDA_FIELD_LOCATION && Y.MemType() == QUDA_MEMORY_DEVICE) return Tunable::advanceTuneParam(param);
114  else return false;
115  }
116 
117  TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
118  };
119 
120  /**
121  @brief Calculate the coarse-link field, including the coarse clover field.
122 
123  @param Y[out] Coarse (fat-)link field accessor
124  @param X[out] Coarse clover field accessor
125  @param G[in] Fine grid link / gauge field accessor
126  @param Y_[out] Coarse link field
127  @param X_[out] Coarse clover field
128  @param X_[out] Coarse clover inverese field (used as temporary here)
129  @param v[in] Packed null-space vectors
130  @param G_[in] Fine gauge field
131  @param mass[in] Kappa parameter
132  @param matpc[in] The type of preconditioning of the source fine-grid operator
133  */
134  template<typename Float, int fineSpin, int fineColor, int coarseSpin, int coarseColor,
135  typename coarseGauge, typename fineGauge>
136  void calculateStaggeredY(coarseGauge &Y, coarseGauge &X, fineGauge &G, GaugeField &Y_, GaugeField &X_,
137  const GaugeField &G_, double mass, QudaDiracType dirac, QudaMatPCType matpc)
138  {
139  // sanity checks
140  if (matpc == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC || matpc == QUDA_MATPC_ODD_ODD_ASYMMETRIC)
141  errorQuda("Unsupported coarsening of matpc = %d", matpc);
142 
143  // This is the last time we use fineSpin, since this file only coarsens
144  // staggered-type ops, not wilson-type AND coarse-type.
145  if (fineSpin != 1)
146  errorQuda("Input Dirac operator %d should have nSpin=1, not nSpin=%d\n", dirac, fineSpin);
147  if (fineColor != 3)
148  errorQuda("Input Dirac operator %d should have nColor=3, not nColor=%d\n", dirac, fineColor);
149 
150  if (G.Ndim() != 4) errorQuda("Number of dimensions not supported");
151  const int nDim = 4;
152 
153  int x_size[QUDA_MAX_DIM] = { };
154  for (int i=0; i<4; i++) x_size[i] = G_.X()[i];
155  x_size[4] = 1;
156 
157  int xc_size[QUDA_MAX_DIM] = { };
158  for (int i=0; i<4; i++) xc_size[i] = X_.X()[i];
159  xc_size[4] = 1;
160 
161  int geo_bs[QUDA_MAX_DIM] = { };
162  for(int d = 0; d < nDim; d++) geo_bs[d] = x_size[d]/xc_size[d];
163  int spin_bs = 0; // 0 -> spin-less types.
164 
165  // Calculate VUV in one pass (due to KD-transform) for each dimension,
166  // accumulating directly into the coarse gauge field Y
167 
168  using Arg = CalculateStaggeredYArg<Float,coarseSpin,fineColor,coarseColor,coarseGauge,fineGauge>;
169  Arg arg(Y, X, G, mass, x_size, xc_size, geo_bs, spin_bs);
170  CalculateStaggeredY<Float, fineColor, coarseSpin, coarseColor, Arg> y(arg, G_, Y_, X_);
171 
172  QudaFieldLocation location = checkLocation(Y_, X_, G_);
173  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Running link coarsening on the %s\n", location == QUDA_CUDA_FIELD_LOCATION ? "GPU" : "CPU");
174 
175  // We know exactly what the scale should be: the max of all of the (fat) links.
176  double max_scale = G_.abs_max();
177  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Global U_max = %e\n", max_scale);
178 
179  if (coarseGauge::fixedPoint()) {
180  arg.Y.resetScale(max_scale);
181  arg.X.resetScale(max_scale > 2.0*mass ? max_scale : 2.0*mass); // To be safe
182  Y_.Scale(max_scale);
183  X_.Scale(max_scale > 2.0*mass ? max_scale : 2.0*mass); // To be safe
184  }
185 
186  // We can technically do a uni-directional build, but becauase
187  // the coarse link builds are just permutations plus lots of zeros,
188  // it's faster to skip the flip!
189 
190  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Computing VUV\n");
191  y.apply(0);
192 
193  if (getVerbosity() >= QUDA_VERBOSE) {
194  for (int d = 0; d < nDim; d++) printfQuda("Y2[%d] = %e\n", 4+d, Y_.norm2( 4+d ));
195  for (int d = 0; d < nDim; d++) printfQuda("Y2[%d] = %e\n", d, Y_.norm2( d ));
196  }
197 
198  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("X2 = %e\n", X_.norm2(0));
199  }
200 
201  template <typename Float, typename vFloat, int fineColor, int fineSpin, int coarseColor, int coarseSpin>
202  void calculateStaggeredY(GaugeField &Y, GaugeField &X, const Transfer &T, const GaugeField &g,
203  double mass, QudaDiracType dirac, QudaMatPCType matpc)
204  {
205  QudaFieldLocation location = Y.Location();
206 
207  if (location == QUDA_CPU_FIELD_LOCATION) {
208 
209  constexpr QudaFieldOrder csOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
210  constexpr QudaGaugeFieldOrder gOrder = QUDA_QDP_GAUGE_ORDER;
211 
212  if (T.Vectors(Y.Location()).FieldOrder() != csOrder)
213  errorQuda("Unsupported field order %d\n", T.Vectors(Y.Location()).FieldOrder());
214  if (g.FieldOrder() != gOrder) errorQuda("Unsupported field order %d\n", g.FieldOrder());
215 
216  using gFine = typename gauge::FieldOrder<Float,fineColor,1,gOrder>;
217  using gCoarse = typename gauge::FieldOrder<Float,coarseColor*coarseSpin,coarseSpin,gOrder,true,vFloat>;
218 
219  gFine gAccessor(const_cast<GaugeField&>(g));
220  gCoarse yAccessor(const_cast<GaugeField&>(Y));
221  gCoarse xAccessor(const_cast<GaugeField&>(X));
222 
223  calculateStaggeredY<Float,fineSpin,fineColor,coarseSpin,coarseColor>
224  (yAccessor, xAccessor, gAccessor, Y, X, g, mass, dirac, matpc);
225  } else {
226 
227  constexpr QudaFieldOrder csOrder = QUDA_FLOAT2_FIELD_ORDER;
228  constexpr QudaGaugeFieldOrder gOrder = QUDA_FLOAT2_GAUGE_ORDER;
229 
230  if (T.Vectors(Y.Location()).FieldOrder() != csOrder)
231  errorQuda("Unsupported field order %d\n", T.Vectors(Y.Location()).FieldOrder());
232  if (g.FieldOrder() != gOrder) errorQuda("Unsupported field order %d\n", g.FieldOrder());
233 
234  using gFine = typename gauge::FieldOrder<Float,fineColor,1,gOrder,true,Float>;
235  using gCoarse = typename gauge::FieldOrder<Float,coarseColor*coarseSpin,coarseSpin,gOrder,true,vFloat>;
236 
237  gFine gAccessor(const_cast<GaugeField&>(g));
238  gCoarse yAccessor(const_cast<GaugeField&>(Y));
239  gCoarse xAccessor(const_cast<GaugeField&>(X));
240 
241  calculateStaggeredY<Float,fineSpin,fineColor,coarseSpin,coarseColor>
242  (yAccessor, xAccessor, gAccessor, Y, X, g, mass, dirac, matpc);
243  }
244 
245  }
246 
247  template <typename Float, typename vFloat, typename vFloatXinv, int fineColor, int fineSpin, int xinvColor, int xinvSpin, int coarseColor, int coarseSpin>
248  void aggregateStaggeredY(GaugeField &Y, GaugeField &X,
249  const Transfer &T, const GaugeField &g, const GaugeField &XinvKD, double mass, QudaDiracType dirac, QudaMatPCType matpc)
250  {
251  // Actually create the temporaries like UV, etc.
252  auto location = Y.Location();
253 
254  //Create a field UV which holds U*V. Has the same structure as V,
255  // No need to double spin for the staggered operator, will need to double it for the K-D op.
256  ColorSpinorParam UVparam(T.Vectors(location));
257  UVparam.create = QUDA_ZERO_FIELD_CREATE;
258  UVparam.location = location;
259  UVparam.setPrecision(T.Vectors(location).Precision());
260  UVparam.mem_type = Y.MemType(); // allocate temporaries to match coarse-grid link field
261 
262  ColorSpinorField *uv = ColorSpinorField::Create(UVparam);
263 
264  ColorSpinorField *av = (dirac == QUDA_STAGGEREDKD_DIRAC || dirac == QUDA_ASQTADKD_DIRAC) ? ColorSpinorField::Create(UVparam) : &const_cast<ColorSpinorField&>(T.Vectors(location));
265 
266  GaugeField *Yatomic = &Y;
267  GaugeField *Xatomic = &X;
268 
269  if (Y.Precision() < QUDA_SINGLE_PRECISION) {
270  // we need to coarsen into single precision fields (float or int), so we allocate temporaries for this purpose
271  // else we can just coarsen directly into the original fields
272  GaugeFieldParam param(X); // use X since we want scalar geometry
273  param.location = location;
274  param.setPrecision(QUDA_SINGLE_PRECISION, location == QUDA_CUDA_FIELD_LOCATION ? true : false);
275 
276  Yatomic = GaugeField::Create(param);
277  Xatomic = GaugeField::Create(param);
278  }
279 
280  // Moving along to the build
281 
282  const double kappa = -1.; // cancels a minus sign factor for kappa w/in the dslash application
283  const double mu_dummy = 0.;
284  const double mu_factor_dummy = 0.;
285 
286  bool need_bidirectional = false;
287  if (dirac == QUDA_STAGGEREDKD_DIRAC || dirac == QUDA_ASQTADKD_DIRAC) need_bidirectional = true;
288 
289  constexpr QudaGaugeFieldOrder xinvOrder = QUDA_MILC_GAUGE_ORDER;
290  if (XinvKD.FieldOrder() != xinvOrder && XinvKD.Bytes()) errorQuda("unsupported field order %d\n", XinvKD.FieldOrder());
291 
292  if (Y.Location() == QUDA_CPU_FIELD_LOCATION) {
293 
294  constexpr QudaFieldOrder csOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
295  constexpr QudaGaugeFieldOrder gOrder = QUDA_QDP_GAUGE_ORDER;
296 
297  if (T.Vectors(Y.Location()).FieldOrder() != csOrder)
298  errorQuda("Unsupported field order %d\n", T.Vectors(Y.Location()).FieldOrder());
299  if (g.FieldOrder() != gOrder) errorQuda("Unsupported field order %d\n", g.FieldOrder());
300 
301  using V = typename colorspinor::FieldOrderCB<Float,fineSpin,fineColor,coarseColor,csOrder,vFloat>;
302  using F = typename colorspinor::FieldOrderCB<Float,fineSpin,fineColor,coarseColor,csOrder,vFloat>; // will need 2x the spin components for the KD op
303  using gFine = typename gauge::FieldOrder<Float,fineColor,1,gOrder>;
304  using xinvFine = typename gauge::FieldOrder<typename mapper<vFloatXinv>::type,xinvColor,xinvSpin,xinvOrder,true,vFloatXinv>;
305  using gCoarse = typename gauge::FieldOrder<Float,coarseColor*coarseSpin,coarseSpin,gOrder,true,vFloat>;
306  using gCoarseAtomic = typename gauge::FieldOrder<Float,coarseColor*coarseSpin,coarseSpin,gOrder,true,storeType>;
307 
308  const ColorSpinorField &v = T.Vectors(Y.Location());
309 
310  V vAccessor(const_cast<ColorSpinorField&>(v));
311  V uvAccessor(*uv); // will need 2x the spin components for the KD op
312  F avAccessor(*av);
313  gFine gAccessor(const_cast<GaugeField&>(g));
314  xinvFine xinvAccessor(const_cast<GaugeField&>(XinvKD));
315  gCoarse yAccessor(const_cast<GaugeField&>(Y));
316  gCoarse xAccessor(const_cast<GaugeField&>(X));
317  gCoarseAtomic yAccessorAtomic(*Yatomic);
318  gCoarseAtomic xAccessorAtomic(*Xatomic);
319 
320  // the repeated xinvAccessor is intentional
321  calculateY<QUDA_CPU_FIELD_LOCATION, false, Float, fineSpin, fineColor, coarseSpin, coarseColor>(
322  yAccessor, xAccessor, yAccessorAtomic, xAccessorAtomic, uvAccessor, avAccessor, vAccessor, gAccessor, xinvAccessor, xinvAccessor,
323  Y, X, *Yatomic, *Xatomic, *uv, *av, v, g, *dummyClover(), kappa, mass, mu_dummy,
324  mu_factor_dummy, dirac, matpc, need_bidirectional, T.fineToCoarse(Y.Location()), T.coarseToFine(Y.Location()));
325  } else {
326 
327  constexpr QudaFieldOrder csOrder = QUDA_FLOAT2_FIELD_ORDER;
328  constexpr QudaGaugeFieldOrder gOrder = QUDA_FLOAT2_GAUGE_ORDER;
329 
330  if (T.Vectors(Y.Location()).FieldOrder() != csOrder)
331  errorQuda("Unsupported field order %d\n", T.Vectors(Y.Location()).FieldOrder());
332  if (g.FieldOrder() != gOrder) errorQuda("Unsupported field order %d\n", g.FieldOrder());
333 
334  using V = typename colorspinor::FieldOrderCB<Float, fineSpin, fineColor, coarseColor, csOrder, vFloat, vFloat, false, false>;
335  using F = typename colorspinor::FieldOrderCB<Float, fineSpin, fineColor, coarseColor, csOrder, vFloat, vFloat, false, false>; // will need 2x the spin components for the KD op
336  using gFine = typename gauge::FieldOrder<Float,fineColor,1,gOrder,true,Float>;
337  using xinvFine = typename gauge::FieldOrder<typename mapper<vFloatXinv>::type,xinvColor,xinvSpin,xinvOrder,true,vFloatXinv>;
338  using gCoarse = typename gauge::FieldOrder<Float, coarseColor * coarseSpin, coarseSpin, gOrder, true, vFloat>;
339  using gCoarseAtomic = typename gauge::FieldOrder<Float, coarseColor * coarseSpin, coarseSpin, gOrder, true, storeType>;
340 
341  const ColorSpinorField &v = T.Vectors(Y.Location());
342 
343  V vAccessor(const_cast<ColorSpinorField &>(v));
344  V uvAccessor(*uv); // will need 2x the spin components for the KD op
345  F avAccessor(*av);
346  gFine gAccessor(const_cast<GaugeField &>(g));
347  xinvFine xinvAccessor(const_cast<GaugeField&>(XinvKD));
348  gCoarse yAccessor(const_cast<GaugeField &>(Y));
349  gCoarse xAccessor(const_cast<GaugeField &>(X));
350  gCoarseAtomic yAccessorAtomic(*Yatomic);
351  gCoarseAtomic xAccessorAtomic(*Xatomic);
352 
353  // create a dummy clover field to allow us to call the external clover reduction routines elsewhere
354  calculateY<QUDA_CUDA_FIELD_LOCATION, false, Float, fineSpin, fineColor, coarseSpin, coarseColor>(
355  yAccessor, xAccessor, yAccessorAtomic, xAccessorAtomic, uvAccessor, avAccessor, vAccessor, gAccessor,
356  xinvAccessor, xinvAccessor, Y, X, *Yatomic, *Xatomic, *uv, *av, v, g, *dummyClover(),
357  kappa, mass, mu_dummy, mu_factor_dummy, dirac, matpc, need_bidirectional, T.fineToCoarse(Y.Location()),
358  T.coarseToFine(Y.Location()));
359  }
360 
361  // Clean up
362  if (Yatomic != &Y) delete Yatomic;
363  if (Xatomic != &X) delete Xatomic;
364 
365  if (av != nullptr && &T.Vectors(location) != av) delete av;
366  if (uv != nullptr) delete uv;
367 
368  }
369 
370  // template on precision of XinvKD
371  template <typename Float, typename vFloat, int fineColor, int fineSpin, int coarseColor, int coarseSpin>
372  void aggregateStaggeredY(GaugeField &Y, GaugeField &X, const Transfer &T, const GaugeField &g,
373  const GaugeField &XinvKD, double mass, QudaDiracType dirac, QudaMatPCType matpc)
374  {
375  if (XinvKD.Ncolor() != 16*fineColor)
376  errorQuda("Invalid Nc %d", XinvKD.Ncolor());
377 
378  constexpr int xinvColor = 16 * fineColor;
379  constexpr int xinvSpin = 2;
380 
381  if (XinvKD.Precision() == QUDA_SINGLE_PRECISION) {
382  aggregateStaggeredY<Float,vFloat,float,fineColor,fineSpin,xinvColor,xinvSpin,coarseColor,coarseSpin>(Y, X, T, g, XinvKD, mass, dirac, matpc);
383  //} else if (XinvKD.Precision() == QUDA_HALF_PRECISION) { --- unnecessary until we add KD coarsening support
384  // aggregateStaggeredY<Float,vFloat,short,fineColor,fineSpin,xinvColor,xinvSpin,coarseColor,coarseSpin>(Y, X, T, g, XinvKD, mass, dirac, matpc);
385  } else {
386  errorQuda("Unsupported precision %d", XinvKD.Precision());
387  }
388  }
389 
390 
391  // template on the number of coarse degrees of freedom, branch between naive K-D
392  // and actual aggregation
393  template <typename Float, typename vFloat, int fineColor, int fineSpin>
394  void calculateStaggeredY(GaugeField &Y, GaugeField &X, const Transfer &T, const GaugeField &g,
395  const GaugeField &XinvKD, double mass, QudaDiracType dirac, QudaMatPCType matpc)
396  {
397  const int coarseSpin = 2;
398  const int coarseColor = Y.Ncolor() / coarseSpin;
399 
400  if (coarseColor == 24) {
401  if (T.getTransferType() == QUDA_TRANSFER_COARSE_KD)
402  // the permutation routines don't need Yatomic, Xatomic, uv, av
403  calculateStaggeredY<Float,vFloat,fineColor,fineSpin,24,coarseSpin>(Y, X, T, g, mass, dirac, matpc);
404  else {
405  // free field aggregation
406  aggregateStaggeredY<Float,vFloat,fineColor,fineSpin,24,coarseSpin>(Y, X, T, g, XinvKD, mass, dirac, matpc);
407  }
408  } else if (coarseColor == 64) {
409  aggregateStaggeredY<Float,vFloat,fineColor,fineSpin,64,coarseSpin>(Y, X, T, g, XinvKD, mass, dirac, matpc);
410  } else { // note --- may revisit 3 -> 96 in the future
411  errorQuda("Unsupported number of coarse dof %d\n", Y.Ncolor());
412  }
413  }
414 
415  // template on fine spin
416  template <typename Float, typename vFloat, int fineColor>
417  void calculateStaggeredY(GaugeField &Y, GaugeField &X, const Transfer &T, const GaugeField &g,
418  const GaugeField &XinvKD, double mass, QudaDiracType dirac, QudaMatPCType matpc)
419  {
420  if (T.Vectors().Nspin() == 1) {
421  calculateStaggeredY<Float,vFloat,fineColor,1>(Y, X, T, g, XinvKD, mass, dirac, matpc);
422  } else {
423  errorQuda("Unsupported number of spins %d\n", T.Vectors(X.Location()).Nspin());
424  }
425  }
426 
427  // template on fine colors
428  template <typename Float, typename vFloat>
429  void calculateStaggeredY(GaugeField &Y, GaugeField &X, const Transfer &T, const GaugeField &g,
430  const GaugeField &XinvKD, double mass, QudaDiracType dirac, QudaMatPCType matpc)
431  {
432  if (g.Ncolor() == 3) {
433  calculateStaggeredY<Float,vFloat,3>(Y, X, T, g, XinvKD, mass, dirac, matpc);
434  } else {
435  errorQuda("Unsupported number of colors %d\n", g.Ncolor());
436  }
437  }
438 
439  //Does the heavy lifting of creating the coarse color matrices Y
440  void calculateStaggeredY(GaugeField &Y, GaugeField &X, const Transfer &T, const GaugeField &g,
441  const GaugeField &XinvKD, double mass, QudaDiracType dirac, QudaMatPCType matpc)
442  {
443 #if defined(GPU_MULTIGRID) && defined(GPU_STAGGERED_DIRAC)
444  checkPrecision(T.Vectors(X.Location()), X, Y);
445 
446  if (getVerbosity() >= QUDA_SUMMARIZE) printfQuda("Computing Y field......\n");
447 
448  if (Y.Precision() == QUDA_DOUBLE_PRECISION) {
449 #ifdef GPU_MULTIGRID_DOUBLE
450  if (T.Vectors(X.Location()).Precision() == QUDA_DOUBLE_PRECISION) {
451  calculateStaggeredY<double,double>(Y, X, T, g, XinvKD, mass, dirac, matpc);
452  } else {
453  errorQuda("Unsupported precision %d\n", Y.Precision());
454  }
455 #else
456  errorQuda("Double precision multigrid has not been enabled");
457 #endif
458  } else
459 #if QUDA_PRECISION & 4
460  if (Y.Precision() == QUDA_SINGLE_PRECISION) {
461  if (T.Vectors(X.Location()).Precision() == QUDA_SINGLE_PRECISION) {
462  calculateStaggeredY<float,float>(Y, X, T, g, XinvKD, mass, dirac, matpc);
463  } else {
464  errorQuda("Unsupported precision %d\n", T.Vectors(X.Location()).Precision());
465  }
466  } else
467 #endif
468 #if QUDA_PRECISION & 2
469  if (Y.Precision() == QUDA_HALF_PRECISION) {
470  if (T.Vectors(X.Location()).Precision() == QUDA_HALF_PRECISION) {
471  calculateStaggeredY<float,short>(Y, X, T, g, XinvKD, mass, dirac, matpc);
472  } else {
473  errorQuda("Unsupported precision %d\n", T.Vectors(X.Location()).Precision());
474  }
475  } else
476 #endif
477  {
478  errorQuda("Unsupported precision %d\n", Y.Precision());
479  }
480  if (getVerbosity() >= QUDA_SUMMARIZE) printfQuda("....done computing Y field\n");
481 #else
482  errorQuda("Staggered multigrid has not been built");
483 #endif
484  }
485 
486  //Calculates the coarse color matrix and puts the result in Y.
487  //N.B. Assumes Y, X have been allocated.
488  void StaggeredCoarseOp(GaugeField &Y, GaugeField &X, const Transfer &T, const cudaGaugeField &gauge,
489  const cudaGaugeField* XinvKD, double mass, QudaDiracType dirac, QudaMatPCType matpc)
490  {
491  QudaPrecision precision = Y.Precision();
492  QudaFieldLocation location = checkLocation(Y, X);
493 
494  GaugeField *U = location == QUDA_CUDA_FIELD_LOCATION ? const_cast<cudaGaugeField*>(&gauge) : nullptr;
495  GaugeField *Xinv = location == QUDA_CUDA_FIELD_LOCATION ? const_cast<cudaGaugeField*>(XinvKD) : nullptr;
496 
497  if (location == QUDA_CPU_FIELD_LOCATION) {
498  //First make a cpu gauge field from the cuda gauge field
499  int pad = 0;
500  GaugeFieldParam gf_param(gauge.X(), precision, QUDA_RECONSTRUCT_NO, pad, gauge.Geometry());
501  gf_param.order = QUDA_QDP_GAUGE_ORDER;
502  gf_param.fixed = gauge.GaugeFixed();
503  gf_param.link_type = gauge.LinkType();
504  gf_param.t_boundary = gauge.TBoundary();
505  gf_param.anisotropy = gauge.Anisotropy();
506  gf_param.gauge = NULL;
507  gf_param.create = QUDA_NULL_FIELD_CREATE;
508  gf_param.siteSubset = QUDA_FULL_SITE_SUBSET;
509  gf_param.nFace = 1;
510  gf_param.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
511 
512  U = new cpuGaugeField(gf_param);
513 
514  //Copy the cuda gauge field to the cpu
515  gauge.saveCPUField(*static_cast<cpuGaugeField*>(U));
516  } else if (location == QUDA_CUDA_FIELD_LOCATION && gauge.Reconstruct() != QUDA_RECONSTRUCT_NO) {
517  //Create a copy of the gauge field with no reconstruction, required for fine-grained access
518  GaugeFieldParam gf_param(gauge);
519  gf_param.reconstruct = QUDA_RECONSTRUCT_NO;
520  gf_param.order = QUDA_FLOAT2_GAUGE_ORDER;
521  gf_param.setPrecision(gf_param.Precision());
522  U = new cudaGaugeField(gf_param);
523 
524  U->copy(gauge);
525  }
526 
527  // Create an Xinv in the spirit of what's done in coarse_op.cu
528 
529  GaugeFieldParam xinvParam;
530  xinvParam.nDim = 4;
531  auto precision_xinv = XinvKD ? XinvKD->Precision() : QUDA_SINGLE_PRECISION;
532  if (precision_xinv < QUDA_HALF_PRECISION) {
533  precision_xinv = QUDA_HALF_PRECISION;
534  } else if (precision_xinv > QUDA_SINGLE_PRECISION) {
535  precision_xinv = QUDA_SINGLE_PRECISION;
536  }
537  xinvParam.setPrecision( precision_xinv );
538  for (int i = 0; i < xinvParam.nDim; i++) xinvParam.x[i] = XinvKD ? XinvKD->X()[i] : 0;
539  xinvParam.nColor = XinvKD ? XinvKD->Ncolor() : 16 * U->Ncolor();
540  xinvParam.reconstruct = QUDA_RECONSTRUCT_NO;
541  xinvParam.order = QUDA_MILC_GAUGE_ORDER;
542  xinvParam.link_type = QUDA_COARSE_LINKS;
543  xinvParam.t_boundary = QUDA_PERIODIC_T;
544  xinvParam.create = QUDA_NULL_FIELD_CREATE;
545  xinvParam.siteSubset = QUDA_FULL_SITE_SUBSET;
546  xinvParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
547  xinvParam.nFace = 0;
548  xinvParam.geometry = QUDA_SCALAR_GEOMETRY;
549  xinvParam.pad = 0;
550 
551  if (location == QUDA_CUDA_FIELD_LOCATION && !XinvKD) {
552  // create a dummy GaugeField if one is not defined
553  xinvParam.order = QUDA_INVALID_GAUGE_ORDER;
554  Xinv = new cudaGaugeField(xinvParam);
555 
556 
557  } else if (location == QUDA_CPU_FIELD_LOCATION) {
558  // Create a cpuGaugeField from the cudaGaugeField
559  Xinv = new cpuGaugeField(xinvParam);
560 
561  if (XinvKD) XinvKD->saveCPUField(*static_cast<cpuGaugeField*>(Xinv));
562  }
563 
564  calculateStaggeredY(Y, X, T, *U, *Xinv, mass, dirac, matpc);
565 
566  if (U != &gauge) delete U;
567  if (Xinv != XinvKD) delete Xinv;
568  }
569 
570 } //namespace quda