QUDA  v1.1.0
A library for QCD on GPUs
staggered_kd_build_xinv.cu
Go to the documentation of this file.
1 #include <tune_quda.h>
2 #include <transfer.h>
3 #include <gauge_field.h>
4 #include <blas_quda.h>
5 #include <blas_lapack.h>
6 
7 #include <staggered_kd_build_xinv.h>
8 
9 #include <jitify_helper.cuh>
10 #include <kernels/staggered_kd_build_xinv_kernel.cuh>
11 
12 namespace quda {
13 
14  template <typename Float, int fineColor, int coarseSpin, int coarseColor, typename Arg>
15  class CalculateStaggeredKDBlock : public TunableVectorYZ {
16 
17  Arg &arg;
18  const GaugeField &meta;
19  GaugeField &X;
20 
21  long long flops() const {
22  // only real work is multiplying the mass by two
23  return arg.coarseVolumeCB*coarseSpin*coarseColor;
24  }
25 
26  long long bytes() const
27  {
28  // 1. meta.Bytes() / 2 b/c the Kahler-Dirac blocking is a dual decomposition: only
29  // half of the gauge field needs to be loaded.
30  // 2. Storing X, extra factor of two b/c it stores forwards and backwards.
31  // 3. Storing mass contribution
32  return meta.Bytes() / 2 + (meta.Bytes() * X.Precision()) / meta.Precision() + 2 * coarseSpin * coarseColor * arg.coarseVolumeCB * X.Precision();
33  }
34 
35  unsigned int minThreads() const { return arg.fineVolumeCB; }
36  bool tuneSharedBytes() const { return false; } // FIXME don't tune the grid dimension
37  bool tuneGridDim() const { return false; } // FIXME don't tune the grid dimension
38  bool tuneAuxDim() const { return false; }
39 
40  public:
41  CalculateStaggeredKDBlock(Arg &arg, const GaugeField &meta, GaugeField &X) :
42  TunableVectorYZ(fineColor*fineColor, 2),
43  arg(arg),
44  meta(meta),
45  X(X)
46  {
47 #ifdef JITIFY
48  create_jitify_program("kernels/staggered_kd_build_xinv_kernel.cuh");
49 #endif
50  strcpy(aux, compile_type_str(meta));
51  strcpy(aux, meta.AuxString());
52  if (meta.Location() == QUDA_CPU_FIELD_LOCATION) strcat(aux, getOmpThreadStr());
53  strcat(aux,",computeStaggeredKDBlock");
54  strcat(aux, (meta.Location()==QUDA_CUDA_FIELD_LOCATION && X.MemType() == QUDA_MEMORY_MAPPED) ? ",GPU-mapped," :
55  meta.Location()==QUDA_CUDA_FIELD_LOCATION ? ",GPU-device," : ",CPU,");
56  strcat(aux,"coarse_vol=");
57  strcat(aux,X.VolString());
58  }
59 
60  void apply(const qudaStream_t &stream)
61  {
62  TuneParam tp = tuneLaunch(*this, getTuning(), QUDA_VERBOSE);
63 
64  if (meta.Location() == QUDA_CPU_FIELD_LOCATION) {
65  ComputeStaggeredKDBlockCPU(arg);
66  } else {
67 #ifdef JITIFY
68  using namespace jitify::reflection;
69  jitify_error = program->kernel("quda::ComputeStaggeredKDBlockGPU")
70  .instantiate(Type<Arg>())
71  .configure(tp.grid,tp.block,tp.shared_bytes,stream).launch(arg);
72 #else // not jitify
73  qudaLaunchKernel(ComputeStaggeredKDBlockGPU<Arg>, tp, stream, arg);
74 #endif // JITIFY
75  }
76  }
77 
78  bool advanceTuneParam(TuneParam &param) const {
79  // only do autotuning if we have device fields
80  if (X.MemType() == QUDA_MEMORY_DEVICE) return Tunable::advanceTuneParam(param);
81  else return false;
82  }
83 
84  TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
85  };
86 
87  /**
88  @brief Calculate the staggered Kahler-Dirac block (coarse clover)
89 
90  @param X[out] KD block (coarse clover field) accessor
91  @param G[in] Fine grid link / gauge field accessor
92  @param X_[out] KD block (coarse clover field)
93  @param G_[in] Fine gauge field
94  @param mass[in] mass
95  */
96  template<typename Float, int fineColor, int coarseSpin, int coarseColor, typename xGauge, typename fineGauge>
97  void calculateStaggeredKDBlock(xGauge &X, fineGauge &G, GaugeField &X_, const GaugeField &G_, double mass)
98  {
99  // sanity checks
100  if (fineColor != 3)
101  errorQuda("Input gauge field should have nColor=3, not nColor=%d\n", fineColor);
102 
103  if (G.Ndim() != 4) errorQuda("Number of dimensions not supported");
104  const int nDim = 4;
105 
106  if (fineColor * 16 != coarseColor*coarseSpin)
107  errorQuda("Fine nColor=%d is not consistent with KD dof %d", fineColor, coarseColor*coarseSpin);
108 
109  int x_size[QUDA_MAX_DIM] = { };
110  int xc_size[QUDA_MAX_DIM] = { };
111  for (int i = 0; i < nDim; i++) {
112  x_size[i] = G_.X()[i];
113  xc_size[i] = X_.X()[i];
114  // check that local volumes are consistent
115  if (2 * xc_size[i] != x_size[i]) {
116  errorQuda("Inconsistent fine dimension %d and coarse KD dimension %d", x_size[i], xc_size[i]);
117  }
118  }
119  x_size[4] = xc_size[4] = 1;
120 
121  // Calculate X (KD block), which is really just a permutation of the gauge fields w/in a KD block
122  using Arg = CalculateStaggeredKDBlockArg<Float,coarseSpin,fineColor,coarseColor,xGauge,fineGauge>;
123  Arg arg(X, G, mass, x_size, xc_size);
124  CalculateStaggeredKDBlock<Float, fineColor, coarseSpin, coarseColor, Arg> y(arg, G_, X_);
125 
126  QudaFieldLocation location = checkLocation(X_, G_);
127  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Calculating the KD block on the %s\n", location == QUDA_CUDA_FIELD_LOCATION ? "GPU" : "CPU");
128 
129  // We know exactly what the scale should be: the max of all of the (fat) links.
130  double max_scale = G_.abs_max();
131  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Global U_max = %e\n", max_scale);
132 
133  if (xGauge::fixedPoint()) {
134  arg.X.resetScale(max_scale > 2.0*mass ? max_scale : 2.0*mass); // To be safe
135  X_.Scale(max_scale > 2.0*mass ? max_scale : 2.0*mass); // To be safe
136  }
137 
138  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Computing KD block\n");
139  y.apply(0);
140 
141  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("X2 = %e\n", X_.norm2(0));
142  }
143 
144  template <typename Float, typename vFloat, int fineColor, int coarseColor, int coarseSpin>
145  void calculateStaggeredKDBlock(GaugeField &X, const GaugeField &g, const double mass)
146  {
147 
148  QudaFieldLocation location = X.Location();
149 
150  if (location == QUDA_CPU_FIELD_LOCATION) {
151 
152  constexpr QudaGaugeFieldOrder gOrder = QUDA_QDP_GAUGE_ORDER;
153 
154  if (g.FieldOrder() != gOrder) errorQuda("Unsupported field order %d\n", g.FieldOrder());
155 
156  using gFine = typename gauge::FieldOrder<Float,fineColor,1,gOrder>;
157  using xCoarse = typename gauge::FieldOrder<Float,coarseColor*coarseSpin,coarseSpin,gOrder,true,vFloat>;
158 
159  gFine gAccessor(const_cast<GaugeField&>(g));
160  xCoarse xAccessor(const_cast<GaugeField&>(X));
161 
162  calculateStaggeredKDBlock<Float,fineColor,coarseSpin,coarseColor>(xAccessor, gAccessor, X, g, mass);
163 
164  } else {
165 
166  constexpr QudaGaugeFieldOrder gOrder = QUDA_FLOAT2_GAUGE_ORDER;
167 
168  if (g.FieldOrder() != gOrder) errorQuda("Unsupported field order %d\n", g.FieldOrder());
169 
170  using gFine = typename gauge::FieldOrder<Float,fineColor,1,gOrder,true,Float>;
171  using xCoarse = typename gauge::FieldOrder<Float,coarseColor*coarseSpin,coarseSpin,gOrder,true,vFloat>;
172 
173  gFine gAccessor(const_cast<GaugeField&>(g));
174  xCoarse xAccessor(const_cast<GaugeField&>(X));
175 
176  calculateStaggeredKDBlock<Float,fineColor,coarseSpin,coarseColor>(xAccessor, gAccessor, X, g, mass);
177  }
178 
179  }
180 
181  // template on the number of KD (coarse) degrees of freedom
182  template <typename Float, typename vFloat, int fineColor>
183  void calculateStaggeredKDBlock(GaugeField &X, const GaugeField& g, const double mass)
184  {
185  constexpr int coarseSpin = 2;
186  const int coarseColor = X.Ncolor() / coarseSpin;
187 
188  if (coarseColor == 24) { // half the dof w/in a KD-block
189  calculateStaggeredKDBlock<Float,vFloat,fineColor,24,coarseSpin>(X, g, mass);
190  } else {
191  errorQuda("Unsupported number of Kahler-Dirac dof %d\n", X.Ncolor());
192  }
193  }
194 
195  // template on fine colors
196  template <typename Float, typename vFloat>
197  void calculateStaggeredKDBlock(GaugeField &X, const GaugeField &g, const double mass)
198  {
199  if (g.Ncolor() == 3) {
200  calculateStaggeredKDBlock<Float,vFloat,3>(X, g, mass);
201  } else {
202  errorQuda("Unsupported number of colors %d\n", g.Ncolor());
203  }
204  }
205 
206  //Does the heavy lifting of building X
207  void calculateStaggeredKDBlock(GaugeField &X, const GaugeField &g, const double mass)
208  {
209 #if defined(GPU_STAGGERED_DIRAC)
210 
211  // FIXME remove when done
212  if (getVerbosity() >= QUDA_SUMMARIZE) printfQuda("Computing X for StaggeredKD...\n");
213 
214 #if QUDA_PRECISION & 8 && defined(GPU_MULTIGRID_DOUBLE)
215  if (X.Precision() == QUDA_DOUBLE_PRECISION) {
216  calculateStaggeredKDBlock<double,double>(X, g, mass);
217  } else
218 #endif
219 #if QUDA_PRECISION & 4
220  if (X.Precision() == QUDA_SINGLE_PRECISION) {
221  calculateStaggeredKDBlock<float,float>(X, g, mass);
222  } else
223 #endif
224 #if QUDA_PRECISION & 2
225  if (X.Precision() == QUDA_HALF_PRECISION) {
226  calculateStaggeredKDBlock<float,short>(X, g, mass);
227  } else
228 #endif
229 //#if QUDA_PRECISION & 1
230 // if (X.Precision() == QUDA_QUARTER_PRECISION) {
231 // calculateStaggeredKDBlock<float,int8_t>(X, g, mass);
232 // } else
233 //#endif
234  {
235  errorQuda("Unsupported precision %d", X.Precision());
236  }
237 
238  if (getVerbosity() >= QUDA_SUMMARIZE) printfQuda("....done computing X for StaggeredKD\n");
239 #else
240  errorQuda("Staggered fermion support has not been built");
241 #endif
242  }
243 
244  // Calculates the inverse KD block and puts the result in Xinv. Assumes Xinv has been allocated, in MILC data order
245  void BuildStaggeredKahlerDiracInverse(GaugeField &Xinv, const cudaGaugeField &gauge, const double mass)
246  {
247  using namespace blas_lapack;
248  auto invert = use_native() ? native::BatchInvertMatrix : generic::BatchInvertMatrix;
249 
250  // Xinv should have a MILC gauge order independent of being
251  // on the CPU or GPU
252  if (Xinv.FieldOrder() != QUDA_MILC_GAUGE_ORDER)
253  errorQuda("Unsupported field order %d", Xinv.FieldOrder());
254 
255  QudaPrecision precision = Xinv.Precision();
256  QudaFieldLocation location = Xinv.Location();
257 
258  // Logic copied from `staggered_coarse_op.cu`
259  GaugeField *U = location == QUDA_CUDA_FIELD_LOCATION ? const_cast<cudaGaugeField*>(&gauge) : nullptr;
260 
261  if (location == QUDA_CPU_FIELD_LOCATION) {
262 
263  //First make a cpu gauge field from the cuda gauge field
264  int pad = 0;
265  GaugeFieldParam gf_param(gauge.X(), precision, QUDA_RECONSTRUCT_NO, pad, gauge.Geometry());
266  gf_param.order = QUDA_QDP_GAUGE_ORDER;
267  gf_param.fixed = gauge.GaugeFixed();
268  gf_param.link_type = gauge.LinkType();
269  gf_param.t_boundary = gauge.TBoundary();
270  gf_param.anisotropy = gauge.Anisotropy();
271  gf_param.gauge = NULL;
272  gf_param.create = QUDA_NULL_FIELD_CREATE;
273  gf_param.siteSubset = QUDA_FULL_SITE_SUBSET;
274  gf_param.nFace = 1;
275  gf_param.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
276 
277  U = new cpuGaugeField(gf_param);
278 
279  //Copy the cuda gauge field to the cpu
280  gauge.saveCPUField(*static_cast<cpuGaugeField*>(U));
281 
282  } else if (location == QUDA_CUDA_FIELD_LOCATION) {
283 
284  // no reconstruct not strictly necessary, for now we do this for simplicity so
285  // we can take advantage of fine-grained access like in "staggered_coarse_op.cu"
286  // technically don't need to require the precision check, but it should
287  // generally be equal anyway
288 
289  // FIXME: make this work for any gauge precision
290  if (gauge.Reconstruct() != QUDA_RECONSTRUCT_NO || gauge.Precision() != precision || gauge.Precision() < QUDA_SINGLE_PRECISION) {
291  GaugeFieldParam gf_param(gauge);
292  gf_param.reconstruct = QUDA_RECONSTRUCT_NO;
293  gf_param.order = QUDA_FLOAT2_GAUGE_ORDER; // guaranteed for no recon
294  gf_param.setPrecision( precision == QUDA_DOUBLE_PRECISION ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION );
295  U = new cudaGaugeField(gf_param);
296 
297  U->copy(gauge);
298  }
299  }
300 
301  // Create X based on Xinv, but switch to a native ordering
302  // for a GPU setup.
303  GaugeField *X = nullptr;
304  GaugeFieldParam x_param(Xinv);
305  if (location == QUDA_CUDA_FIELD_LOCATION) {
306  x_param.order = QUDA_FLOAT2_GAUGE_ORDER;
307  x_param.setPrecision(x_param.Precision());
308  X = static_cast<GaugeField*>(new cudaGaugeField(x_param));
309  } else {
310  X = static_cast<GaugeField*>(new cpuGaugeField(x_param));
311  }
312 
313  // Calculate X
314  calculateStaggeredKDBlock(*X, *U, mass);
315 
316  // Invert X
317  // Logic copied from `coarse_op_preconditioned.cu`
318  const int n = Xinv.Ncolor();
319  if (location == QUDA_CUDA_FIELD_LOCATION) {
320  // FIXME: add support for double precision inverse
321  // Reorder to MILC order for inversion, based on "coarse_op_preconditioned.cu"
322  GaugeFieldParam param(Xinv);
323  param.order = QUDA_MILC_GAUGE_ORDER; // MILC order == QDP order for Xinv
324  param.setPrecision(QUDA_SINGLE_PRECISION); // FIXME until double prec support is added
325  cudaGaugeField X_(param);
326  cudaGaugeField* Xinv_ = ( Xinv.Precision() == QUDA_SINGLE_PRECISION) ? static_cast<cudaGaugeField*>(&Xinv) : new cudaGaugeField(param);
327 
328  X_.copy(*X);
329 
330  blas::flops += invert((void*)Xinv_->Gauge_p(), (void*)X_.Gauge_p(), n, X_.Volume(), X_.Precision(), X->Location());
331 
332  if ( Xinv_ != &Xinv) {
333  if (Xinv.Precision() < QUDA_SINGLE_PRECISION) Xinv.Scale( Xinv_->abs_max() );
334  Xinv.copy(*Xinv_);
335  delete Xinv_;
336  }
337 
338  } else if (location == QUDA_CPU_FIELD_LOCATION) {
339 
340  if (Xinv.Precision() == QUDA_DOUBLE_PRECISION)
341  errorQuda("Unsupported precision %d", Xinv.Precision());
342 
343  const cpuGaugeField *X_h = static_cast<const cpuGaugeField*>(X);
344  cpuGaugeField *Xinv_h = static_cast<cpuGaugeField*>(&Xinv);
345  blas::flops += invert((void*)Xinv_h->Gauge_p(), (void*)X_h->Gauge_p(), n, X_h->Volume(), X->Precision(), X->Location());
346 
347  }
348 
349  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Xinv = %e\n", Xinv.norm2(0));
350 
351  // Clean up
352  delete X;
353  if (U != &gauge) delete U;
354 
355  }
356 
357 
358  // Allocates and calculates the inverse KD block, returning Xinv
359  cudaGaugeField* AllocateAndBuildStaggeredKahlerDiracInverse(const cudaGaugeField &gauge, const double mass, const QudaPrecision override_prec)
360  {
361  const int ndim = 4;
362  int xc[QUDA_MAX_DIM];
363  for (int i = 0; i < ndim; i++) { xc[i] = gauge.X()[i]/2; }
364  const int Nc_c = gauge.Ncolor() * 8; // 24
365  const int Ns_c = 2; // staggered parity
366  GaugeFieldParam gParam;
367  memcpy(gParam.x, xc, QUDA_MAX_DIM*sizeof(int));
368  gParam.nColor = Nc_c*Ns_c;
369  gParam.reconstruct = QUDA_RECONSTRUCT_NO;
370  gParam.order = QUDA_MILC_GAUGE_ORDER;
371  gParam.link_type = QUDA_COARSE_LINKS;
372  gParam.t_boundary = QUDA_PERIODIC_T;
373  gParam.create = QUDA_ZERO_FIELD_CREATE;
374  gParam.setPrecision( override_prec );
375  gParam.nDim = ndim;
376  gParam.siteSubset = QUDA_FULL_SITE_SUBSET;
377  gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
378  gParam.nFace = 0;
379  gParam.geometry = QUDA_SCALAR_GEOMETRY;
380  gParam.pad = 0;
381 
382  cudaGaugeField* Xinv = new cudaGaugeField(gParam);
383 
384  BuildStaggeredKahlerDiracInverse(*Xinv, gauge, mass);
385 
386  return Xinv;
387  }
388 
389 } //namespace quda