QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
color_spinor_pack.cu
Go to the documentation of this file.
1 #include <color_spinor_field.h>
2 #include <tune_quda.h>
3 
4 #include <jitify_helper.cuh>
6 
43 namespace quda {
44 
45  template <typename Float, bool block_float, int Ns, int Ms, int Nc, int Mc, typename Arg>
47  Arg &arg;
49  unsigned int minThreads() const { return arg.volumeCB; }
50  bool tuneGridDim() const { return false; }
51  bool tuneAuxDim() const { return true; }
52 
53  public:
54  inline GenericPackGhostLauncher(Arg &arg, const ColorSpinorField &meta, MemoryLocation *destination)
55  : TunableVectorYZ((Ns/Ms)*(Nc/Mc), 2*arg.nParity), arg(arg), meta(meta) {
56 
57  if (meta.Location() == QUDA_CUDA_FIELD_LOCATION) {
58 #ifdef JITIFY
59  create_jitify_program("kernels/color_spinor_pack.cuh");
60 #endif
61  }
62 
63  strcpy(aux,compile_type_str(meta));
64  strcat(aux,meta.AuxString());
66  strcat(aux,comm_dim_topology_string());
67 
68  // record the location of where each pack buffer is in [2*dim+dir] ordering
69  // 0 - no packing
70  // 1 - pack to local GPU memory
71  // 2 - pack to local mapped CPU memory
72  // 3 - pack to remote mapped GPU memory
73  char label[15] = ",dest=";
74  for (int dim=0; dim<4; dim++) {
75  for (int dir=0; dir<2; dir++) {
76  label[2*dim+dir+6] = !comm_dim_partitioned(dim) ? '0' : destination[2*dim+dir] == Device ? '1' : destination[2*dim+dir] == Host ? '2' : '3';
77  }
78  }
79  label[14] = '\0';
80  strcat(aux,label);
81  }
82 
84 
85  inline void apply(const cudaStream_t &stream) {
86  if (meta.Location() == QUDA_CPU_FIELD_LOCATION) {
87  if (arg.nDim == 5) GenericPackGhost<Float,block_float,Ns,Ms,Nc,Mc,5,Arg>(arg);
88  else GenericPackGhost<Float,block_float,Ns,Ms,Nc,Mc,4,Arg>(arg);
89  } else {
90  const TuneParam &tp = tuneLaunch(*this, getTuning(), getVerbosity());
91  arg.nParity2dim_threads = arg.nParity*2*tp.aux.x;
92 #ifdef JITIFY
93  using namespace jitify::reflection;
94  jitify_error = program->kernel("quda::GenericPackGhostKernel")
95  .instantiate(Type<Float>(),block_float,Ns,Ms,Nc,Mc,arg.nDim,(int)tp.aux.x,Type<Arg>())
96  .configure(tp.grid,tp.block,tp.shared_bytes,stream).launch(arg);
97 #else
98  switch(tp.aux.x) {
99  case 1:
100  if (arg.nDim == 5) GenericPackGhostKernel<Float,block_float,Ns,Ms,Nc,Mc,5,1,Arg> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
101  else GenericPackGhostKernel<Float,block_float,Ns,Ms,Nc,Mc,4,1,Arg> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
102  break;
103  case 2:
104  if (arg.nDim == 5) GenericPackGhostKernel<Float,block_float,Ns,Ms,Nc,Mc,5,2,Arg> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
105  else GenericPackGhostKernel<Float,block_float,Ns,Ms,Nc,Mc,4,2,Arg> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
106  break;
107  case 4:
108  if (arg.nDim == 5) GenericPackGhostKernel<Float,block_float,Ns,Ms,Nc,Mc,5,4,Arg> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
109  else GenericPackGhostKernel<Float,block_float,Ns,Ms,Nc,Mc,4,4,Arg> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
110  break;
111  }
112 #endif
113  }
114  }
115 
116  // if doing block float then all spin-color components must be within the same block
118  param.block.y = (Ns/Ms)*(Nc/Mc);
119  param.grid.y = 1;
120  param.block.z = 1;
121  param.grid.z = arg.nParity*2*param.aux.x;
122  }
123 
125  if (!block_float) {
126  return TunableVectorYZ::advanceBlockDim(param);
127  } else {
128  bool advance = Tunable::advanceBlockDim(param);
129  setColorSpinBlock(param); // if doing block float then all spin-color components must be within the same block
130  return advance;
131  }
132  }
133 
134  int blockStep() const { return block_float ? 2 : TunableVectorYZ::blockStep(); }
135  int blockMin() const { return block_float ? 2 : TunableVectorYZ::blockMin(); }
136 
137  bool advanceAux(TuneParam &param) const {
138  if (param.aux.x < 4) {
139  param.aux.x *= 2;
140  const_cast<GenericPackGhostLauncher*>(this)->resizeVector((Ns/Ms)*(Nc/Mc), arg.nParity*2*param.aux.x);
142  if (block_float) setColorSpinBlock(param);
143  return true;
144  }
145  param.aux.x = 1;
146  const_cast<GenericPackGhostLauncher*>(this)->resizeVector((Ns/Ms)*(Nc/Mc), arg.nParity*2*param.aux.x);
148  if (block_float) setColorSpinBlock(param);
149  return false;
150  }
151 
152  TuneKey tuneKey() const {
153  return TuneKey(meta.VolString(), typeid(*this).name(), aux);
154  }
155 
156  virtual void initTuneParam(TuneParam &param) const {
158  param.aux = make_int4(1,1,1,1);
159  if (block_float) setColorSpinBlock(param);
160  }
161 
162  virtual void defaultTuneParam(TuneParam &param) const {
164  param.aux = make_int4(1,1,1,1);
165  if (block_float) setColorSpinBlock(param);
166  }
167 
168  long long flops() const { return 0; }
169  long long bytes() const {
170  size_t totalBytes = 0;
171  for (int d=0; d<4; d++) {
172  if (!comm_dim_partitioned(d)) continue;
173  totalBytes += arg.nFace*2*Ns*Nc*meta.SurfaceCB(d)*(meta.Precision() + meta.GhostPrecision());
174  }
175  return totalBytes;
176  }
177  };
178 
179  template <typename Float, typename ghostFloat, QudaFieldOrder order, int Ns, int Nc>
180  inline void genericPackGhost(void **ghost, const ColorSpinorField &a, QudaParity parity,
181  int nFace, int dagger, MemoryLocation *destination) {
182 
183  typedef typename mapper<Float>::type RegFloat;
185  Q field(a, nFace, 0, ghost);
186 
187  constexpr int spins_per_thread = Ns == 1 ? 1 : 2; // make this autotunable?
188  constexpr int colors_per_thread = Nc%2 == 0 ? 2 : 1;
189  PackGhostArg<Q> arg(field, a, parity, nFace, dagger);
190 
191  // if we only have short precision for the ghost then this means we have block-float
192  constexpr bool block_float = (sizeof(Float) == QUDA_SINGLE_PRECISION &&
193  (sizeof(ghostFloat) == QUDA_HALF_PRECISION || sizeof(ghostFloat) == QUDA_QUARTER_PRECISION)
194  && Nc <= MAX_BLOCK_FLOAT_NC) ? true : false;
195 
196  // ensure we only compile supported block-float kernels
197  constexpr int Nc_ = (sizeof(Float) == QUDA_SINGLE_PRECISION &&
198  (sizeof(ghostFloat) == QUDA_HALF_PRECISION || sizeof(ghostFloat) == QUDA_QUARTER_PRECISION) &&
200 
201  if (sizeof(Float) == QUDA_SINGLE_PRECISION &&
202  (sizeof(ghostFloat) == QUDA_HALF_PRECISION || sizeof(ghostFloat) == QUDA_QUARTER_PRECISION) && Nc > MAX_BLOCK_FLOAT_NC)
203  errorQuda("Block-float format not supported for Nc = %d", Nc);
204 
206  launch(arg, a, destination);
207 
208  launch.apply(0);
209  }
210 
211  // traits used to ensure we only instantiate arbitrary colors for nSpin=2,4 fields, and only 3 colors otherwise
212  template<typename T, typename G, int nSpin, int nColor_> struct precision_spin_color_mapper { static constexpr int nColor = nColor_; };
213  template<typename T, typename G, int nColor_> struct precision_spin_color_mapper<T,G,1,nColor_> { static constexpr int nColor = 3; };
214 
215  // never need block-float format with nSpin=4 fields for arbitrary colors
216  template<int nColor_> struct precision_spin_color_mapper<float,short,4,nColor_> { static constexpr int nColor = 3; };
217  template<int nColor_> struct precision_spin_color_mapper<float,char,4,nColor_> { static constexpr int nColor = 3; };
218 
219 #ifndef GPU_MULTIGRID_DOUBLE
220  template<int nColor_> struct precision_spin_color_mapper<double,double,1,nColor_> { static constexpr int nColor = 3; };
221  template<int nColor_> struct precision_spin_color_mapper<double,double,2,nColor_> { static constexpr int nColor = 3; };
222  template<int nColor_> struct precision_spin_color_mapper<double,double,4,nColor_> { static constexpr int nColor = 3; };
223 #endif
224 
225  template <typename Float, typename ghostFloat, QudaFieldOrder order, int Ns>
226  inline void genericPackGhost(void **ghost, const ColorSpinorField &a, QudaParity parity,
227  int nFace, int dagger, MemoryLocation *destination) {
228 
229  if (a.Ncolor() != 3 && a.Nspin() == 1)
230  errorQuda("Ncolor = %d not supported for Nspin = %d fields", a.Ncolor(), a.Nspin());
231  if (a.Ncolor() != 3 && a.Nspin() == 4 && a.Precision() == QUDA_SINGLE_PRECISION &&
233  errorQuda("Ncolor = %d not supported for Nspin = %d fields with precision = %d and ghost_precision = %d",
234  a.Ncolor(), a.Nspin(), a.Precision(), a.GhostPrecision());
235 #ifndef GPU_MULTIGRID_DOUBLE
236  if ( a.Ncolor() != 3 && a.Precision() == QUDA_DOUBLE_PRECISION)
237  errorQuda("Ncolor = %d not supported for double precision fields", a.Ncolor());
238 #endif
239 
240  if (a.Ncolor() == 2) {
242  } else if (a.Ncolor() == 3) {
244 #ifdef GPU_MULTIGRID
245  } else if (a.Ncolor() == 4) {
247  } else if (a.Ncolor() == 6) {
249  } else if (a.Ncolor() == 8) {
251  } else if (a.Ncolor() == 12) {
253  } else if (a.Ncolor() == 16) {
255  } else if (a.Ncolor() == 18) { // Needed for two level free field Wilson
257  } else if (a.Ncolor() == 20) {
259  } else if (a.Ncolor() == 24) {
261  } else if (a.Ncolor() == 28) {
263  } else if (a.Ncolor() == 32) {
265  } else if (a.Ncolor() == 36) { // Needed for three level free field Wilson
267  } else if (a.Ncolor() == 48) {
269  } else if (a.Ncolor() == 72) {
271  } else if (a.Ncolor() == 96) {
273  } else if (a.Ncolor() == 256) {
275  } else if (a.Ncolor() == 576) {
277  } else if (a.Ncolor() == 768) {
279  } else if (a.Ncolor() == 1024) {
281 #endif // GPU_MULTIGRID
282  } else {
283  errorQuda("Unsupported nColor = %d", a.Ncolor());
284  }
285 
286  }
287 
288  // traits used to ensure we only instantiate float4 for spin=4 fields
289  template<int nSpin,QudaFieldOrder order_> struct spin_order_mapper { static constexpr QudaFieldOrder order = order_; };
290  template<> struct spin_order_mapper<2,QUDA_FLOAT4_FIELD_ORDER> { static constexpr QudaFieldOrder order = QUDA_FLOAT2_FIELD_ORDER; };
291  template<> struct spin_order_mapper<1,QUDA_FLOAT4_FIELD_ORDER> { static constexpr QudaFieldOrder order = QUDA_FLOAT2_FIELD_ORDER; };
292 
293  template <typename Float, typename ghostFloat, QudaFieldOrder order>
294  inline void genericPackGhost(void **ghost, const ColorSpinorField &a, QudaParity parity,
295  int nFace, int dagger, MemoryLocation *destination) {
296 
297  if (a.Nspin() == 4) {
298  genericPackGhost<Float,ghostFloat,order,4>(ghost, a, parity, nFace, dagger, destination);
299  } else if (a.Nspin() == 2) {
300  if (order == QUDA_FLOAT4_FIELD_ORDER) errorQuda("Field order %d with nSpin = %d not supported", order, a.Nspin());
301  genericPackGhost<Float,ghostFloat,spin_order_mapper<2,order>::order,2>(ghost, a, parity, nFace, dagger, destination);
302 #ifdef GPU_STAGGERED_DIRAC
303  } else if (a.Nspin() == 1) {
304  if (order == QUDA_FLOAT4_FIELD_ORDER) errorQuda("Field order %d with nSpin = %d not supported", order, a.Nspin());
305  genericPackGhost<Float,ghostFloat,spin_order_mapper<1,order>::order,1>(ghost, a, parity, nFace, dagger, destination);
306 #endif
307  } else {
308  errorQuda("Unsupported nSpin = %d", a.Nspin());
309  }
310 
311  }
312 
313  // traits used to ensure we only instantiate double and float templates for non-native fields
314  template<typename> struct non_native_precision_mapper { };
315  template<> struct non_native_precision_mapper<double> { typedef double type; };
316  template<> struct non_native_precision_mapper<float> { typedef float type; };
317  template<> struct non_native_precision_mapper<short> { typedef float type; };
318  template<> struct non_native_precision_mapper<char> { typedef float type; };
319 
320  // traits used to ensure we only instantiate float and lower precision for float4 fields
321  template<typename T> struct float4_precision_mapper { typedef T type; };
322  template<> struct float4_precision_mapper<double> { typedef float type; };
323  template<> struct float4_precision_mapper<short> { typedef float type; };
324  template<> struct float4_precision_mapper<char> { typedef float type; };
325 
326  template <typename Float, typename ghostFloat>
327  inline void genericPackGhost(void **ghost, const ColorSpinorField &a, QudaParity parity,
328  int nFace, int dagger, MemoryLocation *destination) {
329 
330  if (a.FieldOrder() == QUDA_FLOAT2_FIELD_ORDER) {
331 
332  // all precisions, color and spin can use this order
333  genericPackGhost<Float,ghostFloat,QUDA_FLOAT2_FIELD_ORDER>(ghost, a, parity, nFace, dagger, destination);
334 
335  } else if (a.FieldOrder() == QUDA_FLOAT4_FIELD_ORDER) {
336 
337  // never have double fields here
338  if (typeid(Float) != typeid(typename float4_precision_mapper<Float>::type))
339  errorQuda("Precision %d not supported for field type %d", a.Precision(), a.FieldOrder());
340  if (typeid(ghostFloat) != typeid(typename float4_precision_mapper<ghostFloat>::type))
341  errorQuda("Ghost precision %d not supported for field type %d", a.GhostPrecision(), a.FieldOrder());
342  genericPackGhost<typename float4_precision_mapper<Float>::type,
344  QUDA_FLOAT4_FIELD_ORDER>(ghost, a, parity, nFace, dagger, destination);
345 
346  } else if (a.FieldOrder() == QUDA_SPACE_SPIN_COLOR_FIELD_ORDER) {
347  if (typeid(Float) != typeid(typename non_native_precision_mapper<Float>::type))
348  errorQuda("Precision %d not supported for field type %d", a.Precision(), a.FieldOrder());
349  if (typeid(ghostFloat) != typeid(typename non_native_precision_mapper<ghostFloat>::type))
350  errorQuda("Ghost precision %d not supported for field type %d", a.GhostPrecision(), a.FieldOrder());
351  genericPackGhost<typename non_native_precision_mapper<Float>::type,
353  QUDA_SPACE_SPIN_COLOR_FIELD_ORDER>(ghost, a, parity, nFace, dagger, destination);
354  } else {
355  errorQuda("Unsupported field order = %d", a.FieldOrder());
356  }
357 
358  }
359 
360  void genericPackGhost(void **ghost, const ColorSpinorField &a, QudaParity parity,
361  int nFace, int dagger, MemoryLocation *destination_) {
362 
364  errorQuda("Field order %d not supported", a.FieldOrder());
365  }
366 
367  // set default location to match field type
368  MemoryLocation destination[2*QUDA_MAX_DIM];
369  for (int i=0; i<4*2; i++) {
370  destination[i] = destination_ ? destination_[i] : a.Location() == QUDA_CUDA_FIELD_LOCATION ? Device : Host;
371  }
372 
373  // only do packing if one of the dimensions is partitioned
374  bool partitioned = false;
375  for (int d=0; d<4; d++)
376  if (comm_dim_partitioned(d)) partitioned = true;
377  if (!partitioned) return;
378 
379  if (a.Precision() == QUDA_DOUBLE_PRECISION) {
381  genericPackGhost<double,double>(ghost, a, parity, nFace, dagger, destination);
382  } else {
383  errorQuda("precision = %d and ghost precision = %d not supported", a.Precision(), a.GhostPrecision());
384  }
385  } else if (a.Precision() == QUDA_SINGLE_PRECISION) {
387  genericPackGhost<float,float>(ghost, a, parity, nFace, dagger, destination);
388  } else if (a.GhostPrecision() == QUDA_HALF_PRECISION) {
389 #if QUDA_PRECISION & 2
390  genericPackGhost<float,short>(ghost, a, parity, nFace, dagger, destination);
391 #else
392  errorQuda("QUDA_PRECISION=%d does not enable half precision", QUDA_PRECISION);
393 #endif
394  } else if (a.GhostPrecision() == QUDA_QUARTER_PRECISION) {
395 #if QUDA_PRECISION & 1
396  genericPackGhost<float,char>(ghost, a, parity, nFace, dagger, destination);
397 #else
398  errorQuda("QUDA_PRECISION=%d does not enable quarter precision", QUDA_PRECISION);
399 #endif
400  } else {
401  errorQuda("precision = %d and ghost precision = %d not supported", a.Precision(), a.GhostPrecision());
402  }
403  } else if (a.Precision() == QUDA_HALF_PRECISION) {
404  if (a.GhostPrecision() == QUDA_HALF_PRECISION) {
405 #if QUDA_PRECISION & 2
406  genericPackGhost<short,short>(ghost, a, parity, nFace, dagger, destination);
407 #else
408  errorQuda("QUDA_PRECISION=%d does not enable half precision", QUDA_PRECISION);
409 #endif
410  } else {
411  errorQuda("precision = %d and ghost precision = %d not supported", a.Precision(), a.GhostPrecision());
412  }
413  } else {
414  errorQuda("Unsupported precision %d", a.Precision());
415  }
416 
417  }
418 
419 } // namespace quda
const char * AuxString() const
void apply(const cudaStream_t &stream)
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21
#define errorQuda(...)
Definition: util_quda.h:121
Helper file when using jitify run-time compilation. This file should be included in source code...
enum QudaFieldOrder_s QudaFieldOrder
QudaPrecision GhostPrecision() const
cudaStream_t * stream
virtual void defaultTuneParam(TuneParam &param) const
const char * VolString() const
const int * SurfaceCB() const
virtual void initTuneParam(TuneParam &param) const
const char * comm_dim_partitioned_string(const int *comm_dim_override=0)
Return a string that defines the comm partitioning (used as a tuneKey)
void setColorSpinBlock(TuneParam &param) const
const char * compile_type_str(const LatticeField &meta, QudaFieldLocation location_=QUDA_INVALID_FIELD_LOCATION)
Helper function for setting auxilary string.
const ColorSpinorField & meta
QudaGaugeParam param
Definition: pack_test.cpp:17
bool advanceBlockDim(TuneParam &param) const
void genericPackGhost(void **ghost, const ColorSpinorField &a, QudaParity parity, int nFace, int dagger, MemoryLocation *destination=nullptr)
Generic ghost packing routine.
virtual int blockMin() const
Definition: tune_quda.h:106
const int nColor
Definition: covdev_test.cpp:75
const char * comm_dim_topology_string()
Return a string that defines the comm topology (for use as a tuneKey)
GenericPackGhostLauncher(Arg &arg, const ColorSpinorField &meta, MemoryLocation *destination)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:643
CUresult jitify_error
Definition: tune_quda.h:276
enum QudaParity_s QudaParity
#define MAX_BLOCK_FLOAT_NC
void initTuneParam(TuneParam &param) const
Definition: tune_quda.h:523
QudaFieldLocation Location() const
const int nParity
Definition: spinor_noise.cu:25
void resizeVector(int y, int z) const
Definition: tune_quda.h:538
const int volumeCB
Definition: spinor_noise.cu:26
bool advanceAux(TuneParam &param) const
virtual int blockStep() const
Definition: tune_quda.h:105
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
virtual bool advanceBlockDim(TuneParam &param) const
Definition: tune_quda.h:124
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:52
QudaPrecision Precision() const
QudaDagType dagger
Definition: test_util.cpp:1620
QudaParity parity
Definition: covdev_test.cpp:54
QudaFieldOrder FieldOrder() const
char aux[TuneKey::aux_n]
Definition: tune_quda.h:265
int comm_dim_partitioned(int dim)
void defaultTuneParam(TuneParam &param) const
Definition: tune_quda.h:531
bool advanceBlockDim(TuneParam &param) const
Definition: tune_quda.h:496