QUDA  0.9.0
color_spinor_pack.cu
Go to the documentation of this file.
1 #include <color_spinor_field.h>
3 #include <index_helper.cuh>
4 #include <tune_quda.h>
5 #include <fast_intdiv.h>
6 
7 namespace quda {
8 
9  template <typename Field>
10  struct PackGhostArg {
11 
12  Field field;
13  void **ghost;
14  const void *v;
16  const int volumeCB;
17  const int nDim;
18  const int nFace;
19  const int parity;
20  const int nParity;
21  const int dagger;
23  int commDim[4]; // whether a given dimension is partitioned or not
24 
25  PackGhostArg(Field field, void **ghost, const ColorSpinorField &a, int parity, int nFace, int dagger)
26  : field(field),
27  ghost(ghost),
28  v(a.V()),
29  volumeCB(a.VolumeCB()),
30  nDim(a.Ndim()),
31  nFace(nFace),
32  parity(parity),
33  nParity(a.SiteSubset()),
34  dagger(dagger),
35  pc_type(a.DWFPCtype())
36  {
37  X[0] = ((nParity == 1) ? 2 : 1) * a.X(0); // set to full lattice dimensions
38  for (int d=1; d<nDim; d++) X[d] = a.X(d);
39  X[4] = (nDim == 5) ? a.X(4) : 1; // set fifth dimension correctly
40  for (int i=0; i<4; i++) {
42  }
43  }
44  };
45 
46  template <typename Float, int Ns, int Ms, int Nc, int Mc, int nDim, typename Arg>
47  __device__ __host__ inline void packGhost(Arg &arg, int cb_idx, int parity, int spinor_parity, int spin_block, int color_block) {
48  typedef typename mapper<Float>::type RegType;
49 
50  int x[5] = { };
51  if (nDim == 5) getCoords5(x, cb_idx, arg.X, parity, arg.pc_type);
52  else getCoords(x, cb_idx, arg.X, parity);
53 
54 #pragma unroll
55  for (int dim=0; dim<4; dim++) {
56  if (arg.commDim[dim] && x[dim] < arg.nFace){
57  for (int spin_local=0; spin_local<Ms; spin_local++) {
58  int s = spin_block + spin_local;
59  for (int color_local=0; color_local<Mc; color_local++) {
60  int c = color_block + color_local;
61  arg.field.Ghost(dim, 0, spinor_parity, ghostFaceIndex<0>(x,arg.X,dim,arg.nFace), s, c)
62  = arg.field(spinor_parity, cb_idx, s, c);
63  }
64  }
65  }
66 
67  if (arg.commDim[dim] && x[dim] >= arg.X[dim] - arg.nFace){
68  for (int spin_local=0; spin_local<Ms; spin_local++) {
69  int s = spin_block + spin_local;
70  for (int color_local=0; color_local<Mc; color_local++) {
71  int c = color_block + color_local;
72  arg.field.Ghost(dim, 1, spinor_parity, ghostFaceIndex<1>(x,arg.X,dim,arg.nFace), s, c)
73  = arg.field(spinor_parity, cb_idx, s, c);
74  }
75  }
76  }
77  }
78  }
79 
80  template <typename Float, int Ns, int Ms, int Nc, int Mc, int nDim, typename Arg>
81  void GenericPackGhost(Arg &arg) {
82  for (int parity=0; parity<arg.nParity; parity++) {
83  parity = (arg.nParity == 2) ? parity : arg.parity;
84  const int spinor_parity = (arg.nParity == 2) ? parity : 0;
85  for (int i=0; i<arg.volumeCB; i++)
86  for (int spin_block=0; spin_block<Ns; spin_block+=Ms)
87  for (int color_block=0; color_block<Nc; color_block+=Mc)
88  packGhost<Float,Ns,Ms,Nc,Mc,nDim>(arg, i, parity, spinor_parity, spin_block, color_block);
89  }
90  }
91 
92  template <typename Float, int Ns, int Ms, int Nc, int Mc, int nDim, typename Arg>
93  __global__ void GenericPackGhostKernel(Arg arg) {
94  int x_cb = blockIdx.x*blockDim.x + threadIdx.x;
95  if (x_cb >= arg.volumeCB) return;
96 
97  const int parity = (arg.nParity == 2) ? blockDim.z*blockIdx.z + threadIdx.z : arg.parity;
98  const int spinor_parity = (arg.nParity == 2) ? parity : 0;
99  const int spin_color_block = blockDim.y*blockIdx.y + threadIdx.y;
100  if (spin_color_block >= (Ns/Ms)*(Nc/Mc)) return; // ensure only valid threads
101  const int spin_block = (spin_color_block / (Nc / Mc)) * Ms;
102  const int color_block = (spin_color_block % (Nc / Mc)) * Mc;
103  packGhost<Float,Ns,Ms,Nc,Mc,nDim>(arg, x_cb, parity, spinor_parity, spin_block, color_block);
104  }
105 
106  template <typename Float, int Ns, int Ms, int Nc, int Mc, typename Arg>
108  Arg &arg;
110  unsigned int minThreads() const { return arg.volumeCB; }
111  bool tuneGridDim() const { return false; }
112 
113  public:
115  : TunableVectorYZ((Ns/Ms)*(Nc/Mc), arg.nParity), arg(arg), meta(meta) {
116  strcpy(aux, meta.AuxString());
118 
119  // record the location of where each pack buffer is in [2*dim+dir] ordering
120  // 0 - no packing
121  // 1 - pack to local GPU memory
122  // 2 - pack to local mapped CPU memory
123  // 3 - pack to remote mapped GPU memory
124  char label[15] = ",dest=";
125  for (int dim=0; dim<4; dim++) {
126  for (int dir=0; dir<2; dir++) {
127  label[2*dim+dir+6] = !comm_dim_partitioned(dim) ? '0' : destination[2*dim+dir] == Device ? '1' : destination[2*dim+dir] == Host ? '2' : '3';
128  }
129  }
130  label[14] = '\0';
131  strcat(aux,label);
132  }
133 
135 
136  inline void apply(const cudaStream_t &stream) {
138  if (arg.nDim == 5) GenericPackGhost<Float,Ns,Ms,Nc,Mc,5,Arg>(arg);
139  else GenericPackGhost<Float,Ns,Ms,Nc,Mc,4,Arg>(arg);
140  } else {
141  const TuneParam &tp = tuneLaunch(*this, getTuning(), getVerbosity());
142  if (arg.nDim == 5) GenericPackGhostKernel<Float,Ns,Ms,Nc,Mc,5,Arg> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
143  else GenericPackGhostKernel<Float,Ns,Ms,Nc,Mc,4,Arg> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
144  }
145  }
146 
147  TuneKey tuneKey() const {
148  return TuneKey(meta.VolString(), typeid(*this).name(), aux);
149  }
150 
151  long long flops() const { return 0; }
152  long long bytes() const {
153  size_t totalBytes = 0;
154  for (int d=0; d<4; d++) {
155  if (!comm_dim_partitioned(d)) continue;
156  totalBytes += 2*arg.nFace*2*Ns*Nc*meta.SurfaceCB(d)*meta.Precision();
157  }
158  return totalBytes;
159  }
160  };
161 
162  template <typename Float, QudaFieldOrder order, int Ns, int Nc>
163  inline void genericPackGhost(void **ghost, const ColorSpinorField &a, QudaParity parity,
164  int nFace, int dagger, MemoryLocation *destination) {
165 
167  Q field(a, nFace, 0, ghost);
168 
169  constexpr int spins_per_thread = 1; // make this autotunable
170  constexpr int colors_per_thread = 1;
171  PackGhostArg<Q> arg(field, ghost, a, parity, nFace, dagger);
173  launch(arg, a, destination);
174  launch.apply(0);
175  }
176 
177  template <typename Float, QudaFieldOrder order, int Ns>
178  inline void genericPackGhost(void **ghost, const ColorSpinorField &a, QudaParity parity,
179  int nFace, int dagger, MemoryLocation *destination) {
180 
181  if (a.Ncolor() == 2) {
182  genericPackGhost<Float,order,Ns,2>(ghost, a, parity, nFace, dagger, destination);
183  } else if (a.Ncolor() == 3) {
184  genericPackGhost<Float,order,Ns,3>(ghost, a, parity, nFace, dagger, destination);
185  } else if (a.Ncolor() == 4) {
186  genericPackGhost<Float,order,Ns,4>(ghost, a, parity, nFace, dagger, destination);
187  } else if (a.Ncolor() == 6) {
188  genericPackGhost<Float,order,Ns,6>(ghost, a, parity, nFace, dagger, destination);
189  } else if (a.Ncolor() == 8) {
190  genericPackGhost<Float,order,Ns,8>(ghost, a, parity, nFace, dagger, destination);
191  } else if (a.Ncolor() == 12) {
192  genericPackGhost<Float,order,Ns,12>(ghost, a, parity, nFace, dagger, destination);
193  } else if (a.Ncolor() == 16) {
194  genericPackGhost<Float,order,Ns,16>(ghost, a, parity, nFace, dagger, destination);
195  } else if (a.Ncolor() == 20) {
196  genericPackGhost<Float,order,Ns,20>(ghost, a, parity, nFace, dagger, destination);
197  } else if (a.Ncolor() == 24) {
198  genericPackGhost<Float,order,Ns,24>(ghost, a, parity, nFace, dagger, destination);
199  } else if (a.Ncolor() == 28) {
200  genericPackGhost<Float,order,Ns,28>(ghost, a, parity, nFace, dagger, destination);
201  } else if (a.Ncolor() == 32) {
202  genericPackGhost<Float,order,Ns,32>(ghost, a, parity, nFace, dagger, destination);
203  } else if (a.Ncolor() == 48) {
204  genericPackGhost<Float,order,Ns,48>(ghost, a, parity, nFace, dagger, destination);
205  } else if (a.Ncolor() == 72) {
206  genericPackGhost<Float,order,Ns,72>(ghost, a, parity, nFace, dagger, destination);
207  } else if (a.Ncolor() == 96) {
208  genericPackGhost<Float,order,Ns,96>(ghost, a, parity, nFace, dagger, destination);
209  } else if (a.Ncolor() == 256) {
210  genericPackGhost<Float,order,Ns,256>(ghost, a, parity, nFace, dagger, destination);
211  } else if (a.Ncolor() == 576) {
212  genericPackGhost<Float,order,Ns,576>(ghost, a, parity, nFace, dagger, destination);
213  } else if (a.Ncolor() == 768) {
214  genericPackGhost<Float,order,Ns,768>(ghost, a, parity, nFace, dagger, destination);
215  } else if (a.Ncolor() == 1024) {
216  genericPackGhost<Float,order,Ns,1024>(ghost, a, parity, nFace, dagger, destination);
217  } else {
218  errorQuda("Unsupported nColor = %d", a.Ncolor());
219  }
220 
221  }
222 
223  template <typename Float, QudaFieldOrder order>
224  inline void genericPackGhost(void **ghost, const ColorSpinorField &a, QudaParity parity,
225  int nFace, int dagger, MemoryLocation *destination) {
226 
227  if (a.Nspin() == 4) {
228  genericPackGhost<Float,order,4>(ghost, a, parity, nFace, dagger, destination);
229  } else if (a.Nspin() == 2) {
230  genericPackGhost<Float,order,2>(ghost, a, parity, nFace, dagger, destination);
231 #ifdef GPU_STAGGERED_DIRAC
232  } else if (a.Nspin() == 1) {
233  genericPackGhost<Float,order,1>(ghost, a, parity, nFace, dagger, destination);
234 #endif
235  } else {
236  errorQuda("Unsupported nSpin = %d", a.Nspin());
237  }
238 
239  }
240 
241  template <typename Float>
242  inline void genericPackGhost(void **ghost, const ColorSpinorField &a, QudaParity parity,
243  int nFace, int dagger, MemoryLocation *destination) {
244 
245  if (a.FieldOrder() == QUDA_FLOAT2_FIELD_ORDER) {
246  genericPackGhost<Float,QUDA_FLOAT2_FIELD_ORDER>(ghost, a, parity, nFace, dagger, destination);
247  } else if (a.FieldOrder() == QUDA_FLOAT4_FIELD_ORDER) {
248  genericPackGhost<Float,QUDA_FLOAT4_FIELD_ORDER>(ghost, a, parity, nFace, dagger, destination);
249  } else if (a.FieldOrder() == QUDA_SPACE_SPIN_COLOR_FIELD_ORDER) {
250  genericPackGhost<Float,QUDA_SPACE_SPIN_COLOR_FIELD_ORDER>(ghost, a, parity, nFace, dagger, destination);
251  } else {
252  errorQuda("Unsupported field order = %d", a.FieldOrder());
253  }
254 
255  }
256 
257  void genericPackGhost(void **ghost, const ColorSpinorField &a, QudaParity parity,
258  int nFace, int dagger, MemoryLocation *destination_) {
259 
260  if (a.FieldOrder() == QUDA_QOP_DOMAIN_WALL_FIELD_ORDER) {
261  errorQuda("Field order %d not supported", a.FieldOrder());
262  }
263 
264  // set default location to match field type
265  MemoryLocation destination[2*QUDA_MAX_DIM];
266  for (int i=0; i<4*2; i++) {
267  destination[i] = destination_ ? destination_[i] : a.Location() == QUDA_CUDA_FIELD_LOCATION ? Device : Host;
268  }
269 
270  // only do packing if one of the dimensions is partitioned
271  bool partitioned = false;
272  for (int d=0; d<4; d++)
273  if (comm_dim_partitioned(d)) partitioned = true;
274  if (!partitioned) return;
275 
276  if (a.Precision() == QUDA_DOUBLE_PRECISION) {
277  genericPackGhost<double>(ghost, a, parity, nFace, dagger, destination);
278  } else if (a.Precision() == QUDA_SINGLE_PRECISION) {
279  genericPackGhost<float>(ghost, a, parity, nFace, dagger, destination);
280  } else {
281  errorQuda("Unsupported precision %d", a.Precision());
282  }
283 
284  }
285 
286 } // namespace quda
dim3 dim3 blockDim
__device__ __host__ void packGhost(Arg &arg, int cb_idx, int parity, int spinor_parity, int spin_block, int color_block)
const char * comm_dim_partitioned_string()
Return a string that defines the comm partitioning (used as a tuneKey)
Definition: comm_mpi.cpp:342
const char * AuxString() const
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
const QudaDWFPCType pc_type
#define errorQuda(...)
Definition: util_quda.h:90
static __device__ __host__ void getCoords5(int x[5], int cb_index, const I X[5], int parity, QudaDWFPCType pc_type)
cudaStream_t * stream
static __inline__ dim3 dim3 void size_t cudaStream_t int dim
char * strcpy(char *__dst, const char *__src)
const char * VolString() const
char * strcat(char *__s1, const char *__s2)
const int * SurfaceCB() const
void genericPackGhost(void **ghost, const ColorSpinorField &a, QudaParity parity, int nFace, int dagger, MemoryLocation *destination=nullptr)
Generic ghost packing routine.
enum QudaDWFPCType_s QudaDWFPCType
int V
Definition: test_util.cpp:28
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603
enum QudaParity_s QudaParity
__global__ void GenericPackGhostKernel(Arg arg)
QudaFieldLocation Location() const
void GenericPackGhost(Arg &arg)
int_fastdiv X[QUDA_MAX_DIM]
const ColorSpinorField & meta
PackGhostArg(Field field, void **ghost, const ColorSpinorField &a, int parity, int nFace, int dagger)
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:880
void apply(const cudaStream_t &stream)
const void * c
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51
static __inline__ size_t size_t d
QudaPrecision Precision() const
QudaParity parity
Definition: covdev_test.cpp:53
#define a
char aux[TuneKey::aux_n]
Definition: tune_quda.h:189
int comm_dim_partitioned(int dim)
GenericPackGhostLauncher(Arg &arg, const ColorSpinorField &meta, MemoryLocation *destination)
static __device__ __host__ void getCoords(int x[], int cb_index, const I X[], int parity)