QUDA  0.9.0
restrictor.cu
Go to the documentation of this file.
1 #include <color_spinor_field.h>
3 #include <tune_quda.h>
4 #include <cub/cub.cuh>
5 #include <typeinfo>
6 #include <multigrid_helper.cuh>
7 #include <fast_intdiv.h>
8 
9 // enabling CTA swizzling improves spatial locality of MG blocks reducing cache line wastage
10 #define SWIZZLE
11 
12 namespace quda {
13 
14 #ifdef GPU_MULTIGRID
15 
16  using namespace quda::colorspinor;
17 
21  template <typename Float, int fineSpin, int fineColor, int coarseSpin, int coarseColor, QudaFieldOrder order>
22  struct RestrictArg {
23 
27  const int *fine_to_coarse;
28  const int *coarse_to_fine;
29  const spin_mapper<fineSpin,coarseSpin> spin_map;
30  const int parity; // the parity of the input field (if single parity)
31  const int nParity; // number of parities of input fine field
32  int_fastdiv swizzle; // swizzle factor for transposing blockIdx.x mapping to coarse grid coordinate
33 
34  RestrictArg(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &V,
35  const int *fine_to_coarse, const int *coarse_to_fine, int parity)
36  : out(out), in(in), V(V), fine_to_coarse(fine_to_coarse), coarse_to_fine(coarse_to_fine),
37  spin_map(), parity(parity), nParity(in.SiteSubset()), swizzle(1)
38  { }
39 
40  RestrictArg(const RestrictArg<Float,fineSpin,fineColor,coarseSpin,coarseColor,order> &arg) :
41  out(arg.out), in(arg.in), V(arg.V),
42  fine_to_coarse(arg.fine_to_coarse), coarse_to_fine(arg.coarse_to_fine), spin_map(),
43  parity(arg.parity), nParity(arg.nParity), swizzle(arg.swizzle)
44  { }
45  };
46 
50  template <typename Float, int fineSpin, int fineColor, int coarseColor, int coarse_colors_per_thread,
51  class FineColor, class Rotator>
52  __device__ __host__ inline void rotateCoarseColor(complex<Float> out[fineSpin*coarse_colors_per_thread],
53  const FineColor &in, const Rotator &V,
54  int parity, int nParity, int x_cb, int coarse_color_block) {
55  const int spinor_parity = (nParity == 2) ? parity : 0;
56  const int v_parity = (V.Nparity() == 2) ? parity : 0;
57 
58 #pragma unroll
59  for (int s=0; s<fineSpin; s++)
60 #pragma unroll
61  for (int coarse_color_local=0; coarse_color_local<coarse_colors_per_thread; coarse_color_local++) {
62  out[s*coarse_colors_per_thread+coarse_color_local] = 0.0;
63  }
64 
65 #pragma unroll
66  for (int coarse_color_local=0; coarse_color_local<coarse_colors_per_thread; coarse_color_local++) {
67  int i = coarse_color_block + coarse_color_local;
68 #pragma unroll
69  for (int s=0; s<fineSpin; s++) {
70 
71  constexpr int color_unroll = fineColor == 3 ? 3 : 2;
72 
73  complex<Float> partial[color_unroll];
74 #pragma unroll
75  for (int k=0; k<color_unroll; k++) partial[k] = 0.0;
76 
77 #pragma unroll
78  for (int j=0; j<fineColor; j+=color_unroll) {
79 #pragma unroll
80  for (int k=0; k<color_unroll; k++)
81  partial[k] += conj(V(v_parity, x_cb, s, j+k, i)) * in(spinor_parity, x_cb, s, j+k);
82  }
83 
84 #pragma unroll
85  for (int k=0; k<color_unroll; k++) out[s*coarse_colors_per_thread + coarse_color_local] += partial[k];
86  }
87  }
88 
89  }
90 
91  template <typename Float, int fineSpin, int fineColor, int coarseSpin, int coarseColor, int coarse_colors_per_thread, typename Arg>
92  void Restrict(Arg arg) {
93  for (int parity_coarse=0; parity_coarse<2; parity_coarse++)
94  for (int x_coarse_cb=0; x_coarse_cb<arg.out.VolumeCB(); x_coarse_cb++)
95  for (int s=0; s<coarseSpin; s++)
96  for (int c=0; c<coarseColor; c++)
97  arg.out(parity_coarse, x_coarse_cb, s, c) = 0.0;
98 
99  // loop over fine degrees of freedom
100  for (int parity=0; parity<arg.nParity; parity++) {
101  parity = (arg.nParity == 2) ? parity : arg.parity;
102 
103  for (int x_cb=0; x_cb<arg.in.VolumeCB(); x_cb++) {
104 
105  int x = parity*arg.in.VolumeCB() + x_cb;
106  int x_coarse = arg.fine_to_coarse[x];
107  int parity_coarse = (x_coarse >= arg.out.VolumeCB()) ? 1 : 0;
108  int x_coarse_cb = x_coarse - parity_coarse*arg.out.VolumeCB();
109 
110  for (int coarse_color_block=0; coarse_color_block<coarseColor; coarse_color_block+=coarse_colors_per_thread) {
111  complex<Float> tmp[fineSpin*coarse_colors_per_thread];
112  rotateCoarseColor<Float,fineSpin,fineColor,coarseColor,coarse_colors_per_thread>
113  (tmp, arg.in, arg.V, parity, arg.nParity, x_cb, coarse_color_block);
114 
115  for (int s=0; s<fineSpin; s++) {
116  for (int coarse_color_local=0; coarse_color_local<coarse_colors_per_thread; coarse_color_local++) {
117  int c = coarse_color_block + coarse_color_local;
118  arg.out(parity_coarse,x_coarse_cb,arg.spin_map(s),c) += tmp[s*coarse_colors_per_thread+coarse_color_local];
119  }
120  }
121 
122  }
123  }
124  }
125 
126  }
127 
131  template <typename scalar, int n>
132  struct vector_type {
133  scalar data[n];
134  __device__ __host__ inline scalar& operator[](int i) { return data[i]; }
135  __device__ __host__ inline const scalar& operator[](int i) const { return data[i]; }
136  __device__ __host__ inline static constexpr int size() { return n; }
137  __device__ __host__ vector_type() { for (int i=0; i<n; i++) data[i] = 0.0; }
138  };
139 
143  template <typename T>
144  struct reduce {
145  __device__ __host__ inline T operator()(const T &a, const T &b) {
146  T sum;
147  for (int i=0; i<sum.size(); i++) sum[i] = a[i] + b[i];
148  return sum;
149  }
150  };
151 
160  template <typename Float, int fineSpin, int fineColor, int coarseSpin, int coarseColor, int coarse_colors_per_thread,
161  typename Arg, int block_size>
162  __global__ void RestrictKernel(Arg arg) {
163 
164 #ifdef SWIZZLE
165  // the portion of the grid that is exactly divisible by the number of SMs
166  const int gridp = gridDim.x - gridDim.x % arg.swizzle;
167 
168  int x_coarse = blockIdx.x;
169  if (blockIdx.x < gridp) {
170  // this is the portion of the block that we are going to transpose
171  const int i = blockIdx.x % arg.swizzle;
172  const int j = blockIdx.x / arg.swizzle;
173 
174  // tranpose the coordinates
175  x_coarse = i * (gridp / arg.swizzle) + j;
176  }
177 #else
178  int x_coarse = blockIdx.x;
179 #endif
180 
181  int parity_coarse = x_coarse >= arg.out.VolumeCB() ? 1 : 0;
182  int x_coarse_cb = x_coarse - parity_coarse*arg.out.VolumeCB();
183 
184  // obtain fine index from this look up table
185  // since both parities map to the same block, each thread block must do both parities
186 
187  // threadIdx.x - fine checkboard offset
188  // threadIdx.y - fine parity offset
189  // blockIdx.x - which coarse block are we working on (swizzled to improve cache efficiency)
190  // assume that coarse_to_fine look up map is ordered as (coarse-block-id + fine-point-id)
191  // and that fine-point-id is parity ordered
192  int parity = arg.nParity == 2 ? threadIdx.y : arg.parity;
193  int x_fine = arg.coarse_to_fine[ (x_coarse*2 + parity) * blockDim.x + threadIdx.x];
194  int x_fine_cb = x_fine - parity*arg.in.VolumeCB();
195 
196  int coarse_color_block = (blockDim.z*blockIdx.z + threadIdx.z) * coarse_colors_per_thread;
197  if (coarse_color_block >= coarseColor) return;
198 
199  complex<Float> tmp[fineSpin*coarse_colors_per_thread];
200  rotateCoarseColor<Float,fineSpin,fineColor,coarseColor,coarse_colors_per_thread>
201  (tmp, arg.in, arg.V, parity, arg.nParity, x_fine_cb, coarse_color_block);
202 
203  typedef vector_type<complex<Float>, coarseSpin*coarse_colors_per_thread> vector;
204  vector reduced;
205 
206  // first lets coarsen spin locally
207  for (int s=0; s<fineSpin; s++) {
208  for (int v=0; v<coarse_colors_per_thread; v++) {
209  reduced[arg.spin_map(s)*coarse_colors_per_thread+v] += tmp[s*coarse_colors_per_thread+v];
210  }
211  }
212 
213  // now lets coarsen geometry across threads
214  if (arg.nParity == 2) {
215  typedef cub::BlockReduce<vector, block_size, cub::BLOCK_REDUCE_WARP_REDUCTIONS, 2> BlockReduce;
216  __shared__ typename BlockReduce::TempStorage temp_storage;
217  reduce<vector> reducer; // reduce functor
218 
219  // note this is not safe for blockDim.z > 1
220  reduced = BlockReduce(temp_storage).Reduce(reduced, reducer);
221  } else {
222  typedef cub::BlockReduce<vector, block_size, cub::BLOCK_REDUCE_WARP_REDUCTIONS> BlockReduce;
223  __shared__ typename BlockReduce::TempStorage temp_storage;
224  reduce<vector> reducer; // reduce functor
225 
226  // note this is not safe for blockDim.z > 1
227  reduced = BlockReduce(temp_storage).Reduce(reduced, reducer);
228  }
229 
230  if (threadIdx.x==0 && threadIdx.y == 0) {
231  for (int s=0; s<coarseSpin; s++) {
232  for (int coarse_color_local=0; coarse_color_local<coarse_colors_per_thread; coarse_color_local++) {
233  int v = coarse_color_block + coarse_color_local;
234  arg.out(parity_coarse, x_coarse_cb, s, v) = reduced[s*coarse_colors_per_thread+coarse_color_local];
235  }
236  }
237  }
238  }
239 
240  template <typename Float, int fineSpin, int fineColor, int coarseSpin, int coarseColor,
241  int coarse_colors_per_thread>
242  class RestrictLaunch : public Tunable {
243 
244  protected:
245  ColorSpinorField &out;
246  const ColorSpinorField &in;
247  const ColorSpinorField &v;
248  const int *fine_to_coarse;
249  const int *coarse_to_fine;
250  const int parity;
251  const QudaFieldLocation location;
252  const int block_size;
253  char vol[TuneKey::volume_n];
254 
255  unsigned int sharedBytesPerThread() const { return 0; }
256  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
257  bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
258  bool tuneAuxDim() const { return true; } // Do tune the aux dimensions.
259  unsigned int minThreads() const { return in.VolumeCB(); } // fine parity is the block y dimension
260 
261  public:
262  RestrictLaunch(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &v,
263  const int *fine_to_coarse, const int *coarse_to_fine, int parity)
264  : out(out), in(in), v(v), fine_to_coarse(fine_to_coarse), coarse_to_fine(coarse_to_fine),
265  parity(parity), location(checkLocation(out,in,v)), block_size(in.VolumeCB()/(2*out.VolumeCB()))
266  {
267  strcpy(vol, out.VolString());
268  strcat(vol, ",");
269  strcat(vol, in.VolString());
270 
271  strcpy(aux, out.AuxString());
272  strcat(aux, ",");
273  strcat(aux, in.AuxString());
274  } // block size is checkerboard fine length / full coarse length
275  virtual ~RestrictLaunch() { }
276 
277  void apply(const cudaStream_t &stream) {
278  if (location == QUDA_CPU_FIELD_LOCATION) {
279  if (out.FieldOrder() == QUDA_SPACE_SPIN_COLOR_FIELD_ORDER) {
280  RestrictArg<Float,fineSpin,fineColor,coarseSpin,coarseColor,QUDA_SPACE_SPIN_COLOR_FIELD_ORDER>
281  arg(out, in, v, fine_to_coarse, coarse_to_fine, parity);
282  Restrict<Float,fineSpin,fineColor,coarseSpin,coarseColor,coarse_colors_per_thread>(arg);
283  } else {
284  errorQuda("Unsupported field order %d", out.FieldOrder());
285  }
286  } else {
287  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
288 
289  if (out.FieldOrder() == QUDA_FLOAT2_FIELD_ORDER) {
290  typedef RestrictArg<Float,fineSpin,fineColor,coarseSpin,coarseColor,QUDA_FLOAT2_FIELD_ORDER> Arg;
291  Arg arg(out, in, v, fine_to_coarse, coarse_to_fine, parity);
292  arg.swizzle = tp.aux.x;
293 
294  if (block_size == 4) { // for 2x2x2x1 aggregates
295  RestrictKernel<Float,fineSpin,fineColor,coarseSpin,coarseColor,coarse_colors_per_thread,Arg,4>
296  <<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
297  } else if (block_size == 8) { // for 2x2x2x2 aggregates
298  RestrictKernel<Float,fineSpin,fineColor,coarseSpin,coarseColor,coarse_colors_per_thread,Arg,8>
299  <<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
300  } else if (block_size == 16) { // for 4x2x2x2 aggregates
301  RestrictKernel<Float,fineSpin,fineColor,coarseSpin,coarseColor,coarse_colors_per_thread,Arg,16>
302  <<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
303  } else if (block_size == 27) { // for 3x3x3x2 aggregates
304  RestrictKernel<Float,fineSpin,fineColor,coarseSpin,coarseColor,coarse_colors_per_thread,Arg,27>
305  <<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
306  } else if (block_size == 36) { // for 3x3x2x4 aggregates
307  RestrictKernel<Float,fineSpin,fineColor,coarseSpin,coarseColor,coarse_colors_per_thread,Arg,36>
308  <<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
309  } else if (block_size == 54) { // for 3x3x3x4 aggregates
310  RestrictKernel<Float,fineSpin,fineColor,coarseSpin,coarseColor,coarse_colors_per_thread,Arg,54>
311  <<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
312  } else if (block_size == 64) { // for 2x4x4x4 aggregates
313  RestrictKernel<Float,fineSpin,fineColor,coarseSpin,coarseColor,coarse_colors_per_thread,Arg,64>
314  <<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
315  } else if (block_size == 100) { // for 5x5x2x4 aggregates
316  RestrictKernel<Float,fineSpin,fineColor,coarseSpin,coarseColor,coarse_colors_per_thread,Arg,100>
317  <<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
318  } else if (block_size == 108) { // for 3x3x3x8 aggregates
319  RestrictKernel<Float,fineSpin,fineColor,coarseSpin,coarseColor,coarse_colors_per_thread,Arg,108>
320  <<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
321  } else if (block_size == 128) { // for 4x4x4x4 aggregates
322  RestrictKernel<Float,fineSpin,fineColor,coarseSpin,coarseColor,coarse_colors_per_thread,Arg,128>
323  <<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
324 #if __COMPUTE_CAPABILITY__ >= 300
325  } else if (block_size == 200) { // for 5x5x2x8 aggregates
326  RestrictKernel<Float,fineSpin,fineColor,coarseSpin,coarseColor,coarse_colors_per_thread,Arg,200>
327  <<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
328  } else if (block_size == 256) { // for 4x4x4x8 aggregates
329  RestrictKernel<Float,fineSpin,fineColor,coarseSpin,coarseColor,coarse_colors_per_thread,Arg,256>
330  <<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
331  } else if (block_size == 432) { // for 6x6x6x4 aggregates
332  RestrictKernel<Float,fineSpin,fineColor,coarseSpin,coarseColor,coarse_colors_per_thread,Arg,432>
333  <<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
334  } else if (block_size == 500) { // 5x5x5x8 aggregates
335  RestrictKernel<Float,fineSpin,fineColor,coarseSpin,coarseColor,coarse_colors_per_thread,Arg,500>
336  <<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
337 #endif
338  } else {
339  errorQuda("Block size %d not instantiated", block_size);
340  }
341  } else {
342  errorQuda("Unsupported field order %d", out.FieldOrder());
343  }
344  }
345  }
346 
347  // This block tuning tunes for the optimal amount of color
348  // splitting between blockDim.z and gridDim.z. However, enabling
349  // blockDim.z > 1 gives incorrect results due to cub reductions
350  // being unable to do independent sliced reductions along
351  // blockDim.z. So for now we only split between colors per thread
352  // and grid.z.
353  bool advanceBlockDim(TuneParam &param) const
354  {
355  // let's try to advance spin/block-color
356  while(param.block.z <= coarseColor/coarse_colors_per_thread) {
357  param.block.z++;
358  if ( (coarseColor/coarse_colors_per_thread) % param.block.z == 0) {
359  param.grid.z = (coarseColor/coarse_colors_per_thread) / param.block.z;
360  break;
361  }
362  }
363 
364  // we can advance spin/block-color since this is valid
365  if (param.block.z <= (coarseColor/coarse_colors_per_thread) ) { //
366  return true;
367  } else { // we have run off the end so let's reset
368  param.block.z = 1;
369  param.grid.z = coarseColor/coarse_colors_per_thread;
370  return false;
371  }
372  }
373 
374  int tuningIter() const { return 3; }
375 
376  bool advanceAux(TuneParam &param) const
377  {
378 #ifdef SWIZZLE
379  if (param.aux.x < 2*deviceProp.multiProcessorCount) {
380  param.aux.x++;
381  return true;
382  } else {
383  param.aux.x = 1;
384  return false;
385  }
386 #else
387  return false;
388 #endif
389  }
390 
391  // only tune shared memory per thread (disable tuning for block.z for now)
392  bool advanceTuneParam(TuneParam &param) const { return advanceSharedBytes(param) || advanceAux(param); }
393 
394  TuneKey tuneKey() const { return TuneKey(vol, typeid(*this).name(), aux); }
395 
396  void initTuneParam(TuneParam &param) const { defaultTuneParam(param); }
397 
399  void defaultTuneParam(TuneParam &param) const {
400  param.block = dim3(block_size, in.SiteSubset(), 1);
401  param.grid = dim3( (minThreads()+param.block.x-1) / param.block.x, 1, 1);
402  param.shared_bytes = 0;
403 
404  param.block.z = 1;
405  param.grid.z = coarseColor / coarse_colors_per_thread;
406  param.aux.x = 1; // swizzle factor
407  }
408 
409  long long flops() const { return 8 * fineSpin * fineColor * coarseColor * in.SiteSubset()*in.VolumeCB(); }
410 
411  long long bytes() const {
412  size_t v_bytes = v.Bytes() / (v.SiteSubset() == in.SiteSubset() ? 1 : 2);
413  return in.Bytes() + out.Bytes() + v_bytes + in.SiteSubset()*in.VolumeCB()*sizeof(int);
414  }
415 
416  };
417 
418  template <typename Float, int fineSpin, int fineColor, int coarseSpin, int coarseColor>
419  void Restrict(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &v,
420  const int *fine_to_coarse, const int *coarse_to_fine, int parity) {
421 
422  // for fine grids (Nc=3) have more parallelism so can use more coarse strategy
423  constexpr int coarse_colors_per_thread = fineColor != 3 ? 2 : coarseColor >= 4 && coarseColor % 4 == 0 ? 4 : 2;
424  //coarseColor >= 8 && coarseColor % 8 == 0 ? 8 : coarseColor >= 4 && coarseColor % 4 == 0 ? 4 : 2;
425 
426  RestrictLaunch<Float, fineSpin, fineColor, coarseSpin, coarseColor, coarse_colors_per_thread>
427  restrictor(out, in, v, fine_to_coarse, coarse_to_fine, parity);
428  restrictor.apply(0);
429 
431  }
432 
433  template <typename Float, int fineSpin>
434  void Restrict(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &v,
435  int nVec, const int *fine_to_coarse, const int *coarse_to_fine, const int *spin_map, int parity) {
436 
437  if (out.Nspin() != 2) errorQuda("Unsupported nSpin %d", out.Nspin());
438  const int coarseSpin = 2;
439 
440  // first check that the spin_map matches the spin_mapper
441  spin_mapper<fineSpin,coarseSpin> mapper;
442  for (int s=0; s<fineSpin; s++)
443  if (mapper(s) != spin_map[s]) errorQuda("Spin map does not match spin_mapper");
444 
445 
446  // Template over fine color
447  if (in.Ncolor() == 3) { // standard QCD
448  const int fineColor = 3;
449  if (nVec == 2) {
450  Restrict<Float,fineSpin,fineColor,coarseSpin,2>(out, in, v, fine_to_coarse, coarse_to_fine, parity);
451  } else if (nVec == 4) {
452  Restrict<Float,fineSpin,fineColor,coarseSpin,4>(out, in, v, fine_to_coarse, coarse_to_fine, parity);
453  } else if (nVec == 24) {
454  Restrict<Float,fineSpin,fineColor,coarseSpin,24>(out, in, v, fine_to_coarse, coarse_to_fine, parity);
455  } else if (nVec == 32) {
456  Restrict<Float,fineSpin,fineColor,coarseSpin,32>(out, in, v, fine_to_coarse, coarse_to_fine, parity);
457  } else {
458  errorQuda("Unsupported nVec %d", nVec);
459  }
460  } else if (in.Ncolor() == 2) {
461  const int fineColor = 2;
462  if (nVec == 2) { // these are probably only for debugging only
463  Restrict<Float,fineSpin,fineColor,coarseSpin,2>(out, in, v, fine_to_coarse, coarse_to_fine, parity);
464  } else if (nVec == 4) {
465  Restrict<Float,fineSpin,fineColor,coarseSpin,4>(out, in, v, fine_to_coarse, coarse_to_fine, parity);
466  } else {
467  errorQuda("Unsupported nVec %d", nVec);
468  }
469  } else if (in.Ncolor() == 24) { // to keep compilation under control coarse grids have same or more colors
470  const int fineColor = 24;
471  if (nVec == 24) {
472  Restrict<Float,fineSpin,fineColor,coarseSpin,24>(out, in, v, fine_to_coarse, coarse_to_fine, parity);
473  } else if (nVec == 32) {
474  Restrict<Float,fineSpin,fineColor,coarseSpin,32>(out, in, v, fine_to_coarse, coarse_to_fine, parity);
475  } else {
476  errorQuda("Unsupported nVec %d", nVec);
477  }
478  } else if (in.Ncolor() == 32) {
479  const int fineColor = 32;
480  if (nVec == 32) {
481  Restrict<Float,fineSpin,fineColor,coarseSpin,32>(out, in, v, fine_to_coarse, coarse_to_fine, parity);
482  } else {
483  errorQuda("Unsupported nVec %d", nVec);
484  }
485  } else {
486  errorQuda("Unsupported nColor %d", in.Ncolor());
487  }
488  }
489 
490  template <typename Float>
491  void Restrict(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &v,
492  int Nvec, const int *fine_to_coarse, const int *coarse_to_fine, const int *spin_map, int parity) {
493 
494  if (in.Nspin() == 4) {
495  Restrict<Float,4>(out, in, v, Nvec, fine_to_coarse, coarse_to_fine, spin_map, parity);
496  } else if (in.Nspin() == 2) {
497  Restrict<Float,2>(out, in, v, Nvec, fine_to_coarse, coarse_to_fine, spin_map, parity);
498 #if GPU_STAGGERED_DIRAC
499  } else if (in.Nspin() == 1) {
500  Restrict<Float,1>(out, in, v, Nvec, fine_to_coarse, coarse_to_fine, spin_map, parity);
501 #endif
502  } else {
503  errorQuda("Unsupported nSpin %d", in.Nspin());
504  }
505  }
506 
507 #endif // GPU_MULTIGRID
508 
510  int Nvec, const int *fine_to_coarse, const int *coarse_to_fine, const int *spin_map, int parity) {
511 
512 #ifdef GPU_MULTIGRID
513  if (out.FieldOrder() != in.FieldOrder() || out.FieldOrder() != v.FieldOrder())
514  errorQuda("Field orders do not match (out=%d, in=%d, v=%d)",
515  out.FieldOrder(), in.FieldOrder(), v.FieldOrder());
516 
517  QudaPrecision precision = checkPrecision(out, in, v);
518 
519  if (precision == QUDA_DOUBLE_PRECISION) {
520 #ifdef GPU_MULTIGRID_DOUBLE
521  Restrict<double>(out, in, v, Nvec, fine_to_coarse, coarse_to_fine, spin_map, parity);
522 #else
523  errorQuda("Double precision multigrid has not been enabled");
524 #endif
525  } else if (precision == QUDA_SINGLE_PRECISION) {
526  Restrict<float>(out, in, v, Nvec, fine_to_coarse, coarse_to_fine, spin_map, parity);
527  } else {
528  errorQuda("Unsupported precision %d", out.Precision());
529  }
530 #else
531  errorQuda("Multigrid has not been built");
532 #endif
533  }
534 
535 } // namespace quda
dim3 dim3 blockDim
enum QudaPrecision_s QudaPrecision
cudaDeviceProp deviceProp
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
#define checkPrecision(...)
#define errorQuda(...)
Definition: util_quda.h:90
cudaStream_t * stream
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:44
char * strcpy(char *__dst, const char *__src)
char * strcat(char *__s1, const char *__s2)
QudaGaugeParam param
Definition: pack_test.cpp:17
#define b
__host__ __device__ void sum(double &a, double &b)
cpuColorSpinorField * in
for(int s=0;s< param.dc.Ls;s++)
int V
Definition: test_util.cpp:28
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603
#define checkLocation(...)
void Restrict(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &v, int Nvec, const int *fine_to_coarse, const int *coarse_to_fine, const int *spin_map, int parity=QUDA_INVALID_PARITY)
Apply the restriction operator.
Definition: restrictor.cu:509
enum QudaFieldLocation_s QudaFieldLocation
cpuColorSpinorField * out
__device__ void reduce(ReduceArg< T > arg, const T &in, const int idx=0)
Definition: cub_helper.cuh:163
unsigned long long flops
Definition: blas_quda.cu:42
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:880
const void * c
#define checkCudaError()
Definition: util_quda.h:129
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:115
static const int volume_n
Definition: tune_key.h:10
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51
QudaParity parity
Definition: covdev_test.cpp:53
#define a
QudaFieldOrder FieldOrder() const
unsigned long long bytes
Definition: blas_quda.cu:43