QUDA  0.9.0
prolongator.cu
Go to the documentation of this file.
1 #include <color_spinor_field.h>
3 #include <tune_quda.h>
4 #include <typeinfo>
5 
6 #include <multigrid_helper.cuh>
7 
8 namespace quda {
9 
10 #ifdef GPU_MULTIGRID
11  using namespace quda::colorspinor;
12 
16  template <typename Float, int fineSpin, int fineColor, int coarseSpin, int coarseColor, QudaFieldOrder order>
17  struct ProlongateArg {
21  const int *geo_map; // need to make a device copy of this
22  const spin_mapper<fineSpin,coarseSpin> spin_map;
23  const int parity; // the parity of the output field (if single parity)
24  const int nParity; // number of parities of input fine field
25 
26  ProlongateArg(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &V,
27  const int *geo_map, const int parity)
28  : out(out), in(in), V(V), geo_map(geo_map), spin_map(), parity(parity), nParity(out.SiteSubset()) { }
29 
30  ProlongateArg(const ProlongateArg<Float,fineSpin,fineColor,coarseSpin,coarseColor,order> &arg)
31  : out(arg.out), in(arg.in), V(arg.V), geo_map(arg.geo_map), spin_map(),
32  parity(arg.parity), nParity(arg.nParity) { }
33  };
34 
38  template <typename Float, int fineSpin, int coarseColor, class Coarse, typename S>
39  __device__ __host__ inline void prolongate(complex<Float> out[fineSpin*coarseColor], const Coarse &in,
40  int parity, int x_cb, const int *geo_map, const S& spin_map, int fineVolumeCB) {
41  int x = parity*fineVolumeCB + x_cb;
42  int x_coarse = geo_map[x];
43  int parity_coarse = (x_coarse >= in.VolumeCB()) ? 1 : 0;
44  int x_coarse_cb = x_coarse - parity_coarse*in.VolumeCB();
45 
46 #pragma unroll
47  for (int s=0; s<fineSpin; s++) {
48 #pragma unroll
49  for (int c=0; c<coarseColor; c++) {
50  out[s*coarseColor+c] = in(parity_coarse, x_coarse_cb, spin_map(s), c);
51  }
52  }
53  }
54 
59  template <typename Float, int fineSpin, int fineColor, int coarseColor, int fine_colors_per_thread,
60  class FineColor, class Rotator>
61  __device__ __host__ inline void rotateFineColor(FineColor &out, const complex<Float> in[fineSpin*coarseColor],
62  const Rotator &V, int parity, int nParity, int x_cb, int fine_color_block) {
63  const int spinor_parity = (nParity == 2) ? parity : 0;
64  const int v_parity = (V.Nparity() == 2) ? parity : 0;
65 
66  constexpr int color_unroll = 2;
67 
68 #pragma unroll
69  for (int s=0; s<fineSpin; s++)
70 #pragma unroll
71  for (int fine_color_local=0; fine_color_local<fine_colors_per_thread; fine_color_local++)
72  out(spinor_parity, x_cb, s, fine_color_block+fine_color_local) = 0.0; // global fine color index
73 
74 #pragma unroll
75  for (int s=0; s<fineSpin; s++) {
76 #pragma unroll
77  for (int fine_color_local=0; fine_color_local<fine_colors_per_thread; fine_color_local++) {
78  int i = fine_color_block + fine_color_local; // global fine color index
79 
80  complex<Float> partial[color_unroll];
81 #pragma unroll
82  for (int k=0; k<color_unroll; k++) partial[k] = 0.0;
83 
84 #pragma unroll
85  for (int j=0; j<coarseColor; j+=color_unroll) {
86  // V is a ColorMatrixField with internal dimensions Ns * Nc * Nvec
87 #pragma unroll
88  for (int k=0; k<color_unroll; k++)
89  partial[k] += V(v_parity, x_cb, s, i, j+k) * in[s*coarseColor + j + k];
90  }
91 
92 #pragma unroll
93  for (int k=0; k<color_unroll; k++) out(spinor_parity, x_cb, s, i) += partial[k];
94  }
95  }
96 
97  }
98 
99  template <typename Float, int fineSpin, int fineColor, int coarseSpin, int coarseColor, int fine_colors_per_thread, typename Arg>
100  void Prolongate(Arg &arg) {
101  for (int parity=0; parity<arg.nParity; parity++) {
102  parity = (arg.nParity == 2) ? parity : arg.parity;
103 
104  for (int x_cb=0; x_cb<arg.out.VolumeCB(); x_cb++) {
105  complex<Float> tmp[fineSpin*coarseColor];
106  prolongate<Float,fineSpin,coarseColor>(tmp, arg.in, parity, x_cb, arg.geo_map, arg.spin_map, arg.out.VolumeCB());
107  for (int fine_color_block=0; fine_color_block<fineColor; fine_color_block+=fine_colors_per_thread) {
108  rotateFineColor<Float,fineSpin,fineColor,coarseColor,fine_colors_per_thread>
109  (arg.out, tmp, arg.V, parity, arg.nParity, x_cb, fine_color_block);
110  }
111  }
112  }
113  }
114 
115  template <typename Float, int fineSpin, int fineColor, int coarseSpin, int coarseColor, int fine_colors_per_thread, typename Arg>
116  __global__ void ProlongateKernel(Arg arg) {
117  int x_cb = blockIdx.x*blockDim.x + threadIdx.x;
118  int parity = arg.nParity == 2 ? blockDim.y*blockIdx.y + threadIdx.y : arg.parity;
119  if (x_cb >= arg.out.VolumeCB()) return;
120 
121  int fine_color_block = (blockDim.z*blockIdx.z + threadIdx.z) * fine_colors_per_thread;
122  if (fine_color_block >= fineColor) return;
123 
124  complex<Float> tmp[fineSpin*coarseColor];
125  prolongate<Float,fineSpin,coarseColor>(tmp, arg.in, parity, x_cb, arg.geo_map, arg.spin_map, arg.out.VolumeCB());
126  rotateFineColor<Float,fineSpin,fineColor,coarseColor,fine_colors_per_thread>
127  (arg.out, tmp, arg.V, parity, arg.nParity, x_cb, fine_color_block);
128  }
129 
130  template <typename Float, int fineSpin, int fineColor, int coarseSpin, int coarseColor, int fine_colors_per_thread>
131  class ProlongateLaunch : public TunableVectorYZ {
132 
133  protected:
134  ColorSpinorField &out;
135  const ColorSpinorField &in;
136  const ColorSpinorField &V;
137  const int *fine_to_coarse;
138  int parity;
139  QudaFieldLocation location;
140  char vol[TuneKey::volume_n];
141 
142  bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
143  unsigned int minThreads() const { return out.VolumeCB(); } // fine parity is the block y dimension
144 
145  public:
146  ProlongateLaunch(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &V,
147  const int *fine_to_coarse, int parity)
148  : TunableVectorYZ(out.SiteSubset(), fineColor/fine_colors_per_thread), out(out), in(in), V(V),
149  fine_to_coarse(fine_to_coarse), parity(parity), location(checkLocation(out, in, V))
150  {
151  strcpy(vol, out.VolString());
152  strcat(vol, ",");
153  strcat(vol, in.VolString());
154 
155  strcpy(aux, out.AuxString());
156  strcat(aux, ",");
157  strcat(aux, in.AuxString());
158  }
159 
160  virtual ~ProlongateLaunch() { }
161 
162  void apply(const cudaStream_t &stream) {
163  if (location == QUDA_CPU_FIELD_LOCATION) {
164  if (out.FieldOrder() == QUDA_SPACE_SPIN_COLOR_FIELD_ORDER) {
165  ProlongateArg<Float,fineSpin,fineColor,coarseSpin,coarseColor,QUDA_SPACE_SPIN_COLOR_FIELD_ORDER>
166  arg(out, in, V, fine_to_coarse, parity);
167  Prolongate<Float,fineSpin,fineColor,coarseSpin,coarseColor,fine_colors_per_thread>(arg);
168  } else {
169  errorQuda("Unsupported field order %d", out.FieldOrder());
170  }
171  } else {
172  if (out.FieldOrder() == QUDA_FLOAT2_FIELD_ORDER) {
173  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
174  ProlongateArg<Float,fineSpin,fineColor,coarseSpin,coarseColor,QUDA_FLOAT2_FIELD_ORDER>
175  arg(out, in, V, fine_to_coarse, parity);
176  ProlongateKernel<Float,fineSpin,fineColor,coarseSpin,coarseColor,fine_colors_per_thread>
177  <<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
178  } else {
179  errorQuda("Unsupported field order %d", out.FieldOrder());
180  }
181  }
182  }
183 
184  TuneKey tuneKey() const { return TuneKey(vol, typeid(*this).name(), aux); }
185 
186  long long flops() const { return 8 * fineSpin * fineColor * coarseColor * out.SiteSubset()*out.VolumeCB(); }
187 
188  long long bytes() const {
189  size_t v_bytes = V.Bytes() / (V.SiteSubset() == out.SiteSubset() ? 1 : 2);
190  return in.Bytes() + out.Bytes() + v_bytes + out.SiteSubset()*out.VolumeCB()*sizeof(int);
191  }
192 
193  };
194 
195  template <typename Float, int fineSpin, int fineColor, int coarseSpin, int coarseColor>
196  void Prolongate(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &v,
197  const int *fine_to_coarse, int parity) {
198 
199  // for all grids use 1 color per thread
200  constexpr int fine_colors_per_thread = 1;
201 
202  ProlongateLaunch<Float, fineSpin, fineColor, coarseSpin, coarseColor, fine_colors_per_thread>
203  prolongator(out, in, v, fine_to_coarse, parity);
204  prolongator.apply(0);
205 
207  }
208 
209 
210  template <typename Float, int fineSpin>
211  void Prolongate(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &v,
212  int nVec, const int *fine_to_coarse, const int *spin_map, int parity) {
213 
214  if (in.Nspin() != 2) errorQuda("Coarse spin %d is not supported", in.Nspin());
215  const int coarseSpin = 2;
216 
217  // first check that the spin_map matches the spin_mapper
218  spin_mapper<fineSpin,coarseSpin> mapper;
219  for (int s=0; s<fineSpin; s++)
220  if (mapper(s) != spin_map[s]) errorQuda("Spin map does not match spin_mapper");
221 
222  if (out.Ncolor() == 3) {
223  const int fineColor = 3;
224  if (nVec == 2) {
225  Prolongate<Float,fineSpin,fineColor,coarseSpin,2>(out, in, v, fine_to_coarse, parity);
226  } else if (nVec == 4) {
227  Prolongate<Float,fineSpin,fineColor,coarseSpin,4>(out, in, v, fine_to_coarse, parity);
228  } else if (nVec == 24) {
229  Prolongate<Float,fineSpin,fineColor,coarseSpin,24>(out, in, v, fine_to_coarse, parity);
230  } else if (nVec == 32) {
231  Prolongate<Float,fineSpin,fineColor,coarseSpin,32>(out, in, v, fine_to_coarse, parity);
232  } else {
233  errorQuda("Unsupported nVec %d", nVec);
234  }
235  } else if (out.Ncolor() == 2) {
236  const int fineColor = 2;
237  if (nVec == 2) { // these are probably only for debugging only
238  Prolongate<Float,fineSpin,fineColor,coarseSpin,2>(out, in, v, fine_to_coarse, parity);
239  } else if (nVec == 4) {
240  Prolongate<Float,fineSpin,fineColor,coarseSpin,4>(out, in, v, fine_to_coarse, parity);
241  } else {
242  errorQuda("Unsupported nVec %d", nVec);
243  }
244  } else if (out.Ncolor() == 24) {
245  const int fineColor = 24;
246  if (nVec == 24) { // to keep compilation under control coarse grids have same or more colors
247  Prolongate<Float,fineSpin,fineColor,coarseSpin,24>(out, in, v, fine_to_coarse, parity);
248  } else if (nVec == 32) {
249  Prolongate<Float,fineSpin,fineColor,coarseSpin,32>(out, in, v, fine_to_coarse, parity);
250  } else {
251  errorQuda("Unsupported nVec %d", nVec);
252  }
253  } else if (out.Ncolor() == 32) {
254  const int fineColor = 32;
255  if (nVec == 32) {
256  Prolongate<Float,fineSpin,fineColor,coarseSpin,32>(out, in, v, fine_to_coarse, parity);
257  } else {
258  errorQuda("Unsupported nVec %d", nVec);
259  }
260  } else {
261  errorQuda("Unsupported nColor %d", out.Ncolor());
262  }
263  }
264 
265  template <typename Float>
266  void Prolongate(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &v,
267  int Nvec, const int *fine_to_coarse, const int *spin_map, int parity) {
268 
269  if (out.Nspin() == 4) {
270  Prolongate<Float,4>(out, in, v, Nvec, fine_to_coarse, spin_map, parity);
271  } else if (out.Nspin() == 2) {
272  Prolongate<Float,2>(out, in, v, Nvec, fine_to_coarse, spin_map, parity);
273 #ifdef GPU_STAGGERED_DIRAC
274  } else if (out.Nspin() == 1) {
275  Prolongate<Float,1>(out, in, v, Nvec, fine_to_coarse, spin_map, parity);
276 #endif
277  } else {
278  errorQuda("Unsupported nSpin %d", out.Nspin());
279  }
280  }
281 
282 #endif // GPU_MULTIGRID
283 
285  int Nvec, const int *fine_to_coarse, const int *spin_map, int parity) {
286 #ifdef GPU_MULTIGRID
287  if (out.FieldOrder() != in.FieldOrder() || out.FieldOrder() != v.FieldOrder())
288  errorQuda("Field orders do not match (out=%d, in=%d, v=%d)",
289  out.FieldOrder(), in.FieldOrder(), v.FieldOrder());
290 
291  QudaPrecision precision = checkPrecision(out, in, v);
292 
293  if (precision == QUDA_DOUBLE_PRECISION) {
294 #ifdef GPU_MULTIGRID_DOUBLE
295  Prolongate<double>(out, in, v, Nvec, fine_to_coarse, spin_map, parity);
296 #else
297  errorQuda("Double precision multigrid has not been enabled");
298 #endif
299  } else if (precision == QUDA_SINGLE_PRECISION) {
300  Prolongate<float>(out, in, v, Nvec, fine_to_coarse, spin_map, parity);
301  } else {
302  errorQuda("Unsupported precision %d", out.Precision());
303  }
304 
306 #else
307  errorQuda("Multigrid has not been built");
308 #endif
309  }
310 
311 } // end namespace quda
dim3 dim3 blockDim
enum QudaPrecision_s QudaPrecision
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
#define checkPrecision(...)
#define errorQuda(...)
Definition: util_quda.h:90
cudaStream_t * stream
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:44
char * strcpy(char *__dst, const char *__src)
char * strcat(char *__s1, const char *__s2)
cpuColorSpinorField * in
This is just a dummy structure we use for trove to define the required structure size.
for(int s=0;s< param.dc.Ls;s++)
int V
Definition: test_util.cpp:28
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603
#define checkLocation(...)
enum QudaFieldLocation_s QudaFieldLocation
cpuColorSpinorField * out
unsigned long long flops
Definition: blas_quda.cu:42
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:880
const void * c
#define checkCudaError()
Definition: util_quda.h:129
static const int volume_n
Definition: tune_key.h:10
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51
QudaParity parity
Definition: covdev_test.cpp:53
QudaFieldOrder FieldOrder() const
unsigned long long bytes
Definition: blas_quda.cu:43
void Prolongate(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &v, int Nvec, const int *fine_to_coarse, const int *spin_map, int parity=QUDA_INVALID_PARITY)
Apply the prolongation operator.
Definition: prolongator.cu:284