QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
transfer.cpp
Go to the documentation of this file.
1 #include <transfer.h>
2 #include <blas_quda.h>
3 
4 #include <transfer.h>
5 #include <multigrid.h>
6 #include <malloc_quda.h>
7 
8 #include <iostream>
9 #include <algorithm>
10 #include <vector>
11 
12 
13 namespace quda {
14 
15  /*
16  * for the staggered case, there is no spin blocking,
17  * however we do even-odd to preserve chirality (that is straightforward)
18  */
19  Transfer::Transfer(const std::vector<ColorSpinorField *> &B, int Nvec, int n_block_ortho, int *geo_bs, int spin_bs,
20  QudaPrecision null_precision, TimeProfile &profile) :
21  B(B),
22  Nvec(Nvec),
23  NblockOrtho(n_block_ortho),
24  null_precision(null_precision),
25  V_h(nullptr),
26  V_d(nullptr),
27  fine_tmp_h(nullptr),
28  fine_tmp_d(nullptr),
29  coarse_tmp_h(nullptr),
30  coarse_tmp_d(nullptr),
31  geo_bs(nullptr),
32  fine_to_coarse_h(nullptr),
33  coarse_to_fine_h(nullptr),
34  fine_to_coarse_d(nullptr),
35  coarse_to_fine_d(nullptr),
36  spin_bs(spin_bs),
37  spin_map(0),
38  nspin_fine(B[0]->Nspin()),
39  site_subset(QUDA_FULL_SITE_SUBSET),
41  enable_gpu(false),
42  enable_cpu(false),
43  use_gpu(true),
44  flops_(0),
45  profile(profile)
46  {
47  postTrace();
48  int ndim = B[0]->Ndim();
49 
50  for (int d = 0; d < ndim; d++) {
51  while (geo_bs[d] > 0) {
52  if (d==0 && B[0]->X(0) == geo_bs[0])
53  warningQuda("X-dimension length %d cannot block length %d", B[0]->X(0), geo_bs[0]);
54  else if ( (B[0]->X(d)/geo_bs[d]+1)%2 == 0)
55  warningQuda("Indexing does not (yet) support odd coarse dimensions: X(%d) = %d", d, B[0]->X(d)/geo_bs[d]);
56  else if ( (B[0]->X(d)/geo_bs[d]) * geo_bs[d] != B[0]->X(d) )
57  warningQuda("cannot block dim[%d]=%d with block size = %d", d, B[0]->X(d), geo_bs[d]);
58  else
59  break; // this is a valid block size so let's use it
60  geo_bs[d] /= 2;
61  }
62  if (geo_bs[d] == 0) errorQuda("Unable to block dimension %d", d);
63  }
64 
65  this->geo_bs = new int[ndim];
66  int total_block_size = 1;
67  for (int d = 0; d < ndim; d++) {
68  this->geo_bs[d] = geo_bs[d];
69  total_block_size *= geo_bs[d];
70  }
71 
72  if (total_block_size == 1) errorQuda("Total geometric block size is 1");
73 
74  std::string block_str = std::to_string(geo_bs[0]);
75  for (int d=1; d<ndim; d++) block_str += " x " + std::to_string(geo_bs[1]);
76  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Transfer: using block size %s\n", block_str.c_str());
77 
78  createV(B[0]->Location()); // allocate V field
79  createTmp(QUDA_CPU_FIELD_LOCATION); // allocate temporaries
80 
81  // allocate and compute the fine-to-coarse and coarse-to-fine site maps
82  fine_to_coarse_h = static_cast<int*>(pool_pinned_malloc(B[0]->Volume()*sizeof(int)));
83  coarse_to_fine_h = static_cast<int*>(pool_pinned_malloc(B[0]->Volume()*sizeof(int)));
84 
85  if (enable_gpu) {
86  fine_to_coarse_d = static_cast<int*>(pool_device_malloc(B[0]->Volume()*sizeof(int)));
87  coarse_to_fine_d = static_cast<int*>(pool_device_malloc(B[0]->Volume()*sizeof(int)));
88  }
89 
90  createGeoMap(geo_bs);
91 
92  // allocate the fine-to-coarse spin map
93  spin_map = static_cast<int**>(safe_malloc(nspin_fine*sizeof(int*)));
94  for (int s = 0; s < B[0]->Nspin(); s++) spin_map[s] = static_cast<int*>(safe_malloc(2*sizeof(int)));
95  createSpinMap(spin_bs);
96 
97  reset();
98  postTrace();
99  }
100 
102  {
103  postTrace();
104  // create the storage for the final block orthogonal elements
105  ColorSpinorParam param(*B[0]); // takes the geometry from the null-space vectors
106 
107  // the ordering of the V vector is defined by these parameters and
108  // the Packed functions in ColorSpinorFieldOrder
109 
110  param.nSpin = B[0]->Nspin(); // spin has direct mapping
111  param.nColor = B[0]->Ncolor()*Nvec; // nColor = number of colors * number of vectors
112  param.nVec = Nvec;
114  // the V field is defined on all sites regardless of B field (maybe the B fields are always full?)
115  if (param.siteSubset == QUDA_PARITY_SITE_SUBSET) {
116  //keep it the same for staggered:
118  param.x[0] *= 2;
119  }
120  param.location = location;
122  param.setPrecision(location == QUDA_CUDA_FIELD_LOCATION ? null_precision : B[0]->Precision());
123 
124  if (location == QUDA_CUDA_FIELD_LOCATION) {
125  V_d = ColorSpinorField::Create(param);
126  enable_gpu = true;
127  } else {
128  V_h = ColorSpinorField::Create(param);
129  enable_cpu = true;
130  }
131  postTrace();
132  }
133 
135  {
136  postTrace();
137  ColorSpinorParam param(*B[0]);
139  param.location = location;
142 
143  if (location == QUDA_CUDA_FIELD_LOCATION) {
144  if (fine_tmp_d && coarse_tmp_d) return;
147  } else {
150  }
151  postTrace();
152  }
153 
155  {
156  if (!enable_cpu && !enable_gpu) errorQuda("Neither CPU or GPU coarse fields initialized");
157 
158  // delayed allocating this temporary until we need it
160 
161  switch (location) {
163  if (enable_gpu) return;
164  createV(location);
165  *V_d = *V_h;
166  createTmp(location);
167  fine_to_coarse_d = static_cast<int*>(pool_device_malloc(B[0]->Volume()*sizeof(int)));
168  coarse_to_fine_d = static_cast<int*>(pool_device_malloc(B[0]->Volume()*sizeof(int)));
169  qudaMemcpy(fine_to_coarse_d, fine_to_coarse_h, B[0]->Volume()*sizeof(int), cudaMemcpyHostToDevice);
170  qudaMemcpy(coarse_to_fine_d, coarse_to_fine_h, B[0]->Volume()*sizeof(int), cudaMemcpyHostToDevice);
171  break;
173  if (enable_cpu) return;
174  createV(location);
175  *V_h = *V_d;
176  break;
177  default:
178  errorQuda("Unknown location %d", location);
179  }
180  }
181 
183  {
184  postTrace();
185  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Transfer: block orthogonalizing\n");
186 
187  if (B[0]->Location() == QUDA_CUDA_FIELD_LOCATION) {
188  if (!enable_gpu) errorQuda("enable_gpu = %d so cannot reset", enable_gpu);
190  if (enable_cpu) {
191  *V_h = *V_d;
192  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Transferred prolongator back to CPU\n");
193  }
194  } else {
195  if (!enable_cpu) errorQuda("enable_cpu = %d so cannot reset", enable_cpu);
197  if (enable_gpu) { // if the GPU fields has been initialized then we need to update
198  *V_d = *V_h;
199  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Transferred prolongator to GPU\n");
200  }
201  }
202  postTrace();
203  }
204 
206  if (spin_map)
207  {
208  for (int s = 0; s < nspin_fine; s++) { if (spin_map[s]) host_free(spin_map[s]); }
210  }
215  if (V_h) delete V_h;
216  if (V_d) delete V_d;
217 
218  if (fine_tmp_h) delete fine_tmp_h;
219  if (fine_tmp_d) delete fine_tmp_d;
220 
221  if (coarse_tmp_h) delete coarse_tmp_h;
222  if (coarse_tmp_d) delete coarse_tmp_d;
223 
224  if (geo_bs) delete []geo_bs;
225  }
226 
228  {
229  if (site_subset_ == QUDA_PARITY_SITE_SUBSET && parity_ != QUDA_EVEN_PARITY && parity_ != QUDA_ODD_PARITY)
230  errorQuda("Undefined parity %d", parity_);
231  parity = parity_;
232 
233  if (site_subset == site_subset_) return;
234  site_subset = site_subset_;
235  }
236 
237  struct Int2 {
238  int x, y;
239  Int2() : x(0), y(0) { }
240  Int2(int x, int y) : x(x), y(y) { }
241 
242  bool operator<(const Int2 &a) const {
243  return (x < a.x) ? true : (x==a.x && y<a.y) ? true : false;
244  }
245  };
246 
247  // compute the fine-to-coarse site map
249 
250  int x[QUDA_MAX_DIM];
251 
253  ColorSpinorField &coarse(*coarse_tmp_h);
254 
255  // compute the coarse grid point for every site (assuming parity ordering currently)
256  for (int i=0; i<fine.Volume(); i++) {
257  // compute the lattice-site index for this offset index
258  fine.LatticeIndex(x, i);
259 
260  //printfQuda("fine idx %d = fine (%d,%d,%d,%d), ", i, x[0], x[1], x[2], x[3]);
261 
262  // compute the corresponding coarse-grid index given the block size
263  for (int d=0; d<fine.Ndim(); d++) x[d] /= geo_bs[d];
264 
265  // compute the coarse-offset index and store in fine_to_coarse
266  int k;
267  coarse.OffsetIndex(k, x); // this index is parity ordered
268  fine_to_coarse_h[i] = k;
269 
270  //printfQuda("coarse after (%d,%d,%d,%d), coarse idx %d\n", x[0], x[1], x[2], x[3], k);
271  }
272 
273  // now create an inverse-like variant of this
274 
275  std::vector<Int2> geo_sort(B[0]->Volume());
276  for (unsigned int i=0; i<geo_sort.size(); i++) geo_sort[i] = Int2(fine_to_coarse_h[i], i);
277  std::sort(geo_sort.begin(), geo_sort.end());
278  for (unsigned int i=0; i<geo_sort.size(); i++) coarse_to_fine_h[i] = geo_sort[i].y;
279 
280  if (enable_gpu) {
281  qudaMemcpy(fine_to_coarse_d, fine_to_coarse_h, B[0]->Volume()*sizeof(int), cudaMemcpyHostToDevice);
282  qudaMemcpy(coarse_to_fine_d, coarse_to_fine_h, B[0]->Volume()*sizeof(int), cudaMemcpyHostToDevice);
283  checkCudaError();
284  }
285 
286  }
287 
288  // compute the fine spin and checkerboard to coarse spin map
290  if (spin_bs == 0) // staggered
291  {
292  spin_map[0][0] = 0; // fine even
293  spin_map[0][1] = 1; // fine odd
294  }
295  else
296  {
297  for (int s=0; s<B[0]->Nspin(); s++) {
298  spin_map[s][0] = s / spin_bs; // not staggered, doesn't care about parity.
299  spin_map[s][1] = s / spin_bs;
300  }
301  }
302  }
303 
304  // apply the prolongator
306  profile.TPSTART(QUDA_PROFILE_COMPUTE);
307 
308  ColorSpinorField *input = const_cast<ColorSpinorField*>(&in);
309  ColorSpinorField *output = &out;
311  const ColorSpinorField *V = use_gpu ? V_d : V_h;
312  const int *fine_to_coarse = use_gpu ? fine_to_coarse_d : fine_to_coarse_h;
313 
314  if (use_gpu) {
315  if (in.Location() == QUDA_CPU_FIELD_LOCATION) input = coarse_tmp_d;
316  if (out.Location() == QUDA_CPU_FIELD_LOCATION || out.GammaBasis() != V->GammaBasis())
317  output = (out.SiteSubset() == QUDA_FULL_SITE_SUBSET) ? fine_tmp_d : &fine_tmp_d->Even();
318  if (!enable_gpu) errorQuda("not created with enable_gpu set, so cannot run on GPU");
319  } else {
320  if (out.Location() == QUDA_CUDA_FIELD_LOCATION)
321  output = (out.SiteSubset() == QUDA_FULL_SITE_SUBSET) ? fine_tmp_h : &fine_tmp_h->Even();
322  }
323 
324  *input = in; // copy result to input field (aliasing handled automatically)
325 
327  errorQuda("Cannot prolongate to a full field since only have single parity null-space components");
328 
329  if ((V->Nspin() != 1) && ((output->GammaBasis() != V->GammaBasis()) || (input->GammaBasis() != V->GammaBasis()))){
330  errorQuda("Cannot apply prolongator using fields in a different basis from the null space (%d,%d) != %d",
331  output->GammaBasis(), in.GammaBasis(), V->GammaBasis());
332  }
333 
334  Prolongate(*output, *input, *V, Nvec, fine_to_coarse, spin_map, parity);
335 
336  out = *output; // copy result to out field (aliasing handled automatically)
337 
338  flops_ += 8*in.Ncolor()*out.Ncolor()*out.VolumeCB()*out.SiteSubset();
339 
341  }
342 
343  // apply the restrictor
345 
346  profile.TPSTART(QUDA_PROFILE_COMPUTE);
347 
348  ColorSpinorField *input = &const_cast<ColorSpinorField&>(in);
349  ColorSpinorField *output = &out;
351  const ColorSpinorField *V = use_gpu ? V_d : V_h;
352  const int *fine_to_coarse = use_gpu ? fine_to_coarse_d : fine_to_coarse_h;
353  const int *coarse_to_fine = use_gpu ? coarse_to_fine_d : coarse_to_fine_h;
354 
355  if (use_gpu) {
356  if (out.Location() == QUDA_CPU_FIELD_LOCATION) output = coarse_tmp_d;
357  if (in.Location() == QUDA_CPU_FIELD_LOCATION || in.GammaBasis() != V->GammaBasis())
358  input = (in.SiteSubset() == QUDA_FULL_SITE_SUBSET) ? fine_tmp_d : &fine_tmp_d->Even();
359  if (!enable_gpu) errorQuda("not created with enable_gpu set, so cannot run on GPU");
360  } else {
362  input = (in.SiteSubset() == QUDA_FULL_SITE_SUBSET) ? fine_tmp_h : &fine_tmp_h->Even();
363  }
364 
365  *input = in;
366 
368  errorQuda("Cannot restrict a full field since only have single parity null-space components");
369 
370  if ( V->Nspin() != 1 && ( output->GammaBasis() != V->GammaBasis() || input->GammaBasis() != V->GammaBasis() ) )
371  errorQuda("Cannot apply restrictor using fields in a different basis from the null space (%d,%d) != %d",
372  out.GammaBasis(), input->GammaBasis(), V->GammaBasis());
373 
374  Restrict(*output, *input, *V, Nvec, fine_to_coarse, coarse_to_fine, spin_map, parity);
375 
376  out = *output; // copy result to out field (aliasing handled automatically)
377 
378  // only need to synchronize if we're transferring from GPU to CPU
379  if (out.Location() == QUDA_CPU_FIELD_LOCATION && in.Location() == QUDA_CUDA_FIELD_LOCATION)
381 
382  flops_ += 8*out.Ncolor()*in.Ncolor()*in.VolumeCB()*in.SiteSubset();
383 
385  }
386 
387  double Transfer::flops() const {
388  double rtn = flops_;
389  flops_ = 0;
390  return rtn;
391  }
392 
393 } // namespace quda
#define qudaMemcpy(dst, src, count, kind)
Definition: quda_cuda_api.h:33
bool enable_gpu
Definition: transfer.h:104
#define postTrace()
Definition: tune_quda.h:591
void OffsetIndex(int &i, int *y) const
void setPrecision(QudaPrecision precision, QudaPrecision ghost_precision=QUDA_INVALID_PRECISION, bool force_native=false)
TimeProfile & profile
Definition: transfer.h:152
void createTmp(QudaFieldLocation location) const
Allocate temporaries used when applying transfer operators.
Definition: transfer.cpp:134
#define pool_pinned_free(ptr)
Definition: malloc_quda.h:128
const int Nvec
Definition: transfer.h:37
Transfer(const std::vector< ColorSpinorField *> &B, int Nvec, int NblockOrtho, int *geo_bs, int spin_bs, QudaPrecision null_precision, TimeProfile &profile)
Definition: transfer.cpp:19
enum QudaPrecision_s QudaPrecision
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21
void initializeLazy(QudaFieldLocation location) const
Lazy allocation of the transfer operator in a given location.
Definition: transfer.cpp:154
#define errorQuda(...)
Definition: util_quda.h:121
const QudaPrecision null_precision
Definition: transfer.h:43
void R(ColorSpinorField &out, const ColorSpinorField &in) const
Definition: transfer.cpp:344
#define host_free(ptr)
Definition: malloc_quda.h:71
bool enable_cpu
Definition: transfer.h:107
const ColorSpinorField & Even() const
static ColorSpinorField * Create(const ColorSpinorParam &param)
QudaGammaBasis GammaBasis() const
int * fine_to_coarse_d
Definition: transfer.h:81
void P(ColorSpinorField &out, const ColorSpinorField &in) const
Definition: transfer.cpp:305
ColorSpinorField * fine_tmp_h
Definition: transfer.h:52
double flops_
Definition: transfer.h:146
void createV(QudaFieldLocation location) const
Allocate V field.
Definition: transfer.cpp:101
void setSiteSubset(QudaSiteSubset site_subset, QudaParity parity)
Sets whether the transfer operator is to act on full fields or single parity fields, and if single-parity which parity.
Definition: transfer.cpp:227
QudaParity parity
Definition: transfer.h:101
int Nspin
Definition: blas_test.cu:45
QudaSiteSubset siteSubset
Definition: lattice_field.h:71
ColorSpinorField * coarse_tmp_d
Definition: transfer.h:61
QudaGaugeParam param
Definition: pack_test.cpp:17
static int ndim
Definition: layout_hyper.c:53
int x[QUDA_MAX_DIM]
Definition: lattice_field.h:67
#define qudaDeviceSynchronize()
QudaFieldLocation location
int * coarse_to_fine_h
Definition: transfer.h:75
QudaSiteSubset site_subset
Definition: transfer.h:98
ColorSpinorField * CreateCoarse(const int *geoBlockSize, int spinBlockSize, int Nvec, QudaPrecision precision=QUDA_INVALID_PRECISION, QudaFieldLocation location=QUDA_INVALID_FIELD_LOCATION, QudaMemoryType mem_Type=QUDA_MEMORY_INVALID)
Create a coarse color-spinor field, using this field to set the meta data.
void createGeoMap(int *geo_bs)
Creates the map between fine and coarse grids.
Definition: transfer.cpp:248
cpuColorSpinorField * in
ColorSpinorField * coarse_tmp_h
Definition: transfer.h:58
QudaSiteSubset SiteSubset() const
#define pool_device_malloc(size)
Definition: malloc_quda.h:125
#define warningQuda(...)
Definition: util_quda.h:133
const int NblockOrtho
Definition: transfer.h:40
int X[4]
Definition: covdev_test.cpp:70
ColorSpinorField * V_h
Definition: transfer.h:46
enum QudaParity_s QudaParity
#define safe_malloc(size)
Definition: malloc_quda.h:66
int V
Definition: test_util.cpp:27
void Restrict(Arg arg)
Definition: restrictor.cuh:90
int ** spin_map
Definition: transfer.h:92
enum QudaSiteSubset_s QudaSiteSubset
QudaFieldLocation Location() const
int * fine_to_coarse_h
Definition: transfer.h:70
#define pool_pinned_malloc(size)
Definition: malloc_quda.h:127
enum QudaFieldLocation_s QudaFieldLocation
cpuColorSpinorField * out
__shared__ float s[]
QudaPrecision Precision() const
Definition: lattice_field.h:58
#define printfQuda(...)
Definition: util_quda.h:115
const int nspin_fine
Definition: transfer.h:95
void LatticeIndex(int *y, int i) const
ColorSpinorField * V_d
Definition: transfer.h:49
#define pool_device_free(ptr)
Definition: malloc_quda.h:126
bool operator<(const Int2 &a) const
Definition: transfer.cpp:242
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
#define checkCudaError()
Definition: util_quda.h:161
ColorSpinorField * fine_tmp_d
Definition: transfer.h:55
void createSpinMap(int spin_bs)
Creates the map between fine spin and parity to coarse spin dimensions.
Definition: transfer.cpp:289
Int2(int x, int y)
Definition: transfer.cpp:240
const std::vector< ColorSpinorField * > & B
Definition: transfer.h:34
int n_block_ortho[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1673
void Prolongate(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &v, int Nvec, const int *fine_to_coarse, const int *const *spin_map, int parity=QUDA_INVALID_PARITY)
Apply the prolongation operator.
Definition: prolongator.cu:296
QudaParity parity
Definition: covdev_test.cpp:54
double flops() const
Definition: transfer.cpp:387
int * coarse_to_fine_d
Definition: transfer.h:86
int * geo_bs
Definition: transfer.h:64
virtual ~Transfer()
Definition: transfer.cpp:205
void BlockOrthogonalize(ColorSpinorField &V, const std::vector< ColorSpinorField *> &B, const int *fine_to_coarse, const int *coarse_to_fine, const int *geo_bs, const int spin_bs, const int n_block_ortho)
Block orthogonnalize the matrix field, where the blocks are defined by lookup tables that map the fin...
void reset()
for resetting the Transfer when the null vectors have changed
Definition: transfer.cpp:182