QUDA  0.9.0
transfer.cpp
Go to the documentation of this file.
1 #include <transfer.h>
2 #include <blas_quda.h>
3 
4 #include <transfer.h>
5 #include <multigrid.h>
6 #include <malloc_quda.h>
7 
8 #include <iostream>
9 #include <algorithm>
10 #include <vector>
11 
12 #include <nvToolsExt.h>
13 
14 namespace quda {
15 
16  /*
17  * for the staggered case, there is no spin blocking,
18  * however we do even-odd to preserve chirality (that is straightforward)
19  */
20 
21  Transfer::Transfer(const std::vector<ColorSpinorField*> &B, int Nvec, int *geo_bs, int spin_bs, bool enable_gpu, TimeProfile &profile)
22  : B(B), Nvec(Nvec), V_h(0), V_d(0), fine_tmp_h(0), fine_tmp_d(0), coarse_tmp_h(0), coarse_tmp_d(0), geo_bs(0),
23  fine_to_coarse_h(0), coarse_to_fine_h(0),
24  fine_to_coarse_d(0), coarse_to_fine_d(0),
25  spin_bs(spin_bs), spin_map(0), site_subset(QUDA_FULL_SITE_SUBSET), parity(QUDA_INVALID_PARITY),
26  enable_gpu(enable_gpu), use_gpu(enable_gpu), // by default we apply the transfer operator according to enable_gpu flag but can be overridden
27  flops_(0), profile(profile)
28  {
29  int ndim = B[0]->Ndim();
30 
31  for (int d = 0; d < ndim; d++) {
32  while (geo_bs[d] > 0) {
33  if (d==0 && B[0]->X(0) == geo_bs[0])
34  warningQuda("X-dimension length %d cannot block length %d", B[0]->X(0), geo_bs[0]);
35  else if ( (B[0]->X(d)/geo_bs[d]+1)%2 == 0)
36  warningQuda("Indexing does not (yet) support odd coarse dimensions: X(%d) = %d", d, B[0]->X(d)/geo_bs[d]);
37  else if ( (B[0]->X(d)/geo_bs[d]) * geo_bs[d] != B[0]->X(d) )
38  warningQuda("cannot block dim[%d]=%d with block size = %d", d, B[0]->X(d), geo_bs[d]);
39  else
40  break; // this is a valid block size so let's use it
41  geo_bs[d] /= 2;
42  }
43  if (geo_bs[d] == 0) errorQuda("Unable to block dimension %d", d);
44  }
45 
46  this->geo_bs = new int[ndim];
47  int total_block_size = 1;
48  for (int d = 0; d < ndim; d++) {
49  this->geo_bs[d] = geo_bs[d];
50  total_block_size *= geo_bs[d];
51  }
52 
53  if (total_block_size == 1) errorQuda("Total geometric block size is 1");
54 
55  char block_str[128];
56  sprintf(block_str, "%d", geo_bs[0]);
57  for (int d=1; d<ndim; d++) sprintf(block_str, "%s x %d", block_str, geo_bs[d]);
58  printfQuda("Transfer: using block size %s\n", block_str);
59 
60  // create the storage for the final block orthogonal elements
61  ColorSpinorParam param(*B[0]); // takes the geometry from the null-space vectors
62 
63  // the ordering of the V vector is defined by these parameters and
64  // the Packed functions in ColorSpinorFieldOrder
65 
66  param.nSpin = B[0]->Nspin(); // spin has direct mapping
67  param.nColor = B[0]->Ncolor()*Nvec; // nColor = number of colors * number of vectors
69  // the V field is defined on all sites regardless of B field (maybe the B fields are always full?)
70  if (param.siteSubset == QUDA_PARITY_SITE_SUBSET) {
71  //keep it the same for staggered:
72  param.siteSubset = QUDA_FULL_SITE_SUBSET;
73  param.x[0] *= 2;
74  }
75 
76  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Transfer: creating V field with location %d\n", param.location);
77 
78  // for cpu transfer this is the V field, for gpu it's just a temporary until we port the block orthogonalization
80 
82  param.fieldOrder = QUDA_FLOAT2_FIELD_ORDER;//ok for staggered
83 
85 
86  printfQuda("Transfer: filling V field with zero\n");
87  fillV(*V_h); // copy the null space vectors into V
88 
89  param = ColorSpinorParam(*B[0]);
90 
91  // used for cpu<->gpu transfers
94 
95  // useful to have around
97 
98  // create temporaries we use to enable us to change basis and for cpu<->gpu transfers
99  if (enable_gpu) {
100  param = ColorSpinorParam(*B[0]);
102  param.fieldOrder = QUDA_FLOAT2_FIELD_ORDER;
103  param.create = QUDA_NULL_FIELD_CREATE;
105 
106  // used for basis changing
108  }
109 
110  // allocate and compute the fine-to-coarse and coarse-to-fine site maps
111  fine_to_coarse_h = static_cast<int*>(safe_malloc(B[0]->Volume()*sizeof(int)));
112  coarse_to_fine_h = static_cast<int*>(safe_malloc(B[0]->Volume()*sizeof(int)));
113 
114  if (enable_gpu) {
115  fine_to_coarse_d = static_cast<int*>(device_malloc(B[0]->Volume()*sizeof(int)));
116  coarse_to_fine_d = static_cast<int*>(device_malloc(B[0]->Volume()*sizeof(int)));
117  }
118 
120 
121  // allocate the fine-to-coarse spin map (don't need it for staggered.)
122  if (param.nSpin != 1){
123  spin_map = static_cast<int*>(safe_malloc(B[0]->Nspin()*sizeof(int)));
125  }
126 
127  // orthogonalize the blocks
128  printfQuda("Transfer: block orthogonalizing\n");
130 
131  if (enable_gpu) {
132  *V_d = *V_h;
133  printfQuda("Transferred prolongator to GPU\n");
134  }
135  }
136 
143  if (V_h) delete V_h;
144  if (V_d) delete V_d;
145 
146  if (fine_tmp_h) delete fine_tmp_h;
147  if (fine_tmp_d) delete fine_tmp_d;
148 
149  if (coarse_tmp_h) delete coarse_tmp_h;
150  if (coarse_tmp_d) delete coarse_tmp_d;
151 
152  if (geo_bs) delete []geo_bs;
153  }
154 
155  void Transfer::setSiteSubset(QudaSiteSubset site_subset_, QudaParity parity_) {
156  if (parity_ != QUDA_EVEN_PARITY && parity_ != QUDA_ODD_PARITY) errorQuda("Undefined parity %d", parity_);
157  parity = parity_;
158 
159  if (site_subset == site_subset_) return;
160  site_subset = site_subset_;
161 
162  // this function only does something non-trivial if the operator is on the GPU
163  if (!enable_gpu) return;
164 
166  // if doing single-parity then delete full field V and replace with single parity
167 
168  delete V_d;
169 
172  param.fieldOrder = QUDA_FLOAT2_FIELD_ORDER;
173  param.x[0] /= 2;
174  param.siteSubset = QUDA_PARITY_SITE_SUBSET;
175 
177  *V_d = parity == QUDA_EVEN_PARITY ? V_h->Even() : V_h->Odd();
178 
179  } else if (site_subset == QUDA_FULL_SITE_SUBSET) {
180  // if doing full field then delete single parity V and replace with single parity
181 
182  delete V_d;
183 
186  param.fieldOrder = QUDA_FLOAT2_FIELD_ORDER;
187 
189  *V_d = *V_h;
190 
191  } else {
192  errorQuda("Undefined site_subset %d", site_subset_);
193  }
194 
195  }
196 
198  FillV(V, B, Nvec); //printfQuda("V fill check %e\n", norm2(*V));
199  }
200 
201  struct Int2 {
202  int x, y;
203  Int2() : x(0), y(0) { }
204  Int2(int x, int y) : x(x), y(y) { }
205 
206  bool operator<(const Int2 &a) const {
207  return (x < a.x) ? true : (x==a.x && y<a.y) ? true : false;
208  }
209  };
210 
211  // compute the fine-to-coarse site map
212  void Transfer::createGeoMap(int *geo_bs) {
213 
214  int x[QUDA_MAX_DIM];
215 
217  ColorSpinorField &coarse(*coarse_tmp_h);
218 
219  // compute the coarse grid point for every site (assuming parity ordering currently)
220  for (int i=0; i<fine.Volume(); i++) {
221  // compute the lattice-site index for this offset index
222  fine.LatticeIndex(x, i);
223 
224  //printfQuda("fine idx %d = fine (%d,%d,%d,%d), ", i, x[0], x[1], x[2], x[3]);
225 
226  // compute the corresponding coarse-grid index given the block size
227  for (int d=0; d<fine.Ndim(); d++) x[d] /= geo_bs[d];
228 
229  // compute the coarse-offset index and store in fine_to_coarse
230  int k;
231  coarse.OffsetIndex(k, x); // this index is parity ordered
232  fine_to_coarse_h[i] = k;
233 
234  //printfQuda("coarse after (%d,%d,%d,%d), coarse idx %d\n", x[0], x[1], x[2], x[3], k);
235  }
236 
237  // now create an inverse-like variant of this
238 
239  std::vector<Int2> geo_sort(B[0]->Volume());
240  for (unsigned int i=0; i<geo_sort.size(); i++) geo_sort[i] = Int2(fine_to_coarse_h[i], i);
241  std::sort(geo_sort.begin(), geo_sort.end());
242  for (unsigned int i=0; i<geo_sort.size(); i++) coarse_to_fine_h[i] = geo_sort[i].y;
243 
244  if (enable_gpu) {
245  qudaMemcpy(fine_to_coarse_d, fine_to_coarse_h, B[0]->Volume()*sizeof(int), cudaMemcpyHostToDevice);
246  qudaMemcpy(coarse_to_fine_d, coarse_to_fine_h, B[0]->Volume()*sizeof(int), cudaMemcpyHostToDevice);
247  checkCudaError();
248  }
249 
250  }
251 
252  // compute the fine spin to coarse spin map
253  void Transfer::createSpinMap(int spin_bs) {
254 
255  for (int s=0; s<B[0]->Nspin(); s++) {
256  spin_map[s] = s / spin_bs;
257  }
258 
259  }
260 
261  // apply the prolongator
263  profile.TPSTART(QUDA_PROFILE_COMPUTE);
264 
265  ColorSpinorField *input = const_cast<ColorSpinorField*>(&in);
266  ColorSpinorField *output = &out;
267  const ColorSpinorField *V = use_gpu ? V_d : V_h;
268  const int *fine_to_coarse = use_gpu ? fine_to_coarse_d : fine_to_coarse_h;
269 
270  if (use_gpu) {
271  if (in.Location() == QUDA_CPU_FIELD_LOCATION) input = coarse_tmp_d;
272  if (out.Location() == QUDA_CPU_FIELD_LOCATION || out.GammaBasis() != V->GammaBasis())
273  output = (out.SiteSubset() == QUDA_FULL_SITE_SUBSET) ? fine_tmp_d : &fine_tmp_d->Even();
274  if (!enable_gpu) errorQuda("not created with enable_gpu set, so cannot run on GPU");
275  } else {
276  if (out.Location() == QUDA_CUDA_FIELD_LOCATION)
277  output = (out.SiteSubset() == QUDA_FULL_SITE_SUBSET) ? fine_tmp_h : &fine_tmp_h->Even();
278  }
279 
280  *input = in; // copy result to input field (aliasing handled automatically)
281 
282  if (V->SiteSubset() == QUDA_PARITY_SITE_SUBSET && out.SiteSubset() == QUDA_FULL_SITE_SUBSET)
283  errorQuda("Cannot prolongate to a full field since only have single parity null-space components");
284 
285  if ((V->Nspin() != 1) && ((output->GammaBasis() != V->GammaBasis()) || (input->GammaBasis() != V->GammaBasis()))){
286  errorQuda("Cannot apply prolongator using fields in a different basis from the null space (%d,%d) != %d",
287  output->GammaBasis(), in.GammaBasis(), V->GammaBasis());
288  }
289 
290  Prolongate(*output, *input, *V, Nvec, fine_to_coarse, spin_map, parity);
291 
292  out = *output; // copy result to out field (aliasing handled automatically)
293 
294  flops_ += 8*in.Ncolor()*out.Ncolor()*out.VolumeCB()*out.SiteSubset();
295 
297  }
298 
299  // apply the restrictor
301 
302  profile.TPSTART(QUDA_PROFILE_COMPUTE);
303 
304  ColorSpinorField *input = &const_cast<ColorSpinorField&>(in);
305  ColorSpinorField *output = &out;
306  const ColorSpinorField *V = use_gpu ? V_d : V_h;
307  const int *fine_to_coarse = use_gpu ? fine_to_coarse_d : fine_to_coarse_h;
308  const int *coarse_to_fine = use_gpu ? coarse_to_fine_d : coarse_to_fine_h;
309 
310  if (use_gpu) {
311  if (out.Location() == QUDA_CPU_FIELD_LOCATION) output = coarse_tmp_d;
312  if (in.Location() == QUDA_CPU_FIELD_LOCATION || in.GammaBasis() != V->GammaBasis())
313  input = (in.SiteSubset() == QUDA_FULL_SITE_SUBSET) ? fine_tmp_d : &fine_tmp_d->Even();
314  if (!enable_gpu) errorQuda("not created with enable_gpu set, so cannot run on GPU");
315  } else {
316  if (in.Location() == QUDA_CUDA_FIELD_LOCATION)
317  input = (in.SiteSubset() == QUDA_FULL_SITE_SUBSET) ? fine_tmp_h : &fine_tmp_h->Even();
318  }
319 
320  *input = in;
321 
322  if (V->SiteSubset() == QUDA_PARITY_SITE_SUBSET && in.SiteSubset() == QUDA_FULL_SITE_SUBSET)
323  errorQuda("Cannot restrict a full field since only have single parity null-space components");
324 
325  if ( V->Nspin() != 1 && ( output->GammaBasis() != V->GammaBasis() || input->GammaBasis() != V->GammaBasis() ) )
326  errorQuda("Cannot apply restrictor using fields in a different basis from the null space (%d,%d) != %d",
327  out.GammaBasis(), input->GammaBasis(), V->GammaBasis());
328 
329  Restrict(*output, *input, *V, Nvec, fine_to_coarse, coarse_to_fine, spin_map, parity);
330 
331  out = *output; // copy result to out field (aliasing handled automatically)
332 
333  // only need to synchronize if we're transferring from GPU to CPU
334  if (out.Location() == QUDA_CPU_FIELD_LOCATION && in.Location() == QUDA_CUDA_FIELD_LOCATION)
336 
337  flops_ += 8*out.Ncolor()*in.Ncolor()*in.VolumeCB()*in.SiteSubset();
338 
340  }
341 
342  double Transfer::flops() const {
343  double rtn = flops_;
344  flops_ = 0;
345  return rtn;
346  }
347 
348 } // namespace quda
#define qudaMemcpy(dst, src, count, kind)
Definition: quda_cuda_api.h:32
bool enable_gpu
Definition: transfer.h:96
void OffsetIndex(int &i, int *y) const
TimeProfile & profile
Definition: transfer.h:156
const int Nvec
Definition: transfer.h:38
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
#define errorQuda(...)
Definition: util_quda.h:90
void R(ColorSpinorField &out, const ColorSpinorField &in) const
Definition: transfer.cpp:300
#define host_free(ptr)
Definition: malloc_quda.h:59
const ColorSpinorField & Even() const
const ColorSpinorField & Odd() const
static ColorSpinorField * Create(const ColorSpinorParam &param)
QudaGammaBasis GammaBasis() const
int * fine_to_coarse_d
Definition: transfer.h:76
void P(ColorSpinorField &out, const ColorSpinorField &in) const
Definition: transfer.cpp:262
ColorSpinorField * fine_tmp_h
Definition: transfer.h:47
double flops_
Definition: transfer.h:150
void setSiteSubset(QudaSiteSubset site_subset, QudaParity parity)
Sets whether the transfer operator is to act on full fields or single parity fields, and if single-parity which parity. If site_subset is QUDA_FULL_SITE_SUBSET, the transfer operator can still be applied to single-parity fields, however, if site_subset is QUDA_PARITY_SITE_SUBSET, then the transfer operator cannot be applied to full fields, and setSiteSubset will need to be called first to reset to QUDA_FULL_SITE_SUBSET. This method exists to reduce GPU memory overhead - if only transfering single-parity fine fields then we only store a single-parity copy of the null space components on the device.
Definition: transfer.cpp:155
QudaParity parity
Definition: transfer.h:93
int Nspin
Definition: blas_test.cu:45
ColorSpinorField * coarse_tmp_d
Definition: transfer.h:56
void FillV(ColorSpinorField &V, const std::vector< ColorSpinorField *> &B, int Nvec)
QudaGaugeParam param
Definition: pack_test.cpp:17
static int ndim
Definition: layout_hyper.c:53
int * coarse_to_fine_h
Definition: transfer.h:70
QudaSiteSubset site_subset
Definition: transfer.h:90
void createGeoMap(int *geo_bs)
Definition: transfer.cpp:212
cpuColorSpinorField * in
ColorSpinorField * coarse_tmp_h
Definition: transfer.h:53
int V
Definition: test_util.cpp:28
#define warningQuda(...)
Definition: util_quda.h:101
void Restrict(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &v, int Nvec, const int *fine_to_coarse, const int *coarse_to_fine, const int *spin_map, int parity=QUDA_INVALID_PARITY)
Apply the restriction operator.
Definition: restrictor.cu:509
int * spin_map
Definition: transfer.h:87
ColorSpinorField * V_h
Definition: transfer.h:41
enum QudaParity_s QudaParity
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
#define safe_malloc(size)
Definition: malloc_quda.h:54
enum QudaSiteSubset_s QudaSiteSubset
int * fine_to_coarse_h
Definition: transfer.h:65
ColorSpinorField * CreateCoarse(const int *geoblockSize, int spinBlockSize, int Nvec, QudaFieldLocation location=QUDA_INVALID_FIELD_LOCATION)
QudaFieldLocation location
Definition: quda.h:27
cpuColorSpinorField * out
int sprintf(char *, const char *,...) __attribute__((__format__(__printf__
#define printfQuda(...)
Definition: util_quda.h:84
void fillV(ColorSpinorField &V)
Definition: transfer.cpp:197
Transfer(const std::vector< ColorSpinorField *> &B, int Nvec, int *geo_bs, int spin_bs, bool enable_gpu, TimeProfile &profile)
Definition: transfer.cpp:21
#define device_malloc(size)
Definition: malloc_quda.h:52
void LatticeIndex(int *y, int i) const
ColorSpinorField * V_d
Definition: transfer.h:44
bool operator<(const Int2 &a) const
Definition: transfer.cpp:206
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
#define checkCudaError()
Definition: util_quda.h:129
ColorSpinorField * fine_tmp_d
Definition: transfer.h:50
void createSpinMap(int spin_bs)
Definition: transfer.cpp:253
void BlockOrthogonalize(ColorSpinorField &V, int Nvec, const int *geo_bs, const int *fine_to_coarse, int spin_bs)
Block orthogonnalize the matrix field, where the blocks are defined by lookup tables that map the fin...
Int2(int x, int y)
Definition: transfer.cpp:204
const std::vector< ColorSpinorField * > & B
Definition: transfer.h:35
static __inline__ size_t size_t d
QudaParity parity
Definition: covdev_test.cpp:53
double flops() const
Definition: transfer.cpp:342
#define a
int * coarse_to_fine_d
Definition: transfer.h:81
int * geo_bs
Definition: transfer.h:59
virtual ~Transfer()
Definition: transfer.cpp:137
void Prolongate(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &v, int Nvec, const int *fine_to_coarse, const int *spin_map, int parity=QUDA_INVALID_PARITY)
Apply the prolongation operator.
Definition: prolongator.cu:284
#define device_free(ptr)
Definition: malloc_quda.h:57