QUDA  0.9.0
transfer_util.cu
Go to the documentation of this file.
1 #include <color_spinor_field.h>
3 #include <tune_quda.h>
4 #include <typeinfo>
5 #include <vector>
6 #include <assert.h>
7 
8 namespace quda {
9 
10 #ifdef GPU_MULTIGRID
11 
12  using namespace quda::colorspinor;
13 
14  template<typename real, int nSpin, int nColor, int nVec, QudaFieldOrder order>
15  struct FillVArg {
16 
19  const int v;
20 
21  FillVArg(ColorSpinorField &V, const std::vector<ColorSpinorField*> &B, int v)
22  : V(V), B(*(B[v])), v(v) { }
23 
24  };
25 
26  // CPU routine to copy the null-space vectors into the V-field
27  template <typename Float, int nSpin, int nColor, int nVec, typename Arg>
28  void FillVCPU(Arg &arg, int v) {
29 
30  for (int parity=0; parity<arg.V.Nparity(); parity++) {
31  for (int x_cb=0; x_cb<arg.V.VolumeCB(); x_cb++) {
32  for (int s=0; s<nSpin; s++) {
33  for (int c=0; c<nColor; c++) {
34  arg.V(parity, x_cb, s, c, arg.v) = arg.B(parity, x_cb, s, c);
35  }
36  }
37  }
38  }
39 
40  }
41 
42  // GPU kernel to copy the null-space vectors into the V-field
43  template <typename Float, int nSpin, int nColor, int nVec, typename Arg>
44  __global__ void FillVGPU(Arg arg, int v) {
45 
46  int x_cb = threadIdx.x + blockDim.x*blockIdx.x;
47  int parity = threadIdx.y + blockDim.y*blockIdx.y;
48 
49  for (int s=0; s<nSpin; s++) {
50  for (int c=0; c<nColor; c++) {
51  arg.V(parity, x_cb, s, c, arg.v) = arg.B(parity, x_cb, s, c);
52  }
53  }
54 
55  }
56 
57  template <typename real, int nSpin, int nColor, int nVec>
58  class FillVLaunch : public TunableVectorY {
59 
60  ColorSpinorField &V;
61  const std::vector<ColorSpinorField*> &B;
62  const int v;
63  unsigned int minThreads() const { return V.VolumeCB(); }
64 
65  public:
66  FillVLaunch(ColorSpinorField &V, const std::vector<ColorSpinorField*> &B, const int v)
67  : TunableVectorY(2), V(V), B(B), v(v) {
68  (V.Location() == QUDA_CPU_FIELD_LOCATION) ? strcpy(aux,"CPU") : strcpy(aux,"GPU");
69  }
70  virtual ~FillVLaunch() { }
71 
72  void apply(const cudaStream_t &stream) {
73  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
74  if (V.Location() == QUDA_CPU_FIELD_LOCATION) {
75  if (V.FieldOrder() == QUDA_SPACE_SPIN_COLOR_FIELD_ORDER) {
76  FillVArg<real,nSpin,nColor,nVec,QUDA_SPACE_SPIN_COLOR_FIELD_ORDER> arg(V,B,v);
77  FillVCPU<real,nSpin,nColor,nVec>(arg,v);
78  } else {
79  errorQuda("Field order not implemented %d", V.FieldOrder());
80  }
81  } else {
82  if (V.FieldOrder() == QUDA_FLOAT2_FIELD_ORDER) {
83  FillVArg<real,nSpin,nColor,nVec,QUDA_FLOAT2_FIELD_ORDER> arg(V,B,v);
84  FillVGPU<real,nSpin,nColor,nVec> <<<tp.grid,tp.block,tp.shared_bytes>>>(arg,v);
85  } else {
86  errorQuda("Field order not implemented %d", V.FieldOrder());
87  }
88  }
89  }
90 
91  bool advanceTuneParam(TuneParam &param) const { return false; }
92 
93  TuneKey tuneKey() const { return TuneKey(V.VolString(), typeid(*this).name(), aux); }
94 
95  long long flops() const { return 0; }
96  long long bytes() const { return 2*V.Bytes(); }
97  };
98 
99 
100  template <typename real, int nSpin, int nColor, int nVec>
101  void FillV(ColorSpinorField &V, const std::vector<ColorSpinorField*> &B) {
102  for (int v=0; v<nVec; v++) {
103  FillVLaunch<real,nSpin,nColor,nVec> f(V,B,v);
104  f.apply(0);
105  }
106  }
107 
108  // For staggered this does not include factor 2 due to parity decomposition!
109  template <typename Float, int nSpin, int nColor>
110  void FillV(ColorSpinorField &V, const std::vector<ColorSpinorField*> &B, int Nvec) {
111  if (Nvec == 2) {
112  FillV<Float,nSpin,nColor,2>(V,B);
113  } else if (Nvec == 4) {
114  FillV<Float,nSpin,nColor,4>(V,B);
115  } else if (Nvec == 8) {
116  FillV<Float,nSpin,nColor,8>(V,B);
117  } else if (Nvec == 12) {
118  FillV<Float,nSpin,nColor,12>(V,B);
119  } else if (Nvec == 16) {
120  FillV<Float,nSpin,nColor,16>(V,B);
121  } else if (Nvec == 20) {
122  FillV<Float,nSpin,nColor,20>(V,B);
123  } else if (Nvec == 24) {
124  FillV<Float,nSpin,nColor,24>(V,B);
125  } else if (Nvec == 32) {
126  FillV<Float,nSpin,nColor,32>(V,B);
127  } else if (Nvec == 48) {
128  FillV<Float,nSpin,nColor,48>(V,B);
129  } else {
130  errorQuda("Unsupported Nvec %d", Nvec);
131  }
132  }
133 
134  template <typename Float, int nSpin>
135  void FillV(ColorSpinorField &V, const std::vector<ColorSpinorField*> &B, int Nvec) {
136  if (B[0]->Ncolor()*Nvec != V.Ncolor()) errorQuda("Something wrong here");
137 
138  if (B[0]->Ncolor() == 2) {
139  FillV<Float,nSpin,2>(V,B,Nvec);
140  } else if(B[0]->Ncolor() == 3) {
141  FillV<Float,nSpin,3>(V,B,Nvec);
142  } else if(B[0]->Ncolor() == 8) {
143  FillV<Float,nSpin,8>(V,B,Nvec);
144  } else if(B[0]->Ncolor() == 16) {
145  FillV<Float,nSpin,16>(V,B,Nvec);
146  } else if(B[0]->Ncolor() == 24) {
147  FillV<Float,nSpin,24>(V,B,Nvec);
148  } else if(B[0]->Ncolor() == 32) {
149  FillV<Float,nSpin,32>(V,B,Nvec);
150  } else {
151  errorQuda("Unsupported nColor %d", B[0]->Ncolor());
152  }
153  }
154 
155  template <typename Float>
156  void FillV(ColorSpinorField &V, const std::vector<ColorSpinorField*> &B, int Nvec) {
157  if (V.Nspin() == 4) {
158  FillV<Float,4>(V,B,Nvec);
159  } else if (V.Nspin() == 2) {
160  FillV<Float,2>(V,B,Nvec);
161 #ifdef GPU_STAGGERED_DIRAC
162  } else if (V.Nspin() == 1) {
163  FillV<Float,1>(V,B,Nvec);
164 #endif
165  } else {
166  errorQuda("Unsupported nSpin %d", V.Nspin());
167  }
168  }
169 
170 #endif // GPU_MULTIGRID
171 
172  void FillV(ColorSpinorField &V, const std::vector<ColorSpinorField*> &B, int Nvec) {
173 
174 #ifdef GPU_MULTIGRID
175  if (V.Precision() == QUDA_DOUBLE_PRECISION) {
176 #ifdef GPU_MULTIGRID_DOUBLE
177  FillV<double>(V,B,Nvec);
178 #else
179  errorQuda("Double precision multigrid has not been enabled");
180 #endif
181  } else if (V.Precision() == QUDA_SINGLE_PRECISION) {
182  FillV<float>(V,B,Nvec);
183  } else {
184  errorQuda("Unsupported precision %d", V.Precision());
185  }
186 #else
187  errorQuda("Multigrid has not been built");
188 #endif
189  }
190 
191 #ifdef GPU_MULTIGRID
192 
193  // Creates a block-ordered version of a ColorSpinorField
194  // N.B.: Only works for the V field, as we need to block spin.
195  template <bool toBlock, int nVec, class Complex, class FieldOrder>
196  void blockOrderV(Complex *out, FieldOrder &in,
197  const int *geo_map, const int *geo_bs, int spin_bs,
198  const cpuColorSpinorField &V) {
199  //printfQuda("in.Ncolor = %d\n", in.Ncolor());
200  int nSpin_coarse = in.Nspin() / spin_bs; // this is number of chiral blocks
201 
202  //Compute the size of each block
203  int geoBlockSize = 1;
204  for (int d=0; d<in.Ndim(); d++) geoBlockSize *= geo_bs[d];
205  int blockSize = geoBlockSize * in.Ncolor() * spin_bs; // blockSize includes internal dof
206 
207  int x[QUDA_MAX_DIM]; // global coordinates
208  int y[QUDA_MAX_DIM]; // local coordinates within a block (full site ordering)
209 
210  int checkLength = in.Nparity() * in.VolumeCB() * in.Ncolor() * in.Nspin() * in.Nvec();
211  int *check = new int[checkLength];
212  int count = 0;
213 
214  // Run through the fine grid and do the block ordering
215  for (int parity = 0; parity<in.Nparity(); parity++) {
216  for (int x_cb=0; x_cb<in.VolumeCB(); x_cb++) {
217  int i = parity*in.VolumeCB() + x_cb;
218 
219  // Get fine grid coordinates
220  V.LatticeIndex(x, i);
221 
222  //Compute the geometric offset within a block
223  // (x fastest direction, t is slowest direction, non-parity ordered)
224  int blockOffset = 0;
225  for (int d=in.Ndim()-1; d>=0; d--) {
226  y[d] = x[d]%geo_bs[d];
227  blockOffset *= geo_bs[d];
228  blockOffset += y[d];
229  }
230 
231  //Take the block-ordered offset from the coarse grid offset (geo_map)
232  int offset = geo_map[i]*nSpin_coarse*nVec*geoBlockSize*in.Ncolor()*spin_bs;
233 
234  for (int v=0; v<in.Nvec(); v++) {
235  for (int s=0; s<in.Nspin(); s++) {
236  for (int c=0; c<in.Ncolor(); c++) {
237 
238  int chirality = s / spin_bs; // chirality is the coarse spin
239  int blockSpin = s % spin_bs; // the remaining spin dof left in each block
240 
241  int index = offset + // geo block
242  chirality * nVec * geoBlockSize * spin_bs * in.Ncolor() + // chiral block
243  v * geoBlockSize * spin_bs * in.Ncolor() + // vector
244  blockOffset * spin_bs * in.Ncolor() + // local geometry
245  blockSpin*in.Ncolor() + // block spin
246  c; // color
247 
248  if (toBlock) out[index] = in(parity, x_cb, s, c, v); // going to block order
249  else in(parity, x_cb, s, c, v) = out[index]; // coming from block order
250 
251  check[count++] = index;
252  }
253  }
254  }
255  }
256 
257  //printf("blockOrderV done %d / %d\n", i, in.Volume());
258  }
259 
260  if (count != checkLength) {
261  errorQuda("Number of elements packed %d does not match expected value %d nvec=%d nspin=%d ncolor=%d",
262  count, checkLength, in.Nvec(), in.Nspin(), in.Ncolor());
263  }
264 
265  /*
266  // need non-quadratic check
267  for (int i=0; i<checkLength; i++) {
268  for (int j=0; j<i; j++) {
269  if (check[i] == check[j]) errorQuda("Collision detected in block ordering\n");
270  }
271  }
272  */
273  delete []check;
274  }
275 
276 
277  // Creates a block-ordered version of a ColorSpinorField, with parity blocking (for staggered fields)
278  // N.B.: same as above but parity are separated.
279  template <bool toBlock, int nVec, class Complex, class FieldOrder>
280  void blockCBOrderV(Complex *out, FieldOrder &in,
281  const int *geo_map, const int *geo_bs, int spin_bs,
282  const cpuColorSpinorField &V) {
283  //Compute the size of each block
284  int geoBlockSize = 1;
285  for (int d=0; d<in.Ndim(); d++) geoBlockSize *= geo_bs[d];
286  int blockSize = geoBlockSize * in.Ncolor(); // blockSize includes internal dof
287 
288  int x[QUDA_MAX_DIM]; // global coordinates
289  int y[QUDA_MAX_DIM]; // local coordinates within a block (full site ordering)
290 
291  int checkLength = in.Nparity() * in.VolumeCB() * in.Ncolor() * in.Nvec();
292  int *check = new int[checkLength];
293  int count = 0;
294 
295  // Run through the fine grid and do the block ordering
296  for (int parity = 0; parity<in.Nparity(); parity++) {
297  for (int x_cb=0; x_cb<in.VolumeCB(); x_cb++) {
298  int i = parity*in.VolumeCB() + x_cb;
299 
300  // Get fine grid coordinates
301  V.LatticeIndex(x, i);
302 
303  //Compute the geometric offset within a block
304  // (x fastest direction, t is slowest direction, non-parity ordered)
305  int blockOffset = 0;
306  for (int d=in.Ndim()-1; d>=0; d--) {
307  y[d] = x[d]%geo_bs[d];
308  blockOffset *= geo_bs[d];
309  blockOffset += y[d];
310  }
311 
312  //Take the block-ordered offset from the coarse grid offset (geo_map)
313  //A.S.: geo_map introduced for the full site ordering, so ok to use it for the offset
314  int offset = geo_map[i]*nVec*geoBlockSize*in.Ncolor();
315 
316  const int s = 0;
317 
318  for (int v=0; v<in.Nvec(); v++) {
319  for (int c=0; c<in.Ncolor(); c++) {
320 
321  int chirality = (x[0]+x[1]+x[2]+x[3])%2; // chirality is the fine-grid parity flag
322 
323  int index = offset + // geo block
324  chirality * nVec * geoBlockSize * in.Ncolor() + // chiral block
325  v * geoBlockSize * in.Ncolor() + // vector
326  blockOffset * in.Ncolor() + // local geometry
327  c; // color
328 
329  if (toBlock) out[index] = in(parity, x_cb, s, c, v); // going to block order
330  else in(parity, x_cb, s, c, v) = out[index]; // coming from block order
331 
332  check[count++] = index;
333  }
334  }
335 
336  //printf("blockOrderV done %d / %d\n", i, in.Volume());
337  } // x_cb
338  } // parity
339 
340  if (count != checkLength) {
341  errorQuda("Number of elements packed %d does not match expected value %d nvec=%d ncolor=%d",
342  count, checkLength, in.Nvec(), in.Ncolor());
343  }
344 
345  delete []check;
346  }
347 
348 
349 
350 
351  // Orthogonalise the nc vectors v[] of length n
352  // this assumes the ordering v[(b * Nvec + v) * blocksize + i]
353 
354  template <typename sumFloat, typename Float, int N>
355  void blockGramSchmidt(complex<Float> *v, int nBlocks, int blockSize) {
356 
357  for (int b=0; b<nBlocks; b++) {
358  for (int jc=0; jc<N; jc++) {
359 
360  for (int ic=0; ic<jc; ic++) {
361  // Calculate dot product.
362  complex<Float> dot = 0.0;
363  for (int i=0; i<blockSize; i++)
364  dot += conj(v[(b*N+ic)*blockSize+i]) * v[(b*N+jc)*blockSize+i];
365 
366  // Subtract the blocks to orthogonalise
367  for (int i=0; i<blockSize; i++)
368  v[(b*N+jc)*blockSize+i] -= dot * v[(b*N+ic)*blockSize+i];
369  }
370 
371  // Normalize the block
372  // nrm2 is pure real, but need to use Complex because of template.
373  sumFloat nrm2 = 0.0;
374  for (int i=0; i<blockSize; i++) nrm2 += norm(v[(b*N+jc)*blockSize+i]);
375  sumFloat scale = nrm2 > 0.0 ? 1.0/sqrt(nrm2) : 0.0;
376  for (int i=0; i<blockSize; i++) v[(b*N+jc)*blockSize+i] *= scale;
377  }
378 
379  /*
380  for (int jc=0; jc<N; jc++) {
381  complex<sumFloat> nrm2 = 0.0;
382  for(int i=0; i<blockSize; i++) nrm2 += norm(v[(b*N+jc)*blockSize+i]);
383  //printfQuda("block = %d jc = %d nrm2 = %f\n", b, jc, nrm2.real());
384  }
385  */
386 
387  //printf("blockGramSchmidt done %d / %d\n", b, nBlocks);
388  }
389 
390  }
391 
392  template <typename sumType, typename real, int N>
393  class BlockGramSchmidt : public Tunable {
394 
395  complex<real> *v;
396  int nBlock;
397  int blockSize;
398  const ColorSpinorField &meta;
399 
400  unsigned int sharedBytesPerThread() const { return 0; }
401  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
402 
403  public:
404  BlockGramSchmidt(complex<real> *v, int nBlock, int blockSize, const ColorSpinorField &meta)
405  : v(v), nBlock(nBlock), blockSize(blockSize), meta(meta) {
406  if (meta.Location() == QUDA_CPU_FIELD_LOCATION) sprintf(aux, "nBlock=%d,blockSize=%d,CPU", nBlock, blockSize);
407  else sprintf(aux, "nBlock=%d,blockSize=%d,GPU", nBlock, blockSize);
408  }
409 
410  virtual ~BlockGramSchmidt() { }
411 
412  void apply(const cudaStream_t &stream) {
413  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
414  if (meta.Location() == QUDA_CPU_FIELD_LOCATION) {
415  blockGramSchmidt<sumType, real, N>(v, nBlock, blockSize);
416  } else {
417  errorQuda("Not implemented for GPU");
418  }
419  }
420 
421  bool advanceTuneParam(TuneParam &param) const { return false; }
422 
423  TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
424 
425  long long flops() const { return nBlock * N * ((N-1) * (8l + 8l) + 2l) * blockSize; }
426  long long bytes() const { return 2*meta.Bytes(); }
427  };
428 
429  template <bool toBlock, int N, typename real, typename Order>
430  class BlockOrderV : public Tunable {
431 
432  complex<real> *vBlock;
433  Order &vOrder;
434  const int *geo_map;
435  const int *geo_bs;
436  int spin_bs;
437  const ColorSpinorField &V;
438 
439  unsigned int sharedBytesPerThread() const { return 0; }
440  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
441 
442  public:
443  BlockOrderV(complex<real> *vBlock, Order &vOrder, const int *geo_map, const int *geo_bs, int spin_bs, const ColorSpinorField &V)
444  : vBlock(vBlock), vOrder(vOrder), geo_map(geo_map), geo_bs(geo_bs), spin_bs(spin_bs), V(V) {
445  (V.Location() == QUDA_CPU_FIELD_LOCATION) ? strcpy(aux, "CPU") : strcpy(aux,"GPU");
446  }
447 
448  virtual ~BlockOrderV() { }
449 
450  void apply(const cudaStream_t &stream) {
451  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
452  if (V.Location() == QUDA_CPU_FIELD_LOCATION) {
453  blockOrderV<toBlock,N,complex<real>,Order>(vBlock,vOrder,geo_map,geo_bs,spin_bs,V);
454  } else {
455  errorQuda("Not implemented for GPU");
456  }
457  }
458 
459  bool advanceTuneParam(TuneParam &param) const { return false; }
460 
461  TuneKey tuneKey() const { return TuneKey(V.VolString(), typeid(*this).name(), aux); }
462 
463  long long flops() const { return 0; }
464  long long bytes() const { return 2*V.Bytes(); }
465  };
466 
467 #if 0
468  using namespace quda::colorspinor;
469 
473  template <typename Out, typename In, typename Rotator, int fineSpin, int coarseSpin>
474  struct BlockOrthoArg {
475  const Rotator V;
476  const int *fine_to_coarse;
477  const int *coarse_to_fine;
478  const spin_mapper<fineSpin,coarseSpin> spin_map;
479  const int parity; // the parity of the input field (if single parity)
480  const int nParity; // number of parities of input fine field
481  int swizzle; // swizzle factor for transposing blockIdx.x mapping to coarse grid coordinate
482 
483  BlockOrthoArg(Rotator &V, const int *fine_to_coarse, const int *coarse_to_fine,
484  int parity, const ColorSpinorField &meta) :
485  out(out), in(in), V(V), fine_to_coarse(fine_to_coarse), coarse_to_fine(coarse_to_fine),
486  spin_map(), parity(parity), nParity(meta.SiteSubset()), swizzle(1)
487  { }
488 
489  BlockOrthoArg(const BlockOrthoArg<Out,In,Rotator,fineSpin,coarseSpin> &arg) :
490  out(arg.out), in(arg.in), V(arg.V),
491  fine_to_coarse(arg.fine_to_coarse), coarse_to_fine(arg.coarse_to_fine), spin_map(),
492  parity(arg.parity), nParity(arg.nParity), swizzle(arg.swizzle)
493  { }
494  };
495 
496  template <typename Float, int nVec, int fineSpin, int coarseSpin, typename Arg>
497  void BlockOrtho(Arg &arg) {
498 
499  constexpr spinBlocks = fineSpin / coarseSpin;
500 
501  for (int b=0; b<nBlocks; b++) {
502  for (int s=0; s<spinBlocks; s++) {
503 
504  for (int k=0; k<nVec; k++) {
505 
506  for (int l=0; l<k; l++) {
507  complex<Float> dot = 0.0;
508 
509  for (int i=0; i<blockSize; i++) {
510 
511  dot += conj(v(parity, x_cb, s, c, l)) * v(parity, x_cb, s, c, k);
512 
513  }
514 
515  }
516 
517  }
518  }
519 
520  for (int parity_coarse=0; parity_coarse<2; parity_coarse++)
521  for (int x_coarse_cb=0; x_coarse_cb<arg.out.VolumeCB(); x_coarse_cb++)
522  for (int s=0; s<coarseSpin; s++)
523  for (int c=0; c<coarseColor; c++)
524  arg.out(parity_coarse, x_coarse_cb, s, c) = 0.0;
525 
526  // loop over fine degrees of freedom
527  for (int parity=0; parity<arg.nParity; parity++) {
528  parity = (arg.nParity == 2) ? parity : arg.parity;
529 
530  for (int x_cb=0; x_cb<arg.in.VolumeCB(); x_cb++) {
531 
532  int x = parity*arg.in.VolumeCB() + x_cb;
533  int x_coarse = arg.fine_to_coarse[x];
534  int parity_coarse = (x_coarse >= arg.out.VolumeCB()) ? 1 : 0;
535  int x_coarse_cb = x_coarse - parity_coarse*arg.out.VolumeCB();
536 
537  for (int coarse_color_block=0; coarse_color_block<coarseColor; coarse_color_block+=coarse_colors_per_thread) {
538  complex<Float> tmp[fineSpin*coarse_colors_per_thread];
539  rotateCoarseColor<Float,fineSpin,fineColor,coarseColor,coarse_colors_per_thread>
540  (tmp, arg.in, arg.V, parity, arg.nParity, x_cb, coarse_color_block);
541 
542  for (int s=0; s<fineSpin; s++) {
543  for (int coarse_color_local=0; coarse_color_local<coarse_colors_per_thread; coarse_color_local++) {
544  int c = coarse_color_block + coarse_color_local;
545  arg.out(parity_coarse,x_coarse_cb,arg.spin_map(s),c) += tmp[s*coarse_colors_per_thread+coarse_color_local];
546  }
547  }
548 
549  }
550  }
551  }
552 
553  }
554 #endif
555 
556  template<typename Float, int nSpin, int nColor, int nVec>
557  void BlockOrthogonalize(ColorSpinorField &V, const int *geo_bs, const int *geo_map, int spin_bs) {
558  complex<Float> *Vblock = new complex<Float>[V.Volume()*V.Nspin()*V.Ncolor()];
559 
560  if (V.FieldOrder() == QUDA_SPACE_SPIN_COLOR_FIELD_ORDER) {
562 
564  VectorField vOrder(const_cast<ColorSpinorField&>(V));
565 
566  int geo_blocksize = 1;
567  for (int d = 0; d < V.Ndim(); d++) geo_blocksize *= geo_bs[d];
568 
569  int blocksize = geo_blocksize * vOrder.Ncolor() * spin_bs;
570  int chiralBlocks = (V.Nspin() == 1) ? 2 : vOrder.Nspin() / spin_bs; //always 2 for staggered.
571  int numblocks = (V.Volume()/geo_blocksize) * chiralBlocks;
572  if (V.Nspin() == 1) blocksize /= chiralBlocks; //for staggered chiral block size is a parity block size
573 
574  printfQuda("Block Orthogonalizing %d blocks of %d length and width %d\n", numblocks, blocksize, nVec);
575 
576 #if 0
577  BlockOrthoArg<> arg(V);
578  BlockOrtho ortho();
579  otho.apply(0);
580 #endif
581 
582  BlockOrderV<true,nVec,Float,VectorField> reorder(Vblock, vOrder, geo_map, geo_bs, spin_bs, V);
583  reorder.apply(0);
584 
585  BlockGramSchmidt<double,Float,nVec> ortho(Vblock, numblocks, blocksize, V);
586  ortho.apply(0);
587 
588  BlockOrderV<false,nVec,Float,VectorField> reset(Vblock, vOrder, geo_map, geo_bs, spin_bs, V);
589  reset.apply(0);
590 
591  delete []Vblock;
592 
593  } else {
594  errorQuda("Unsupported field order %d\n", V.FieldOrder());
595  }
596 
597  }
598 
599  template<typename Float, int nSpin, int nColor>
600  void BlockOrthogonalize(ColorSpinorField &V, int Nvec, const int *geo_bs, const int *geo_map, int spin_bs) {
601  if (Nvec == 2) {
602  BlockOrthogonalize<Float,nSpin,nColor,2>(V, geo_bs, geo_map, spin_bs);
603  } else if (Nvec == 4) {
604  BlockOrthogonalize<Float,nSpin,nColor,4>(V, geo_bs, geo_map, spin_bs);
605  } else if (Nvec == 8) {
606  BlockOrthogonalize<Float,nSpin,nColor,8>(V, geo_bs, geo_map, spin_bs);
607  } else if (Nvec == 12) {
608  BlockOrthogonalize<Float,nSpin,nColor,12>(V, geo_bs, geo_map, spin_bs);
609  } else if (Nvec == 16) {
610  BlockOrthogonalize<Float,nSpin,nColor,16>(V, geo_bs, geo_map, spin_bs);
611  } else if (Nvec == 20) {
612  BlockOrthogonalize<Float,nSpin,nColor,20>(V, geo_bs, geo_map, spin_bs);
613  } else if (Nvec == 24) {
614  BlockOrthogonalize<Float,nSpin,nColor,24>(V, geo_bs, geo_map, spin_bs);
615  } else if (Nvec == 32) {
616  BlockOrthogonalize<Float,nSpin,nColor,32>(V, geo_bs, geo_map, spin_bs);
617  } else if (Nvec == 48) {
618  BlockOrthogonalize<Float,nSpin,nColor,48>(V, geo_bs, geo_map, spin_bs);
619  } else {
620  errorQuda("Unsupported nVec %d\n", Nvec);
621  }
622  }
623 
624  template<typename Float, int nSpin>
625  void BlockOrthogonalize(ColorSpinorField &V, int Nvec,
626  const int *geo_bs, const int *geo_map, int spin_bs) {
627  if (V.Ncolor()/Nvec == 3) {
628  BlockOrthogonalize<Float,nSpin,3>(V, Nvec, geo_bs, geo_map, spin_bs);
629  } else if (V.Ncolor()/Nvec == 2) {
630  BlockOrthogonalize<Float,nSpin,2>(V, Nvec, geo_bs, geo_map, spin_bs);
631  } else if (V.Ncolor()/Nvec == 8) {
632  BlockOrthogonalize<Float,nSpin,8>(V, Nvec, geo_bs, geo_map, spin_bs);
633  } else if (V.Ncolor()/Nvec == 16) {
634  BlockOrthogonalize<Float,nSpin,16>(V, Nvec, geo_bs, geo_map, spin_bs);
635  } else if (V.Ncolor()/Nvec == 24) {
636  BlockOrthogonalize<Float,nSpin,24>(V, Nvec, geo_bs, geo_map, spin_bs);
637  } else if (V.Ncolor()/Nvec == 32) {
638  BlockOrthogonalize<Float,nSpin,32>(V, Nvec, geo_bs, geo_map, spin_bs);
639  } else if (V.Ncolor()/Nvec == 48) {
640  BlockOrthogonalize<Float,nSpin,48>(V, Nvec, geo_bs, geo_map, spin_bs); //for staggered, even-odd blocking presumed
641  }
642  else {
643  errorQuda("Unsupported nColor %d\n", V.Ncolor()/Nvec);
644  }
645  }
646 
647  template<typename Float>
648  void BlockOrthogonalize(ColorSpinorField &V, int Nvec,
649  const int *geo_bs, const int *geo_map, int spin_bs) {
650  if (V.Nspin() == 4) {
651  BlockOrthogonalize<Float,4>(V, Nvec, geo_bs, geo_map, spin_bs);
652  } else if(V.Nspin() ==2) {
653  BlockOrthogonalize<Float,2>(V, Nvec, geo_bs, geo_map, spin_bs);
654  } else if (V.Nspin() == 1) {
655  BlockOrthogonalize<Float,1>(V, Nvec, geo_bs, geo_map, 1);
656  }
657  else {
658  errorQuda("Unsupported nSpin %d\n", V.Nspin());
659  }
660  }
661 
662 #endif // GPU_MULTIGRID
663 
665  const int *geo_bs, const int *geo_map, int spin_bs) {
666 #ifdef GPU_MULTIGRID
667  if (V.Precision() == QUDA_DOUBLE_PRECISION) {
668 #ifdef GPU_MULTIGRID_DOUBLE
669  BlockOrthogonalize<double>(V, Nvec, geo_bs, geo_map, spin_bs);
670 #else
671  errorQuda("Double precision multigrid has not been enabled");
672 #endif
673  } else if (V.Precision() == QUDA_SINGLE_PRECISION) {
674  BlockOrthogonalize<float>(V, Nvec, geo_bs, geo_map, spin_bs);
675  } else {
676  errorQuda("Unsupported precision %d\n", V.Precision());
677  }
678 #else
679  errorQuda("Multigrid has not been built");
680 #endif // GPU_MULTIGRID
681  }
682 
683 } // namespace quda
dim3 dim3 blockDim
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
Definition: complex_quda.h:896
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
#define errorQuda(...)
Definition: util_quda.h:90
__host__ __device__ ValueType sqrt(ValueType x)
Definition: complex_quda.h:105
enum QudaFieldOrder_s QudaFieldOrder
std::complex< double > Complex
Definition: eig_variables.h:13
cudaStream_t * stream
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:44
void checkLength(const ColorSpinorField &a, ColorSpinorField &b)
Definition: reduce_quda.cu:49
char * strcpy(char *__dst, const char *__src)
void FillV(ColorSpinorField &V, const std::vector< ColorSpinorField *> &B, int Nvec)
size_t size_t offset
char * index(const char *, int)
QudaGaugeParam param
Definition: pack_test.cpp:17
#define b
const int nColor
Definition: covdev_test.cpp:77
cpuColorSpinorField * in
for(int s=0;s< param.dc.Ls;s++)
int V
Definition: test_util.cpp:28
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603
int int int enum cudaChannelFormatKind f
cpuColorSpinorField * out
int Ncolor
Definition: blas_test.cu:46
int sprintf(char *, const char *,...) __attribute__((__format__(__printf__
const void int blockSize
#define printfQuda(...)
Definition: util_quda.h:84
unsigned long long flops
Definition: blas_quda.cu:42
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:880
const void * c
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
void BlockOrthogonalize(ColorSpinorField &V, int Nvec, const int *geo_bs, const int *fine_to_coarse, int spin_bs)
Block orthogonnalize the matrix field, where the blocks are defined by lookup tables that map the fin...
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:115
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51
static __inline__ size_t size_t d
__device__ unsigned int count[QUDA_MAX_MULTI_REDUCE]
Definition: cub_helper.cuh:118
QudaParity parity
Definition: covdev_test.cpp:53
unsigned long long bytes
Definition: blas_quda.cu:43
static void dot(sFloat *res, gFloat *a, sFloat *b)
Definition: dslash_util.h:56