QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
block_orthogonalize.cu
Go to the documentation of this file.
1 #include <color_spinor_field.h>
2 #include <tune_quda.h>
3 #include <uint_to_char.h>
4 #include <typeinfo>
5 #include <vector>
6 #include <assert.h>
7 
8 // ensure compatibilty with C++11
9 #if __cplusplus >= 201402L
10 #include <utility>
11 #else
12 #include <integer_sequence.hpp> // C++11 version of this C++14 feature
13 #endif
14 
15 #include <launch_kernel.cuh>
16 
17 #include <jitify_helper.cuh>
18 
19 #ifdef GPU_MULTIGRID
21 #endif
22 
23 namespace quda {
24 
25 #ifdef GPU_MULTIGRID
26 
27  using namespace quda::colorspinor;
28 
29  template <typename sumType, typename vFloat, typename bFloat, int nSpin, int spinBlockSize, int nColor_, int coarseSpin, int nVec>
30  class BlockOrtho : public Tunable {
31 
32  // we only support block-format on fine grid where Ncolor=3
33  static constexpr int nColor = isFixed<bFloat>::value ? 3 : nColor_;
34 
35  typedef typename mapper<vFloat>::type RegType;
36  ColorSpinorField &V;
37  const std::vector<ColorSpinorField*> B;
38  const int *fine_to_coarse;
39  const int *coarse_to_fine;
40  const int *geo_bs;
41  const int n_block_ortho;
42  int geoBlockSize;
43  int nBlock;
44 
45  unsigned int sharedBytesPerThread() const { return 0; }
46  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
47  unsigned int minThreads() const { return V.VolumeCB(); } // fine parity is the block y dimension
48 
49  public:
50  BlockOrtho(ColorSpinorField &V, const std::vector<ColorSpinorField *> B, const int *fine_to_coarse,
51  const int *coarse_to_fine, const int *geo_bs, const int n_block_ortho) :
52  V(V),
53  B(B),
54  fine_to_coarse(fine_to_coarse),
55  coarse_to_fine(coarse_to_fine),
56  geo_bs(geo_bs),
57  n_block_ortho(n_block_ortho)
58  {
59  if (nColor_ != nColor)
60  errorQuda("Number of colors %d not supported with this precision %lu\n", nColor_, sizeof(bFloat));
61 
62  if (V.Location() == QUDA_CUDA_FIELD_LOCATION) {
63 #ifdef JITIFY
64  create_jitify_program("kernels/block_orthogonalize.cuh");
65 #endif
66  }
67  strcat(aux, compile_type_str(V));
68  strcat(aux, V.AuxString());
69  strcat(aux,",block_size=");
70 
71  geoBlockSize = 1;
72  char geo_str[16];
73  for (int d = 0; d < V.Ndim(); d++) {
74  geoBlockSize *= geo_bs[d];
75  i32toa(geo_str, geo_bs[d]);
76  strcat(aux, geo_str);
77  if (d < V.Ndim() - 1) strcat(aux, "x");
78  }
79 
80  strcat(aux, ",n_block_ortho=");
81  char n_ortho_str[2];
82  i32toa(n_ortho_str, n_block_ortho);
83  strcat(aux, n_ortho_str);
84 
85  if (V.Location() == QUDA_CPU_FIELD_LOCATION) strcat(aux, getOmpThreadStr());
86 
87  int chiralBlocks = (nSpin==1) ? 2 : V.Nspin() / spinBlockSize; //always 2 for staggered.
88  nBlock = (V.Volume()/geoBlockSize) * chiralBlocks;
89  }
90 
91  virtual ~BlockOrtho() { }
92 
99  template <typename Rotator, typename Vector, std::size_t... S>
100  void CPU(const std::vector<ColorSpinorField*> &B, std::index_sequence<S...>) {
101  typedef BlockOrthoArg<Rotator,Vector,nSpin,spinBlockSize,coarseSpin,nVec> Arg;
102  Arg arg(V, fine_to_coarse, coarse_to_fine, QUDA_INVALID_PARITY, geo_bs, n_block_ortho, V, B[S]...);
103  blockOrthoCPU<sumType,RegType,nSpin,spinBlockSize,nColor,coarseSpin,nVec,Arg>(arg);
104  }
105 
112  template <typename Rotator, typename Vector, std::size_t... S>
113  void GPU(const TuneParam &tp, const cudaStream_t &stream, const std::vector<ColorSpinorField*> &B, std::index_sequence<S...>) {
114  typedef typename mapper<vFloat>::type RegType; // need to redeclare typedef (WAR for CUDA 7 and 8)
115  typedef BlockOrthoArg<Rotator,Vector,nSpin,spinBlockSize,coarseSpin,nVec> Arg;
116  Arg arg(V, fine_to_coarse, coarse_to_fine, QUDA_INVALID_PARITY, geo_bs, n_block_ortho, V, B[S]...);
117  arg.swizzle = tp.aux.x;
118 #ifdef JITIFY
119  using namespace jitify::reflection;
120  auto instance = program->kernel("quda::blockOrthoGPU")
121  .instantiate((int)tp.block.x,Type<sumType>(),Type<RegType>(),nSpin,spinBlockSize,nColor,coarseSpin,nVec,Type<Arg>());
122  cuMemcpyHtoDAsync(instance.get_constant_ptr("quda::B_array_d"), B_array_h, MAX_MATRIX_SIZE, stream);
123  jitify_error = instance.configure(tp.grid,tp.block,tp.shared_bytes,stream).launch(arg);
124 #else
125  cudaMemcpyToSymbolAsync(B_array_d, B_array_h, MAX_MATRIX_SIZE, 0, cudaMemcpyHostToDevice, stream);
126  LAUNCH_KERNEL_MG_BLOCK_SIZE(blockOrthoGPU,tp,stream,arg,sumType,RegType,nSpin,spinBlockSize,nColor,coarseSpin,nVec,Arg);
127 #endif
128  }
129 
130  void apply(const cudaStream_t &stream) {
131  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
132  if (V.Location() == QUDA_CPU_FIELD_LOCATION) {
133  if (V.FieldOrder() == QUDA_SPACE_SPIN_COLOR_FIELD_ORDER && B[0]->FieldOrder() == QUDA_SPACE_SPIN_COLOR_FIELD_ORDER) {
136  CPU<Rotator,Vector>(B, std::make_index_sequence<nVec>());
137  } else {
138  errorQuda("Unsupported field order %d\n", V.FieldOrder());
139  }
140  } else {
141 #if __COMPUTE_CAPABILITY__ >= 300
142  if (V.FieldOrder() == QUDA_FLOAT2_FIELD_ORDER && B[0]->FieldOrder() == QUDA_FLOAT2_FIELD_ORDER) {
145  GPU<Rotator,Vector>(tp,stream,B,std::make_index_sequence<nVec>());
146  } else {
147  errorQuda("Unsupported field order V=%d B=%d\n", V.FieldOrder(), B[0]->FieldOrder());
148  }
149 #else
150  errorQuda("GPU block orthogonalization not supported on this GPU architecture");
151 #endif
152  }
153  }
154 
155  bool advanceAux(TuneParam &param) const
156  {
157 #ifdef SWIZZLE
158  if (param.aux.x < 2*deviceProp.multiProcessorCount) {
159  param.aux.x++;
160  return true;
161  } else {
162  param.aux.x = 1;
163  return false;
164  }
165 #else
166  return false;
167 #endif
168  }
169 
170  bool advanceTuneParam(TuneParam &param) const {
171  if (V.Location() == QUDA_CUDA_FIELD_LOCATION) {
172  return advanceSharedBytes(param) || advanceAux(param);
173  } else {
174  return false;
175  }
176  }
177 
178  TuneKey tuneKey() const { return TuneKey(V.VolString(), typeid(*this).name(), aux); }
179 
180  void initTuneParam(TuneParam &param) const { defaultTuneParam(param); }
181 
183  void defaultTuneParam(TuneParam &param) const {
184  param.block = dim3(geoBlockSize/2, V.SiteSubset(), 1);
185  param.grid = dim3((minThreads() + param.block.x - 1) / param.block.x, 1, coarseSpin);
186  param.shared_bytes = 0;
187  param.aux.x = 1; // swizzle factor
188  }
189 
190  long long flops() const
191  {
192  return n_block_ortho * nBlock * (geoBlockSize / 2) * (spinBlockSize == 0 ? 1 : 2 * spinBlockSize) / 2 * nColor
193  * (nVec * ((nVec - 1) * (8l + 8l)) + 6l);
194  }
195 
196  long long bytes() const
197  {
198  return nVec * B[0]->Bytes() + (nVec - 1) * nVec / 2 * V.Bytes() / nVec + V.Bytes()
199  + (n_block_ortho - 1) * (V.Bytes() + (nVec - 1) * nVec / 2 * V.Bytes() / nVec + V.Bytes());
200  }
201 
202  char *saveOut, *saveOutNorm;
203 
204  void preTune() { V.backup(); }
205  void postTune() { V.restore(); }
206 
207  };
208 
209  template <typename vFloat, typename bFloat, int nSpin, int spinBlockSize, int nColor, int nVec>
210  void BlockOrthogonalize(ColorSpinorField &V, const std::vector<ColorSpinorField *> &B, const int *fine_to_coarse,
211  const int *coarse_to_fine, const int *geo_bs, const int n_block_ortho)
212  {
213 
214  int geo_blocksize = 1;
215  for (int d = 0; d < V.Ndim(); d++) geo_blocksize *= geo_bs[d];
216 
217  int blocksize = geo_blocksize * V.Ncolor();
218  if (spinBlockSize == 0) { blocksize /= 2; } else { blocksize *= spinBlockSize; }
219  int chiralBlocks = (spinBlockSize == 0) ? 2 : V.Nspin() / spinBlockSize; //always 2 for staggered.
220  int numblocks = (V.Volume()/geo_blocksize) * chiralBlocks;
221  constexpr int coarseSpin = (nSpin == 4 || nSpin == 2 || spinBlockSize == 0) ? 2 : 1;
222 
223  if (getVerbosity() >= QUDA_VERBOSE)
224  printfQuda("Block Orthogonalizing %d blocks of %d length and width %d repeating %d times\n", numblocks, blocksize,
225  nVec, n_block_ortho);
226 
227  V.Scale(1.0); // by definition this is true
228  BlockOrtho<double, vFloat, bFloat, nSpin, spinBlockSize, nColor, coarseSpin, nVec> ortho(
229  V, B, fine_to_coarse, coarse_to_fine, geo_bs, n_block_ortho);
230  ortho.apply(0);
231  checkCudaError();
232  }
233 
234  template <typename vFloat, typename bFloat, int nSpin, int spinBlockSize>
235  void BlockOrthogonalize(ColorSpinorField &V, const std::vector<ColorSpinorField *> &B, const int *fine_to_coarse,
236  const int *coarse_to_fine, const int *geo_bs, const int n_block_ortho)
237  {
238 
239  const int Nvec = B.size();
240  if (V.Ncolor()/Nvec == 3) {
241 
242  constexpr int nColor = 3;
243  if (Nvec == 6) { // for Wilson free field
244  BlockOrthogonalize<vFloat, bFloat, nSpin, spinBlockSize, nColor, 6>(V, B, fine_to_coarse, coarse_to_fine,
245  geo_bs, n_block_ortho);
246  } else if (Nvec == 24) {
247  BlockOrthogonalize<vFloat, bFloat, nSpin, spinBlockSize, nColor, 24>(V, B, fine_to_coarse, coarse_to_fine,
248  geo_bs, n_block_ortho);
249  } else if (Nvec == 32) {
250  BlockOrthogonalize<vFloat, bFloat, nSpin, spinBlockSize, nColor, 32>(V, B, fine_to_coarse, coarse_to_fine,
251  geo_bs, n_block_ortho);
252  } else if (Nvec == 48) {
253  BlockOrthogonalize<vFloat, bFloat, nSpin, spinBlockSize, nColor, 48>(V, B, fine_to_coarse, coarse_to_fine,
254  geo_bs, n_block_ortho);
255  } else {
256  errorQuda("Unsupported nVec %d\n", Nvec);
257  }
258 
259  } else if (V.Ncolor()/Nvec == 6) {
260 
261  constexpr int nColor = 6;
262  if (Nvec == 6) {
263  BlockOrthogonalize<vFloat, bFloat, nSpin, spinBlockSize, nColor, 6>(V, B, fine_to_coarse, coarse_to_fine,
264  geo_bs, n_block_ortho);
265  } else {
266  errorQuda("Unsupported nVec %d\n", Nvec);
267  }
268 
269  } else if (V.Ncolor()/Nvec == 24) {
270 
271  constexpr int nColor = 24;
272  if (Nvec == 24) {
273  BlockOrthogonalize<vFloat, bFloat, nSpin, spinBlockSize, nColor, 24>(V, B, fine_to_coarse, coarse_to_fine,
274  geo_bs, n_block_ortho);
275  } else if (Nvec == 32) {
276  BlockOrthogonalize<vFloat, bFloat, nSpin, spinBlockSize, nColor, 32>(V, B, fine_to_coarse, coarse_to_fine,
277  geo_bs, n_block_ortho);
278  } else {
279  errorQuda("Unsupported nVec %d\n", Nvec);
280  }
281 
282  } else if (V.Ncolor()/Nvec == 32) {
283 
284  constexpr int nColor = 32;
285  if (Nvec == 32) {
286  BlockOrthogonalize<vFloat, bFloat, nSpin, spinBlockSize, nColor, 32>(V, B, fine_to_coarse, coarse_to_fine,
287  geo_bs, n_block_ortho);
288  } else {
289  errorQuda("Unsupported nVec %d\n", Nvec);
290  }
291  } else {
292  errorQuda("Unsupported nColor %d\n", V.Ncolor()/Nvec);
293  }
294  }
295 
296  template <typename vFloat, typename bFloat>
297  void BlockOrthogonalize(ColorSpinorField &V, const std::vector<ColorSpinorField *> &B, const int *fine_to_coarse,
298  const int *coarse_to_fine, const int *geo_bs, const int spin_bs, const int n_block_ortho)
299  {
300  if(V.Nspin() ==2 && spin_bs == 1) { //coarsening coarse fermions w/ chirality.
301  BlockOrthogonalize<vFloat, bFloat, 2, 1>(V, B, fine_to_coarse, coarse_to_fine, geo_bs, n_block_ortho);
302 #ifdef GPU_WILSON_DIRAC
303  } else if (V.Nspin() == 4 && spin_bs == 2) { // coarsening Wilson-like fermions.
304  BlockOrthogonalize<vFloat, bFloat, 4, 2>(V, B, fine_to_coarse, coarse_to_fine, geo_bs, n_block_ortho);
305 #endif
306 #ifdef GPU_STAGGERED_DIRAC
307  } else if (V.Nspin() == 1 && spin_bs == 1) { // coarsening Laplace-like operators.
308  BlockOrthogonalize<vFloat, bFloat, 1, 1>(V, B, fine_to_coarse, coarse_to_fine, geo_bs, n_block_ortho);
309 #endif
310  } else {
311  errorQuda("Unsupported nSpin %d and spinBlockSize %d combination.\n", V.Nspin(), spin_bs);
312  }
313  }
314 
315 #endif // GPU_MULTIGRID
316 
317  void BlockOrthogonalize(ColorSpinorField &V, const std::vector<ColorSpinorField *> &B, const int *fine_to_coarse,
318  const int *coarse_to_fine, const int *geo_bs, const int spin_bs, const int n_block_ortho)
319  {
320 #ifdef GPU_MULTIGRID
321  if (V.Precision() == QUDA_DOUBLE_PRECISION && B[0]->Precision() == QUDA_DOUBLE_PRECISION) {
322 #ifdef GPU_MULTIGRID_DOUBLE
323  BlockOrthogonalize<double>(V, B, fine_to_coarse, coarse_to_fine, geo_bs, spin_bs, n_block_ortho);
324 #else
325  errorQuda("Double precision multigrid has not been enabled");
326 #endif
327  } else if (V.Precision() == QUDA_SINGLE_PRECISION && B[0]->Precision() == QUDA_SINGLE_PRECISION) {
328  BlockOrthogonalize<float, float>(V, B, fine_to_coarse, coarse_to_fine, geo_bs, spin_bs, n_block_ortho);
329  } else if (V.Precision() == QUDA_HALF_PRECISION && B[0]->Precision() == QUDA_SINGLE_PRECISION) {
330 #if QUDA_PRECISION & 2
331  BlockOrthogonalize<short, float>(V, B, fine_to_coarse, coarse_to_fine, geo_bs, spin_bs, n_block_ortho);
332 #else
333  errorQuda("QUDA_PRECISION=%d does not enable half precision", QUDA_PRECISION);
334 #endif
335  } else if (V.Precision() == QUDA_HALF_PRECISION && B[0]->Precision() == QUDA_HALF_PRECISION) {
336 #if QUDA_PRECISION & 2
337  BlockOrthogonalize<short, short>(V, B, fine_to_coarse, coarse_to_fine, geo_bs, spin_bs, n_block_ortho);
338 #else
339  errorQuda("QUDA_PRECISION=%d does not enable half precision", QUDA_PRECISION);
340 #endif
341  } else {
342  errorQuda("Unsupported precision combination V=%d B=%d\n", V.Precision(), B[0]->Precision());
343  }
344 #else
345  errorQuda("Multigrid has not been built");
346 #endif // GPU_MULTIGRID
347  }
348 
349 } // namespace quda
cudaDeviceProp deviceProp
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21
#define errorQuda(...)
Definition: util_quda.h:121
Helper file when using jitify run-time compilation. This file should be included in source code...
cudaStream_t * stream
static const bool value
const char * compile_type_str(const LatticeField &meta, QudaFieldLocation location_=QUDA_INVALID_FIELD_LOCATION)
Helper function for setting auxilary string.
void i32toa(char *buffer, int32_t value)
Definition: uint_to_char.h:117
QudaGaugeParam param
Definition: pack_test.cpp:17
const int nColor
Definition: covdev_test.cpp:75
This is just a dummy structure we use for trove to define the required structure size.
static __constant__ signed char B_array_d[MAX_MATRIX_SIZE]
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:643
#define MAX_MATRIX_SIZE
char * getOmpThreadStr()
Returns a string of the form ",omp_threads=$OMP_NUM_THREADS", which can be used for storing the numbe...
Definition: util_quda.cpp:134
int V
Definition: test_util.cpp:27
static signed char B_array_h[MAX_MATRIX_SIZE]
#define printfQuda(...)
Definition: util_quda.h:115
unsigned long long flops
Definition: blas_quda.cu:22
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
VectorXcd Vector
#define checkCudaError()
Definition: util_quda.h:161
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:52
int n_block_ortho[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1673
QudaPrecision Precision() const
#define LAUNCH_KERNEL_MG_BLOCK_SIZE(kernel, tp, stream, arg,...)
unsigned long long bytes
Definition: blas_quda.cu:23
void BlockOrthogonalize(ColorSpinorField &V, const std::vector< ColorSpinorField *> &B, const int *fine_to_coarse, const int *coarse_to_fine, const int *geo_bs, const int spin_bs, const int n_block_ortho)
Block orthogonnalize the matrix field, where the blocks are defined by lookup tables that map the fin...