QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
block_orthogonalize.cuh
Go to the documentation of this file.
1 #include <multigrid_helper.cuh>
2 #include <fast_intdiv.h>
3 
4 // this removes ghost accessor reducing the parameter space needed
5 #define DISABLE_GHOST true // do not rename this (it is both a template parameter and a macro)
6 
8 #include <cub_helper.cuh>
9 
10 // enabling CTA swizzling improves spatial locality of MG blocks reducing cache line wastage
11 //#define SWIZZLE
12 
13 namespace quda {
14 
15 #define MAX_MATRIX_SIZE 4096
16  static __constant__ signed char B_array_d[MAX_MATRIX_SIZE];
17 
18  // to avoid overflowing the parameter space we put the B array into a separate constant memory buffer
19  static signed char B_array_h[MAX_MATRIX_SIZE];
20 
24  template <typename Rotator, typename Vector, int fineSpin, int spinBlockSize, int coarseSpin, int nVec>
25  struct BlockOrthoArg {
26  Rotator V;
27  const int *fine_to_coarse;
28  const int *coarse_to_fine;
30  const int parity; // the parity of the input field (if single parity)
31  const int nParity; // number of parities of input fine field
32  const int nBlockOrtho; // number of times we Gram-Schmidt
35  int geoBlockSizeCB; // number of geometric elements in each checkerboarded block
36  int_fastdiv swizzle; // swizzle factor for transposing blockIdx.x mapping to coarse grid coordinate
37  const Vector *B;
38  template <typename... T>
39  BlockOrthoArg(ColorSpinorField &V, const int *fine_to_coarse, const int *coarse_to_fine, int parity,
40  const int *geo_bs, const int n_block_ortho, const ColorSpinorField &meta, T... B) :
41  V(V),
42  fine_to_coarse(fine_to_coarse),
43  coarse_to_fine(coarse_to_fine),
44  spin_map(),
45  parity(parity),
46  nParity(meta.SiteSubset()),
47  nBlockOrtho(n_block_ortho),
48  B(V.Location() == QUDA_CPU_FIELD_LOCATION ? reinterpret_cast<Vector *>(B_array_h) : nullptr)
49  {
50  const Vector Btmp[nVec]{*B...};
51  if (sizeof(Btmp) > MAX_MATRIX_SIZE) errorQuda("B array size (%lu) is larger than maximum allowed (%d)\n", sizeof(Btmp), MAX_MATRIX_SIZE);
52  memcpy(B_array_h, (void *)Btmp, sizeof(Btmp));
53  int geoBlockSize = 1;
54  for (int d = 0; d < V.Ndim(); d++) geoBlockSize *= geo_bs[d];
55  geoBlockSizeCB = geoBlockSize/2;
56  coarseVolume = meta.Volume() / geoBlockSize;
57  fineVolumeCB = meta.VolumeCB();
58  if (nParity != 2) errorQuda("BlockOrtho only presently supports full fields");
59  }
60  };
61 
62  template <int nColor, typename sumType, typename real>
63  inline __device__ __host__ void colorInnerProduct(complex<sumType> &dot, int i, complex<real> v[nColor],
64  complex<real> w[nColor])
65  {
66 #pragma unroll
67  for (int c = 0; c < nColor; c++) {
68  dot.x += w[c].real() * v[c].real();
69  dot.x += w[c].imag() * v[c].imag();
70  dot.y += w[c].real() * v[c].imag();
71  dot.y -= w[c].imag() * v[c].real();
72  }
73  }
74 
75  template <int nColor, typename sumType, typename real>
76  inline __device__ __host__ void colorNorm(sumType &nrm, complex<real> v[nColor])
77  {
78 #pragma unroll
79  for (int c = 0; c < nColor; c++) {
80  nrm += v[c].real() * v[c].real();
81  nrm += v[c].imag() * v[c].imag();
82  }
83  }
84 
85  template <typename real, int nColor>
86  inline __device__ __host__ void colorScaleSubtract(complex<real> v[nColor], complex<real> a, complex<real> w[nColor])
87  {
88 #pragma unroll
89  for (int c = 0; c < nColor; c++) {
90  v[c].x -= a.real() * w[c].real();
91  v[c].x += a.imag() * w[c].imag();
92  v[c].y -= a.real() * w[c].imag();
93  v[c].y -= a.imag() * w[c].real();
94  }
95  }
96 
97  template <typename real, int nColor> inline __device__ __host__ void colorScale(complex<real> v[nColor], real a)
98  {
99 #pragma unroll
100  for (int c=0; c<nColor; c++) v[c] *= a;
101  }
102 
103 #ifndef __CUDACC_RTC___
104  template <typename sumFloat, typename Float, int nSpin, int spinBlockSize, int nColor, int coarseSpin, int nVec, typename Arg>
106 
107  // loop over geometric blocks
108 #pragma omp parallel for
109  for (int x_coarse=0; x_coarse<arg.coarseVolume; x_coarse++) {
110 
111  // loop over number of block orthos
112  for (int n = 0; n < arg.nBlockOrtho; n++) {
113  for (int j = 0; j < nVec; j++) {
114 
115  for (int i = 0; i < j; i++) {
116 
117  // compute (j,i) block inner products
118 
119  complex<sumFloat> dot[coarseSpin];
120  for (int s = 0; s < coarseSpin; s++) dot[s] = 0.0;
121  for (int parity = 0; parity < arg.nParity; parity++) {
122  parity = (arg.nParity == 2) ? parity : arg.parity;
123 
124  for (int b = 0; b < arg.geoBlockSizeCB; b++) {
125 
126  int x = arg.coarse_to_fine[(x_coarse * 2 + parity) * arg.geoBlockSizeCB + b];
127  int x_cb = x - parity * arg.fineVolumeCB;
128 
129  complex<Float> v[nSpin][nColor];
130 
131  if (n == 0) { // load from B on first Gram-Schmidt, otherwise V.
132  for (int s = 0; s < nSpin; s++)
133  for (int c = 0; c < nColor; c++) v[s][c] = arg.B[j](parity, x_cb, s, c);
134  } else {
135  for (int s = 0; s < nSpin; s++)
136  for (int c = 0; c < nColor; c++) v[s][c] = arg.V(parity, x_cb, s, c, j);
137  }
138 
139  for (int s = 0; s < nSpin; s++) {
140  complex<Float> vis[nColor];
141  for (int c = 0; c < nColor; c++) vis[c] = arg.V(parity, x_cb, s, c, i);
142  colorInnerProduct<nColor>(dot[arg.spin_map(s, parity)], i, v[s], vis);
143  }
144  }
145  }
146 
147  // subtract the i blocks to orthogonalise
148  for (int parity = 0; parity < arg.nParity; parity++) {
149  parity = (arg.nParity == 2) ? parity : arg.parity;
150 
151  for (int b = 0; b < arg.geoBlockSizeCB; b++) {
152 
153  int x = arg.coarse_to_fine[(x_coarse * 2 + parity) * arg.geoBlockSizeCB + b];
154  int x_cb = x - parity * arg.fineVolumeCB;
155 
156  complex<Float> v[nSpin][nColor];
157  if (i == 0)
158  for (int s = 0; s < nSpin; s++)
159  for (int c = 0; c < nColor; c++) v[s][c] = arg.B[j](parity, x_cb, s, c);
160  else
161  for (int s = 0; s < nSpin; s++)
162  for (int c = 0; c < nColor; c++) v[s][c] = arg.V(parity, x_cb, s, c, j);
163 
164  for (int s = 0; s < nSpin; s++) {
165  complex<Float> vis[nColor];
166  for (int c = 0; c < nColor; c++) vis[c] = arg.V(parity, x_cb, s, c, i);
167  colorScaleSubtract<Float, nColor>(v[s], static_cast<complex<Float>>(dot[arg.spin_map(s, parity)]), vis);
168  }
169 
170  for (int s = 0; s < nSpin; s++)
171  for (int c = 0; c < nColor; c++) arg.V(parity, x_cb, s, c, j) = v[s][c];
172  }
173  }
174 
175  } // i
176 
177  sumFloat nrm[coarseSpin] = {};
178  for (int parity = 0; parity < arg.nParity; parity++) {
179  parity = (arg.nParity == 2) ? parity : arg.parity;
180 
181  for (int b = 0; b < arg.geoBlockSizeCB; b++) {
182 
183  int x = arg.coarse_to_fine[(x_coarse * 2 + parity) * arg.geoBlockSizeCB + b];
184  int x_cb = x - parity * arg.fineVolumeCB;
185 
186  complex<Float> v[nSpin][nColor];
187  if (j == 0)
188  for (int s = 0; s < nSpin; s++)
189  for (int c = 0; c < nColor; c++) v[s][c] = arg.B[j](parity, x_cb, s, c);
190  else
191  for (int s = 0; s < nSpin; s++)
192  for (int c = 0; c < nColor; c++) v[s][c] = arg.V(parity, x_cb, s, c, j);
193 
194  for (int s = 0; s < nSpin; s++) { colorNorm<nColor>(nrm[arg.spin_map(s, parity)], v[s]); }
195  }
196  }
197 
198  for (int s = 0; s < coarseSpin; s++) nrm[s] = nrm[s] > 0.0 ? rsqrt(nrm[s]) : 0.0;
199 
200  for (int parity = 0; parity < arg.nParity; parity++) {
201  parity = (arg.nParity == 2) ? parity : arg.parity;
202 
203  for (int b = 0; b < arg.geoBlockSizeCB; b++) {
204 
205  int x = arg.coarse_to_fine[(x_coarse * 2 + parity) * arg.geoBlockSizeCB + b];
206  int x_cb = x - parity * arg.fineVolumeCB;
207 
208  complex<Float> v[nSpin][nColor];
209  if (j == 0)
210  for (int s = 0; s < nSpin; s++)
211  for (int c = 0; c < nColor; c++) v[s][c] = arg.B[j](parity, x_cb, s, c);
212  else
213  for (int s = 0; s < nSpin; s++)
214  for (int c = 0; c < nColor; c++) v[s][c] = arg.V(parity, x_cb, s, c, j);
215 
216  for (int s = 0; s < nSpin; s++) { colorScale<Float, nColor>(v[s], nrm[arg.spin_map(s, parity)]); }
217 
218  for (int s = 0; s < nSpin; s++)
219  for (int c = 0; c < nColor; c++) arg.V(parity, x_cb, s, c, j) = v[s][c];
220  }
221  }
222 
223  } // j
224 
225  } // n
226 
227  } // x_coarse
228  }
229 #endif
230 
231  template <int block_size, typename sumFloat, typename Float, int nSpin, int spinBlockSize, int nColor, int coarseSpin,
232  int nVec, typename Arg>
233  __launch_bounds__(2 * block_size) __global__ void blockOrthoGPU(Arg arg)
234  {
235 
236  int x_coarse = blockIdx.x;
237 #ifdef SWIZZLE
238  // the portion of the grid that is exactly divisible by the number of SMs
239  const int gridp = gridDim.x - gridDim.x % arg.swizzle;
240 
241  if (blockIdx.x < gridp) {
242  // this is the portion of the block that we are going to transpose
243  const int i = blockIdx.x % arg.swizzle;
244  const int j = blockIdx.x / arg.swizzle;
245 
246  // tranpose the coordinates
247  x_coarse = i * (gridp / arg.swizzle) + j;
248  }
249 #endif
250  int parity = (arg.nParity == 2) ? threadIdx.y + blockIdx.y*blockDim.y : arg.parity;
251  int x = arg.coarse_to_fine[ (x_coarse*2 + parity) * blockDim.x + threadIdx.x];
252  int x_cb = x - parity*arg.fineVolumeCB;
253  if (x_cb >= arg.fineVolumeCB) return;
254  int chirality = blockIdx.z; // which chiral block we're working on (if chirality is present)
255 
256  constexpr int spinBlock = nSpin / coarseSpin; // size of spin block
257  typedef cub::BlockReduce<complex<sumFloat>, block_size, cub::BLOCK_REDUCE_WARP_REDUCTIONS, 2> dotReduce;
258  typedef cub::BlockReduce<sumFloat, block_size, cub::BLOCK_REDUCE_WARP_REDUCTIONS, 2> normReduce;
259 
260  __shared__ typename dotReduce::TempStorage dot_storage;
261  typename normReduce::TempStorage *norm_storage = (typename normReduce::TempStorage *)&dot_storage;
262  complex<sumFloat> *dot_ = (complex<sumFloat> *)&dot_storage;
263  sumFloat *nrm_ = (sumFloat *)&dot_storage;
264 
265  // cast the constant memory buffer to a Vector array
266  typedef typename std::remove_reference<decltype(*arg.B)>::type Vector;
267  const Vector *B = reinterpret_cast<const Vector *>(B_array_d);
268 
269  // loop over number of block orthos
270  for (int n = 0; n < arg.nBlockOrtho; n++) {
271  for (int j = 0; j < nVec; j++) {
272 
273  complex<Float> v[spinBlock][nColor];
274  if (n == 0) { // load from B on first Gram-Schmidt, otherwise V.
275 #pragma unroll
276  for (int s = 0; s < spinBlock; s++)
277 #pragma unroll
278  for (int c = 0; c < nColor; c++) v[s][c] = B[j](parity, x_cb, chirality * spinBlock + s, c);
279  } else {
280 #pragma unroll
281  for (int s = 0; s < spinBlock; s++)
282 #pragma unroll
283  for (int c = 0; c < nColor; c++) v[s][c] = arg.V(parity, x_cb, chirality * spinBlock + s, c, j);
284  }
285 
286  for (int i = 0; i < j; i++) {
287 
288  complex<Float> dot = 0.0;
289 
290  // compute (j,i) block inner products
291  complex<Float> vi[spinBlock][nColor];
292 #pragma unroll
293  for (int s = 0; s < spinBlock; s++)
294 #pragma unroll
295  for (int c = 0; c < nColor; c++) vi[s][c] = arg.V(parity, x_cb, chirality * spinBlock + s, c, i);
296 
297 #pragma unroll
298  for (int s = 0; s < spinBlock; s++) { colorInnerProduct<nColor>(dot, i, v[s], vi[s]); }
299 
300  __syncthreads();
301  dot = dotReduce(dot_storage).Sum(dot);
302  if (threadIdx.x == 0 && threadIdx.y == 0) *dot_ = dot;
303  __syncthreads();
304  dot = *dot_;
305 
306  // subtract the blocks to orthogonalise
307 #pragma unroll
308  for (int s = 0; s < spinBlock; s++) { colorScaleSubtract<Float, nColor>(v[s], dot, vi[s]); }
309 
310  } // i
311 
312  // normalize the block
313  sumFloat nrm = static_cast<sumFloat>(0.0);
314 
315 #pragma unroll
316  for (int s = 0; s < spinBlock; s++) { colorNorm<nColor>(nrm, v[s]); }
317 
318  __syncthreads();
319  nrm = normReduce(*norm_storage).Sum(nrm);
320  if (threadIdx.x == 0 && threadIdx.y == 0) *nrm_ = nrm;
321  __syncthreads();
322  nrm = *nrm_;
323 
324  nrm = nrm > 0.0 ? rsqrt(nrm) : 0.0;
325 
326 #pragma unroll
327  for (int s = 0; s < spinBlock; s++) { colorScale<Float, nColor>(v[s], nrm); }
328 
329 #pragma unroll
330  for (int s = 0; s < spinBlock; s++)
331 #pragma unroll
332  for (int c = 0; c < nColor; c++) arg.V(parity, x_cb, chirality * spinBlock + s, c, j) = v[s][c];
333 
334  } // j
335  } // n
336  }
337 
338 } // namespace quda
#define errorQuda(...)
Definition: util_quda.h:121
BlockOrthoArg(ColorSpinorField &V, const int *fine_to_coarse, const int *coarse_to_fine, int parity, const int *geo_bs, const int n_block_ortho, const ColorSpinorField &meta, T... B)
const spin_mapper< fineSpin, coarseSpin > spin_map
void blockOrthoCPU(Arg &arg)
__device__ __host__ void colorNorm(sumType &nrm, complex< real > v[nColor])
const int nColor
Definition: covdev_test.cpp:75
static __constant__ signed char B_array_d[MAX_MATRIX_SIZE]
__device__ __host__ void colorScaleSubtract(complex< real > v[nColor], complex< real > a, complex< real > w[nColor])
#define MAX_MATRIX_SIZE
__device__ __host__ void colorScale(complex< real > v[nColor], real a)
const int nParity
Definition: spinor_noise.cu:25
__shared__ float s[]
static signed char B_array_h[MAX_MATRIX_SIZE]
__device__ __host__ void colorInnerProduct(complex< sumType > &dot, int i, complex< real > v[nColor], complex< real > w[nColor])
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
colorspinor::FieldOrderCB< real, Ns, Nc, 1, order > V
Definition: spinor_noise.cu:23
VectorXcd Vector
int n_block_ortho[QUDA_MAX_MG_LEVEL]
Definition: test_util.cpp:1673
__launch_bounds__(2 *block_size) __global__ void blockOrthoGPU(Arg arg)
__device__ __host__ void dot_(ReduceType &sum, const double2 &a, const double2 &b)
static void dot(sFloat *res, gFloat *a, sFloat *b)
Definition: dslash_util.h:56