QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
dslash_coarse.cuh
Go to the documentation of this file.
1 #include <gauge_field_order.h>
3 #include <index_helper.cuh>
4 #include <cub_helper.cuh> // for vector_type
5 #if (__COMPUTE_CAPABILITY__ >= 300 || __CUDA_ARCH__ >= 300)
6 #include <generics/shfl.h>
7 #endif
8 
9 // splitting the dot-product between threads is buggy with CUDA 7.0
10 #if __COMPUTE_CAPABILITY__ >= 300 && CUDA_VERSION >= 7050
11 #define DOT_PRODUCT_SPLIT
12 #endif
13 
14 namespace quda {
15 
16  enum DslashType {
20  };
21 
22  template <typename Float, typename yFloat, typename ghostFloat, int coarseSpin, int coarseColor, QudaFieldOrder csOrder, QudaGaugeFieldOrder gOrder>
23  struct DslashCoarseArg {
27 
28  F out;
29  const F inA;
30  const F inB;
31  const GY Y;
32  const GY X;
33  const Float kappa;
34  const int parity; // only use this for single parity fields
35  const int nParity; // number of parities we're working on
36  const int nFace; // hard code to 1 for now
37  const int_fastdiv X0h; // X[0]/2
38  const int_fastdiv dim[5]; // full lattice dimensions
39  const int commDim[4]; // whether a given dimension is partitioned or not
40  const int volumeCB;
41 
43  const GaugeField &Y, const GaugeField &X, Float kappa, int parity)
44  : out(const_cast<ColorSpinorField&>(out)), inA(const_cast<ColorSpinorField&>(inA)),
45  inB(const_cast<ColorSpinorField&>(inB)), Y(const_cast<GaugeField&>(Y)),
46  X(const_cast<GaugeField&>(X)), kappa(kappa), parity(parity),
47  nParity(out.SiteSubset()), nFace(1), X0h( ((3-nParity) * out.X(0)) /2),
48  dim{ (3-nParity) * out.X(0), out.X(1), out.X(2), out.X(3), out.Ndim() == 5 ? out.X(4) : 1 },
50  volumeCB(out.VolumeCB()/dim[4])
51  { }
52  };
53 
57  template <DslashType type>
58  static __host__ __device__ bool doHalo() {
59  switch(type) {
60  case DSLASH_EXTERIOR:
61  case DSLASH_FULL:
62  return true;
63  default:
64  return false;
65  }
66  }
67 
71  template <DslashType type>
72  static __host__ __device__ bool doBulk() {
73  switch(type) {
74  case DSLASH_INTERIOR:
75  case DSLASH_FULL:
76  return true;
77  default:
78  return false;
79  }
80  }
81 
92  extern __shared__ float s[];
93  template <typename Float, int nDim, int Ns, int Nc, int Mc, int color_stride, int dim_stride, int thread_dir, int thread_dim, bool dagger, DslashType type, typename Arg>
94  __device__ __host__ inline void applyDslash(complex<Float> out[], Arg &arg, int x_cb, int src_idx, int parity, int s_row, int color_block, int color_offset) {
95  const int their_spinor_parity = (arg.nParity == 2) ? 1-parity : 0;
96 
97  int coord[5];
98  getCoordsCB(coord, x_cb, arg.dim, arg.X0h, parity);
99  coord[4] = src_idx;
100 
101 #ifdef __CUDA_ARCH__
102  complex<Float> *shared_sum = (complex<Float>*)s;
103  if (!thread_dir) {
104 #endif
105 
106  //Forward gather - compute fwd offset for spinor fetch
107 #pragma unroll
108  for(int d = thread_dim; d < nDim; d+=dim_stride) // loop over dimension
109  {
110  const int fwd_idx = linkIndexP1(coord, arg.dim, d);
111 
112  if ( arg.commDim[d] && (coord[d] + arg.nFace >= arg.dim[d]) ) {
113  if (doHalo<type>()) {
114  int ghost_idx = ghostFaceIndex<1, 5>(coord, arg.dim, d, arg.nFace);
115 
116 #pragma unroll
117  for(int color_local = 0; color_local < Mc; color_local++) { //Color row
118  int c_row = color_block + color_local; // global color index
119  int row = s_row*Nc + c_row;
120 #pragma unroll
121  for(int s_col = 0; s_col < Ns; s_col++) { //Spin column
122 #pragma unroll
123  for(int c_col = 0; c_col < Nc; c_col+=color_stride) { //Color column
124  int col = s_col*Nc + c_col + color_offset;
125  if (!dagger)
126  out[color_local] += arg.Y(d+4, parity, x_cb, row, col)
127  * arg.inA.Ghost(d, 1, their_spinor_parity, ghost_idx + src_idx*arg.volumeCB, s_col, c_col+color_offset);
128  else
129  out[color_local] += arg.Y(d, parity, x_cb, row, col)
130  * arg.inA.Ghost(d, 1, their_spinor_parity, ghost_idx + src_idx*arg.volumeCB, s_col, c_col+color_offset);
131  }
132  }
133  }
134  }
135  } else if (doBulk<type>()) {
136 #pragma unroll
137  for(int color_local = 0; color_local < Mc; color_local++) { //Color row
138  int c_row = color_block + color_local; // global color index
139  int row = s_row*Nc + c_row;
140 #pragma unroll
141  for(int s_col = 0; s_col < Ns; s_col++) { //Spin column
142 #pragma unroll
143  for(int c_col = 0; c_col < Nc; c_col+=color_stride) { //Color column
144  int col = s_col*Nc + c_col + color_offset;
145  if (!dagger)
146  out[color_local] += arg.Y(d+4, parity, x_cb, row, col)
147  * arg.inA(their_spinor_parity, fwd_idx + src_idx*arg.volumeCB, s_col, c_col+color_offset);
148  else
149  out[color_local] += arg.Y(d, parity, x_cb, row, col)
150  * arg.inA(their_spinor_parity, fwd_idx + src_idx*arg.volumeCB, s_col, c_col+color_offset);
151  }
152  }
153  }
154  }
155 
156  } // nDim
157 
158 #if defined(__CUDA_ARCH__)
159  if (thread_dim > 0) { // only need to write to shared memory if not master thread
160 #pragma unroll
161  for (int color_local=0; color_local < Mc; color_local++) {
162  shared_sum[((color_local * blockDim.z + threadIdx.z )*blockDim.y + threadIdx.y)*blockDim.x + threadIdx.x] = out[color_local];
163  }
164  }
165 #endif
166 
167 #ifdef __CUDA_ARCH__
168  } else {
169 #endif
170 
171  //Backward gather - compute back offset for spinor and gauge fetch
172 #pragma unroll
173  for(int d = thread_dim; d < nDim; d+=dim_stride)
174  {
175  const int back_idx = linkIndexM1(coord, arg.dim, d);
176  const int gauge_idx = back_idx;
177  if ( arg.commDim[d] && (coord[d] - arg.nFace < 0) ) {
178  if (doHalo<type>()) {
179  const int ghost_idx = ghostFaceIndex<0, 5>(coord, arg.dim, d, arg.nFace);
180 #pragma unroll
181  for (int color_local=0; color_local<Mc; color_local++) {
182  int c_row = color_block + color_local;
183  int row = s_row*Nc + c_row;
184 #pragma unroll
185  for (int s_col=0; s_col<Ns; s_col++)
186 #pragma unroll
187  for (int c_col=0; c_col<Nc; c_col+=color_stride) {
188  int col = s_col*Nc + c_col + color_offset;
189  if (!dagger)
190  out[color_local] += conj(arg.Y.Ghost(d, 1-parity, ghost_idx, col, row))
191  * arg.inA.Ghost(d, 0, their_spinor_parity, ghost_idx + src_idx*arg.volumeCB, s_col, c_col+color_offset);
192  else
193  out[color_local] += conj(arg.Y.Ghost(d+4, 1-parity, ghost_idx, col, row))
194  * arg.inA.Ghost(d, 0, their_spinor_parity, ghost_idx + src_idx*arg.volumeCB, s_col, c_col+color_offset);
195  }
196  }
197  }
198  } else if (doBulk<type>()) {
199 #pragma unroll
200  for(int color_local = 0; color_local < Mc; color_local++) {
201  int c_row = color_block + color_local;
202  int row = s_row*Nc + c_row;
203 #pragma unroll
204  for(int s_col = 0; s_col < Ns; s_col++)
205 #pragma unroll
206  for(int c_col = 0; c_col < Nc; c_col+=color_stride) {
207  int col = s_col*Nc + c_col + color_offset;
208  if (!dagger)
209  out[color_local] += conj(arg.Y(d, 1-parity, gauge_idx, col, row))
210  * arg.inA(their_spinor_parity, back_idx + src_idx*arg.volumeCB, s_col, c_col+color_offset);
211  else
212  out[color_local] += conj(arg.Y(d+4, 1-parity, gauge_idx, col, row))
213  * arg.inA(their_spinor_parity, back_idx + src_idx*arg.volumeCB, s_col, c_col+color_offset);
214  }
215  }
216  }
217 
218  } //nDim
219 
220 #if defined(__CUDA_ARCH__)
221 
222 #pragma unroll
223  for (int color_local=0; color_local < Mc; color_local++) {
224  shared_sum[ ((color_local * blockDim.z + threadIdx.z )*blockDim.y + threadIdx.y)*blockDim.x + threadIdx.x] = out[color_local];
225  }
226 
227  } // forwards / backwards thread split
228 #endif
229 
230 #ifdef __CUDA_ARCH__ // CUDA path has to recombine the foward and backward results
231  __syncthreads();
232 
233  // (colorspin * dim_stride + dim * 2 + dir)
234  if (thread_dim == 0 && thread_dir == 0) {
235 
236  // full split over dimension and direction
237 #pragma unroll
238  for (int d=1; d<dim_stride; d++) { // get remaining forward fathers (if any)
239  // 4-way 1,2,3 (stride = 4)
240  // 2-way 1 (stride = 2)
241 #pragma unroll
242  for (int color_local=0; color_local < Mc; color_local++) {
243  out[color_local] +=
244  shared_sum[(((color_local*blockDim.z/(2*dim_stride) + threadIdx.z/(2*dim_stride)) * 2 * dim_stride + d * 2 + 0)*blockDim.y+threadIdx.y)*blockDim.x+threadIdx.x];
245  }
246  }
247 
248 #pragma unroll
249  for (int d=0; d<dim_stride; d++) { // get all backward gathers
250 #pragma unroll
251  for (int color_local=0; color_local < Mc; color_local++) {
252  out[color_local] +=
253  shared_sum[(((color_local*blockDim.z/(2*dim_stride) + threadIdx.z/(2*dim_stride)) * 2 * dim_stride + d * 2 + 1)*blockDim.y+threadIdx.y)*blockDim.x+threadIdx.x];
254  }
255  }
256 
257  // apply kappa
258 #pragma unroll
259  for (int color_local=0; color_local<Mc; color_local++) out[color_local] *= -arg.kappa;
260 
261  }
262 
263 #else // !__CUDA_ARCH__
264  for (int color_local=0; color_local<Mc; color_local++) out[color_local] *= -arg.kappa;
265 #endif
266 
267  }
268 
279  template <typename Float, int Ns, int Nc, int Mc, int color_stride, bool dagger, typename Arg>
280  __device__ __host__ inline void applyClover(complex<Float> out[], Arg &arg, int x_cb, int src_idx, int parity, int s, int color_block, int color_offset) {
281  const int spinor_parity = (arg.nParity == 2) ? parity : 0;
282 
283  // M is number of colors per thread
284 #pragma unroll
285  for(int color_local = 0; color_local < Mc; color_local++) {//Color out
286  int c = color_block + color_local; // global color index
287  int row = s*Nc + c;
288 #pragma unroll
289  for (int s_col = 0; s_col < Ns; s_col++) //Spin in
290 #pragma unroll
291  for (int c_col = 0; c_col < Nc; c_col+=color_stride) { //Color in
292  //Factor of kappa and diagonal addition now incorporated in X
293  int col = s_col*Nc + c_col + color_offset;
294  if (!dagger) {
295  out[color_local] += arg.X(0, parity, x_cb, row, col)
296  * arg.inB(spinor_parity, x_cb+src_idx*arg.volumeCB, s_col, c_col+color_offset);
297  } else {
298  out[color_local] += conj(arg.X(0, parity, x_cb, col, row))
299  * arg.inB(spinor_parity, x_cb+src_idx*arg.volumeCB, s_col, c_col+color_offset);
300  }
301  }
302  }
303 
304  }
305 
306  //out(x) = M*in = \sum_mu Y_{-\mu}(x)in(x+mu) + Y^\dagger_mu(x-mu)in(x-mu)
307  template <typename Float, int nDim, int Ns, int Nc, int Mc, int color_stride, int dim_thread_split,
308  bool dslash, bool clover, bool dagger, DslashType type, int dir, int dim, typename Arg>
309  __device__ __host__ inline void coarseDslash(Arg &arg, int x_cb, int src_idx, int parity, int s, int color_block, int color_offset)
310  {
312  if (dslash) applyDslash<Float,nDim,Ns,Nc,Mc,color_stride,dim_thread_split,dir,dim,dagger,type>(out.data, arg, x_cb, src_idx, parity, s, color_block, color_offset);
313  if (doBulk<type>() && clover && dir==0 && dim==0) applyClover<Float,Ns,Nc,Mc,color_stride,dagger>(out.data, arg, x_cb, src_idx, parity, s, color_block, color_offset);
314 
315  if (dir==0 && dim==0) {
316  const int my_spinor_parity = (arg.nParity == 2) ? parity : 0;
317 #if __CUDA_ARCH__ >= 300 // only have warp shuffle on Kepler and above
318 
319 #pragma unroll
320  for (int color_local=0; color_local<Mc; color_local++) {
321  // reduce down to the first group of column-split threads
322  constexpr int warp_size = 32; // FIXME - this is buggy when x-dim * color_stride < 32
323 #pragma unroll
324  for (int offset = warp_size/2; offset >= warp_size/color_stride; offset /= 2)
325 #if (__CUDACC_VER_MAJOR__ >= 9 || CUDA_VERSION >= 9000)
326 #define WARP_CONVERGED 0xffffffff // we know warp should be converged here
327  out[color_local] += __shfl_down_sync(WARP_CONVERGED, out[color_local], offset);
328 #else
329  out[color_local] += __shfl_down(out[color_local], offset);
330 #endif
331  }
332 
333 #endif // __CUDA_ARCH__ >= 300
334 
335 #pragma unroll
336  for (int color_local=0; color_local<Mc; color_local++) {
337  int c = color_block + color_local; // global color index
338  if (color_offset == 0) {
339  // if not halo we just store, else we accumulate
340  if (doBulk<type>()) arg.out(my_spinor_parity, x_cb+src_idx*arg.volumeCB, s, c) = out[color_local];
341  else arg.out(my_spinor_parity, x_cb+src_idx*arg.volumeCB, s, c) += out[color_local];
342  }
343  }
344  }
345 
346  }
347 
348  // CPU kernel for applying the coarse Dslash to a vector
349  template <typename Float, int nDim, int Ns, int Nc, int Mc, bool dslash, bool clover, bool dagger, DslashType type, typename Arg>
351  {
352  // the fine-grain parameters mean nothing for CPU variant
353  const int color_stride = 1;
354  const int color_offset = 0;
355  const int dim_thread_split = 1;
356  const int dir = 0;
357  const int dim = 0;
358 
359  for (int parity= 0; parity < arg.nParity; parity++) {
360  // for full fields then set parity from loop else use arg setting
361  parity = (arg.nParity == 2) ? parity : arg.parity;
362 
363  for (int src_idx = 0; src_idx < arg.dim[4]; src_idx++) {
364  //#pragma omp parallel for
365  for(int x_cb = 0; x_cb < arg.volumeCB; x_cb++) { // 4-d volume
366  for (int s=0; s<2; s++) {
367  for (int color_block=0; color_block<Nc; color_block+=Mc) { // Mc=Nc means all colors in a thread
368  coarseDslash<Float,nDim,Ns,Nc,Mc,color_stride,dim_thread_split,dslash,clover,dagger,type,dir,dim>(arg, x_cb, src_idx, parity, s, color_block, color_offset);
369  }
370  }
371  } // 4-d volumeCB
372  } // src index
373  } // parity
374 
375  }
376 
377  // GPU Kernel for applying the coarse Dslash to a vector
378  template <typename Float, int nDim, int Ns, int Nc, int Mc, int color_stride, int dim_thread_split, bool dslash, bool clover, bool dagger, DslashType type, typename Arg>
379  __global__ void coarseDslashKernel(Arg arg)
380  {
381  constexpr int warp_size = 32;
382  const int lane_id = threadIdx.x % warp_size;
383  const int warp_id = threadIdx.x / warp_size;
384  const int vector_site_width = warp_size / color_stride;
385 
386  int x_cb = blockIdx.x*(blockDim.x/color_stride) + warp_id*(warp_size/color_stride) + lane_id % vector_site_width;
387 
388  const int color_offset = lane_id / vector_site_width;
389 
390  // for full fields set parity from y thread index else use arg setting
391 #if 0 // disable multi-src since this has a measurable impact on single src performance
392  int paritySrc = blockDim.y*blockIdx.y + threadIdx.y;
393  if (paritySrc >= arg.nParity * arg.dim[4]) return;
394  const int src_idx = (arg.nParity == 2) ? paritySrc / 2 : paritySrc; // maybe want to swap order or source and parity for improved locality of same parity
395  const int parity = (arg.nParity == 2) ? paritySrc % 2 : arg.parity;
396 #else
397  const int src_idx = 0;
398  const int parity = (arg.nParity == 2) ? blockDim.y*blockIdx.y + threadIdx.y : arg.parity;
399 #endif
400 
401  // z thread dimension is (( s*(Nc/Mc) + color_block )*dim_thread_split + dim)*2 + dir
402  int sMd = blockDim.z*blockIdx.z + threadIdx.z;
403  int dir = sMd & 1;
404  int sMdim = sMd >> 1;
405  int dim = sMdim % dim_thread_split;
406  int sM = sMdim / dim_thread_split;
407  int s = sM / (Nc/Mc);
408  int color_block = (sM % (Nc/Mc)) * Mc;
409 
410  if (x_cb >= arg.volumeCB) return;
411 
412  if (dir == 0) {
413  if (dim == 0) coarseDslash<Float,nDim,Ns,Nc,Mc,color_stride,dim_thread_split,dslash,clover,dagger,type,0,0>(arg, x_cb, src_idx, parity, s, color_block, color_offset);
414  else if (dim == 1) coarseDslash<Float,nDim,Ns,Nc,Mc,color_stride,dim_thread_split,dslash,clover,dagger,type,0,1>(arg, x_cb, src_idx, parity, s, color_block, color_offset);
415  else if (dim == 2) coarseDslash<Float,nDim,Ns,Nc,Mc,color_stride,dim_thread_split,dslash,clover,dagger,type,0,2>(arg, x_cb, src_idx, parity, s, color_block, color_offset);
416  else if (dim == 3) coarseDslash<Float,nDim,Ns,Nc,Mc,color_stride,dim_thread_split,dslash,clover,dagger,type,0,3>(arg, x_cb, src_idx, parity, s, color_block, color_offset);
417  } else if (dir == 1) {
418  if (dim == 0) coarseDslash<Float,nDim,Ns,Nc,Mc,color_stride,dim_thread_split,dslash,clover,dagger,type,1,0>(arg, x_cb, src_idx, parity, s, color_block, color_offset);
419  else if (dim == 1) coarseDslash<Float,nDim,Ns,Nc,Mc,color_stride,dim_thread_split,dslash,clover,dagger,type,1,1>(arg, x_cb, src_idx, parity, s, color_block, color_offset);
420  else if (dim == 2) coarseDslash<Float,nDim,Ns,Nc,Mc,color_stride,dim_thread_split,dslash,clover,dagger,type,1,2>(arg, x_cb, src_idx, parity, s, color_block, color_offset);
421  else if (dim == 3) coarseDslash<Float,nDim,Ns,Nc,Mc,color_stride,dim_thread_split,dslash,clover,dagger,type,1,3>(arg, x_cb, src_idx, parity, s, color_block, color_offset);
422  }
423  }
424 
425 } // namespace quda
DslashCoarseArg(ColorSpinorField &out, const ColorSpinorField &inA, const ColorSpinorField &inB, const GaugeField &Y, const GaugeField &X, Float kappa, int parity)
static __device__ __host__ void getCoordsCB(int x[], int cb_index, const I X[], J X0h, int parity)
gauge::FieldOrder< Float, coarseColor *coarseSpin, coarseSpin, gOrder, true, yFloat > G
__host__ __device__ bool doBulk()
Helper function to determine if we should do interior computation.
__device__ __host__ void coarseDslash(Arg &arg, int x_cb, int src_idx, int parity, int s, int color_block, int color_offset)
static __device__ __host__ int linkIndexM1(const int x[], const I X[4], const int mu)
__device__ __host__ int VolumeCB() const
__host__ __device__ bool doHalo(int dim=-1)
Helper function to determine if we should do halo computation.
const int_fastdiv dim[5]
__global__ void coarseDslashKernel(Arg arg)
colorspinor::FieldOrderCB< Float, coarseSpin, coarseColor, 1, csOrder, Float, ghostFloat > F
Main header file for host and device accessors to GaugeFields.
__device__ __host__ void applyDslash(complex< Float > out[], Arg &arg, int x_cb, int src_idx, int parity, int s_row, int color_block, int color_offset)
const int_fastdiv X0h
const int nParity
Definition: spinor_noise.cu:25
__shared__ float s[]
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
const int volumeCB
Definition: spinor_noise.cu:26
const int * X() const
__device__ __host__ void applyClover(complex< Float > out[], Arg &arg, int x_cb, int src_idx, int parity, int s, int color_block, int color_offset)
gauge::FieldOrder< Float, coarseColor *coarseSpin, coarseSpin, gOrder, true, yFloat > GY
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:130
static __device__ __host__ int linkIndexP1(const int x[], const I X[4], const int mu)
QudaDagType dagger
Definition: test_util.cpp:1620
int comm_dim_partitioned(int dim)