QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
color_spinor_pack.cuh
Go to the documentation of this file.
2 #include <index_helper.cuh>
3 #include <fast_intdiv.h>
4 
5 namespace quda {
6 
7  template <typename Field>
8  struct PackGhostArg {
9 
10  Field field;
13  const int nDim;
14  const int nFace;
15  const int parity;
16  const int nParity;
17  const int dagger;
19  int commDim[4]; // whether a given dimension is partitioned or not
21 
22  PackGhostArg(Field field, const ColorSpinorField &a, int parity, int nFace, int dagger) :
23  field(field),
24  volumeCB(a.VolumeCB()),
25  nDim(a.Ndim()),
26  nFace(nFace),
27  parity(parity),
28  nParity(a.SiteSubset()),
29  dagger(dagger),
30  pc_type(a.PCType())
31  {
32  X[0] = ((nParity == 1) ? 2 : 1) * a.X(0); // set to full lattice dimensions
33  for (int d=1; d<nDim; d++) X[d] = a.X(d);
34  X[4] = (nDim == 5) ? a.X(4) : 1; // set fifth dimension correctly
35  for (int i=0; i<4; i++) {
36  commDim[i] = comm_dim_partitioned(i);
37  }
38  }
39  };
40 
41  // this is the maximum number of colors for which we support block-float format
42 #define MAX_BLOCK_FLOAT_NC 32
43 
47  template <typename Float, int Ns, int Ms, int Nc, int Mc, typename Arg>
48  __device__ __host__ __forceinline__ Float compute_site_max(Arg &arg, int x_cb, int parity, int spinor_parity, int spin_block, int color_block, bool active) {
49 
50  Float thread_max = 0.0;
51  Float site_max = active ? 0.0 : 1.0;
52 
53 #ifdef __CUDA_ARCH__
54  // workout how big a shared-memory allocation we need
55  // just statically compute the largest size needed to avoid templating on block size
56  constexpr int max_block_size = 1024; // all supported GPUs have 1024 as their max block size
57  constexpr int bank_width = 32; // shared memory has 32 banks
58  constexpr int color_spin_threads = Nc <= MAX_BLOCK_FLOAT_NC ? (Ns/Ms) * (Nc/Mc) : 1;
59  // this is the largest size of blockDim.x (rounded up to multiples of bank_width)
60  constexpr int thread_width_x = ( (max_block_size / color_spin_threads + bank_width-1) / bank_width) * bank_width;
61  __shared__ Float v[ (Ns/Ms) * (Nc/Mc) * thread_width_x];
62  const auto &rhs = arg.field;
63  if (active) {
64 #pragma unroll
65  for (int spin_local=0; spin_local<Ms; spin_local++) {
66  int s = spin_block + spin_local;
67 #pragma unroll
68  for (int color_local=0; color_local<Mc; color_local++) {
69  int c = color_block + color_local;
70  complex<Float> z = rhs(spinor_parity, x_cb, s, c);
71  thread_max = thread_max > fabs(z.real()) ? thread_max : fabs(z.real());
72  thread_max = thread_max > fabs(z.imag()) ? thread_max : fabs(z.imag());
73  }
74  }
75  v[ ( (spin_block/Ms) * (Nc/Mc) + (color_block/Mc)) * blockDim.x + threadIdx.x ] = thread_max;
76  }
77 
78  __syncthreads();
79 
80  if (active) {
81 #pragma unroll
82  for (int sc=0; sc<(Ns/Ms) * (Nc/Mc); sc++) {
83  site_max = site_max > v[sc*blockDim.x + threadIdx.x] ? site_max : v[sc*blockDim.x + threadIdx.x];
84  }
85  }
86 #else
87  errorQuda("Not supported on CPU");
88 #endif
89 
90  return site_max;
91  }
92 
93 
94  template <typename Float, bool block_float, int Ns, int Ms, int Nc, int Mc, int nDim, int dim, int dir, typename Arg>
95  __device__ __host__ __forceinline__ void packGhost(Arg &arg, int x_cb, int parity, int spinor_parity, int spin_block, int color_block) {
96 
97  int x[5] = { };
98  if (nDim == 5) getCoords5(x, x_cb, arg.X, parity, arg.pc_type);
99  else getCoords(x, x_cb, arg.X, parity);
100 
101  const auto &rhs = arg.field;
102 
103  {
104  Float max = 1.0;
105  if (block_float) {
106  bool active = ( arg.commDim[dim] && ( (dir == 0 && x[dim] < arg.nFace) || (dir == 1 && x[dim] >= arg.X[dim] - arg.nFace) ) );
107  max = compute_site_max<Float,Ns,Ms,Nc,Mc>(arg, x_cb, parity, spinor_parity, spin_block, color_block, active);
108  }
109 
110  if (dir == 0 && arg.commDim[dim] && x[dim] < arg.nFace) {
111  for (int spin_local=0; spin_local<Ms; spin_local++) {
112  int s = spin_block + spin_local;
113  for (int color_local=0; color_local<Mc; color_local++) {
114  int c = color_block + color_local;
115  arg.field.Ghost(dim, 0, spinor_parity, ghostFaceIndex<0, nDim>(x, arg.X, dim, arg.nFace), s, c, 0, max)
116  = rhs(spinor_parity, x_cb, s, c);
117  }
118  }
119  }
120 
121  if (dir == 1 && arg.commDim[dim] && x[dim] >= arg.X[dim] - arg.nFace) {
122  for (int spin_local=0; spin_local<Ms; spin_local++) {
123  int s = spin_block + spin_local;
124  for (int color_local=0; color_local<Mc; color_local++) {
125  int c = color_block + color_local;
126  arg.field.Ghost(dim, 1, spinor_parity, ghostFaceIndex<1, nDim>(x, arg.X, dim, arg.nFace), s, c, 0, max)
127  = rhs(spinor_parity, x_cb, s, c);
128  }
129  }
130  }
131  }
132  }
133 
134  template <typename Float, bool block_float, int Ns, int Ms, int Nc, int Mc, int nDim, typename Arg>
136  for (int parity=0; parity<arg.nParity; parity++) {
137  parity = (arg.nParity == 2) ? parity : arg.parity;
138  const int spinor_parity = (arg.nParity == 2) ? parity : 0;
139  for (int dim=0; dim<4; dim++)
140  for (int dir=0; dir<2; dir++)
141  for (int x_cb=0; x_cb<arg.volumeCB; x_cb++)
142  for (int spin_block=0; spin_block<Ns; spin_block+=Ms)
143  for (int color_block=0; color_block<Nc; color_block+=Mc)
144  switch(dir) {
145  case 0: // backwards pack
146  switch(dim) {
147  case 0: packGhost<Float,block_float,Ns,Ms,Nc,Mc,nDim,0,0>(arg, x_cb, parity, spinor_parity, spin_block, color_block); break;
148  case 1: packGhost<Float,block_float,Ns,Ms,Nc,Mc,nDim,1,0>(arg, x_cb, parity, spinor_parity, spin_block, color_block); break;
149  case 2: packGhost<Float,block_float,Ns,Ms,Nc,Mc,nDim,2,0>(arg, x_cb, parity, spinor_parity, spin_block, color_block); break;
150  case 3: packGhost<Float,block_float,Ns,Ms,Nc,Mc,nDim,3,0>(arg, x_cb, parity, spinor_parity, spin_block, color_block); break;
151  }
152  break;
153  case 1: // forwards pack
154  switch(dim) {
155  case 0: packGhost<Float,block_float,Ns,Ms,Nc,Mc,nDim,0,1>(arg, x_cb, parity, spinor_parity, spin_block, color_block); break;
156  case 1: packGhost<Float,block_float,Ns,Ms,Nc,Mc,nDim,1,1>(arg, x_cb, parity, spinor_parity, spin_block, color_block); break;
157  case 2: packGhost<Float,block_float,Ns,Ms,Nc,Mc,nDim,2,1>(arg, x_cb, parity, spinor_parity, spin_block, color_block); break;
158  case 3: packGhost<Float,block_float,Ns,Ms,Nc,Mc,nDim,3,1>(arg, x_cb, parity, spinor_parity, spin_block, color_block); break;
159  }
160  }
161  }
162  }
163 
164  template <typename Float, bool block_float, int Ns, int Ms, int Nc, int Mc, int nDim, int dim_threads, typename Arg>
165  __global__ void GenericPackGhostKernel(Arg arg) {
166  int x_cb = blockIdx.x*blockDim.x + threadIdx.x;
167  int spin_color_block = blockDim.y*blockIdx.y + threadIdx.y;
168  int parity_dim_dir = blockDim.z*blockIdx.z + threadIdx.z;
169 
170  // ensure all threads are always active so it safe to synchronize
171  x_cb %= arg.volumeCB;
172  spin_color_block %= (Ns/Ms)*(Nc/Mc);
173  parity_dim_dir %= arg.nParity2dim_threads;
174 
175  const int dim_dir = parity_dim_dir % (2*dim_threads);
176  const int dim0 = dim_dir / 2; // this is the dim offset for each thread =0 for dim_threads=1)
177  const int dir = dim_dir % 2;
178  const int parity = (arg.nParity == 2) ? (parity_dim_dir / (2*dim_threads) ) : arg.parity;
179  const int spinor_parity = (arg.nParity == 2) ? parity : 0;
180  const int spin_block = (spin_color_block / (Nc / Mc)) * Ms;
181  const int color_block = (spin_color_block % (Nc / Mc)) * Mc;
182 
183 #pragma unroll
184  for (int dim=0; dim<4; dim+=dim_threads) {
185  switch(dir) {
186  case 0:
187  switch(dim+dim0) {
188  case 0: packGhost<Float,block_float,Ns,Ms,Nc,Mc,nDim,0,0>(arg, x_cb, parity, spinor_parity, spin_block, color_block); break;
189  case 1: packGhost<Float,block_float,Ns,Ms,Nc,Mc,nDim,1,0>(arg, x_cb, parity, spinor_parity, spin_block, color_block); break;
190  case 2: packGhost<Float,block_float,Ns,Ms,Nc,Mc,nDim,2,0>(arg, x_cb, parity, spinor_parity, spin_block, color_block); break;
191  case 3: packGhost<Float,block_float,Ns,Ms,Nc,Mc,nDim,3,0>(arg, x_cb, parity, spinor_parity, spin_block, color_block); break;
192  }
193  break;
194  case 1:
195  switch(dim+dim0) {
196  case 0: packGhost<Float,block_float,Ns,Ms,Nc,Mc,nDim,0,1>(arg, x_cb, parity, spinor_parity, spin_block, color_block); break;
197  case 1: packGhost<Float,block_float,Ns,Ms,Nc,Mc,nDim,1,1>(arg, x_cb, parity, spinor_parity, spin_block, color_block); break;
198  case 2: packGhost<Float,block_float,Ns,Ms,Nc,Mc,nDim,2,1>(arg, x_cb, parity, spinor_parity, spin_block, color_block); break;
199  case 3: packGhost<Float,block_float,Ns,Ms,Nc,Mc,nDim,3,1>(arg, x_cb, parity, spinor_parity, spin_block, color_block); break;
200  }
201  break;
202  }
203  }
204  }
205 
206 } // namespace quda
const int_fastdiv volumeCB
#define errorQuda(...)
Definition: util_quda.h:121
enum QudaPCType_s QudaPCType
__device__ __host__ __forceinline__ Float compute_site_max(Arg &arg, int x_cb, int parity, int spinor_parity, int spin_block, int color_block, bool active)
PackGhostArg(Field field, const ColorSpinorField &a, int parity, int nFace, int dagger)
__global__ void GenericPackGhostKernel(Arg arg)
__device__ __host__ __forceinline__ void packGhost(Arg &arg, int x_cb, int parity, int spinor_parity, int spin_block, int color_block)
const QudaPCType pc_type
int_fastdiv nParity2dim_threads
static __device__ __host__ void getCoords5(int x[5], int cb_index, const I X[5], int parity, QudaPCType pc_type)
#define MAX_BLOCK_FLOAT_NC
int_fastdiv X[QUDA_MAX_DIM]
const int nParity
Definition: spinor_noise.cu:25
__shared__ float s[]
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
const int volumeCB
Definition: spinor_noise.cu:26
const int * X() const
void GenericPackGhost(Arg &arg)
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
int comm_dim_partitioned(int dim)
__host__ __device__ int getCoords(int coord[], const Arg &arg, int &idx, int parity, int &dim)
Compute the space-time coordinates we are at.