QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
pgauge_exchange.cu
Go to the documentation of this file.
1 #include <quda_internal.h>
2 #include <quda_matrix.h>
3 #include <tune_quda.h>
4 #include <gauge_field.h>
5 #include <gauge_field_order.h>
6 #include <cub_helper.cuh>
7 #include <launch_kernel.cuh>
8 #include <comm_quda.h>
9 
10 
11 namespace quda {
12 
13 #ifdef GPU_GAUGE_ALG
14 
15 #define LAUNCH_KERNEL_GAUGEFIX(kernel, tp, stream, arg, parity, ...) \
16  if ( tp.block.z == 0 ) { \
17  switch ( tp.block.x ) { \
18  case 256: \
19  kernel<0, 32,__VA_ARGS__> \
20  <<< tp.grid.x, tp.block.x, tp.shared_bytes, stream >>> (arg, parity); \
21  break; \
22  case 512: \
23  kernel<0, 64,__VA_ARGS__> \
24  <<< tp.grid.x, tp.block.x, tp.shared_bytes, stream >>> (arg, parity); \
25  break; \
26  case 768: \
27  kernel<0, 96,__VA_ARGS__> \
28  <<< tp.grid.x, tp.block.x, tp.shared_bytes, stream >>> (arg, parity); \
29  break; \
30  case 1024: \
31  kernel<0, 128,__VA_ARGS__> \
32  <<< tp.grid.x, tp.block.x, tp.shared_bytes, stream >>> (arg, parity); \
33  break; \
34  default: \
35  errorQuda("%s not implemented for %d threads", # kernel, tp.block.x); \
36  } \
37  } \
38  else{ \
39  switch ( tp.block.x ) { \
40  case 256: \
41  kernel<1, 32,__VA_ARGS__> \
42  <<< tp.grid.x, tp.block.x, tp.shared_bytes, stream >>> (arg, parity); \
43  break; \
44  case 512: \
45  kernel<1, 64,__VA_ARGS__> \
46  <<< tp.grid.x, tp.block.x, tp.shared_bytes, stream >>> (arg, parity); \
47  break; \
48  case 768: \
49  kernel<1, 96,__VA_ARGS__> \
50  <<< tp.grid.x, tp.block.x, tp.shared_bytes, stream >>> (arg, parity); \
51  break; \
52  case 1024: \
53  kernel<1, 128,__VA_ARGS__> \
54  <<< tp.grid.x, tp.block.x, tp.shared_bytes, stream >>> (arg, parity); \
55  break; \
56  default: \
57  errorQuda("%s not implemented for %d threads", # kernel, tp.block.x); \
58  } \
59  }
60 
61 
62  template <typename Gauge>
63  struct GaugeFixUnPackArg {
64  int X[4]; // grid dimensions
65  Gauge dataOr;
66  GaugeFixUnPackArg(Gauge & dataOr, cudaGaugeField & data)
67  : dataOr(dataOr) {
68  for ( int dir = 0; dir < 4; ++dir ) X[dir] = data.X()[dir];
69  }
70  };
71 
72 
73  template<int NElems, typename Float, typename Gauge, bool pack>
74  __global__ void Kernel_UnPack(int size, GaugeFixUnPackArg<Gauge> arg, \
75  complex<Float> *array, int parity, int face, int dir, int borderid){
76  int idx = blockIdx.x * blockDim.x + threadIdx.x;
77  if ( idx >= size ) return;
78  int X[4];
79  for ( int dr = 0; dr < 4; ++dr ) X[dr] = arg.X[dr];
80  int x[4];
81  int za, xodd;
82  switch ( face ) {
83  case 0: //X FACE
84  za = idx / ( X[1] / 2);
85  x[3] = za / X[2];
86  x[2] = za - x[3] * X[2];
87  x[0] = borderid;
88  xodd = (borderid + x[2] + x[3] + parity) & 1;
89  x[1] = (2 * idx + xodd) - za * X[1];
90  break;
91  case 1: //Y FACE
92  za = idx / ( X[0] / 2);
93  x[3] = za / X[2];
94  x[2] = za - x[3] * X[2];
95  x[1] = borderid;
96  xodd = (borderid + x[2] + x[3] + parity) & 1;
97  x[0] = (2 * idx + xodd) - za * X[0];
98  break;
99  case 2: //Z FACE
100  za = idx / ( X[0] / 2);
101  x[3] = za / X[1];
102  x[1] = za - x[3] * X[1];
103  x[2] = borderid;
104  xodd = (borderid + x[1] + x[3] + parity) & 1;
105  x[0] = (2 * idx + xodd) - za * X[0];
106  break;
107  case 3: //T FACE
108  za = idx / ( X[0] / 2);
109  x[2] = za / X[1];
110  x[1] = za - x[2] * X[1];
111  x[3] = borderid;
112  xodd = (borderid + x[1] + x[2] + parity) & 1;
113  x[0] = (2 * idx + xodd) - za * X[0];
114  break;
115  }
116  int id = (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]) >> 1;
117  typedef complex<Float> Complex;
118  typedef typename mapper<Float>::type RegType;
119  RegType tmp[NElems];
120  Complex data[9];
121  if ( pack ) {
122  arg.dataOr.load(data, id, dir, parity);
123  arg.dataOr.reconstruct.Pack(tmp, data, id);
124  for ( int i = 0; i < NElems / 2; ++i ) array[idx + size * i] = Complex(tmp[2*i+0], tmp[2*i+1]);
125  }
126  else{
127  for ( int i = 0; i < NElems / 2; ++i ) {
128  tmp[2*i+0] = array[idx + size * i].real();
129  tmp[2*i+1] = array[idx + size * i].imag();
130  }
131  arg.dataOr.reconstruct.Unpack(data, tmp, id, dir, 0, arg.dataOr.X, arg.dataOr.R);
132  arg.dataOr.save(data, id, dir, parity);
133  }
134  }
135 
136 #ifdef MULTI_GPU
137  static void *send[4];
138  static void *recv[4];
139  static void *sendg[4];
140  static void *recvg[4];
141  static void *send_d[4];
142  static void *recv_d[4];
143  static void *sendg_d[4];
144  static void *recvg_d[4];
145  static void *hostbuffer_h[4];
146  static cudaStream_t GFStream[2];
147  static size_t offset[4];
148  static size_t bytes[4];
149  static size_t faceVolume[4];
150  static size_t faceVolumeCB[4];
151  // do the exchange
152  static MsgHandle *mh_recv_back[4];
153  static MsgHandle *mh_recv_fwd[4];
154  static MsgHandle *mh_send_fwd[4];
155  static MsgHandle *mh_send_back[4];
156  static int X[4];
157  static dim3 block[4];
158  static dim3 grid[4];
159  static bool notinitialized = true;
160 #endif // MULTI_GPU
161 
165  void PGaugeExchangeFree(){
166 #ifdef MULTI_GPU
168  cudaStreamDestroy(GFStream[0]);
169  cudaStreamDestroy(GFStream[1]);
170  for ( int d = 0; d < 4; d++ ) {
171  if ( commDimPartitioned(d)) {
172  comm_free(mh_send_fwd[d]);
173  comm_free(mh_send_back[d]);
174  comm_free(mh_recv_back[d]);
175  comm_free(mh_recv_fwd[d]);
176  device_free(send_d[d]);
177  device_free(recv_d[d]);
178  device_free(sendg_d[d]);
179  device_free(recvg_d[d]);
180  #ifndef GPU_COMMS
181  host_free(hostbuffer_h[d]);
182  #endif
183  }
184  }
185  notinitialized = true;
186  }
187 #endif
188  }
189 
190 
191  template<typename Float, int NElems, typename Gauge>
192  void PGaugeExchange( Gauge dataOr, cudaGaugeField& data, const int dir, const int parity) {
193 
194 
195 #ifdef MULTI_GPU
196  if ( notinitialized == false ) {
197  for ( int d = 0; d < 4; d++ ) {
198  if ( X[d] != data.X()[d] ) {
200  notinitialized = true;
201  printfQuda("PGaugeExchange needs to be reinitialized...\n");
202  break;
203  }
204  }
205  }
206  if ( notinitialized ) {
207  for ( int d = 0; d < 4; d++ ) {
208  X[d] = data.X()[d];
209  }
210  for ( int i = 0; i < 4; i++ ) {
211  faceVolume[i] = 1;
212  for ( int j = 0; j < 4; j++ ) {
213  if ( i == j ) continue;
214  faceVolume[i] *= X[j];
215  }
216  faceVolumeCB[i] = faceVolume[i] / 2;
217  }
218 
219  cudaStreamCreate(&GFStream[0]);
220  cudaStreamCreate(&GFStream[1]);
221  for ( int d = 0; d < 4; d++ ) {
222  if ( !commDimPartitioned(d)) continue;
223  // store both parities and directions in each
224  offset[d] = faceVolumeCB[d] * NElems;
225  bytes[d] = sizeof(Float) * offset[d];
226  send_d[d] = device_malloc(bytes[d]);
227  recv_d[d] = device_malloc(bytes[d]);
228  sendg_d[d] = device_malloc(bytes[d]);
229  recvg_d[d] = device_malloc(bytes[d]);
230  #ifndef GPU_COMMS
231  hostbuffer_h[d] = (void*)pinned_malloc(4 * bytes[d]);
232  #endif
233  block[d] = make_uint3(128, 1, 1);
234  grid[d] = make_uint3((faceVolumeCB[d] + block[d].x - 1) / block[d].x, 1, 1);
235  }
236 
237  for ( int d = 0; d < 4; d++ ) {
238  if ( !commDimPartitioned(d)) continue;
239  #ifdef GPU_COMMS
240  recv[d] = recv_d[d];
241  send[d] = send_d[d];
242  recvg[d] = recvg_d[d];
243  sendg[d] = sendg_d[d];
244  #else
245  recv[d] = hostbuffer_h[d];
246  send[d] = static_cast<char*>(hostbuffer_h[d]) + bytes[d];
247  recvg[d] = static_cast<char*>(hostbuffer_h[d]) + 3 * bytes[d];
248  sendg[d] = static_cast<char*>(hostbuffer_h[d]) + 2 * bytes[d];
249  #endif
250  // look into storing these for later
251  mh_recv_back[d] = comm_declare_receive_relative(recv[d], d, -1, bytes[d]);
252  mh_recv_fwd[d] = comm_declare_receive_relative(recvg[d], d, +1, bytes[d]);
253  mh_send_back[d] = comm_declare_send_relative(sendg[d], d, -1, bytes[d]);
254  mh_send_fwd[d] = comm_declare_send_relative(send[d], d, +1, bytes[d]);
255  }
256  notinitialized = false;
257  }
258  GaugeFixUnPackArg<Gauge> dataexarg(dataOr, data);
259 
260 
261  for ( int d = 0; d < 4; d++ ) {
262  if ( !commDimPartitioned(d)) continue;
263  comm_start(mh_recv_back[d]);
264  comm_start(mh_recv_fwd[d]);
265 
266  //extract top face
267  Kernel_UnPack<NElems, Float, Gauge, true> <<< grid[d], block[d], 0, GFStream[0] >>>
268  (faceVolumeCB[d], dataexarg, reinterpret_cast<complex<Float>*>(send_d[d]), parity, d, dir, X[d] - data.R()[d] - 1);
269  //extract bottom
270  Kernel_UnPack<NElems, Float, Gauge, true> <<< grid[d], block[d], 0, GFStream[1] >>>
271  (faceVolumeCB[d], dataexarg, reinterpret_cast<complex<Float>*>(sendg_d[d]), parity, d, dir, data.R()[d]);
272 
273  #ifndef GPU_COMMS
274  cudaMemcpyAsync(send[d], send_d[d], bytes[d], cudaMemcpyDeviceToHost, GFStream[0]);
275  cudaMemcpyAsync(sendg[d], sendg_d[d], bytes[d], cudaMemcpyDeviceToHost, GFStream[1]);
276  #endif
277  qudaStreamSynchronize(GFStream[0]);
278  comm_start(mh_send_fwd[d]);
279 
280  qudaStreamSynchronize(GFStream[1]);
281  comm_start(mh_send_back[d]);
282 
283  #ifndef GPU_COMMS
284  comm_wait(mh_recv_back[d]);
285  cudaMemcpyAsync(recv_d[d], recv[d], bytes[d], cudaMemcpyHostToDevice, GFStream[0]);
286  #endif
287  #ifdef GPU_COMMS
288  comm_wait(mh_recv_back[d]);
289  #endif
290  Kernel_UnPack<NElems, Float, Gauge, false> <<< grid[d], block[d], 0, GFStream[0] >>>
291  (faceVolumeCB[d], dataexarg, reinterpret_cast<complex<Float>*>(recv_d[d]), parity, d, dir, data.R()[d] - 1);
292 
293  #ifndef GPU_COMMS
294  comm_wait(mh_recv_fwd[d]);
295  cudaMemcpyAsync(recvg_d[d], recvg[d], bytes[d], cudaMemcpyHostToDevice, GFStream[1]);
296  #endif
297 
298  #ifdef GPU_COMMS
299  comm_wait(mh_recv_fwd[d]);
300  #endif
301  Kernel_UnPack<NElems, Float, Gauge, false> <<< grid[d], block[d], 0, GFStream[1] >>>
302  (faceVolumeCB[d], dataexarg, reinterpret_cast<complex<Float>*>(recvg_d[d]), parity, d, dir, X[d] - data.R()[d]);
303 
304  comm_wait(mh_send_back[d]);
305  comm_wait(mh_send_fwd[d]);
306  qudaStreamSynchronize(GFStream[0]);
307  qudaStreamSynchronize(GFStream[1]);
308  }
309  checkCudaError();
311  #endif
312 
313  }
314 
315 
316  template<typename Float>
317  void PGaugeExchange( cudaGaugeField& data, const int dir, const int parity) {
318 
319 
320  // Switching to FloatNOrder for the gauge field in order to support RECONSTRUCT_12
321  // Need to fix this!!
322  if ( data.isNative() ) {
323  if ( data.Reconstruct() == QUDA_RECONSTRUCT_NO ) {
325  PGaugeExchange<Float, 18>(G(data), data, dir, parity);
326  } else if ( data.Reconstruct() == QUDA_RECONSTRUCT_12 ) {
328  PGaugeExchange<Float, 12>(G(data), data, dir, parity);
329  } else if ( data.Reconstruct() == QUDA_RECONSTRUCT_8 ) {
331  PGaugeExchange<Float, 8>(G(data), data, dir, parity);
332  } else {
333  errorQuda("Reconstruction type %d of gauge field not supported", data.Reconstruct());
334  }
335  } else {
336  errorQuda("Invalid Gauge Order\n");
337  }
338  }
339 
340 #endif // GPU_GAUGE_ALG
341 
342  void PGaugeExchange( cudaGaugeField& data, const int dir, const int parity) {
343 
344 #ifdef GPU_GAUGE_ALG
345 #ifdef MULTI_GPU
347  if ( data.Precision() == QUDA_HALF_PRECISION ) {
348  errorQuda("Half precision not supported\n");
349  }
350  if ( data.Precision() == QUDA_SINGLE_PRECISION ) {
351  PGaugeExchange<float> (data, dir, parity);
352  } else if ( data.Precision() == QUDA_DOUBLE_PRECISION ) {
353  PGaugeExchange<double>(data, dir, parity);
354  } else {
355  errorQuda("Precision %d not supported", data.Precision());
356  }
357  }
358 #endif
359 #else
360  errorQuda("Pure gauge code has not been built");
361 #endif
362  }
363 }
int commDimPartitioned(int dir)
#define pinned_malloc(size)
Definition: malloc_quda.h:67
#define errorQuda(...)
Definition: util_quda.h:121
#define host_free(ptr)
Definition: malloc_quda.h:71
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:44
void PGaugeExchange(cudaGaugeField &data, const int dir, const int parity)
Perform heatbath and overrelaxation. Performs nhb heatbath steps followed by nover overrelaxation ste...
const int * R() const
#define qudaDeviceSynchronize()
#define comm_declare_send_relative(buffer, dim, dir, nbytes)
Definition: comm_quda.h:59
void PGaugeExchangeFree()
Release all allocated memory used to exchange data between nodes.
cudaError_t qudaStreamSynchronize(cudaStream_t &stream)
Wrapper around cudaStreamSynchronize or cuStreamSynchronize.
#define comm_declare_receive_relative(buffer, dim, dir, nbytes)
Definition: comm_quda.h:74
void comm_start(MsgHandle *mh)
Definition: comm_mpi.cpp:216
constexpr int size
Main header file for host and device accessors to GaugeFields.
void comm_free(MsgHandle *&mh)
Definition: comm_mpi.cpp:207
int X[4]
Definition: covdev_test.cpp:70
std::complex< double > Complex
Definition: quda_internal.h:46
__device__ __host__ void pack(Arg &arg, int ghost_idx, int s, int parity)
Definition: dslash_pack.cuh:83
#define printfQuda(...)
Definition: util_quda.h:115
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
#define device_malloc(size)
Definition: malloc_quda.h:64
int faceVolume[4]
Definition: test_util.cpp:31
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:250
#define checkCudaError()
Definition: util_quda.h:161
void comm_wait(MsgHandle *mh)
Definition: comm_mpi.cpp:222
QudaPrecision Precision() const
bool isNative() const
QudaParity parity
Definition: covdev_test.cpp:54
unsigned long long bytes
Definition: blas_quda.cu:23
int comm_dim_partitioned(int dim)
const int * X() const
#define device_free(ptr)
Definition: malloc_quda.h:69