QUDA  0.9.0
pgauge_exchange.cu
Go to the documentation of this file.
1 #include <quda_internal.h>
2 #include <quda_matrix.h>
3 #include <tune_quda.h>
4 #include <gauge_field.h>
5 #include <gauge_field_order.h>
6 #include <cub/cub.cuh>
7 #include <launch_kernel.cuh>
8 
9 #include <device_functions.h>
10 
11 #include <comm_quda.h>
12 
13 
14 namespace quda {
15 
16 #ifdef GPU_GAUGE_ALG
17 
18 #define LAUNCH_KERNEL_GAUGEFIX(kernel, tp, stream, arg, parity, ...) \
19  if ( tp.block.z == 0 ) { \
20  switch ( tp.block.x ) { \
21  case 256: \
22  kernel<0, 32,__VA_ARGS__> \
23  << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \
24  break; \
25  case 512: \
26  kernel<0, 64,__VA_ARGS__> \
27  << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \
28  break; \
29  case 768: \
30  kernel<0, 96,__VA_ARGS__> \
31  << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \
32  break; \
33  case 1024: \
34  kernel<0, 128,__VA_ARGS__> \
35  << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \
36  break; \
37  default: \
38  errorQuda("%s not implemented for %d threads", # kernel, tp.block.x); \
39  } \
40  } \
41  else{ \
42  switch ( tp.block.x ) { \
43  case 256: \
44  kernel<1, 32,__VA_ARGS__> \
45  << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \
46  break; \
47  case 512: \
48  kernel<1, 64,__VA_ARGS__> \
49  << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \
50  break; \
51  case 768: \
52  kernel<1, 96,__VA_ARGS__> \
53  << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \
54  break; \
55  case 1024: \
56  kernel<1, 128,__VA_ARGS__> \
57  << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \
58  break; \
59  default: \
60  errorQuda("%s not implemented for %d threads", # kernel, tp.block.x); \
61  } \
62  }
63 
64 
65  template <typename Gauge>
66  struct GaugeFixUnPackArg {
67  int X[4]; // grid dimensions
68 #ifdef MULTI_GPU
69  int border[4];
70 #endif
71  Gauge dataOr;
72  GaugeFixUnPackArg(Gauge & dataOr, cudaGaugeField & data)
73  : dataOr(dataOr) {
74  for ( int dir = 0; dir < 4; ++dir ) X[dir] = data.X()[dir];
75  }
76  };
77 
78 
79  template<int NElems, typename Float, typename Gauge, bool pack>
80  __global__ void Kernel_UnPack(int size, GaugeFixUnPackArg<Gauge> arg, \
81  complex<Float> *array, int parity, int face, int dir, int borderid){
82  int idx = blockIdx.x * blockDim.x + threadIdx.x;
83  if ( idx >= size ) return;
84  int X[4];
85  for ( int dr = 0; dr < 4; ++dr ) X[dr] = arg.X[dr];
86  int x[4];
87  int za, xodd;
88  switch ( face ) {
89  case 0: //X FACE
90  za = idx / ( X[1] / 2);
91  x[3] = za / X[2];
92  x[2] = za - x[3] * X[2];
93  x[0] = borderid;
94  xodd = (borderid + x[2] + x[3] + parity) & 1;
95  x[1] = (2 * idx + xodd) - za * X[1];
96  break;
97  case 1: //Y FACE
98  za = idx / ( X[0] / 2);
99  x[3] = za / X[2];
100  x[2] = za - x[3] * X[2];
101  x[1] = borderid;
102  xodd = (borderid + x[2] + x[3] + parity) & 1;
103  x[0] = (2 * idx + xodd) - za * X[0];
104  break;
105  case 2: //Z FACE
106  za = idx / ( X[0] / 2);
107  x[3] = za / X[1];
108  x[1] = za - x[3] * X[1];
109  x[2] = borderid;
110  xodd = (borderid + x[1] + x[3] + parity) & 1;
111  x[0] = (2 * idx + xodd) - za * X[0];
112  break;
113  case 3: //T FACE
114  za = idx / ( X[0] / 2);
115  x[2] = za / X[1];
116  x[1] = za - x[2] * X[1];
117  x[3] = borderid;
118  xodd = (borderid + x[1] + x[2] + parity) & 1;
119  x[0] = (2 * idx + xodd) - za * X[0];
120  break;
121  }
122  int id = (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]) >> 1;
123  typedef complex<Float> Complex;
124  typedef typename mapper<Float>::type RegType;
125  RegType tmp[NElems];
126  RegType data[18];
127  if ( pack ) {
128  arg.dataOr.load(data, id, dir, parity);
129  arg.dataOr.reconstruct.Pack(tmp, data, id);
130  for ( int i = 0; i < NElems / 2; ++i ) array[idx + size * i] = ((Complex*)tmp)[i];
131  }
132  else{
133  for ( int i = 0; i < NElems / 2; ++i ) ((Complex*)tmp)[i] = array[idx + size * i];
134  arg.dataOr.reconstruct.Unpack(data, tmp, id, dir, 0, arg.dataOr.X, arg.dataOr.R);
135  arg.dataOr.save(data, id, dir, parity);
136  }
137  }
138 
139 #ifdef MULTI_GPU
140  static void *send[4];
141  static void *recv[4];
142  static void *sendg[4];
143  static void *recvg[4];
144  static void *send_d[4];
145  static void *recv_d[4];
146  static void *sendg_d[4];
147  static void *recvg_d[4];
148  static void *hostbuffer_h[4];
149  static cudaStream_t GFStream[2];
150  static size_t offset[4];
151  static size_t bytes[4];
152  static size_t faceVolume[4];
153  static size_t faceVolumeCB[4];
154  // do the exchange
155  static MsgHandle *mh_recv_back[4];
156  static MsgHandle *mh_recv_fwd[4];
157  static MsgHandle *mh_send_fwd[4];
158  static MsgHandle *mh_send_back[4];
159  static int X[4];
160  static dim3 block[4];
161  static dim3 grid[4];
162  static bool notinitialized = true;
163 #endif // MULTI_GPU
164 
168  void PGaugeExchangeFree(){
169 #ifdef MULTI_GPU
171  cudaStreamDestroy(GFStream[0]);
172  cudaStreamDestroy(GFStream[1]);
173  for ( int d = 0; d < 4; d++ ) {
174  if ( commDimPartitioned(d)) {
175  comm_free(mh_send_fwd[d]);
176  comm_free(mh_send_back[d]);
177  comm_free(mh_recv_back[d]);
178  comm_free(mh_recv_fwd[d]);
179  device_free(send_d[d]);
180  device_free(recv_d[d]);
181  device_free(sendg_d[d]);
182  device_free(recvg_d[d]);
183  #ifndef GPU_COMMS
184  host_free(hostbuffer_h[d]);
185  #endif
186  }
187  }
188  notinitialized = true;
189  }
190 #endif
191  }
192 
193 
194  template<typename Float, int NElems, typename Gauge>
195  void PGaugeExchange( Gauge dataOr, cudaGaugeField& data, const int dir, const int parity) {
196 
197 
198 #ifdef MULTI_GPU
199  if ( notinitialized == false ) {
200  for ( int d = 0; d < 4; d++ ) {
201  if ( X[d] != data.X()[d] ) {
203  notinitialized = true;
204  printfQuda("PGaugeExchange needs to be reinitialized...\n");
205  break;
206  }
207  }
208  }
209  if ( notinitialized ) {
210  for ( int d = 0; d < 4; d++ ) {
211  X[d] = data.X()[d];
212  }
213  for ( int i = 0; i < 4; i++ ) {
214  faceVolume[i] = 1;
215  for ( int j = 0; j < 4; j++ ) {
216  if ( i == j ) continue;
217  faceVolume[i] *= X[j];
218  }
219  faceVolumeCB[i] = faceVolume[i] / 2;
220  }
221 
222  cudaStreamCreate(&GFStream[0]);
223  cudaStreamCreate(&GFStream[1]);
224  for ( int d = 0; d < 4; d++ ) {
225  if ( !commDimPartitioned(d)) continue;
226  // store both parities and directions in each
227  offset[d] = faceVolumeCB[d] * NElems;
228  bytes[d] = sizeof(Float) * offset[d];
229  send_d[d] = device_malloc(bytes[d]);
230  recv_d[d] = device_malloc(bytes[d]);
231  sendg_d[d] = device_malloc(bytes[d]);
232  recvg_d[d] = device_malloc(bytes[d]);
233  #ifndef GPU_COMMS
234  hostbuffer_h[d] = (void*)pinned_malloc(4 * bytes[d]);
235  #endif
236  block[d] = make_uint3(128, 1, 1);
237  grid[d] = make_uint3((faceVolumeCB[d] + block[d].x - 1) / block[d].x, 1, 1);
238  }
239 
240  for ( int d = 0; d < 4; d++ ) {
241  if ( !commDimPartitioned(d)) continue;
242  #ifdef GPU_COMMS
243  recv[d] = recv_d[d];
244  send[d] = send_d[d];
245  recvg[d] = recvg_d[d];
246  sendg[d] = sendg_d[d];
247  #else
248  recv[d] = hostbuffer_h[d];
249  send[d] = static_cast<char*>(hostbuffer_h[d]) + bytes[d];
250  recvg[d] = static_cast<char*>(hostbuffer_h[d]) + 3 * bytes[d];
251  sendg[d] = static_cast<char*>(hostbuffer_h[d]) + 2 * bytes[d];
252  #endif
253  // look into storing these for later
254  mh_recv_back[d] = comm_declare_receive_relative(recv[d], d, -1, bytes[d]);
255  mh_recv_fwd[d] = comm_declare_receive_relative(recvg[d], d, +1, bytes[d]);
256  mh_send_back[d] = comm_declare_send_relative(sendg[d], d, -1, bytes[d]);
257  mh_send_fwd[d] = comm_declare_send_relative(send[d], d, +1, bytes[d]);
258  }
259  notinitialized = false;
260  }
261  GaugeFixUnPackArg<Gauge> dataexarg(dataOr, data);
262 
263 
264  for ( int d = 0; d < 4; d++ ) {
265  if ( !commDimPartitioned(d)) continue;
266  comm_start(mh_recv_back[d]);
267  comm_start(mh_recv_fwd[d]);
268 
269  //extract top face
270  Kernel_UnPack<NElems, Float, Gauge, true><< < grid[d], block[d], 0, GFStream[0] >> >
271  (faceVolumeCB[d], dataexarg, reinterpret_cast<complex<Float>*>(send_d[d]), parity, d, dir, X[d] - data.R()[d] - 1);
272  //extract bottom
273  Kernel_UnPack<NElems, Float, Gauge, true><< < grid[d], block[d], 0, GFStream[1] >> >
274  (faceVolumeCB[d], dataexarg, reinterpret_cast<complex<Float>*>(sendg_d[d]), parity, d, dir, data.R()[d]);
275 
276  #ifndef GPU_COMMS
277  cudaMemcpyAsync(send[d], send_d[d], bytes[d], cudaMemcpyDeviceToHost, GFStream[0]);
278  cudaMemcpyAsync(sendg[d], sendg_d[d], bytes[d], cudaMemcpyDeviceToHost, GFStream[1]);
279  #endif
280  qudaStreamSynchronize(GFStream[0]);
281  comm_start(mh_send_fwd[d]);
282 
283  qudaStreamSynchronize(GFStream[1]);
284  comm_start(mh_send_back[d]);
285 
286  #ifndef GPU_COMMS
287  comm_wait(mh_recv_back[d]);
288  cudaMemcpyAsync(recv_d[d], recv[d], bytes[d], cudaMemcpyHostToDevice, GFStream[0]);
289  #endif
290  #ifdef GPU_COMMS
291  comm_wait(mh_recv_back[d]);
292  #endif
293  Kernel_UnPack<NElems, Float, Gauge, false><< < grid[d], block[d], 0, GFStream[0] >> >
294  (faceVolumeCB[d], dataexarg, reinterpret_cast<complex<Float>*>(recv_d[d]), parity, d, dir, data.R()[d] - 1);
295 
296  #ifndef GPU_COMMS
297  comm_wait(mh_recv_fwd[d]);
298  cudaMemcpyAsync(recvg_d[d], recvg[d], bytes[d], cudaMemcpyHostToDevice, GFStream[1]);
299  #endif
300 
301  #ifdef GPU_COMMS
302  comm_wait(mh_recv_fwd[d]);
303  #endif
304  Kernel_UnPack<NElems, Float, Gauge, false><< < grid[d], block[d], 0, GFStream[1] >> >
305  (faceVolumeCB[d], dataexarg, reinterpret_cast<complex<Float>*>(recvg_d[d]), parity, d, dir, X[d] - data.R()[d]);
306 
307  comm_wait(mh_send_back[d]);
308  comm_wait(mh_send_fwd[d]);
309  qudaStreamSynchronize(GFStream[0]);
310  qudaStreamSynchronize(GFStream[1]);
311  }
312  checkCudaError();
314  #endif
315 
316  }
317 
318 
319  template<typename Float>
320  void PGaugeExchange( cudaGaugeField& data, const int dir, const int parity) {
321 
322 
323  // Switching to FloatNOrder for the gauge field in order to support RECONSTRUCT_12
324  // Need to fix this!!
325  if ( data.isNative() ) {
326  if ( data.Reconstruct() == QUDA_RECONSTRUCT_NO ) {
327  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type G;
328  PGaugeExchange<Float, 18>(G(data), data, dir, parity);
329  } else if ( data.Reconstruct() == QUDA_RECONSTRUCT_12 ) {
330  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type G;
331  PGaugeExchange<Float, 12>(G(data), data, dir, parity);
332  } else if ( data.Reconstruct() == QUDA_RECONSTRUCT_8 ) {
333  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type G;
334  PGaugeExchange<Float, 8>(G(data), data, dir, parity);
335  } else {
336  errorQuda("Reconstruction type %d of gauge field not supported", data.Reconstruct());
337  }
338  } else {
339  errorQuda("Invalid Gauge Order\n");
340  }
341  }
342 
343 #endif // GPU_GAUGE_ALG
344 
345  void PGaugeExchange( cudaGaugeField& data, const int dir, const int parity) {
346 
347 #ifdef GPU_GAUGE_ALG
348 #ifdef MULTI_GPU
350  if ( data.Precision() == QUDA_HALF_PRECISION ) {
351  errorQuda("Half precision not supported\n");
352  }
353  if ( data.Precision() == QUDA_SINGLE_PRECISION ) {
354  PGaugeExchange<float> (data, dir, parity);
355  } else if ( data.Precision() == QUDA_DOUBLE_PRECISION ) {
356  PGaugeExchange<double>(data, dir, parity);
357  } else {
358  errorQuda("Precision %d not supported", data.Precision());
359  }
360  }
361 #endif
362 #else
363  errorQuda("Pure gauge code has not been built");
364 #endif
365  }
366 }
dim3 dim3 blockDim
int commDimPartitioned(int dir)
#define pinned_malloc(size)
Definition: malloc_quda.h:55
#define errorQuda(...)
Definition: util_quda.h:90
#define host_free(ptr)
Definition: malloc_quda.h:59
std::complex< double > Complex
Definition: eig_variables.h:13
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:44
size_t size_t offset
void PGaugeExchange(cudaGaugeField &data, const int dir, const int parity)
Perform heatbath and overrelaxation. Performs nhb heatbath steps followed by nover overrelaxation ste...
void comm_free(MsgHandle *mh)
Definition: comm_mpi.cpp:252
#define comm_declare_send_relative(buffer, dim, dir, nbytes)
Definition: comm_quda.h:59
void PGaugeExchangeFree()
Release all allocated memory used to exchange data between nodes.
cudaError_t qudaStreamSynchronize(cudaStream_t &stream)
Wrapper around cudaStreamSynchronize or cuStreamSynchronize.
#define comm_declare_receive_relative(buffer, dim, dir, nbytes)
Definition: comm_quda.h:74
void comm_start(MsgHandle *mh)
Definition: comm_mpi.cpp:260
Main header file for host and device accessors to GaugeFields.
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
#define printfQuda(...)
Definition: util_quda.h:84
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:880
#define device_malloc(size)
Definition: malloc_quda.h:52
int faceVolume[4]
Definition: test_util.cpp:32
#define checkCudaError()
Definition: util_quda.h:129
struct cudaExtent unsigned int cudaArray_t array
void comm_wait(MsgHandle *mh)
Definition: comm_mpi.cpp:266
static __inline__ size_t size_t d
QudaPrecision Precision() const
QudaParity parity
Definition: covdev_test.cpp:53
unsigned long long bytes
Definition: blas_quda.cu:43
int comm_dim_partitioned(int dim)
#define device_free(ptr)
Definition: malloc_quda.h:57