QUDA  v0.5.0
A library for QCD on GPUs
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
face_buffer.cpp
Go to the documentation of this file.
1 #include <quda_internal.h>
2 #include <face_quda.h>
3 #include <dslash_quda.h>
4 
5 #ifndef GPU_DIRECT
6 #include <string.h>
7 #endif
8 
9 using namespace quda;
10 
11 cudaStream_t *stream;
12 
13 bool globalReduce = true;
14 
15 
16 FaceBuffer::FaceBuffer(const int *X, const int nDim, const int Ninternal,
17  const int nFace, const QudaPrecision precision, const int Ls) :
18  Ninternal(Ninternal), precision(precision), nDim(nDim), nDimComms(nDim), nFace(nFace)
19 {
20  setupDims(X, Ls);
21 
22  // set these both = 0 separate streams for forwards and backwards comms
23  // sendBackStrmIdx = 0, and sendFwdStrmIdx = 1 for overlap
24  sendBackStrmIdx = 0;
25  sendFwdStrmIdx = 1;
26  recFwdStrmIdx = sendBackStrmIdx;
27  recBackStrmIdx = sendFwdStrmIdx;
28 
29  // Buffers hold half spinors
30  for (int i=0; i<nDimComms; i++) {
31  nbytes[i] = nFace*faceVolumeCB[i]*Ninternal*precision;
32  // add extra space for the norms for half precision
33  if (precision == QUDA_HALF_PRECISION) nbytes[i] += nFace*faceVolumeCB[i]*sizeof(float);
34 
35  my_back_face[i] = allocatePinned(2*nbytes[i]);
36  my_fwd_face[i] = (char*)my_back_face[i] + nbytes[i];
37  from_back_face[i] = allocatePinned(2*nbytes[i]);
38  from_fwd_face[i] = (char*)from_back_face[i] + nbytes[i];
39 
40 #ifdef GPU_DIRECT // just alias the pointer
41  ib_my_fwd_face[i] = my_fwd_face[i];
42  ib_my_back_face[i] = my_back_face[i];
43  ib_from_fwd_face[i] = from_fwd_face[i];
44  ib_from_back_face[i] = from_back_face[i];
45 #else // if no GPUDirect so need separate IB and GPU host buffers
46  ib_my_fwd_face[i] = safe_malloc(nbytes[i]);
47  ib_my_back_face[i] = safe_malloc(nbytes[i]);
48  ib_from_fwd_face[i] = safe_malloc(nbytes[i]);
49  ib_from_back_face[i] = safe_malloc(nbytes[i]);
50 #endif
51 
52  }
53 
54  for (int i=0; i<nDimComms; i++) {
55  mh_send_fwd[i] = comm_declare_send_relative(ib_my_fwd_face[i], i, 1, nbytes[i]);
56  mh_send_back[i] = comm_declare_send_relative(ib_my_back_face[i], i, -1, nbytes[i]);
57  mh_recv_fwd[i] = comm_declare_receive_relative(ib_from_fwd_face[i], i, +1, nbytes[i]);
58  mh_recv_back[i] = comm_declare_receive_relative(ib_from_back_face[i], i, -1, nbytes[i]);
59  }
60 
62 }
63 
64 
66  errorQuda("FaceBuffer copy constructor not implemented");
67 }
68 
69 
71 {
72  for (int i=0; i<nDimComms; i++) {
73 
74 #ifndef GPU_DIRECT
75  host_free(ib_my_fwd_face[i]);
76  host_free(ib_my_back_face[i]);
77  host_free(ib_from_fwd_face[i]);
78  host_free(ib_from_back_face[i]);
79 #endif
80 
81  comm_free(mh_send_fwd[i]);
82  comm_free(mh_send_back[i]);
83  comm_free(mh_recv_fwd[i]);
84  comm_free(mh_recv_back[i]);
85 
86  freePinned(from_back_face[i]);
87  freePinned(my_back_face[i]);
88  }
89 
90  for (int i=0; i<nDimComms; i++) {
91  ib_my_fwd_face[i] = NULL;
92  ib_my_back_face[i] = NULL;
93  ib_from_fwd_face[i] = NULL;
94  ib_from_back_face[i] = NULL;
95 
96  my_fwd_face[i] = NULL;
97  my_back_face[i] = NULL;
98  from_fwd_face[i] = NULL;
99  from_back_face[i] = NULL;
100 
101  mh_recv_fwd[i] = NULL;
102  mh_recv_back[i] = NULL;
103  mh_send_fwd[i] = NULL;
104  mh_send_back[i] = NULL;
105  }
106 
107  checkCudaError();
108 }
109 
110 
111 // X here is a checkboarded volume
112 void FaceBuffer::setupDims(const int* X, int Ls)
113 {
114  if (nDim > QUDA_MAX_DIM) errorQuda("nDim = %d is greater than the maximum of %d\n", nDim, QUDA_MAX_DIM);
115  for (int d=0; d<4; d++) this->X[d] = X[d];
116  if(nDim == 5) {
117  this->X[nDim-1] = Ls;
118  nDimComms = 4;
119  }
120 
121  Volume = 1;
122  for (int d=0; d<nDim; d++) Volume *= this->X[d];
123  VolumeCB = Volume/2;
124 
125  for (int i=0; i<nDim; i++) {
126  faceVolume[i] = 1;
127  for (int j=0; j<nDim; j++) {
128  if (i==j) continue;
129  faceVolume[i] *= this->X[j];
130  }
131  faceVolumeCB[i] = faceVolume[i]/2;
132  }
133 }
134 
135 
136 // cache of inactive allocations
137 std::multimap<size_t, void *> FaceBuffer::pinnedCache;
138 
139 // sizes of active allocations
140 std::map<void *, size_t> FaceBuffer::pinnedSize;
141 
142 
143 void *FaceBuffer::allocatePinned(size_t nbytes)
144 {
145  std::multimap<size_t, void *>::iterator it;
146  void *ptr = 0;
147 
148  if (pinnedCache.empty()) {
149  ptr = pinned_malloc(nbytes);
150  } else {
151  it = pinnedCache.lower_bound(nbytes);
152  if (it != pinnedCache.end()) { // sufficiently large allocation found
153  nbytes = it->first;
154  ptr = it->second;
155  pinnedCache.erase(it);
156  } else { // sacrifice the smallest cached allocation
157  it = pinnedCache.begin();
158  ptr = it->second;
159  pinnedCache.erase(it);
160  host_free(ptr);
161  ptr = pinned_malloc(nbytes);
162  }
163  }
164  pinnedSize[ptr] = nbytes;
165  return ptr;
166 }
167 
168 
169 void FaceBuffer::freePinned(void *ptr)
170 {
171  if (!pinnedSize.count(ptr)) {
172  errorQuda("Attempt to free invalid pointer");
173  }
174  pinnedCache.insert(std::make_pair(pinnedSize[ptr], ptr));
175  pinnedSize.erase(ptr);
176 }
177 
178 
180 {
181  std::multimap<size_t, void *>::iterator it;
182  for (it = pinnedCache.begin(); it != pinnedCache.end(); it++) {
183  void *ptr = it->second;
184  host_free(ptr);
185  }
186  pinnedCache.clear();
187 }
188 
189 
190 void FaceBuffer::pack(cudaColorSpinorField &in, int parity, int dagger, cudaStream_t *stream_p)
191 {
192  in.allocateGhostBuffer(); // allocate the ghost buffer if not yet allocated
193  stream = stream_p;
194  in.packGhost((QudaParity)parity, dagger, &stream[Nstream-1]);
195 }
196 
197 
199 {
200  int dim = dir/2;
201  if(!commDimPartitioned(dim)) return;
202 
203  if (dir%2==0) {
204  // backwards copy to host
205  in.sendGhost(my_back_face[dim], dim, QUDA_BACKWARDS, dagger, &stream[2*dim+sendBackStrmIdx]);
206  } else {
207  // forwards copy to host
208  in.sendGhost(my_fwd_face[dim], dim, QUDA_FORWARDS, dagger, &stream[2*dim+sendFwdStrmIdx]);
209  }
210 }
211 
212 
213 // experimenting with callbacks for GPU -> MPI interaction.
214 // much slower though because callbacks are done on a background thread
215 //#define QUDA_CALLBACK
216 
217 #ifdef QUDA_CALLBACK
218 
219 struct commCallback_t {
220  MsgHandle *mh_recv;
221  MsgHandle *mh_send;
222  void *ib_buffer;
223  void *face_buffer;
224  size_t bytes;
225 };
226 
227 static commCallback_t commCB[2*QUDA_MAX_DIM];
228 
229 void CUDART_CB commCallback(cudaStream_t stream, cudaError_t status, void *data) {
230  const unsigned long long dir = (unsigned long long)data;
231 
232  comm_start(commCB[dir].mh_recv);
233 #ifndef GPU_DIRECT
234  memcpy(commCB[dir].ib_buffer, commCB[dir].face_buffer, commCB[dir].bytes);
235 #endif
236  comm_start(commCB[dir].mh_send);
237 
238 }
239 
240 void FaceBuffer::commsStart(int dir) {
241  int dim = dir / 2;
242  if(!commDimPartitioned(dim)) return;
243 
244  if (dir%2 == 0) { // sending backwards
245  commCB[dir].mh_recv = mh_recv_fwd[dim];
246  commCB[dir].mh_send = mh_send_back[dim];
247  commCB[dir].ib_buffer = ib_my_back_face[dim];
248  commCB[dir].face_buffer = my_back_face[dim];
249  commCB[dir].bytes = nbytes[dim];
250  } else { //sending forwards
251  commCB[dir].mh_recv = mh_recv_back[dim];
252  commCB[dir].mh_send = mh_send_fwd[dim];
253  commCB[dir].ib_buffer = ib_my_fwd_face[dim];
254  commCB[dir].face_buffer = my_fwd_face[dim];
255  commCB[dir].bytes = nbytes[dim];
256  }
257 
258  cudaStreamAddCallback(stream[dir], commCallback, (void*)dir, 0);
259 }
260 
261 #else // !defined(QUDA_CALLBACK)
262 
263 void FaceBuffer::commsStart(int dir) {
264  int dim = dir / 2;
265  if(!commDimPartitioned(dim)) return;
266 
267  if (dir%2 == 0) { // sending backwards
268  // Prepost receive
269  comm_start(mh_recv_fwd[dim]);
270 #ifndef GPU_DIRECT
271  memcpy(ib_my_back_face[dim], my_back_face[dim], nbytes[dim]);
272 #endif
273  comm_start(mh_send_back[dim]);
274  } else { //sending forwards
275  // Prepost receive
276  comm_start(mh_recv_back[dim]);
277  // Begin forward send
278 #ifndef GPU_DIRECT
279  memcpy(ib_my_fwd_face[dim], my_fwd_face[dim], nbytes[dim]);
280 #endif
281  comm_start(mh_send_fwd[dim]);
282  }
283 }
284 
285 #endif // QUDA_CALLBACK
286 
287 
289 {
290  int dim = dir / 2;
291  if(!commDimPartitioned(dim)) return 0;
292 
293  if(dir%2==0) {
294  if (comm_query(mh_recv_fwd[dim]) && comm_query(mh_send_back[dim])) {
295 #ifndef GPU_DIRECT
296  memcpy(from_fwd_face[dim], ib_from_fwd_face[dim], nbytes[dim]);
297 #endif
298  return 1;
299  }
300  } else {
301  if (comm_query(mh_recv_back[dim]) && comm_query(mh_send_fwd[dim])) {
302 #ifndef GPU_DIRECT
303  memcpy(from_back_face[dim], ib_from_back_face[dim], nbytes[dim]);
304 #endif
305  return 1;
306  }
307  }
308 
309  return 0;
310 }
311 
312 
314 {
315  int dim = dir/2;
316  if(!commDimPartitioned(dim)) return;
317 
318  // both scattering occurances now go through the same stream
319  if (dir%2==0) {// receive from forwards
320  out.unpackGhost(from_fwd_face[dim], dim, QUDA_FORWARDS, dagger, &stream[2*dim/*+recFwdStrmIdx*/]); // 0, 2, 4, 6
321  } else { // receive from backwards
322  out.unpackGhost(from_back_face[dim], dim, QUDA_BACKWARDS, dagger, &stream[2*dim/*+recBackStrmIdx*/]); // 1, 3, 5, 7
323  }
324 }
325 
326 
327 // This is just an initial hack for CPU comms - should be creating the message handlers at instantiation
329 {
330  // allocate the ghost buffer if not yet allocated
331  spinor.allocateGhostBuffer();
332 
333  for(int i=0;i < 4; i++){
334  spinor.packGhost(spinor.backGhostFaceSendBuffer[i], i, QUDA_BACKWARDS, (QudaParity)oddBit, dagger);
335  spinor.packGhost(spinor.fwdGhostFaceSendBuffer[i], i, QUDA_FORWARDS, (QudaParity)oddBit, dagger);
336  }
337 
338  MsgHandle *mh_send_fwd[4];
339  MsgHandle *mh_from_back[4];
340  MsgHandle *mh_from_fwd[4];
341  MsgHandle *mh_send_back[4];
342 
343  for (int i=0; i<nDimComms; i++) {
344  mh_send_fwd[i] = comm_declare_send_relative(spinor.fwdGhostFaceSendBuffer[i], i, +1, nbytes[i]);
345  mh_send_back[i] = comm_declare_send_relative(spinor.backGhostFaceSendBuffer[i], i, -1, nbytes[i]);
346  mh_from_fwd[i] = comm_declare_receive_relative(spinor.fwdGhostFaceBuffer[i], i, +1, nbytes[i]);
347  mh_from_back[i] = comm_declare_receive_relative(spinor.backGhostFaceBuffer[i], i, -1, nbytes[i]);
348  }
349 
350  for (int i=0; i<nDimComms; i++) {
351  comm_start(mh_from_back[i]);
352  comm_start(mh_from_fwd[i]);
353  comm_start(mh_send_fwd[i]);
354  comm_start(mh_send_back[i]);
355  }
356 
357  for (int i=0; i<nDimComms; i++) {
358  comm_wait(mh_send_fwd[i]);
359  comm_wait(mh_send_back[i]);
360  comm_wait(mh_from_back[i]);
361  comm_wait(mh_from_fwd[i]);
362  }
363 
364  for (int i=0; i<nDimComms; i++) {
365  comm_free(mh_send_fwd[i]);
366  comm_free(mh_send_back[i]);
367  comm_free(mh_from_back[i]);
368  comm_free(mh_from_fwd[i]);
369  }
370 }
371 
372 
373 void FaceBuffer::exchangeCpuLink(void** ghost_link, void** link_sendbuf)
374 {
375  MsgHandle *mh_from_back[4];
376  MsgHandle *mh_send_fwd[4];
377 
378  for (int i=0; i<nDimComms; i++) {
379  int len = 2*nFace*faceVolumeCB[i]*Ninternal;
380  mh_send_fwd[i] = comm_declare_send_relative(link_sendbuf[i], i, +1, len*precision);
381  mh_from_back[i] = comm_declare_receive_relative(ghost_link[i], i, -1, len*precision);
382  }
383 
384  for (int i=0; i<nDimComms; i++) {
385  comm_start(mh_send_fwd[i]);
386  comm_start(mh_from_back[i]);
387  }
388 
389  for (int i=0; i<nDimComms; i++) {
390  comm_wait(mh_send_fwd[i]);
391  comm_wait(mh_from_back[i]);
392  }
393 
394  for (int i=0; i<nDimComms; i++) {
395  comm_free(mh_send_fwd[i]);
396  comm_free(mh_from_back[i]);
397  }
398 }
399 
400 
401 void reduceMaxDouble(double &max) { comm_allreduce_max(&max); }
402 
403 void reduceDouble(double &sum) { if (globalReduce) comm_allreduce(&sum); }
404 
405 void reduceDoubleArray(double *sum, const int len)
406 { if (globalReduce) comm_allreduce_array(sum, len); }
407 
408 int commDim(int dir) { return comm_dim(dir); }
409 
410 int commCoords(int dir) { return comm_coord(dir); }
411 
412 int commDimPartitioned(int dir){ return comm_dim_partitioned(dir);}
413