QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
face_buffer.cpp
Go to the documentation of this file.
1 #include <quda_internal.h>
2 #include <face_quda.h>
3 #include <dslash_quda.h>
4 
5 #include <string.h>
6 
7 using namespace quda;
8 
9 cudaStream_t *stream;
10 
11 bool globalReduce = true;
12 
13 
14 FaceBuffer::FaceBuffer(const int *X, const int nDim, const int Ninternal,
15  const int nFace, const QudaPrecision precision, const int Ls) :
16  my_face(0), from_face(0), Ninternal(Ninternal), precision(precision), nDim(nDim),
17  nDimComms(nDim), nFace(nFace)
18 {
19  setupDims(X, Ls);
20 
21  // set these both = 0 separate streams for forwards and backwards comms
22  // sendBackStrmIdx = 0, and sendFwdStrmIdx = 1 for overlap
23  sendBackStrmIdx = 0;
24  sendFwdStrmIdx = 1;
25  recFwdStrmIdx = sendBackStrmIdx;
26  recBackStrmIdx = sendFwdStrmIdx;
27 
28  // allocate a single contiguous buffer for the buffers
29  size_t faceBytes = 0;
30  for (int i=0; i<nDimComms; i++) {
31  nbytes[i] = nFace*faceVolumeCB[i]*Ninternal*precision;
32  // add extra space for the norms for half precision
33  if (precision == QUDA_HALF_PRECISION) nbytes[i] += nFace*faceVolumeCB[i]*sizeof(float);
34  if(!commDimPartitioned(i)) continue;
35  faceBytes += 2*nbytes[i];
36  }
37 
38  if (faceBytes > 0) {
39  my_face = allocatePinned(faceBytes);
40  from_face = allocatePinned(faceBytes);
41  }
42 
43  // assign Buffers hold half spinors
44  size_t offset = 0;
45  for (int i=0; i<nDimComms; i++) {
46  if (!commDimPartitioned(i)) continue;
47 
48  my_back_face[i] = (char*)my_face + offset;
49  from_back_face[i] = (char*)from_face + offset;
50  offset += nbytes[i];
51 
52  my_fwd_face[i] = (char*)my_face + offset;
53  from_fwd_face[i] = (char*)from_face + offset;
54  offset += nbytes[i];
55 
56 #ifdef GPU_DIRECT // just alias the pointer
57  ib_my_fwd_face[i] = my_fwd_face[i];
58  ib_my_back_face[i] = my_back_face[i];
59  ib_from_fwd_face[i] = from_fwd_face[i];
60  ib_from_back_face[i] = from_back_face[i];
61 #else // if no GPUDirect so need separate IB and GPU host buffers
62  ib_my_fwd_face[i] = safe_malloc(nbytes[i]);
63  ib_my_back_face[i] = safe_malloc(nbytes[i]);
64  ib_from_fwd_face[i] = safe_malloc(nbytes[i]);
65  ib_from_back_face[i] = safe_malloc(nbytes[i]);
66 #endif
67  }
68 
69  for (int i=0; i<nDimComms; i++) {
70  if (!commDimPartitioned(i)) continue;
71  mh_send_fwd[i] = comm_declare_send_relative(ib_my_fwd_face[i], i, 1, nbytes[i]);
72  mh_send_back[i] = comm_declare_send_relative(ib_my_back_face[i], i, -1, nbytes[i]);
73  mh_recv_fwd[i] = comm_declare_receive_relative(ib_from_fwd_face[i], i, +1, nbytes[i]);
74  mh_recv_back[i] = comm_declare_receive_relative(ib_from_back_face[i], i, -1, nbytes[i]);
75  }
76 
78 }
79 
80 
82  errorQuda("FaceBuffer copy constructor not implemented");
83 }
84 
85 
87 {
88  for (int i=0; i<nDimComms; i++) {
89  if (commDimPartitioned(i)) {
90 #ifndef GPU_DIRECT
91  host_free(ib_my_fwd_face[i]);
92  host_free(ib_my_back_face[i]);
93  host_free(ib_from_fwd_face[i]);
94  host_free(ib_from_back_face[i]);
95 #endif
96  comm_free(mh_send_fwd[i]);
97  comm_free(mh_send_back[i]);
98  comm_free(mh_recv_fwd[i]);
99  comm_free(mh_recv_back[i]);
100  }
101 
102  }
103 
104  for (int i=0; i<nDimComms; i++) {
105  ib_my_fwd_face[i] = NULL;
106  ib_my_back_face[i] = NULL;
107  ib_from_fwd_face[i] = NULL;
108  ib_from_back_face[i] = NULL;
109 
110  my_fwd_face[i] = NULL;
111  my_back_face[i] = NULL;
112  from_fwd_face[i] = NULL;
113  from_back_face[i] = NULL;
114 
115  mh_recv_fwd[i] = NULL;
116  mh_recv_back[i] = NULL;
117  mh_send_fwd[i] = NULL;
118  mh_send_back[i] = NULL;
119  }
120 
121  if (from_face) freePinned(from_face);
122  if (my_face) freePinned(my_face);
123 
124  checkCudaError();
125 }
126 
127 
128 // X here is a checkboarded volume
129 void FaceBuffer::setupDims(const int* X, int Ls)
130 {
131  if (nDim > QUDA_MAX_DIM) errorQuda("nDim = %d is greater than the maximum of %d\n", nDim, QUDA_MAX_DIM);
132  for (int d=0; d<4; d++) this->X[d] = X[d];
133  if(nDim == 5) {
134  this->X[nDim-1] = Ls;
135  nDimComms = 4;
136  }
137 
138  Volume = 1;
139  for (int d=0; d<nDim; d++) Volume *= this->X[d];
140  VolumeCB = Volume/2;
141 
142  for (int i=0; i<nDim; i++) {
143  faceVolume[i] = 1;
144  for (int j=0; j<nDim; j++) {
145  if (i==j) continue;
146  faceVolume[i] *= this->X[j];
147  }
148  faceVolumeCB[i] = faceVolume[i]/2;
149  }
150 }
151 
152 
153 // cache of inactive allocations
154 std::multimap<size_t, void *> FaceBuffer::pinnedCache;
155 
156 // sizes of active allocations
157 std::map<void *, size_t> FaceBuffer::pinnedSize;
158 
159 
160 void *FaceBuffer::allocatePinned(size_t nbytes)
161 {
162  std::multimap<size_t, void *>::iterator it;
163  void *ptr = 0;
164 
165  if (pinnedCache.empty()) {
166  ptr = pinned_malloc(nbytes);
167  } else {
168  it = pinnedCache.lower_bound(nbytes);
169  if (it != pinnedCache.end()) { // sufficiently large allocation found
170  nbytes = it->first;
171  ptr = it->second;
172  pinnedCache.erase(it);
173  } else { // sacrifice the smallest cached allocation
174  it = pinnedCache.begin();
175  ptr = it->second;
176  pinnedCache.erase(it);
177  host_free(ptr);
178  ptr = pinned_malloc(nbytes);
179  }
180  }
181  pinnedSize[ptr] = nbytes;
182  return ptr;
183 }
184 
185 
186 void FaceBuffer::freePinned(void *ptr)
187 {
188  if (!pinnedSize.count(ptr)) {
189  errorQuda("Attempt to free invalid pointer");
190  }
191  pinnedCache.insert(std::make_pair(pinnedSize[ptr], ptr));
192  pinnedSize.erase(ptr);
193 }
194 
195 
197 {
198  std::multimap<size_t, void *>::iterator it;
199  for (it = pinnedCache.begin(); it != pinnedCache.end(); it++) {
200  void *ptr = it->second;
201  host_free(ptr);
202  }
203  pinnedCache.clear();
204 }
205 
206 
207 void FaceBuffer::pack(cudaColorSpinorField &in, int dim, int dir, int parity, int dagger,
208  cudaStream_t *stream_p, bool zeroCopyPack, double a, double b)
209 {
210  in.allocateGhostBuffer(nFace); // allocate the ghost buffer if not yet allocated
211  stream = stream_p;
212 
213  if (zeroCopyPack) {
214  void *my_face_d;
215  cudaHostGetDevicePointer(&my_face_d, my_face, 0); // set the matching device pointer
216  in.packGhost(nFace, (QudaParity)parity, dim, (QudaDirection)dir, dagger, &stream[0], my_face_d, a, b);
217  } else {
218  in.packGhost(nFace, (QudaParity)parity, dim, (QudaDirection)dir, dagger, &stream[Nstream-1], 0, a, b);
219  }
220 }
221 
222 void FaceBuffer::gather(cudaColorSpinorField &in, int dagger, int dir, int stream_idx){
223  int dim = dir/2;
224  if(!commDimPartitioned(dim)) return;
225 
226  if (dir%2==0) {
227  // backwards copy to host
228  in.sendGhost(my_back_face[dim], nFace, dim, QUDA_BACKWARDS, dagger, &stream[stream_idx]);
229  } else {
230  // forwards copy to host
231  in.sendGhost(my_fwd_face[dim], nFace, dim, QUDA_FORWARDS, dagger, &stream[stream_idx]);
232  }
233 }
234 
236  cudaStream_t *stream_p, bool zeroCopyPack, double a, double b)
237 {
238  const int dim = -1;
239  pack(in, dim, dir, parity, dagger, stream_p, zeroCopyPack, a, b);
240 }
241 
243  cudaStream_t *stream_p, bool zeroCopyPack, double a, double b)
244 {
245  const int dim = -1; // pack all partitioned space-time dimensions
246  const int dir = 2; // pack both forward and backwards directions
247  pack(in, dim, dir, parity, dagger, stream_p, zeroCopyPack, a, b);
248 
249 }
250 
252 
253  if(!commDimPartitioned(dir/2)) return;
254 
255  const int stream_idx = (dir%2 == 0) ? dir+sendBackStrmIdx : dir-1+sendFwdStrmIdx;
256  gather(in, dagger, dir, stream_idx);
257 }
258 
259 // experimenting with callbacks for GPU -> MPI interaction.
260 // much slower though because callbacks are done on a background thread
261 //#define QUDA_CALLBACK
262 
263 #ifdef QUDA_CALLBACK
264 
265 struct commCallback_t {
266  MsgHandle *mh_recv;
267  MsgHandle *mh_send;
268  void *ib_buffer;
269  void *face_buffer;
270  size_t bytes;
271 };
272 
273 static commCallback_t commCB[2*QUDA_MAX_DIM];
274 
275 void CUDART_CB commCallback(cudaStream_t stream, cudaError_t status, void *data) {
276  const unsigned long long dir = (unsigned long long)data;
277 
278  comm_start(commCB[dir].mh_recv);
279 #ifndef GPU_DIRECT
280  memcpy(commCB[dir].ib_buffer, commCB[dir].face_buffer, commCB[dir].bytes);
281 #endif
282  comm_start(commCB[dir].mh_send);
283 
284 }
285 
286 void FaceBuffer::commsStart(int dir) {
287  int dim = dir / 2;
288  if(!commDimPartitioned(dim)) return;
289 
290  if (dir%2 == 0) { // sending backwards
291  commCB[dir].mh_recv = mh_recv_fwd[dim];
292  commCB[dir].mh_send = mh_send_back[dim];
293  commCB[dir].ib_buffer = ib_my_back_face[dim];
294  commCB[dir].face_buffer = my_back_face[dim];
295  commCB[dir].bytes = nbytes[dim];
296  } else { //sending forwards
297  commCB[dir].mh_recv = mh_recv_back[dim];
298  commCB[dir].mh_send = mh_send_fwd[dim];
299  commCB[dir].ib_buffer = ib_my_fwd_face[dim];
300  commCB[dir].face_buffer = my_fwd_face[dim];
301  commCB[dir].bytes = nbytes[dim];
302  }
303 
304  cudaStreamAddCallback(stream[dir], commCallback, (void*)dir, 0);
305 }
306 
307 #else // !defined(QUDA_CALLBACK)
308 
309 void FaceBuffer::commsStart(int dir) {
310  int dim = dir / 2;
311  if(!commDimPartitioned(dim)) return;
312 
313  if (dir%2 == 0) { // sending backwards
314  // Prepost receive
315  comm_start(mh_recv_fwd[dim]);
316 #ifndef GPU_DIRECT
317  memcpy(ib_my_back_face[dim], my_back_face[dim], nbytes[dim]);
318 #endif
319  comm_start(mh_send_back[dim]);
320  } else { //sending forwards
321  // Prepost receive
322  comm_start(mh_recv_back[dim]);
323  // Begin forward send
324 #ifndef GPU_DIRECT
325  memcpy(ib_my_fwd_face[dim], my_fwd_face[dim], nbytes[dim]);
326 #endif
327  comm_start(mh_send_fwd[dim]);
328  }
329 }
330 
331 void FaceBuffer::recvStart(int dir){
332  int dim = dir/2;
333  if(!commDimPartitioned(dim)) return;
334 
335  if(dir&1){
336  comm_start(mh_recv_back[dim]);
337  }else{
338  comm_start(mh_recv_fwd[dim]);
339  }
340  return;
341 }
342 
343 void FaceBuffer::sendStart(int dir){
344  int dim = dir/2;
345  if(!commDimPartitioned(dim)) return;
346 
347  if (dir%2 == 0) { // sending backwards
348 #ifndef GPU_DIRECT
349  memcpy(ib_my_back_face[dim], my_back_face[dim], nbytes[dim]);
350 #endif
351  comm_start(mh_send_back[dim]);
352  } else { //sending forwards
353  // Begin forward send
354 #ifndef GPU_DIRECT
355  memcpy(ib_my_fwd_face[dim], my_fwd_face[dim], nbytes[dim]);
356 #endif
357  comm_start(mh_send_fwd[dim]);
358  }
359 }
360 
361 #endif // QUDA_CALLBACK
362 
363 
365 {
366  int dim = dir / 2;
367  if(!commDimPartitioned(dim)) return 0;
368 
369  if(dir%2==0) {
370  if (comm_query(mh_recv_fwd[dim]) && comm_query(mh_send_back[dim])) {
371 #ifndef GPU_DIRECT
372  memcpy(from_fwd_face[dim], ib_from_fwd_face[dim], nbytes[dim]);
373 #endif
374  return 1;
375  }
376  } else {
377  if (comm_query(mh_recv_back[dim]) && comm_query(mh_send_fwd[dim])) {
378 #ifndef GPU_DIRECT
379  memcpy(from_back_face[dim], ib_from_back_face[dim], nbytes[dim]);
380 #endif
381  return 1;
382  }
383  }
384 
385  return 0;
386 }
387 
388 
389 void FaceBuffer::scatter(cudaColorSpinorField &out, int dagger, int dir, int stream_idx)
390 {
391  int dim = dir/2;
392  if(!commDimPartitioned(dim)) return;
393 
394  if(dir%2==0) { // receive from forwards
395  out.unpackGhost(from_fwd_face[dim], nFace, dim, QUDA_FORWARDS, dagger, &stream[stream_idx]);
396  }else{ // receive from backwards
397  out.unpackGhost(from_back_face[dim], nFace, dim, QUDA_BACKWARDS, dagger, &stream[stream_idx]);
398  }
399 }
400 
402  scatter(out, dagger, dir, (dir/2)*2);
403 }
404 
405 
406 // This is just an initial hack for CPU comms - should be creating the message handlers at instantiation
408 {
409  // allocate the ghost buffer if not yet allocated
410  spinor.allocateGhostBuffer();
411 
412  for(int i=0;i < 4; i++){
413  spinor.packGhost(spinor.backGhostFaceSendBuffer[i], i,
414  QUDA_BACKWARDS, (QudaParity)oddBit, dagger);
415  spinor.packGhost(spinor.fwdGhostFaceSendBuffer[i], i,
416  QUDA_FORWARDS, (QudaParity)oddBit, dagger);
417  }
418 
419  MsgHandle *mh_send_fwd[4];
420  MsgHandle *mh_from_back[4];
421  MsgHandle *mh_from_fwd[4];
422  MsgHandle *mh_send_back[4];
423 
424  for (int i=0; i<nDimComms; i++) {
425  if (!commDimPartitioned(i)) continue;
426  mh_send_fwd[i] = comm_declare_send_relative(spinor.fwdGhostFaceSendBuffer[i], i, +1, nbytes[i]);
427  mh_send_back[i] = comm_declare_send_relative(spinor.backGhostFaceSendBuffer[i], i, -1, nbytes[i]);
428  mh_from_fwd[i] = comm_declare_receive_relative(spinor.fwdGhostFaceBuffer[i], i, +1, nbytes[i]);
429  mh_from_back[i] = comm_declare_receive_relative(spinor.backGhostFaceBuffer[i], i, -1, nbytes[i]);
430  }
431 
432  for (int i=0; i<nDimComms; i++) {
433  if (commDimPartitioned(i)) {
434  comm_start(mh_from_back[i]);
435  comm_start(mh_from_fwd[i]);
436  comm_start(mh_send_fwd[i]);
437  comm_start(mh_send_back[i]);
438  } else {
439  memcpy(spinor.backGhostFaceBuffer[i], spinor.fwdGhostFaceSendBuffer[i], nbytes[i]);
440  memcpy(spinor.fwdGhostFaceBuffer[i], spinor.backGhostFaceSendBuffer[i], nbytes[i]);
441  }
442  }
443 
444  for (int i=0; i<nDimComms; i++) {
445  if (!commDimPartitioned(i)) continue;
446  comm_wait(mh_send_fwd[i]);
447  comm_wait(mh_send_back[i]);
448  comm_wait(mh_from_back[i]);
449  comm_wait(mh_from_fwd[i]);
450  }
451 
452  for (int i=0; i<nDimComms; i++) {
453  if (!commDimPartitioned(i)) continue;
454  comm_free(mh_send_fwd[i]);
455  comm_free(mh_send_back[i]);
456  comm_free(mh_from_back[i]);
457  comm_free(mh_from_fwd[i]);
458  }
459 }
460 
461 
462 void FaceBuffer::exchangeLink(void** ghost_link, void** link_sendbuf, QudaFieldLocation location)
463 {
464  MsgHandle *mh_from_back[4];
465  MsgHandle *mh_send_fwd[4];
466 
467  size_t bytes[4];
468  for (int i=0; i<nDimComms; i++) bytes[i] = 2*nFace*faceVolumeCB[i]*Ninternal*precision;
469 
470  void *send[4];
471  void *receive[4];
472  if (location == QUDA_CPU_FIELD_LOCATION) {
473  for (int i=0; i<nDimComms; i++) {
474  if (commDimPartitioned(i)) {
475  send[i] = link_sendbuf[i];
476  receive[i] = ghost_link[i];
477  } else {
478  memcpy(ghost_link[i], link_sendbuf[i], bytes[i]);
479  }
480  }
481  } else { // FIXME for CUDA field copy back to the CPU
482  for (int i=0; i<nDimComms; i++) {
483  if (commDimPartitioned(i)) {
484  send[i] = allocatePinned(bytes[i]);
485  receive[i] = allocatePinned(bytes[i]);
486  cudaMemcpy(send[i], link_sendbuf[i], bytes[i], cudaMemcpyDeviceToHost);
487  } else {
488  cudaMemcpy(ghost_link[i], link_sendbuf[i], bytes[i], cudaMemcpyDeviceToDevice);
489  }
490  }
491  }
492 
493  for (int i=0; i<nDimComms; i++) {
494  if (!commDimPartitioned(i)) continue;
495  mh_send_fwd[i] = comm_declare_send_relative(send[i], i, +1, bytes[i]);
496  mh_from_back[i] = comm_declare_receive_relative(receive[i], i, -1, bytes[i]);
497  }
498 
499  for (int i=0; i<nDimComms; i++) {
500  if (!commDimPartitioned(i)) continue;
501  comm_start(mh_send_fwd[i]);
502  comm_start(mh_from_back[i]);
503  }
504 
505  for (int i=0; i<nDimComms; i++) {
506  if (!commDimPartitioned(i)) continue;
507  comm_wait(mh_send_fwd[i]);
508  comm_wait(mh_from_back[i]);
509  }
510 
511  if (location == QUDA_CUDA_FIELD_LOCATION) {
512  for (int i=0; i<nDimComms; i++) {
513  if (!commDimPartitioned(i)) continue;
514  cudaMemcpy(ghost_link[i], receive[i], bytes[i], cudaMemcpyHostToDevice);
515  freePinned(send[i]);
516  freePinned(receive[i]);
517  }
518  }
519 
520  for (int i=0; i<nDimComms; i++) {
521  if (!commDimPartitioned(i)) continue;
522  comm_free(mh_send_fwd[i]);
523  comm_free(mh_from_back[i]);
524  }
525 }
526 
527 
528 void reduceMaxDouble(double &max) { comm_allreduce_max(&max); }
529 
530 void reduceDouble(double &sum) { if (globalReduce) comm_allreduce(&sum); }
531 
532 void reduceDoubleArray(double *sum, const int len)
533 { if (globalReduce) comm_allreduce_array(sum, len); }
534 
535 int commDim(int dir) { return comm_dim(dir); }
536 
537 int commCoords(int dir) { return comm_coord(dir); }
538 
539 int commDimPartitioned(int dir){ return comm_dim_partitioned(dir);}
540 
void pack(quda::cudaColorSpinorField &in, quda::FullClover &clov, quda::FullClover &clovInv, int dim, int dir, int parity, int dagger, cudaStream_t *stream, bool zeroCopyPack=false, double a=0)
#define pinned_malloc(size)
Definition: malloc_quda.h:26
enum QudaPrecision_s QudaPrecision
void packGhost(const int nFace, const QudaParity parity, const int dim, const QudaDirection dir, const int dagger, cudaStream_t *stream, void *buffer=0, double a=0, double b=0)
int commCoords(int dir)
void unpackGhost(const void *ghost_spinor, const int nFace, const int dim, const QudaDirection dir, const int dagger, cudaStream_t *stream)
#define errorQuda(...)
Definition: util_quda.h:73
#define host_free(ptr)
Definition: malloc_quda.h:29
void sendGhost(void *ghost_spinor, const int nFace, const int dim, const QudaDirection dir, const int dagger, cudaStream_t *stream)
int comm_dim(int dim)
MsgHandle * comm_declare_send_relative(void *buffer, int dim, int dir, size_t nbytes)
cudaStream_t * stream
int commsQuery(int dir)
int commDimPartitioned(int dir)
int comm_coord(int dim)
const int Nstream
void comm_allreduce_array(double *data, size_t size)
Definition: comm_mpi.cpp:216
void reduceDoubleArray(double *sum, const int len)
void sendStart(int dir)
cpuColorSpinorField * spinor
Definition: dslash_test.cpp:40
int commDim(int dir)
void packGhost(void *ghost_spinor, const int dim, const QudaDirection dir, const QudaParity parity, const int dagger)
void reduceDouble(double &sum)
QudaDagType dagger
Definition: test_util.cpp:1558
int Ls
Definition: test_util.cpp:40
void comm_free(MsgHandle *mh)
Definition: comm_mpi.cpp:174
enum QudaDirection_s QudaDirection
static void * backGhostFaceSendBuffer[QUDA_MAX_DIM]
void exchangeLink(void **ghost_link, void **link_sendbuf, QudaFieldLocation location)
void gather(quda::cudaColorSpinorField &in, int dagger, int dir, int streamIdx)
const QudaFieldLocation location
Definition: pack_test.cpp:46
void scatter(quda::cudaColorSpinorField &out, int dagger, int dir)
static void flushPinnedCache()
cpuColorSpinorField * in
void exchangeCpuSpinor(quda::cpuColorSpinorField &in, int parity, int dagger)
void comm_start(MsgHandle *mh)
Definition: comm_mpi.cpp:180
void comm_dim_partitioned_set(int dim)
FaceBuffer(const int *X, const int nDim, const int Ninternal, const int nFace, const QudaPrecision precision, const int Ls=1)
Definition: face_buffer.cpp:14
static void * backGhostFaceBuffer[QUDA_MAX_DIM]
enum QudaParity_s QudaParity
static void * fwdGhostFaceSendBuffer[QUDA_MAX_DIM]
#define safe_malloc(size)
Definition: malloc_quda.h:25
static void * fwdGhostFaceBuffer[QUDA_MAX_DIM]
MsgHandle * comm_declare_receive_relative(void *buffer, int dim, int dir, size_t nbytes)
enum QudaFieldLocation_s QudaFieldLocation
cpuColorSpinorField * out
bool globalReduce
Definition: face_buffer.cpp:11
int comm_query(MsgHandle *mh)
Definition: comm_mpi.cpp:192
virtual ~FaceBuffer()
Definition: face_buffer.cpp:86
void recvStart(int dir)
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
#define checkCudaError()
Definition: util_quda.h:110
void comm_wait(MsgHandle *mh)
Definition: comm_mpi.cpp:186
void comm_allreduce(double *data)
Definition: comm_mpi.cpp:201
void commsStart(int dir)
void comm_allreduce_max(double *data)
Definition: comm_mpi.cpp:209
void commDimPartitionedSet(int dir)
void reduceMaxDouble(double &max)
const QudaParity parity
Definition: dslash_test.cpp:29
int oddBit
int comm_dim_partitioned(int dim)