QUDA  0.9.0
face_gauge.cpp
Go to the documentation of this file.
1 #include <cstdio>
2 #include <cstdlib>
3 #include <string.h>
4 #include <sys/time.h>
5 #include <assert.h>
6 
7 #include <quda_internal.h>
8 #include <comm_quda.h>
9 
10 #include <test_util.h>
11 
12 using namespace quda;
13 
14 extern cudaStream_t *stream;
15 
16 /**************************************************************
17  * Staple exchange routine
18  * used in fat link computation
19  ***************************************************************/
20 
21 #if defined(MULTI_GPU) && (defined(GPU_FATLINK) || defined(GPU_GAUGE_FORCE)|| defined(GPU_FERMION_FORCE) || defined(GPU_HISQ_FORCE) || defined(CLOVER_FORCE)) || defined(GPU_CLOVER_DIRAC)
22 
23 enum {
24  XUP = 0,
25  YUP = 1,
26  ZUP = 2,
27  TUP = 3,
28  TDOWN = 4,
29  ZDOWN = 5,
30  YDOWN = 6,
31  XDOWN = 7
32 };
33 
34 
35 //FIXME remove this legacy macro
36 #define gaugeSiteSize 18 // real numbers per gauge field
37 
38 static void* fwd_nbr_staple[4];
39 static void* back_nbr_staple[4];
40 static void* fwd_nbr_staple_sendbuf[4];
41 static void* back_nbr_staple_sendbuf[4];
42 
43 static int dims[4];
44 static int X1,X2,X3,X4;
45 static int volumeCB;
46 static int Vs[4], Vsh[4];
47 
48 #include "gauge_field.h"
49 extern void setup_dims_in_gauge(int *XX);
50 
51 static void
52 setup_dims(int* X)
53 {
54  V = 1;
55  for (int d=0; d< 4; d++) {
56  V *= X[d];
57  dims[d] = X[d];
58  }
59  volumeCB = V/2;
60 
61  X1=X[0];
62  X2=X[1];
63  X3=X[2];
64  X4=X[3];
65 
66  Vs[0] = Vs_x = X[1]*X[2]*X[3];
67  Vs[1] = Vs_y = X[0]*X[2]*X[3];
68  Vs[2] = Vs_z = X[0]*X[1]*X[3];
69  Vs[3] = Vs_t = X[0]*X[1]*X[2];
70 
71  Vsh[0] = Vsh_x = Vs_x/2;
72  Vsh[1] = Vsh_y = Vs_y/2;
73  Vsh[2] = Vsh_z = Vs_z/2;
74  Vsh[3] = Vsh_t = Vs_t/2;
75 }
76 
77 template <typename Float>
78 void packGhostAllStaples(Float *cpuStaple, Float **cpuGhostBack,Float**cpuGhostFwd, int nFace, int* X) {
79  int XY=X[0]*X[1];
80  int XYZ=X[0]*X[1]*X[2];
81  int volumeCB = X[0]*X[1]*X[2]*X[3]/2;
82  int faceVolumeCB[4]={
83  X[1]*X[2]*X[3]/2,
84  X[0]*X[2]*X[3]/2,
85  X[0]*X[1]*X[3]/2,
86  X[0]*X[1]*X[2]/2
87  };
88 
89  //loop variables: a, b, c with a the most signifcant and c the least significant
90  //A, B, C the maximum value
91  //we need to loop in d as well, d's vlaue dims[dir]-3, dims[dir]-2, dims[dir]-1
92  int A[4], B[4], C[4];
93 
94  //X dimension
95  A[0] = X[3]; B[0] = X[2]; C[0] = X[1];
96 
97  //Y dimension
98  A[1] = X[3]; B[1] = X[2]; C[1] = X[0];
99 
100  //Z dimension
101  A[2] = X[3]; B[2] = X[1]; C[2] = X[0];
102 
103  //T dimension
104  A[3] = X[2]; B[3] = X[1]; C[3] = X[0];
105 
106  //multiplication factor to compute index in original cpu memory
107  int f[4][4]={
108  {XYZ, XY, X[0], 1},
109  {XYZ, XY, 1, X[0]},
110  {XYZ, X[0], 1, XY},
111  { XY, X[0], 1, XYZ}
112  };
113 
114  for(int ite = 0; ite < 2; ite++){
115  //ite == 0: back
116  //ite == 1: fwd
117  Float** dst;
118  if (ite == 0){
119  dst = cpuGhostBack;
120  }else{
121  dst = cpuGhostFwd;
122  }
123 
124  //collect back ghost staple
125  for(int dir =0; dir < 4; dir++){
126  int d;
127  int a,b,c;
128  //ther is only one staple in the same location
129  for(int linkdir=0; linkdir < 1; linkdir ++){
130  Float* even_src = cpuStaple;
131  Float* odd_src = cpuStaple + volumeCB*gaugeSiteSize;
132 
133  Float* even_dst;
134  Float* odd_dst;
135 
136  //switching odd and even ghost cpuLink when that dimension size is odd
137  //only switch if X[dir] is odd and the gridsize in that dimension is greater than 1
138  if((X[dir] % 2 ==0) || (comm_dim(dir) == 1)){
139  even_dst = dst[dir];
140  odd_dst = even_dst + nFace*faceVolumeCB[dir]*gaugeSiteSize;
141  }else{
142  odd_dst = dst[dir];
143  even_dst = dst[dir] + nFace*faceVolumeCB[dir]*gaugeSiteSize;
144  }
145 
146  int even_dst_index = 0;
147  int odd_dst_index = 0;
148  int startd;
149  int endd;
150  if(ite == 0){ //back
151  startd = 0;
152  endd= nFace;
153  }else{//fwd
154  startd = X[dir] - nFace;
155  endd =X[dir];
156  }
157  for(d = startd; d < endd; d++){
158  for(a = 0; a < A[dir]; a++){
159  for(b = 0; b < B[dir]; b++){
160  for(c = 0; c < C[dir]; c++){
161  int index = ( a*f[dir][0] + b*f[dir][1]+ c*f[dir][2] + d*f[dir][3])>> 1;
162  int oddness = (a+b+c+d)%2;
163  if (oddness == 0){ //even
164  for(int i=0;i < 18;i++){
165  even_dst[18*even_dst_index+i] = even_src[18*index + i];
166  }
167  even_dst_index++;
168  }else{ //odd
169  for(int i=0;i < 18;i++){
170  odd_dst[18*odd_dst_index+i] = odd_src[18*index + i];
171  }
172  odd_dst_index++;
173  }
174  }//c
175  }//b
176  }//a
177  }//d
178  assert( even_dst_index == nFace*faceVolumeCB[dir]);
179  assert( odd_dst_index == nFace*faceVolumeCB[dir]);
180  }//linkdir
181  }//dir
182  }//ite
183 }
184 
185 
186 void pack_ghost_all_staples_cpu(void *staple, void **cpuGhostStapleBack, void** cpuGhostStapleFwd,
187  int nFace, QudaPrecision precision, int* X) {
188  if (precision == QUDA_DOUBLE_PRECISION) {
189  packGhostAllStaples((double*)staple, (double**)cpuGhostStapleBack, (double**) cpuGhostStapleFwd, nFace, X);
190  } else {
191  packGhostAllStaples((float*)staple, (float**)cpuGhostStapleBack, (float**)cpuGhostStapleFwd, nFace, X);
192  }
193 }
194 
195 void pack_gauge_diag(void* buf, int* X, void** sitelink, int nu, int mu, int dir1, int dir2, QudaPrecision prec)
196 {
197  /*
198  nu | |
199  |__________|
200  mu
201  *
202  * nu, mu are the directions we are working on
203  * Since we are packing our own data, we need to go to the north-west corner in the diagram
204  * i.e. x[nu] = X[nu]-1, x[mu]=0, and looop throught x[dir1],x[dir2]
205  * in the remaining two directions (dir1/dir2), dir2 is the slowest changing dim when computing
206  * index
207  */
208 
209  int mul_factor[4]={1, X[0], X[1]*X[0], X[2]*X[1]*X[0]};
210 
211  int even_dst_idx = 0;
212  int odd_dst_idx = 0;
213  char* dst_even =(char*)buf;
214  char* dst_odd = dst_even + (X[dir1]*X[dir2]/2)*gaugeSiteSize*prec;
215  char* src_even = (char*)sitelink[nu];
216  char* src_odd = src_even + (X[0]*X[1]*X[2]*X[3]/2)*gaugeSiteSize*prec;
217 
218  if( (X[nu]+X[mu]) % 2 == 1){
219  //oddness will change between me and the diagonal neighbor
220  //switch it now
221  char* tmp = dst_odd;
222  dst_odd = dst_even;
223  dst_even = tmp;
224  }
225 
226  for(int i=0;i < X[dir2]; i++){
227  for(int j=0; j < X[dir1]; j++){
228  int src_idx = ((X[nu]-1)*mul_factor[nu]+ 0*mul_factor[mu]+i*mul_factor[dir2]+j*mul_factor[dir1])>>1;
229  int oddness = ( (X[nu]-1) + 0 + i + j) %2;
230  if(oddness==0){
231  for(int tmpidx = 0; tmpidx < gaugeSiteSize; tmpidx++){
232  memcpy(&dst_even[(18*even_dst_idx+tmpidx)*prec], &src_even[(18*src_idx + tmpidx)*prec], prec);
233  }
234  even_dst_idx++;
235  }else{
236  for(int tmpidx = 0; tmpidx < gaugeSiteSize; tmpidx++){
237  memcpy(&dst_odd[(18*odd_dst_idx+tmpidx)*prec], &src_odd[(18*src_idx + tmpidx)*prec], prec);
238  }
239  odd_dst_idx++;
240  }//if
241  }//for j
242  }//for i
243  if( (even_dst_idx != X[dir1]*X[dir2]/2)|| (odd_dst_idx != X[dir1]*X[dir2]/2)){
244  errorQuda("even_dst_idx/odd_dst_idx(%d/%d) does not match the value of X[dir1]*X[dir2]/2 (%d)\n",
245  even_dst_idx, odd_dst_idx, X[dir1]*X[dir2]/2);
246  }
247  return ;
248 }
249 
250 /*
251  This is the packing kernel for the multi-dimensional ghost zone in
252  the padded region. This is called by cpuexchangesitelink in
253  FaceBuffer (MPI only), which was called by loadLinkToGPU (defined at
254  the bottom).
255 
256  Not currently included since it will be replaced by Guochun's new
257  routine which uses an enlarged domain instead of a ghost zone.
258 */
259 template <typename Float>
260 void packGhostAllLinks(Float **cpuLink, Float **cpuGhostBack,Float**cpuGhostFwd, int dir, int nFace, int* X) {
261  int XY=X[0]*X[1];
262  int XYZ=X[0]*X[1]*X[2];
263  int volumeCB = X[0]*X[1]*X[2]*X[3]/2;
264  int faceVolumeCB[4]={
265  X[1]*X[2]*X[3]/2,
266  X[0]*X[2]*X[3]/2,
267  X[0]*X[1]*X[3]/2,
268  X[0]*X[1]*X[2]/2
269  };
270 
271  //loop variables: a, b, c with a the most signifcant and c the least significant
272  //A, B, C the maximum value
273  //we need to loop in d as well, d's vlaue dims[dir]-3, dims[dir]-2, dims[dir]-1
274  int A[4], B[4], C[4];
275 
276  //X dimension
277  A[0] = X[3]; B[0] = X[2]; C[0] = X[1];
278 
279  //Y dimension
280  A[1] = X[3]; B[1] = X[2]; C[1] = X[0];
281 
282  //Z dimension
283  A[2] = X[3]; B[2] = X[1]; C[2] = X[0];
284 
285  //T dimension
286  A[3] = X[2]; B[3] = X[1]; C[3] = X[0];
287 
288  //multiplication factor to compute index in original cpu memory
289  int f[4][4]={
290  {XYZ, XY, X[0], 1},
291  {XYZ, XY, 1, X[0]},
292  {XYZ, X[0], 1, XY},
293  { XY, X[0], 1, XYZ}
294  };
295 
296  for(int ite = 0; ite < 2; ite++){
297  //ite == 0: back
298  //ite == 1: fwd
299  Float** dst;
300  if (ite == 0){
301  dst = cpuGhostBack;
302  }else{
303  dst = cpuGhostFwd;
304  }
305  //collect back ghost gauge field
306  //for(int dir =0; dir < 4; dir++){
307  int d;
308  int a,b,c;
309 
310  //we need copy all 4 links in the same location
311  for(int linkdir=0; linkdir < 4; linkdir ++){
312  Float* even_src = cpuLink[linkdir];
313  Float* odd_src = cpuLink[linkdir] + volumeCB*gaugeSiteSize;
314  Float* even_dst;
315  Float* odd_dst;
316 
317  //switching odd and even ghost cpuLink when that dimension size is odd
318  //only switch if X[dir] is odd and the gridsize in that dimension is greater than 1
319  if((X[dir] % 2 ==0) || (comm_dim(dir) == 1)){
320  even_dst = dst[dir] + 2*linkdir* nFace *faceVolumeCB[dir]*gaugeSiteSize;
321  odd_dst = even_dst + nFace*faceVolumeCB[dir]*gaugeSiteSize;
322  }else{
323  odd_dst = dst[dir] + 2*linkdir* nFace *faceVolumeCB[dir]*gaugeSiteSize;
324  even_dst = odd_dst + nFace*faceVolumeCB[dir]*gaugeSiteSize;
325  }
326 
327  int even_dst_index = 0;
328  int odd_dst_index = 0;
329  int startd;
330  int endd;
331  if(ite == 0){ //back
332  startd = 0;
333  endd= nFace;
334  }else{//fwd
335  startd = X[dir] - nFace;
336  endd =X[dir];
337  }
338  for(d = startd; d < endd; d++){
339  for(a = 0; a < A[dir]; a++){
340  for(b = 0; b < B[dir]; b++){
341  for(c = 0; c < C[dir]; c++){
342  int index = ( a*f[dir][0] + b*f[dir][1]+ c*f[dir][2] + d*f[dir][3])>> 1;
343  int oddness = (a+b+c+d)%2;
344  if (oddness == 0){ //even
345  for(int i=0;i < 18;i++){
346  even_dst[18*even_dst_index+i] = even_src[18*index + i];
347  }
348  even_dst_index++;
349  }else{ //odd
350  for(int i=0;i < 18;i++){
351  odd_dst[18*odd_dst_index+i] = odd_src[18*index + i];
352  }
353  odd_dst_index++;
354  }
355  }//c
356  }//b
357  }//a
358  }//d
359  assert( even_dst_index == nFace*faceVolumeCB[dir]);
360  assert( odd_dst_index == nFace*faceVolumeCB[dir]);
361  }//linkdir
362  }//ite
363 }
364 
365 
366 void pack_ghost_all_links(void **cpuLink, void **cpuGhostBack, void** cpuGhostFwd,
367  int dir, int nFace, QudaPrecision precision, int *X) {
368  if (precision == QUDA_DOUBLE_PRECISION) {
369  packGhostAllLinks((double**)cpuLink, (double**)cpuGhostBack, (double**) cpuGhostFwd, dir, nFace, X);
370  } else {
371  packGhostAllLinks((float**)cpuLink, (float**)cpuGhostBack, (float**)cpuGhostFwd, dir, nFace, X);
372  }
373 }
374 
376 {
377  static bool initialized = false;
378 
379  if (initialized) return;
380  initialized = true;
381 
382  for (int i=0; i < 4; i++) {
383 
384  size_t packet_size = Vs[i]*gaugeSiteSize*prec;
385 
386  fwd_nbr_staple[i] = pinned_malloc(packet_size);
387  back_nbr_staple[i] = pinned_malloc(packet_size);
388  fwd_nbr_staple_sendbuf[i] = pinned_malloc(packet_size);
389  back_nbr_staple_sendbuf[i] = pinned_malloc(packet_size);
390 
391  }
392 }
393 
394 
395 template<typename Float>
396 void exchange_sitelink_diag(int* X, Float** sitelink, Float** ghost_sitelink_diag, int optflag)
397 {
398  /*
399  nu | |
400  |__________|
401  mu
402 
403  * There are total 12 different combinations for (nu,mu)
404  * since nu/mu = X,Y,Z,T and nu != mu
405  * For each combination, we need to communicate with the corresponding
406  * neighbor and get the diag ghost data
407  * The neighbor we need to get data from is dx[nu]=-1, dx[mu]= +1
408  * and we need to send our data to neighbor with dx[nu]=+1, dx[mu]=-1
409  */
410 
411  for(int nu = XUP; nu <=TUP; nu++){
412  for(int mu = XUP; mu <= TUP; mu++){
413  if(nu == mu){
414  continue;
415  }
416  if(optflag && (!commDimPartitioned(mu) || !commDimPartitioned(nu))){
417  continue;
418  }
419 
420  int dir1, dir2; //other two dimensions
421  for(dir1=0; dir1 < 4; dir1 ++){
422  if(dir1 != nu && dir1 != mu){
423  break;
424  }
425  }
426  for(dir2=0; dir2 < 4; dir2 ++){
427  if(dir2 != nu && dir2 != mu && dir2 != dir1){
428  break;
429  }
430  }
431 
432  if(dir1 == 4 || dir2 == 4){
433  errorQuda("Invalid dir1/dir2");
434  }
435  int len = X[dir1]*X[dir2]*gaugeSiteSize*sizeof(Float);
436  void *sendbuf = safe_malloc(len);
437 
438  pack_gauge_diag(sendbuf, X, (void**)sitelink, nu, mu, dir1, dir2, (QudaPrecision)sizeof(Float));
439 
440  int dx[4] = {0};
441  dx[nu] = -1;
442  dx[mu] = +1;
443  MsgHandle *mh_recv = comm_declare_receive_displaced(ghost_sitelink_diag[nu*4+mu], dx, len);
444  comm_start(mh_recv);
445 
446  dx[nu] = +1;
447  dx[mu] = -1;
448  MsgHandle *mh_send = comm_declare_send_displaced(sendbuf, dx, len);
449  comm_start(mh_send);
450 
451  comm_wait(mh_send);
452  comm_wait(mh_recv);
453 
454  comm_free(mh_send);
455  comm_free(mh_recv);
456 
457  host_free(sendbuf);
458  }
459  }
460 }
461 
462 
463 template<typename Float>
464 void
465 exchange_sitelink(int*X, Float** sitelink, Float** ghost_sitelink, Float** ghost_sitelink_diag,
466  Float** sitelink_fwd_sendbuf, Float** sitelink_back_sendbuf, int optflag)
467 {
468 
469  int nFace =1;
470  for(int dir=0; dir < 4; dir++){
471  if(optflag && !commDimPartitioned(dir)) continue;
472  pack_ghost_all_links((void**)sitelink, (void**)sitelink_back_sendbuf, (void**)sitelink_fwd_sendbuf, dir, nFace, (QudaPrecision)(sizeof(Float)), X);
473  }
474 
475  for (int dir = 0; dir < 4; dir++) {
476  if(optflag && !commDimPartitioned(dir)) continue;
477  int len = Vsh[dir]*gaugeSiteSize*sizeof(Float);
478  Float* ghost_sitelink_back = ghost_sitelink[dir];
479  Float* ghost_sitelink_fwd = ghost_sitelink[dir] + 8*Vsh[dir]*gaugeSiteSize;
480 
481  MsgHandle *mh_recv_back;
482  MsgHandle *mh_recv_fwd;
483  MsgHandle *mh_send_fwd;
484  MsgHandle *mh_send_back;
485 
486  mh_recv_back = comm_declare_receive_relative(ghost_sitelink_back, dir, -1, 8*len);
487  mh_recv_fwd = comm_declare_receive_relative(ghost_sitelink_fwd, dir, +1, 8*len);
488  mh_send_fwd = comm_declare_send_relative(sitelink_fwd_sendbuf[dir], dir, +1, 8*len);
489  mh_send_back = comm_declare_send_relative(sitelink_back_sendbuf[dir], dir, -1, 8*len);
490 
491  comm_start(mh_recv_back);
492  comm_start(mh_recv_fwd);
493  comm_start(mh_send_fwd);
494  comm_start(mh_send_back);
495 
496  comm_wait(mh_send_fwd);
497  comm_wait(mh_send_back);
498  comm_wait(mh_recv_back);
499  comm_wait(mh_recv_fwd);
500 
501  comm_free(mh_send_fwd);
502  comm_free(mh_send_back);
503  comm_free(mh_recv_back);
504  comm_free(mh_recv_fwd);
505  }
506 
507  exchange_sitelink_diag(X, sitelink, ghost_sitelink_diag, optflag);
508 }
509 
510 
511 //this function is used for link fattening computation
512 //@optflag: if this flag is set, we only communicate in directions that are partitioned
513 // if not set, then we communicate in all directions regradless of partitions
514 void exchange_cpu_sitelink(int* X,
515  void** sitelink, void** ghost_sitelink,
516  void** ghost_sitelink_diag,
517  QudaPrecision gPrecision, QudaGaugeParam* param, int optflag)
518 {
519  setup_dims(X);
520  static void* sitelink_fwd_sendbuf[4];
521  static void* sitelink_back_sendbuf[4];
522 
523  for (int i=0; i<4; i++) {
524  int nbytes = 4*Vs[i]*gaugeSiteSize*gPrecision;
525  sitelink_fwd_sendbuf[i] = safe_malloc(nbytes);
526  sitelink_back_sendbuf[i] = safe_malloc(nbytes);
527  memset(sitelink_fwd_sendbuf[i], 0, nbytes);
528  memset(sitelink_back_sendbuf[i], 0, nbytes);
529  }
530 
531  if (gPrecision == QUDA_DOUBLE_PRECISION){
532  exchange_sitelink(X, (double**)sitelink, (double**)(ghost_sitelink), (double**)ghost_sitelink_diag,
533  (double**)sitelink_fwd_sendbuf, (double**)sitelink_back_sendbuf, optflag);
534  }else{ //single
535  exchange_sitelink(X, (float**)sitelink, (float**)(ghost_sitelink), (float**)ghost_sitelink_diag,
536  (float**)sitelink_fwd_sendbuf, (float**)sitelink_back_sendbuf, optflag);
537  }
538 
539  for(int i=0;i < 4;i++){
540  host_free(sitelink_fwd_sendbuf[i]);
541  host_free(sitelink_back_sendbuf[i]);
542  }
543 }
544 
545 
546 #define MEMCOPY_GAUGE_FIELDS_GRID_TO_BUF(ghost_buf, dst_idx, sitelink, src_idx, num, dir, geom) \
547  if(src_oddness) src_idx += Vh_ex; \
548  if(dst_oddness) dst_idx += R[dir]*slice_3d[dir]/2; \
549  if(cpu_order == QUDA_QDP_GAUGE_ORDER) { \
550  for(int linkdir=0; linkdir < 4; linkdir++){ \
551  char* src = (char*) sitelink[linkdir] + (src_idx)*gaugebytes; \
552  char* dst = ((char*)ghost_buf[dir])+ linkdir*R[dir]*slice_3d[dir]*gaugebytes + (dst_idx)*gaugebytes; \
553  memcpy(dst, src, gaugebytes*(num)); \
554  } \
555  } else if (cpu_order == QUDA_MILC_GAUGE_ORDER) { \
556  char* src = ((char*)sitelink)+ (geom)*(src_idx)*gaugebytes; \
557  char* dst = ((char*)ghost_buf[dir]) + (geom)*(dst_idx)*gaugebytes; \
558  memcpy(dst, src, (geom)*gaugebytes*(num)); \
559  } else { \
560  errorQuda("Unsupported gauge order"); \
561  } \
562 
563 #define MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID(sitelink, dst_idx, ghost_buf, src_idx, num, dir, geom) \
564  if(oddness){ \
565  if(commDimPartitioned(dir)){ \
566  src_idx += R[dir]*slice_3d[dir]/2; \
567  }else{ \
568  src_idx += Vh_ex; \
569  } \
570  dst_idx += Vh_ex; \
571  } \
572  if(cpu_order == QUDA_QDP_GAUGE_ORDER){ \
573  for(int linkdir=0; linkdir < 4; linkdir++){ \
574  char* src; \
575  if(commDimPartitioned(dir)){ \
576  src = ((char*)ghost_buf[dir])+ linkdir*R[dir]*slice_3d[dir]*gaugebytes + (src_idx)*gaugebytes; \
577  }else{ \
578  src = ((char*)sitelink[linkdir])+ (src_idx)*gaugebytes; \
579  } \
580  char* dst = (char*) sitelink[linkdir] + (dst_idx)*gaugebytes; \
581  memcpy(dst, src, gaugebytes*(num)); \
582  } \
583  } else if (cpu_order == QUDA_MILC_GAUGE_ORDER) { \
584  char* src; \
585  if(commDimPartitioned(dir)){ \
586  src=((char*)ghost_buf[dir]) + (geom)*(src_idx)*gaugebytes; \
587  }else{ \
588  src = ((char*)sitelink)+ (geom)*(src_idx)*gaugebytes; \
589  } \
590  char* dst = ((char*)sitelink) + (geom)*(dst_idx)*gaugebytes; \
591  memcpy(dst, src, (geom)*gaugebytes*(num)); \
592  } else { \
593  errorQuda("Unsupported gauge order"); \
594  }
595 
596 #define MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID_T(sitelink, ghost_buf, dst_face, src_face, dir, geom) \
597  /*even*/ \
598  int even_dst_idx = (dst_face*E[2]*E[1]*E[0])/2; \
599  int even_src_idx; \
600  if(commDimPartitioned(dir)){ \
601  even_src_idx = 0; \
602  }else{ \
603  even_src_idx = (src_face*E[2]*E[1]*E[0])/2; \
604  } \
605  /*odd*/ \
606  int odd_dst_idx = even_dst_idx+Vh_ex; \
607  int odd_src_idx; \
608  if(commDimPartitioned(dir)){ \
609  odd_src_idx = R[dir]*slice_3d[dir]/2; \
610  }else{ \
611  odd_src_idx = even_src_idx+Vh_ex; \
612  } \
613  if(cpu_order == QUDA_QDP_GAUGE_ORDER){ \
614  for(int linkdir=0; linkdir < 4; linkdir ++){ \
615  char* dst = (char*)sitelink[linkdir]; \
616  char* src; \
617  if(commDimPartitioned(dir)){ \
618  src = ((char*)ghost_buf[dir]) + linkdir*R[dir]*slice_3d[dir]*gaugebytes; \
619  }else{ \
620  src = (char*)sitelink[linkdir]; \
621  } \
622  memcpy(dst + even_dst_idx * gaugebytes, src + even_src_idx*gaugebytes, R[dir]*slice_3d[dir]*gaugebytes/2); \
623  memcpy(dst + odd_dst_idx * gaugebytes, src + odd_src_idx*gaugebytes, R[dir]*slice_3d[dir]*gaugebytes/2); \
624  } \
625  } else if (cpu_order == QUDA_MILC_GAUGE_ORDER) { \
626  char* dst = (char*)sitelink; \
627  char* src; \
628  if(commDimPartitioned(dir)){ \
629  src = (char*)ghost_buf[dir]; \
630  }else{ \
631  src = (char*)sitelink; \
632  } \
633  memcpy(dst+(geom)*even_dst_idx*gaugebytes, src+(geom)*even_src_idx*gaugebytes, (geom)*R[dir]*slice_3d[dir]*gaugebytes/2); \
634  memcpy(dst+(geom)*odd_dst_idx*gaugebytes, src+(geom)*odd_src_idx*gaugebytes, (geom)*R[dir]*slice_3d[dir]*gaugebytes/2); \
635  } else { \
636  errorQuda("Unsupported gauge order\n"); \
637  }
638 
639 /* This function exchange the sitelink and store them in the correspoinding portion of
640  * the extended sitelink memory region
641  * @sitelink: this is stored according to dimension size (X4+R4) * (X1+R1) * (X2+R2) * (X3+R3)
642  */
643 
644 // gaugeSiteSize
645 
646 void exchange_cpu_sitelink_ex(int* X, int *R, void** sitelink, QudaGaugeFieldOrder cpu_order,
647  QudaPrecision gPrecision, int optflag, int geometry)
648 {
649  int E[4];
650  for (int i=0; i<4; i++) E[i] = X[i] + 2*R[i];
651  int Vh_ex = E[3]*E[2]*E[1]*E[0]/2;
652 
653  //...............x.........y.....z......t
654  int starta[] = {R[3], R[3], R[3], 0};
655  int enda[] = {X[3]+R[3], X[3]+R[3], X[3]+R[3], X[2]+2*R[2]};
656 
657  int startb[] = {R[2], R[2], 0, 0};
658  int endb[] = {X[2]+R[2], X[2]+R[2], X[1]+2*R[1], X[1]+2*R[1]};
659 
660  int startc[] = {R[1], 0, 0, 0};
661  int endc[] = {X[1]+R[1], X[0]+2*R[0], X[0]+2*R[0], X[0]+2*R[0]};
662 
663  int f_main[4][4] = {
664  {E[2]*E[1]*E[0], E[1]*E[0], E[0], 1},
665  {E[2]*E[1]*E[0], E[1]*E[0], 1, E[0]},
666  {E[2]*E[1]*E[0], E[0], 1, E[1]*E[0]},
667  {E[1]*E[0], E[0], 1, E[2]*E[1]*E[0]}
668  };
669 
670  int f_bound[4][4]={
671  {E[2]*E[1], E[1], 1, E[3]*E[2]*E[1]},
672  {E[2]*E[0], E[0], 1, E[3]*E[2]*E[0]},
673  {E[1]*E[0], E[0], 1, E[3]*E[1]*E[0]},
674  {E[1]*E[0], E[0], 1, E[2]*E[1]*E[0]}
675  };
676 
677  int slice_3d[] = { E[3]*E[2]*E[1], E[3]*E[2]*E[0], E[3]*E[1]*E[0], E[2]*E[1]*E[0]};
678  int len[4];
679  for(int i=0; i<4;i++){
680  len[i] = slice_3d[i] * R[i] * geometry*gaugeSiteSize*gPrecision; //2 slices, 4 directions' links
681  }
682 
683  void* ghost_sitelink_fwd_sendbuf[4];
684  void* ghost_sitelink_back_sendbuf[4];
685  void* ghost_sitelink_fwd[4];
686  void* ghost_sitelink_back[4];
687 
688  for(int i=0; i<4; i++) {
689  if(!commDimPartitioned(i)) continue;
690  ghost_sitelink_fwd_sendbuf[i] = safe_malloc(len[i]);
691  ghost_sitelink_back_sendbuf[i] = safe_malloc(len[i]);
692  ghost_sitelink_fwd[i] = safe_malloc(len[i]);
693  ghost_sitelink_back[i] = safe_malloc(len[i]);
694  }
695 
696  int gaugebytes = gaugeSiteSize*gPrecision;
697  int a, b, c,d;
698  for(int dir =0;dir < 4;dir++){
699  if( (!commDimPartitioned(dir)) && optflag) continue;
700  if(commDimPartitioned(dir)){
701  //fill the sendbuf here
702  //back
703  for(d=R[dir]; d < 2*R[dir]; d++)
704  for(a=starta[dir];a < enda[dir]; a++)
705  for(b=startb[dir]; b < endb[dir]; b++)
706 
707  if(f_main[dir][2] != 1 || f_bound[dir][2] !=1){
708  for (c=startc[dir]; c < endc[dir]; c++){
709  int oddness = (a+b+c+d)%2;
710  int src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
711  int dst_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + (d-R[dir])*f_bound[dir][3])>> 1;
712 
713  int src_oddness = oddness;
714  int dst_oddness = oddness;
715  if((X[dir] % 2 ==1) && (commDim(dir) > 1)){ //switch even/odd position
716  dst_oddness = 1-oddness;
717  }
718 
719  MEMCOPY_GAUGE_FIELDS_GRID_TO_BUF(ghost_sitelink_back_sendbuf, dst_idx, sitelink, src_idx, 1, dir, geometry);
720 
721  }//c
722  }else{
723  for(int loop=0; loop < 2; loop++){
724  c=startc[dir]+loop;
725  if(c < endc[dir]){
726  int oddness = (a+b+c+d)%2;
727  int src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
728  int dst_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + (d-R[dir])*f_bound[dir][3])>> 1;
729 
730  int src_oddness = oddness;
731  int dst_oddness = oddness;
732  if((X[dir] % 2 ==1) && (commDim(dir) > 1)){ //switch even/odd position
733  dst_oddness = 1-oddness;
734  }
735  MEMCOPY_GAUGE_FIELDS_GRID_TO_BUF(ghost_sitelink_back_sendbuf, dst_idx, sitelink, src_idx, (endc[dir]-c+1)/2, dir, geometry);
736 
737  }//if c
738  }//for loop
739  }//if
740 
741 
742  //fwd
743  for(d=X[dir]; d < X[dir]+R[dir]; d++) {
744  for(a=starta[dir];a < enda[dir]; a++) {
745  for(b=startb[dir]; b < endb[dir]; b++) {
746 
747  if(f_main[dir][2] != 1 || f_bound[dir][2] !=1){
748  for (c=startc[dir]; c < endc[dir]; c++){
749  int oddness = (a+b+c+d)%2;
750  int src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
751  int dst_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + (d-X[dir])*f_bound[dir][3])>> 1;
752 
753  int src_oddness = oddness;
754  int dst_oddness = oddness;
755  if((X[dir] % 2 ==1) && (commDim(dir) > 1)){ //switch even/odd position
756  dst_oddness = 1-oddness;
757  }
758 
759  MEMCOPY_GAUGE_FIELDS_GRID_TO_BUF(ghost_sitelink_fwd_sendbuf, dst_idx, sitelink, src_idx, 1,dir, geometry);
760  }//c
761  }else{
762  for(int loop=0; loop < 2; loop++){
763  c=startc[dir]+loop;
764  if(c < endc[dir]){
765  int oddness = (a+b+c+d)%2;
766  int src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
767  int dst_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + (d-X[dir])*f_bound[dir][3])>> 1;
768 
769  int src_oddness = oddness;
770  int dst_oddness = oddness;
771  if((X[dir] % 2 ==1) && (commDim(dir) > 1)){ //switch even/odd position
772  dst_oddness = 1-oddness;
773  }
774  MEMCOPY_GAUGE_FIELDS_GRID_TO_BUF(ghost_sitelink_fwd_sendbuf, dst_idx, sitelink, src_idx, (endc[dir]-c+1)/2,dir, geometry);
775  }
776  }//for loop
777  }//if
778 
779  }
780  }
781  }
782 
783  MsgHandle *mh_recv_back;
784  MsgHandle *mh_recv_fwd;
785  MsgHandle *mh_send_fwd;
786  MsgHandle *mh_send_back;
787 
788  mh_recv_back = comm_declare_receive_relative(ghost_sitelink_back[dir], dir, -1, len[dir]);
789  mh_recv_fwd = comm_declare_receive_relative(ghost_sitelink_fwd[dir], dir, +1, len[dir]);
790  mh_send_fwd = comm_declare_send_relative(ghost_sitelink_fwd_sendbuf[dir], dir, +1, len[dir]);
791  mh_send_back = comm_declare_send_relative(ghost_sitelink_back_sendbuf[dir], dir, -1, len[dir]);
792 
793  comm_start(mh_recv_back);
794  comm_start(mh_recv_fwd);
795  comm_start(mh_send_fwd);
796  comm_start(mh_send_back);
797 
798  comm_wait(mh_send_fwd);
799  comm_wait(mh_send_back);
800  comm_wait(mh_recv_back);
801  comm_wait(mh_recv_fwd);
802 
803  comm_free(mh_send_fwd);
804  comm_free(mh_send_back);
805  comm_free(mh_recv_back);
806  comm_free(mh_recv_fwd);
807 
808  }//if
809 
810  //use the messages to fill the sitelink data
811  //back
812  if (dir < 3 ) {
813 
814  for(d=0; d < R[dir]; d++) {
815  for(a=starta[dir];a < enda[dir]; a++) {
816  for(b=startb[dir]; b < endb[dir]; b++) {
817 
818  if(f_main[dir][2] != 1 || f_bound[dir][2] !=1){
819  for (c=startc[dir]; c < endc[dir]; c++){
820  int oddness = (a+b+c+d)%2;
821  int dst_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
822  int src_idx;
823  if(commDimPartitioned(dir)){
824  src_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + d*f_bound[dir][3])>> 1;
825  }else{
826  src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + (d+X[dir])*f_main[dir][3])>> 1;
827  }
828 
829  MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID(sitelink, dst_idx, ghost_sitelink_back, src_idx, 1, dir, geometry);
830 
831  }//c
832  }else{
833  //optimized copy
834  //first half: startc[dir] -> end[dir] with step=2
835 
836  for(int loop =0;loop <2;loop++){
837  int c=startc[dir]+loop;
838  if(c < endc[dir]){
839  int oddness = (a+b+c+d)%2;
840  int dst_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
841  int src_idx;
842  if(commDimPartitioned(dir)){
843  src_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + d*f_bound[dir][3])>> 1;
844  }else{
845  src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + (d+X[dir])*f_main[dir][3])>> 1;
846  }
847 
848  MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID(sitelink, dst_idx, ghost_sitelink_back, src_idx, (endc[dir]-c+1)/2, dir, geometry);
849 
850  }//if c
851  }//for loop
852  }//if
853 
854  }
855  }
856  }
857 
858  }else{
859  //when dir == 3 (T direction), the data layout format in sitelink and the message is the same, we can do large copys
860 
861  MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID_T(sitelink, ghost_sitelink_back, 0, X[3], dir, geometry)
862  }//if
863 
864  //fwd
865  if( dir < 3 ){
866 
867  for(d=X[dir]+R[dir]; d < X[dir]+2*R[dir]; d++) {
868  for(a=starta[dir];a < enda[dir]; a++) {
869  for(b=startb[dir]; b < endb[dir]; b++) {
870 
871  if(f_main[dir][2] != 1 || f_bound[dir][2] != 1){
872  for (c=startc[dir]; c < endc[dir]; c++){
873  int oddness = (a+b+c+d)%2;
874  int dst_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
875  int src_idx;
876  if(commDimPartitioned(dir)){
877  src_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + (d-X[dir]-R[dir])*f_bound[dir][3])>> 1;
878  }else{
879  src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + (d-X[dir])*f_main[dir][3])>> 1;
880  }
881 
882  MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID(sitelink, dst_idx, ghost_sitelink_fwd, src_idx, 1, dir, geometry);
883 
884  }//c
885  }else{
886  for(int loop =0; loop < 2; loop++){
887  //for (c=startc[dir]; c < endc[dir]; c++){
888  c=startc[dir] + loop;
889  if(c < endc[dir]){
890  int oddness = (a+b+c+d)%2;
891  int dst_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
892  int src_idx;
893  if(commDimPartitioned(dir)){
894  src_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + (d-X[dir]-R[dir])*f_bound[dir][3])>> 1;
895  }else{
896  src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + (d-X[dir])*f_main[dir][3])>> 1;
897  }
898  MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID(sitelink, dst_idx, ghost_sitelink_fwd, src_idx, (endc[dir]-c+1)/2, dir, geometry);
899  }//if
900  }//for loop
901  }//if
902 
903  }
904  }
905  }
906 
907 
908  } else {
909 
910  //when dir == 3 (T direction), the data layout format in sitelink and the message is the same, we can do large copys
911  MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID_T(sitelink, ghost_sitelink_fwd, (X[3]+R[3]), 2, dir, geometry) // TESTME 2
912 
913  }//if
914 
915  }//dir for loop
916 
917 
918  for(int dir=0;dir < 4;dir++){
919  if(!commDimPartitioned(dir)) continue;
920  host_free(ghost_sitelink_fwd_sendbuf[dir]);
921  host_free(ghost_sitelink_back_sendbuf[dir]);
922  host_free(ghost_sitelink_fwd[dir]);
923  host_free(ghost_sitelink_back[dir]);
924  }
925 
926 }
927 
928 
929 
930 template<typename Float>
931 void
932 do_exchange_cpu_staple(Float* staple, Float** ghost_staple, Float** staple_fwd_sendbuf, Float** staple_back_sendbuf, int* X)
933 {
934 
935  int nFace =1;
936  pack_ghost_all_staples_cpu(staple, (void**)staple_back_sendbuf,
937  (void**)staple_fwd_sendbuf, nFace, (QudaPrecision)(sizeof(Float)), X);
938 
939 
940  int Vsh[4] = {Vsh_x, Vsh_y, Vsh_z, Vsh_t};
941  size_t len[4] = {
942  Vsh_x*gaugeSiteSize*sizeof(Float),
943  Vsh_y*gaugeSiteSize*sizeof(Float),
944  Vsh_z*gaugeSiteSize*sizeof(Float),
945  Vsh_t*gaugeSiteSize*sizeof(Float)
946  };
947 
948  for (int dir=0;dir < 4; dir++) {
949 
950  Float *ghost_staple_back = ghost_staple[dir];
951  Float *ghost_staple_fwd = ghost_staple[dir] + 2*Vsh[dir]*gaugeSiteSize;
952 
953  MsgHandle *mh_recv_back;
954  MsgHandle *mh_recv_fwd;
955  MsgHandle *mh_send_fwd;
956  MsgHandle *mh_send_back;
957 
958  mh_recv_back = comm_declare_receive_relative(ghost_staple_back, dir, -1, 2*len[dir]);
959  mh_recv_fwd = comm_declare_receive_relative(ghost_staple_fwd, dir, +1, 2*len[dir]);
960  mh_send_fwd = comm_declare_send_relative(staple_fwd_sendbuf[dir], dir, +1, 2*len[dir]);
961  mh_send_back = comm_declare_send_relative(staple_back_sendbuf[dir], dir, -1, 2*len[dir]);
962 
963  comm_start(mh_recv_back);
964  comm_start(mh_recv_fwd);
965  comm_start(mh_send_fwd);
966  comm_start(mh_send_back);
967 
968  comm_wait(mh_send_fwd);
969  comm_wait(mh_send_back);
970  comm_wait(mh_recv_back);
971  comm_wait(mh_recv_fwd);
972 
973  comm_free(mh_send_fwd);
974  comm_free(mh_send_back);
975  comm_free(mh_recv_back);
976  comm_free(mh_recv_fwd);
977  }
978 }
979 
980 
981 //this function is used for link fattening computation
982 void exchange_cpu_staple(int* X, void* staple, void** ghost_staple, QudaPrecision gPrecision)
983 {
984  setup_dims(X);
985 
986  int Vs[4] = {Vs_x, Vs_y, Vs_z, Vs_t};
987  void *staple_fwd_sendbuf[4];
988  void *staple_back_sendbuf[4];
989 
990  for(int i=0;i < 4; i++){
991  staple_fwd_sendbuf[i] = safe_malloc(Vs[i]*gaugeSiteSize*gPrecision);
992  staple_back_sendbuf[i] = safe_malloc(Vs[i]*gaugeSiteSize*gPrecision);
993  }
994 
995  if (gPrecision == QUDA_DOUBLE_PRECISION) {
996  do_exchange_cpu_staple((double*)staple, (double**)ghost_staple,
997  (double**)staple_fwd_sendbuf, (double**)staple_back_sendbuf, X);
998  } else { //single
999  do_exchange_cpu_staple((float*)staple, (float**)ghost_staple,
1000  (float**)staple_fwd_sendbuf, (float**)staple_back_sendbuf, X);
1001  }
1002 
1003  for (int i=0;i < 4;i++) {
1004  host_free(staple_fwd_sendbuf[i]);
1005  host_free(staple_back_sendbuf[i]);
1006  }
1007 }
1008 
1009 void exchange_llfat_cleanup(void)
1010 {
1011  for (int i=0; i<4; i++) {
1012 
1013  if(fwd_nbr_staple[i]){
1014  host_free(fwd_nbr_staple[i]); fwd_nbr_staple[i] = NULL;
1015  }
1016  if(back_nbr_staple[i]){
1017  host_free(back_nbr_staple[i]); back_nbr_staple[i] = NULL;
1018  }
1019  if(fwd_nbr_staple_sendbuf[i]){
1020  host_free(fwd_nbr_staple_sendbuf[i]); fwd_nbr_staple_sendbuf[i] = NULL;
1021  }
1022  if(back_nbr_staple_sendbuf[i]){
1023  host_free(back_nbr_staple_sendbuf[i]); back_nbr_staple_sendbuf[i] = NULL;
1024  }
1025 
1026  }
1027  checkCudaError();
1028 }
1029 
1030 #undef gaugeSiteSize
1031 
1032 #endif
void exchange_cpu_sitelink_ex(int *X, int *R, void **sitelink, QudaGaugeFieldOrder cpu_order, QudaPrecision gPrecision, int optflag, int geometry)
int commDimPartitioned(int dir)
double mu
Definition: test_util.cpp:1643
#define pinned_malloc(size)
Definition: malloc_quda.h:55
enum QudaPrecision_s QudaPrecision
MsgHandle * comm_declare_receive_displaced(void *buffer, const int displacement[], size_t nbytes)
Definition: comm_mpi.cpp:174
int Vs_z
Definition: test_util.cpp:30
void exchange_cpu_sitelink(int *X, void **sitelink, void **ghost_sitelink, void **ghost_sitelink_diag, QudaPrecision gPrecision, QudaGaugeParam *param, int optflag)
#define errorQuda(...)
Definition: util_quda.h:90
#define host_free(ptr)
Definition: malloc_quda.h:59
#define TDOWN
Definition: misc.h:64
int comm_dim(int dim)
int Vs_y
Definition: test_util.cpp:30
cudaStream_t * stream
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:44
#define XUP
static int R[4]
int E[4]
Definition: test_util.cpp:36
#define YUP
int Vsh_y
Definition: test_util.cpp:31
char * index(const char *, int)
QudaGaugeParam param
Definition: pack_test.cpp:17
#define b
void comm_free(MsgHandle *mh)
Definition: comm_mpi.cpp:252
else return(__swbuf(_c, _p))
int Vsh_t
Definition: test_util.cpp:31
int Vs_x
Definition: test_util.cpp:30
static int Vs[4]
MsgHandle * comm_declare_send_displaced(void *buffer, const int displacement[], size_t nbytes)
Definition: comm_mpi.cpp:151
#define comm_declare_send_relative(buffer, dim, dir, nbytes)
Definition: comm_quda.h:59
#define comm_declare_receive_relative(buffer, dim, dir, nbytes)
Definition: comm_quda.h:74
static bool initialized
Profiler for initQuda.
int Vh_ex
Definition: test_util.cpp:37
int src_idx
void exchange_cpu_staple(int *X, void *staple, void **ghost_staple, QudaPrecision gPrecision)
#define ZUP
int V
Definition: test_util.cpp:28
void comm_start(MsgHandle *mh)
Definition: comm_mpi.cpp:260
#define gaugeSiteSize
Definition: test_util.h:6
int commDim(int)
int int int enum cudaChannelFormatKind f
enum QudaGaugeFieldOrder_s QudaGaugeFieldOrder
void exchange_llfat_init(QudaPrecision prec)
void * memcpy(void *__dst, const void *__src, size_t __n)
#define safe_malloc(size)
Definition: malloc_quda.h:54
void exchange_llfat_cleanup(void)
#define TUP
void * memset(void *__b, int __c, size_t __len)
#define XDOWN
Definition: misc.h:67
int Vsh_z
Definition: test_util.cpp:31
#define ZDOWN
Definition: misc.h:65
const void * c
int Vs_t
Definition: test_util.cpp:30
cpuGaugeField * cpuLink
Definition: covdev_test.cpp:39
#define checkCudaError()
Definition: util_quda.h:129
void comm_wait(MsgHandle *mh)
Definition: comm_mpi.cpp:266
static __inline__ size_t size_t d
#define YDOWN
Definition: misc.h:66
QudaPrecision prec
Definition: test_util.cpp:1615
#define a
int Vsh_x
Definition: test_util.cpp:31
static int Vsh[4]