QUDA  v1.1.0
A library for QCD on GPUs
face_gauge.cpp
Go to the documentation of this file.
1 #include <cstdio>
2 #include <cstdlib>
3 #include <string.h>
4 #include <sys/time.h>
5 #include <assert.h>
6 
7 #include <quda_internal.h>
8 #include <comm_quda.h>
9 
10 #include <host_utils.h>
11 
12 using namespace quda;
13 
14 extern qudaStream_t *stream;
15 
16 /**************************************************************
17  * Staple exchange routine
18  * used in fat link computation
19  ***************************************************************/
20 
21 enum {
22  XUP = 0,
23  YUP = 1,
24  ZUP = 2,
25  TUP = 3,
26  TDOWN = 4,
27  ZDOWN = 5,
28  YDOWN = 6,
29  XDOWN = 7
30 };
31 
32 
33 //FIXME remove this legacy macro
34 #define gauge_site_size 18 // real numbers per gauge field
35 
36 static void* fwd_nbr_staple[4];
37 static void* back_nbr_staple[4];
38 static void* fwd_nbr_staple_sendbuf[4];
39 static void* back_nbr_staple_sendbuf[4];
40 
41 static int dims[4];
42 static int X1,X2,X3,X4;
43 static int volumeCB;
44 static int Vs[4], Vsh[4];
45 
46 #include "gauge_field.h"
47 // extern void setup_dims_in_gauge(int *XX);
48 
49 static void
50 setup_dims(int* X)
51 {
52  V = 1;
53  for (int d=0; d< 4; d++) {
54  V *= X[d];
55  dims[d] = X[d];
56  }
57  volumeCB = V/2;
58 
59  X1=X[0];
60  X2=X[1];
61  X3=X[2];
62  X4=X[3];
63 
64  Vs[0] = Vs_x = X[1]*X[2]*X[3];
65  Vs[1] = Vs_y = X[0]*X[2]*X[3];
66  Vs[2] = Vs_z = X[0]*X[1]*X[3];
67  Vs[3] = Vs_t = X[0]*X[1]*X[2];
68 
69  Vsh[0] = Vsh_x = Vs_x/2;
70  Vsh[1] = Vsh_y = Vs_y/2;
71  Vsh[2] = Vsh_z = Vs_z/2;
72  Vsh[3] = Vsh_t = Vs_t/2;
73 }
74 
75 template <typename Float>
76 void packGhostAllStaples(Float *cpuStaple, Float **cpuGhostBack,Float**cpuGhostFwd, int nFace, int* X) {
77  int XY=X[0]*X[1];
78  int XYZ=X[0]*X[1]*X[2];
79  int volumeCB = X[0]*X[1]*X[2]*X[3]/2;
80  int faceVolumeCB[4]={
81  X[1]*X[2]*X[3]/2,
82  X[0]*X[2]*X[3]/2,
83  X[0]*X[1]*X[3]/2,
84  X[0]*X[1]*X[2]/2
85  };
86 
87  //loop variables: a, b, c with a the most signifcant and c the least significant
88  //A, B, C the maximum value
89  //we need to loop in d as well, d's vlaue dims[dir]-3, dims[dir]-2, dims[dir]-1
90  int A[4], B[4], C[4];
91 
92  //X dimension
93  A[0] = X[3]; B[0] = X[2]; C[0] = X[1];
94 
95  //Y dimension
96  A[1] = X[3]; B[1] = X[2]; C[1] = X[0];
97 
98  //Z dimension
99  A[2] = X[3]; B[2] = X[1]; C[2] = X[0];
100 
101  //T dimension
102  A[3] = X[2]; B[3] = X[1]; C[3] = X[0];
103 
104  //multiplication factor to compute index in original cpu memory
105  int f[4][4]={
106  {XYZ, XY, X[0], 1},
107  {XYZ, XY, 1, X[0]},
108  {XYZ, X[0], 1, XY},
109  { XY, X[0], 1, XYZ}
110  };
111 
112  for(int ite = 0; ite < 2; ite++){
113  //ite == 0: back
114  //ite == 1: fwd
115  Float** dst;
116  if (ite == 0){
117  dst = cpuGhostBack;
118  }else{
119  dst = cpuGhostFwd;
120  }
121 
122  //collect back ghost staple
123  for(int dir =0; dir < 4; dir++){
124  int d;
125  int a,b,c;
126  //ther is only one staple in the same location
127  for(int linkdir=0; linkdir < 1; linkdir ++){
128  Float* even_src = cpuStaple;
129  Float *odd_src = cpuStaple + volumeCB * gauge_site_size;
130 
131  Float *even_dst;
132  Float *odd_dst;
133 
134  // switching odd and even ghost cpuLink when that dimension size is odd
135  // only switch if X[dir] is odd and the gridsize in that dimension is greater than 1
136  if ((X[dir] % 2 == 0) || (comm_dim(dir) == 1)) {
137  even_dst = dst[dir];
138  odd_dst = even_dst + nFace * faceVolumeCB[dir] * gauge_site_size;
139  } else {
140  odd_dst = dst[dir];
141  even_dst = dst[dir] + nFace * faceVolumeCB[dir] * gauge_site_size;
142  }
143 
144  int even_dst_index = 0;
145  int odd_dst_index = 0;
146  int startd;
147  int endd;
148  if (ite == 0) { // back
149  startd = 0;
150  endd = nFace;
151  } else { // fwd
152  startd = X[dir] - nFace;
153  endd = X[dir];
154  }
155  for (d = startd; d < endd; d++) {
156  for (a = 0; a < A[dir]; a++) {
157  for (b = 0; b < B[dir]; b++) {
158  for (c = 0; c < C[dir]; c++) {
159  int index = ( a*f[dir][0] + b*f[dir][1]+ c*f[dir][2] + d*f[dir][3])>> 1;
160  int oddness = (a+b+c+d)%2;
161  if (oddness == 0){ //even
162  for(int i=0;i < 18;i++){
163  even_dst[18*even_dst_index+i] = even_src[18*index + i];
164  }
165  even_dst_index++;
166  }else{ //odd
167  for(int i=0;i < 18;i++){
168  odd_dst[18*odd_dst_index+i] = odd_src[18*index + i];
169  }
170  odd_dst_index++;
171  }
172  } // c
173  } // b
174  } // a
175  } // d
176  assert(even_dst_index == nFace * faceVolumeCB[dir]);
177  assert(odd_dst_index == nFace * faceVolumeCB[dir]);
178  }//linkdir
179  }//dir
180  }//ite
181 }
182 
183 
184 void pack_ghost_all_staples_cpu(void *staple, void **cpuGhostStapleBack, void** cpuGhostStapleFwd,
185  int nFace, QudaPrecision precision, int* X) {
186  if (precision == QUDA_DOUBLE_PRECISION) {
187  packGhostAllStaples((double*)staple, (double**)cpuGhostStapleBack, (double**) cpuGhostStapleFwd, nFace, X);
188  } else {
189  packGhostAllStaples((float*)staple, (float**)cpuGhostStapleBack, (float**)cpuGhostStapleFwd, nFace, X);
190  }
191 }
192 
193 void pack_gauge_diag(void* buf, int* X, void** sitelink, int nu, int mu, int dir1, int dir2, QudaPrecision prec)
194 {
195  /*
196  nu | |
197  |__________|
198  mu
199  *
200  * nu, mu are the directions we are working on
201  * Since we are packing our own data, we need to go to the north-west corner in the diagram
202  * i.e. x[nu] = X[nu]-1, x[mu]=0, and looop throught x[dir1],x[dir2]
203  * in the remaining two directions (dir1/dir2), dir2 is the slowest changing dim when computing
204  * index
205  */
206 
207  int mul_factor[4]={1, X[0], X[1]*X[0], X[2]*X[1]*X[0]};
208 
209  int even_dst_idx = 0;
210  int odd_dst_idx = 0;
211  char* dst_even =(char*)buf;
212  char *dst_odd = dst_even + (X[dir1] * X[dir2] / 2) * gauge_site_size * prec;
213  char* src_even = (char*)sitelink[nu];
214  char *src_odd = src_even + (X[0] * X[1] * X[2] * X[3] / 2) * gauge_site_size * prec;
215 
216  if( (X[nu]+X[mu]) % 2 == 1){
217  //oddness will change between me and the diagonal neighbor
218  //switch it now
219  char* tmp = dst_odd;
220  dst_odd = dst_even;
221  dst_even = tmp;
222  }
223 
224  for(int i=0;i < X[dir2]; i++){
225  for(int j=0; j < X[dir1]; j++){
226  int src_idx = ((X[nu]-1)*mul_factor[nu]+ 0*mul_factor[mu]+i*mul_factor[dir2]+j*mul_factor[dir1])>>1;
227  int oddness = ( (X[nu]-1) + 0 + i + j) %2;
228  if(oddness==0){
229  for (int tmpidx = 0; tmpidx < gauge_site_size; tmpidx++) {
230  memcpy(&dst_even[(18 * even_dst_idx + tmpidx) * prec], &src_even[(18 * src_idx + tmpidx) * prec], prec);
231  }
232  even_dst_idx++;
233  }else{
234  for (int tmpidx = 0; tmpidx < gauge_site_size; tmpidx++) {
235  memcpy(&dst_odd[(18 * odd_dst_idx + tmpidx) * prec], &src_odd[(18 * src_idx + tmpidx) * prec], prec);
236  }
237  odd_dst_idx++;
238  }//if
239  }//for j
240  }//for i
241  if( (even_dst_idx != X[dir1]*X[dir2]/2)|| (odd_dst_idx != X[dir1]*X[dir2]/2)){
242  errorQuda("even_dst_idx/odd_dst_idx(%d/%d) does not match the value of X[dir1]*X[dir2]/2 (%d)\n",
243  even_dst_idx, odd_dst_idx, X[dir1]*X[dir2]/2);
244  }
245  return ;
246 }
247 
248 /*
249  This is the packing kernel for the multi-dimensional ghost zone in
250  the padded region. This is called by cpuexchangesitelink in
251  FaceBuffer (MPI only), which was called by loadLinkToGPU (defined at
252  the bottom).
253 
254  Not currently included since it will be replaced by Guochun's new
255  routine which uses an enlarged domain instead of a ghost zone.
256 */
257 template <typename Float>
258 void packGhostAllLinks(Float **cpuLink, Float **cpuGhostBack,Float**cpuGhostFwd, int dir, int nFace, int* X) {
259  int XY=X[0]*X[1];
260  int XYZ=X[0]*X[1]*X[2];
261  int volumeCB = X[0]*X[1]*X[2]*X[3]/2;
262  int faceVolumeCB[4]={
263  X[1]*X[2]*X[3]/2,
264  X[0]*X[2]*X[3]/2,
265  X[0]*X[1]*X[3]/2,
266  X[0]*X[1]*X[2]/2
267  };
268 
269  //loop variables: a, b, c with a the most signifcant and c the least significant
270  //A, B, C the maximum value
271  //we need to loop in d as well, d's vlaue dims[dir]-3, dims[dir]-2, dims[dir]-1
272  int A[4], B[4], C[4];
273 
274  //X dimension
275  A[0] = X[3]; B[0] = X[2]; C[0] = X[1];
276 
277  //Y dimension
278  A[1] = X[3]; B[1] = X[2]; C[1] = X[0];
279 
280  //Z dimension
281  A[2] = X[3]; B[2] = X[1]; C[2] = X[0];
282 
283  //T dimension
284  A[3] = X[2]; B[3] = X[1]; C[3] = X[0];
285 
286  //multiplication factor to compute index in original cpu memory
287  int f[4][4]={
288  {XYZ, XY, X[0], 1},
289  {XYZ, XY, 1, X[0]},
290  {XYZ, X[0], 1, XY},
291  { XY, X[0], 1, XYZ}
292  };
293 
294  for(int ite = 0; ite < 2; ite++){
295  //ite == 0: back
296  //ite == 1: fwd
297  Float** dst;
298  if (ite == 0){
299  dst = cpuGhostBack;
300  }else{
301  dst = cpuGhostFwd;
302  }
303  //collect back ghost gauge field
304  //for(int dir =0; dir < 4; dir++){
305  int d;
306  int a,b,c;
307 
308  //we need copy all 4 links in the same location
309  for(int linkdir=0; linkdir < 4; linkdir ++){
310  Float* even_src = cpuLink[linkdir];
311  Float *odd_src = cpuLink[linkdir] + volumeCB * gauge_site_size;
312  Float* even_dst;
313  Float* odd_dst;
314 
315  //switching odd and even ghost cpuLink when that dimension size is odd
316  //only switch if X[dir] is odd and the gridsize in that dimension is greater than 1
317  if((X[dir] % 2 ==0) || (comm_dim(dir) == 1)){
318  even_dst = dst[dir] + 2 * linkdir * nFace * faceVolumeCB[dir] * gauge_site_size;
319  odd_dst = even_dst + nFace * faceVolumeCB[dir] * gauge_site_size;
320  }else{
321  odd_dst = dst[dir] + 2 * linkdir * nFace * faceVolumeCB[dir] * gauge_site_size;
322  even_dst = odd_dst + nFace * faceVolumeCB[dir] * gauge_site_size;
323  }
324 
325  int even_dst_index = 0;
326  int odd_dst_index = 0;
327  int startd;
328  int endd;
329  if(ite == 0){ //back
330  startd = 0;
331  endd= nFace;
332  }else{//fwd
333  startd = X[dir] - nFace;
334  endd =X[dir];
335  }
336  for(d = startd; d < endd; d++){
337  for(a = 0; a < A[dir]; a++){
338  for(b = 0; b < B[dir]; b++){
339  for(c = 0; c < C[dir]; c++){
340  int index = ( a*f[dir][0] + b*f[dir][1]+ c*f[dir][2] + d*f[dir][3])>> 1;
341  int oddness = (a+b+c+d)%2;
342  if (oddness == 0){ //even
343  for(int i=0;i < 18;i++){
344  even_dst[18*even_dst_index+i] = even_src[18*index + i];
345  }
346  even_dst_index++;
347  }else{ //odd
348  for(int i=0;i < 18;i++){
349  odd_dst[18*odd_dst_index+i] = odd_src[18*index + i];
350  }
351  odd_dst_index++;
352  }
353  }//c
354  }//b
355  }//a
356  }//d
357  assert( even_dst_index == nFace*faceVolumeCB[dir]);
358  assert( odd_dst_index == nFace*faceVolumeCB[dir]);
359  }//linkdir
360  }//ite
361 }
362 
363 
364 void pack_ghost_all_links(void **cpuLink, void **cpuGhostBack, void** cpuGhostFwd,
365  int dir, int nFace, QudaPrecision precision, int *X) {
366  if (precision == QUDA_DOUBLE_PRECISION) {
367  packGhostAllLinks((double**)cpuLink, (double**)cpuGhostBack, (double**) cpuGhostFwd, dir, nFace, X);
368  } else {
369  packGhostAllLinks((float**)cpuLink, (float**)cpuGhostBack, (float**)cpuGhostFwd, dir, nFace, X);
370  }
371 }
372 
374 {
375  static bool initialized = false;
376 
377  if (initialized) return;
378  initialized = true;
379 
380  for (int i=0; i < 4; i++) {
381 
382  size_t packet_size = Vs[i] * gauge_site_size * prec;
383 
384  fwd_nbr_staple[i] = pinned_malloc(packet_size);
385  back_nbr_staple[i] = pinned_malloc(packet_size);
386  fwd_nbr_staple_sendbuf[i] = pinned_malloc(packet_size);
387  back_nbr_staple_sendbuf[i] = pinned_malloc(packet_size);
388 
389  }
390 }
391 
392 
393 template<typename Float>
394 void exchange_sitelink_diag(int* X, Float** sitelink, Float** ghost_sitelink_diag, int optflag)
395 {
396  /*
397  nu | |
398  |__________|
399  mu
400 
401  * There are total 12 different combinations for (nu,mu)
402  * since nu/mu = X,Y,Z,T and nu != mu
403  * For each combination, we need to communicate with the corresponding
404  * neighbor and get the diag ghost data
405  * The neighbor we need to get data from is dx[nu]=-1, dx[mu]= +1
406  * and we need to send our data to neighbor with dx[nu]=+1, dx[mu]=-1
407  */
408 
409  for(int nu = XUP; nu <=TUP; nu++){
410  for(int mu = XUP; mu <= TUP; mu++){
411  if(nu == mu){
412  continue;
413  }
414  if(optflag && (!commDimPartitioned(mu) || !commDimPartitioned(nu))){
415  continue;
416  }
417 
418  int dir1, dir2; //other two dimensions
419  for(dir1=0; dir1 < 4; dir1 ++){
420  if(dir1 != nu && dir1 != mu){
421  break;
422  }
423  }
424  for(dir2=0; dir2 < 4; dir2 ++){
425  if(dir2 != nu && dir2 != mu && dir2 != dir1){
426  break;
427  }
428  }
429 
430  if(dir1 == 4 || dir2 == 4){
431  errorQuda("Invalid dir1/dir2");
432  }
433  int len = X[dir1] * X[dir2] * gauge_site_size * sizeof(Float);
434  void *sendbuf = safe_malloc(len);
435 
436  pack_gauge_diag(sendbuf, X, (void**)sitelink, nu, mu, dir1, dir2, (QudaPrecision)sizeof(Float));
437 
438  int dx[4] = {0};
439  dx[nu] = -1;
440  dx[mu] = +1;
441  MsgHandle *mh_recv = comm_declare_receive_displaced(ghost_sitelink_diag[nu*4+mu], dx, len);
442  comm_start(mh_recv);
443 
444  dx[nu] = +1;
445  dx[mu] = -1;
446  MsgHandle *mh_send = comm_declare_send_displaced(sendbuf, dx, len);
447  comm_start(mh_send);
448 
449  comm_wait(mh_send);
450  comm_wait(mh_recv);
451 
452  comm_free(mh_send);
453  comm_free(mh_recv);
454 
455  host_free(sendbuf);
456  }
457  }
458 }
459 
460 
461 template<typename Float>
462 void
463 exchange_sitelink(int*X, Float** sitelink, Float** ghost_sitelink, Float** ghost_sitelink_diag,
464  Float** sitelink_fwd_sendbuf, Float** sitelink_back_sendbuf, int optflag)
465 {
466 
467  int nFace =1;
468  for(int dir=0; dir < 4; dir++){
469  if(optflag && !commDimPartitioned(dir)) continue;
470  pack_ghost_all_links((void**)sitelink, (void**)sitelink_back_sendbuf, (void**)sitelink_fwd_sendbuf, dir, nFace, (QudaPrecision)(sizeof(Float)), X);
471  }
472 
473  for (int dir = 0; dir < 4; dir++) {
474  if(optflag && !commDimPartitioned(dir)) continue;
475  int len = Vsh[dir] * gauge_site_size * sizeof(Float);
476  Float* ghost_sitelink_back = ghost_sitelink[dir];
477  Float *ghost_sitelink_fwd = ghost_sitelink[dir] + 8 * Vsh[dir] * gauge_site_size;
478 
479  MsgHandle *mh_recv_back;
480  MsgHandle *mh_recv_fwd;
481  MsgHandle *mh_send_fwd;
482  MsgHandle *mh_send_back;
483 
484  mh_recv_back = comm_declare_receive_relative(ghost_sitelink_back, dir, -1, 8*len);
485  mh_recv_fwd = comm_declare_receive_relative(ghost_sitelink_fwd, dir, +1, 8*len);
486  mh_send_fwd = comm_declare_send_relative(sitelink_fwd_sendbuf[dir], dir, +1, 8*len);
487  mh_send_back = comm_declare_send_relative(sitelink_back_sendbuf[dir], dir, -1, 8*len);
488 
489  comm_start(mh_recv_back);
490  comm_start(mh_recv_fwd);
491  comm_start(mh_send_fwd);
492  comm_start(mh_send_back);
493 
494  comm_wait(mh_send_fwd);
495  comm_wait(mh_send_back);
496  comm_wait(mh_recv_back);
497  comm_wait(mh_recv_fwd);
498 
499  comm_free(mh_send_fwd);
500  comm_free(mh_send_back);
501  comm_free(mh_recv_back);
502  comm_free(mh_recv_fwd);
503  }
504 
505  exchange_sitelink_diag(X, sitelink, ghost_sitelink_diag, optflag);
506 }
507 
508 
509 //this function is used for link fattening computation
510 //@optflag: if this flag is set, we only communicate in directions that are partitioned
511 // if not set, then we communicate in all directions regradless of partitions
513  void** sitelink, void** ghost_sitelink,
514  void** ghost_sitelink_diag,
515  QudaPrecision gPrecision, QudaGaugeParam* param, int optflag)
516 {
517  setup_dims(X);
518  static void* sitelink_fwd_sendbuf[4];
519  static void* sitelink_back_sendbuf[4];
520 
521  for (int i=0; i<4; i++) {
522  int nbytes = 4 * Vs[i] * gauge_site_size * gPrecision;
523  sitelink_fwd_sendbuf[i] = safe_malloc(nbytes);
524  sitelink_back_sendbuf[i] = safe_malloc(nbytes);
525  memset(sitelink_fwd_sendbuf[i], 0, nbytes);
526  memset(sitelink_back_sendbuf[i], 0, nbytes);
527  }
528 
529  if (gPrecision == QUDA_DOUBLE_PRECISION){
530  exchange_sitelink(X, (double**)sitelink, (double**)(ghost_sitelink), (double**)ghost_sitelink_diag,
531  (double**)sitelink_fwd_sendbuf, (double**)sitelink_back_sendbuf, optflag);
532  }else{ //single
533  exchange_sitelink(X, (float**)sitelink, (float**)(ghost_sitelink), (float**)ghost_sitelink_diag,
534  (float**)sitelink_fwd_sendbuf, (float**)sitelink_back_sendbuf, optflag);
535  }
536 
537  for(int i=0;i < 4;i++){
538  host_free(sitelink_fwd_sendbuf[i]);
539  host_free(sitelink_back_sendbuf[i]);
540  }
541 }
542 
543 
544 #define MEMCOPY_GAUGE_FIELDS_GRID_TO_BUF(ghost_buf, dst_idx, sitelink, src_idx, num, dir, geom) \
545  if(src_oddness) src_idx += Vh_ex; \
546  if(dst_oddness) dst_idx += R[dir]*slice_3d[dir]/2; \
547  if(cpu_order == QUDA_QDP_GAUGE_ORDER) { \
548  for(int linkdir=0; linkdir < 4; linkdir++){ \
549  char* src = (char*) sitelink[linkdir] + (src_idx)*gaugebytes; \
550  char* dst = ((char*)ghost_buf[dir])+ linkdir*R[dir]*slice_3d[dir]*gaugebytes + (dst_idx)*gaugebytes; \
551  memcpy(dst, src, gaugebytes*(num)); \
552  } \
553  } else if (cpu_order == QUDA_MILC_GAUGE_ORDER) { \
554  char* src = ((char*)sitelink)+ (geom)*(src_idx)*gaugebytes; \
555  char* dst = ((char*)ghost_buf[dir]) + (geom)*(dst_idx)*gaugebytes; \
556  memcpy(dst, src, (geom)*gaugebytes*(num)); \
557  } else { \
558  errorQuda("Unsupported gauge order"); \
559  } \
560 
561 #define MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID(sitelink, dst_idx, ghost_buf, src_idx, num, dir, geom) \
562  if(oddness){ \
563  if(commDimPartitioned(dir)){ \
564  src_idx += R[dir]*slice_3d[dir]/2; \
565  }else{ \
566  src_idx += Vh_ex; \
567  } \
568  dst_idx += Vh_ex; \
569  } \
570  if(cpu_order == QUDA_QDP_GAUGE_ORDER){ \
571  for(int linkdir=0; linkdir < 4; linkdir++){ \
572  char* src; \
573  if(commDimPartitioned(dir)){ \
574  src = ((char*)ghost_buf[dir])+ linkdir*R[dir]*slice_3d[dir]*gaugebytes + (src_idx)*gaugebytes; \
575  }else{ \
576  src = ((char*)sitelink[linkdir])+ (src_idx)*gaugebytes; \
577  } \
578  char* dst = (char*) sitelink[linkdir] + (dst_idx)*gaugebytes; \
579  memcpy(dst, src, gaugebytes*(num)); \
580  } \
581  } else if (cpu_order == QUDA_MILC_GAUGE_ORDER) { \
582  char* src; \
583  if(commDimPartitioned(dir)){ \
584  src=((char*)ghost_buf[dir]) + (geom)*(src_idx)*gaugebytes; \
585  }else{ \
586  src = ((char*)sitelink)+ (geom)*(src_idx)*gaugebytes; \
587  } \
588  char* dst = ((char*)sitelink) + (geom)*(dst_idx)*gaugebytes; \
589  memcpy(dst, src, (geom)*gaugebytes*(num)); \
590  } else { \
591  errorQuda("Unsupported gauge order"); \
592  }
593 
594 #define MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID_T(sitelink, ghost_buf, dst_face, src_face, dir, geom) \
595  /*even*/ \
596  int even_dst_idx = (dst_face*E[2]*E[1]*E[0])/2; \
597  int even_src_idx; \
598  if(commDimPartitioned(dir)){ \
599  even_src_idx = 0; \
600  }else{ \
601  even_src_idx = (src_face*E[2]*E[1]*E[0])/2; \
602  } \
603  /*odd*/ \
604  int odd_dst_idx = even_dst_idx+Vh_ex; \
605  int odd_src_idx; \
606  if(commDimPartitioned(dir)){ \
607  odd_src_idx = R[dir]*slice_3d[dir]/2; \
608  }else{ \
609  odd_src_idx = even_src_idx+Vh_ex; \
610  } \
611  if(cpu_order == QUDA_QDP_GAUGE_ORDER){ \
612  for(int linkdir=0; linkdir < 4; linkdir ++){ \
613  char* dst = (char*)sitelink[linkdir]; \
614  char* src; \
615  if(commDimPartitioned(dir)){ \
616  src = ((char*)ghost_buf[dir]) + linkdir*R[dir]*slice_3d[dir]*gaugebytes; \
617  }else{ \
618  src = (char*)sitelink[linkdir]; \
619  } \
620  memcpy(dst + even_dst_idx * gaugebytes, src + even_src_idx*gaugebytes, R[dir]*slice_3d[dir]*gaugebytes/2); \
621  memcpy(dst + odd_dst_idx * gaugebytes, src + odd_src_idx*gaugebytes, R[dir]*slice_3d[dir]*gaugebytes/2); \
622  } \
623  } else if (cpu_order == QUDA_MILC_GAUGE_ORDER) { \
624  char* dst = (char*)sitelink; \
625  char* src; \
626  if(commDimPartitioned(dir)){ \
627  src = (char*)ghost_buf[dir]; \
628  }else{ \
629  src = (char*)sitelink; \
630  } \
631  memcpy(dst+(geom)*even_dst_idx*gaugebytes, src+(geom)*even_src_idx*gaugebytes, (geom)*R[dir]*slice_3d[dir]*gaugebytes/2); \
632  memcpy(dst+(geom)*odd_dst_idx*gaugebytes, src+(geom)*odd_src_idx*gaugebytes, (geom)*R[dir]*slice_3d[dir]*gaugebytes/2); \
633  } else { \
634  errorQuda("Unsupported gauge order\n"); \
635  }
636 
637 /* This function exchange the sitelink and store them in the correspoinding portion of
638  * the extended sitelink memory region
639  * @sitelink: this is stored according to dimension size (X4+R4) * (X1+R1) * (X2+R2) * (X3+R3)
640  */
641 
642 // gauge_site_size
643 
644 void exchange_cpu_sitelink_ex(int* X, int *R, void** sitelink, QudaGaugeFieldOrder cpu_order,
645  QudaPrecision gPrecision, int optflag, int geometry)
646 {
647  int E[4];
648  for (int i=0; i<4; i++) E[i] = X[i] + 2*R[i];
649  int Vh_ex = E[3]*E[2]*E[1]*E[0]/2;
650 
651  //...............x.........y.....z......t
652  int starta[] = {R[3], R[3], R[3], 0};
653  int enda[] = {X[3]+R[3], X[3]+R[3], X[3]+R[3], X[2]+2*R[2]};
654 
655  int startb[] = {R[2], R[2], 0, 0};
656  int endb[] = {X[2]+R[2], X[2]+R[2], X[1]+2*R[1], X[1]+2*R[1]};
657 
658  int startc[] = {R[1], 0, 0, 0};
659  int endc[] = {X[1]+R[1], X[0]+2*R[0], X[0]+2*R[0], X[0]+2*R[0]};
660 
661  int f_main[4][4] = {
662  {E[2]*E[1]*E[0], E[1]*E[0], E[0], 1},
663  {E[2]*E[1]*E[0], E[1]*E[0], 1, E[0]},
664  {E[2]*E[1]*E[0], E[0], 1, E[1]*E[0]},
665  {E[1]*E[0], E[0], 1, E[2]*E[1]*E[0]}
666  };
667 
668  int f_bound[4][4]={
669  {E[2]*E[1], E[1], 1, E[3]*E[2]*E[1]},
670  {E[2]*E[0], E[0], 1, E[3]*E[2]*E[0]},
671  {E[1]*E[0], E[0], 1, E[3]*E[1]*E[0]},
672  {E[1]*E[0], E[0], 1, E[2]*E[1]*E[0]}
673  };
674 
675  int slice_3d[] = { E[3]*E[2]*E[1], E[3]*E[2]*E[0], E[3]*E[1]*E[0], E[2]*E[1]*E[0]};
676  int len[4];
677  for(int i=0; i<4;i++){
678  len[i] = slice_3d[i] * R[i] * geometry * gauge_site_size * gPrecision; // 2 slices, 4 directions' links
679  }
680 
681  void* ghost_sitelink_fwd_sendbuf[4];
682  void* ghost_sitelink_back_sendbuf[4];
683  void* ghost_sitelink_fwd[4];
684  void* ghost_sitelink_back[4];
685 
686  for(int i=0; i<4; i++) {
687  if(!commDimPartitioned(i)) continue;
688  ghost_sitelink_fwd_sendbuf[i] = safe_malloc(len[i]);
689  ghost_sitelink_back_sendbuf[i] = safe_malloc(len[i]);
690  ghost_sitelink_fwd[i] = safe_malloc(len[i]);
691  ghost_sitelink_back[i] = safe_malloc(len[i]);
692  }
693 
694  int gaugebytes = gauge_site_size * gPrecision;
695  int a, b, c,d;
696  for(int dir =0;dir < 4;dir++){
697  if( (!commDimPartitioned(dir)) && optflag) continue;
698  if(commDimPartitioned(dir)){
699  //fill the sendbuf here
700  //back
701  for(d=R[dir]; d < 2*R[dir]; d++)
702  for(a=starta[dir];a < enda[dir]; a++)
703  for(b=startb[dir]; b < endb[dir]; b++)
704 
705  if(f_main[dir][2] != 1 || f_bound[dir][2] !=1){
706  for (c=startc[dir]; c < endc[dir]; c++){
707  int oddness = (a+b+c+d)%2;
708  int src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
709  int dst_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + (d-R[dir])*f_bound[dir][3])>> 1;
710 
711  int src_oddness = oddness;
712  int dst_oddness = oddness;
713  if((X[dir] % 2 ==1) && (commDim(dir) > 1)){ //switch even/odd position
714  dst_oddness = 1-oddness;
715  }
716 
717  MEMCOPY_GAUGE_FIELDS_GRID_TO_BUF(ghost_sitelink_back_sendbuf, dst_idx, sitelink, src_idx, 1, dir, geometry);
718 
719  }//c
720  }else{
721  for(int loop=0; loop < 2; loop++){
722  c=startc[dir]+loop;
723  if(c < endc[dir]){
724  int oddness = (a+b+c+d)%2;
725  int src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
726  int dst_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + (d-R[dir])*f_bound[dir][3])>> 1;
727 
728  int src_oddness = oddness;
729  int dst_oddness = oddness;
730  if((X[dir] % 2 ==1) && (commDim(dir) > 1)){ //switch even/odd position
731  dst_oddness = 1-oddness;
732  }
733  MEMCOPY_GAUGE_FIELDS_GRID_TO_BUF(ghost_sitelink_back_sendbuf, dst_idx, sitelink, src_idx, (endc[dir]-c+1)/2, dir, geometry);
734 
735  }//if c
736  }//for loop
737  }//if
738 
739 
740  //fwd
741  for(d=X[dir]; d < X[dir]+R[dir]; d++) {
742  for(a=starta[dir];a < enda[dir]; a++) {
743  for(b=startb[dir]; b < endb[dir]; b++) {
744 
745  if(f_main[dir][2] != 1 || f_bound[dir][2] !=1){
746  for (c=startc[dir]; c < endc[dir]; c++){
747  int oddness = (a+b+c+d)%2;
748  int src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
749  int dst_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + (d-X[dir])*f_bound[dir][3])>> 1;
750 
751  int src_oddness = oddness;
752  int dst_oddness = oddness;
753  if((X[dir] % 2 ==1) && (commDim(dir) > 1)){ //switch even/odd position
754  dst_oddness = 1-oddness;
755  }
756 
757  MEMCOPY_GAUGE_FIELDS_GRID_TO_BUF(ghost_sitelink_fwd_sendbuf, dst_idx, sitelink, src_idx, 1,dir, geometry);
758  }//c
759  }else{
760  for(int loop=0; loop < 2; loop++){
761  c=startc[dir]+loop;
762  if(c < endc[dir]){
763  int oddness = (a+b+c+d)%2;
764  int src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
765  int dst_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + (d-X[dir])*f_bound[dir][3])>> 1;
766 
767  int src_oddness = oddness;
768  int dst_oddness = oddness;
769  if((X[dir] % 2 ==1) && (commDim(dir) > 1)){ //switch even/odd position
770  dst_oddness = 1-oddness;
771  }
772  MEMCOPY_GAUGE_FIELDS_GRID_TO_BUF(ghost_sitelink_fwd_sendbuf, dst_idx, sitelink, src_idx, (endc[dir]-c+1)/2,dir, geometry);
773  }
774  }//for loop
775  }//if
776 
777  }
778  }
779  }
780 
781  MsgHandle *mh_recv_back;
782  MsgHandle *mh_recv_fwd;
783  MsgHandle *mh_send_fwd;
784  MsgHandle *mh_send_back;
785 
786  mh_recv_back = comm_declare_receive_relative(ghost_sitelink_back[dir], dir, -1, len[dir]);
787  mh_recv_fwd = comm_declare_receive_relative(ghost_sitelink_fwd[dir], dir, +1, len[dir]);
788  mh_send_fwd = comm_declare_send_relative(ghost_sitelink_fwd_sendbuf[dir], dir, +1, len[dir]);
789  mh_send_back = comm_declare_send_relative(ghost_sitelink_back_sendbuf[dir], dir, -1, len[dir]);
790 
791  comm_start(mh_recv_back);
792  comm_start(mh_recv_fwd);
793  comm_start(mh_send_fwd);
794  comm_start(mh_send_back);
795 
796  comm_wait(mh_send_fwd);
797  comm_wait(mh_send_back);
798  comm_wait(mh_recv_back);
799  comm_wait(mh_recv_fwd);
800 
801  comm_free(mh_send_fwd);
802  comm_free(mh_send_back);
803  comm_free(mh_recv_back);
804  comm_free(mh_recv_fwd);
805 
806  }//if
807 
808  //use the messages to fill the sitelink data
809  //back
810  if (dir < 3 ) {
811 
812  for(d=0; d < R[dir]; d++) {
813  for(a=starta[dir];a < enda[dir]; a++) {
814  for(b=startb[dir]; b < endb[dir]; b++) {
815 
816  if(f_main[dir][2] != 1 || f_bound[dir][2] !=1){
817  for (c=startc[dir]; c < endc[dir]; c++){
818  int oddness = (a+b+c+d)%2;
819  int dst_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
820  int src_idx;
821  if(commDimPartitioned(dir)){
822  src_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + d*f_bound[dir][3])>> 1;
823  }else{
824  src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + (d+X[dir])*f_main[dir][3])>> 1;
825  }
826 
827  MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID(sitelink, dst_idx, ghost_sitelink_back, src_idx, 1, dir, geometry);
828 
829  }//c
830  }else{
831  //optimized copy
832  //first half: startc[dir] -> end[dir] with step=2
833 
834  for(int loop =0;loop <2;loop++){
835  int c=startc[dir]+loop;
836  if(c < endc[dir]){
837  int oddness = (a+b+c+d)%2;
838  int dst_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
839  int src_idx;
840  if(commDimPartitioned(dir)){
841  src_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + d*f_bound[dir][3])>> 1;
842  }else{
843  src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + (d+X[dir])*f_main[dir][3])>> 1;
844  }
845 
846  MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID(sitelink, dst_idx, ghost_sitelink_back, src_idx, (endc[dir]-c+1)/2, dir, geometry);
847 
848  }//if c
849  }//for loop
850  }//if
851 
852  }
853  }
854  }
855 
856  }else{
857  //when dir == 3 (T direction), the data layout format in sitelink and the message is the same, we can do large copys
858 
859  MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID_T(sitelink, ghost_sitelink_back, 0, X[3], dir, geometry)
860  }//if
861 
862  //fwd
863  if( dir < 3 ){
864 
865  for(d=X[dir]+R[dir]; d < X[dir]+2*R[dir]; d++) {
866  for(a=starta[dir];a < enda[dir]; a++) {
867  for(b=startb[dir]; b < endb[dir]; b++) {
868 
869  if(f_main[dir][2] != 1 || f_bound[dir][2] != 1){
870  for (c=startc[dir]; c < endc[dir]; c++){
871  int oddness = (a+b+c+d)%2;
872  int dst_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
873  int src_idx;
874  if(commDimPartitioned(dir)){
875  src_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + (d-X[dir]-R[dir])*f_bound[dir][3])>> 1;
876  }else{
877  src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + (d-X[dir])*f_main[dir][3])>> 1;
878  }
879 
880  MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID(sitelink, dst_idx, ghost_sitelink_fwd, src_idx, 1, dir, geometry);
881 
882  }//c
883  }else{
884  for(int loop =0; loop < 2; loop++){
885  //for (c=startc[dir]; c < endc[dir]; c++){
886  c=startc[dir] + loop;
887  if(c < endc[dir]){
888  int oddness = (a+b+c+d)%2;
889  int dst_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
890  int src_idx;
891  if(commDimPartitioned(dir)){
892  src_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + (d-X[dir]-R[dir])*f_bound[dir][3])>> 1;
893  }else{
894  src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + (d-X[dir])*f_main[dir][3])>> 1;
895  }
896  MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID(sitelink, dst_idx, ghost_sitelink_fwd, src_idx, (endc[dir]-c+1)/2, dir, geometry);
897  }//if
898  }//for loop
899  }//if
900 
901  }
902  }
903  }
904 
905 
906  } else {
907 
908  //when dir == 3 (T direction), the data layout format in sitelink and the message is the same, we can do large copys
909  MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID_T(sitelink, ghost_sitelink_fwd, (X[3]+R[3]), 2, dir, geometry) // TESTME 2
910 
911  }//if
912 
913  }//dir for loop
914 
915 
916  for(int dir=0;dir < 4;dir++){
917  if(!commDimPartitioned(dir)) continue;
918  host_free(ghost_sitelink_fwd_sendbuf[dir]);
919  host_free(ghost_sitelink_back_sendbuf[dir]);
920  host_free(ghost_sitelink_fwd[dir]);
921  host_free(ghost_sitelink_back[dir]);
922  }
923 
924 }
925 
926 
927 
928 template<typename Float>
929 void
930 do_exchange_cpu_staple(Float* staple, Float** ghost_staple, Float** staple_fwd_sendbuf, Float** staple_back_sendbuf, int* X)
931 {
932 
933  int nFace =1;
934  pack_ghost_all_staples_cpu(staple, (void**)staple_back_sendbuf,
935  (void**)staple_fwd_sendbuf, nFace, (QudaPrecision)(sizeof(Float)), X);
936 
937 
938  int Vsh[4] = {Vsh_x, Vsh_y, Vsh_z, Vsh_t};
939  size_t len[4] = {Vsh_x * gauge_site_size * sizeof(Float), Vsh_y * gauge_site_size * sizeof(Float),
940  Vsh_z * gauge_site_size * sizeof(Float), Vsh_t * gauge_site_size * sizeof(Float)};
941 
942  for (int dir=0;dir < 4; dir++) {
943 
944  Float *ghost_staple_back = ghost_staple[dir];
945  Float *ghost_staple_fwd = ghost_staple[dir] + 2 * Vsh[dir] * gauge_site_size;
946 
947  MsgHandle *mh_recv_back;
948  MsgHandle *mh_recv_fwd;
949  MsgHandle *mh_send_fwd;
950  MsgHandle *mh_send_back;
951 
952  mh_recv_back = comm_declare_receive_relative(ghost_staple_back, dir, -1, 2*len[dir]);
953  mh_recv_fwd = comm_declare_receive_relative(ghost_staple_fwd, dir, +1, 2*len[dir]);
954  mh_send_fwd = comm_declare_send_relative(staple_fwd_sendbuf[dir], dir, +1, 2*len[dir]);
955  mh_send_back = comm_declare_send_relative(staple_back_sendbuf[dir], dir, -1, 2*len[dir]);
956 
957  comm_start(mh_recv_back);
958  comm_start(mh_recv_fwd);
959  comm_start(mh_send_fwd);
960  comm_start(mh_send_back);
961 
962  comm_wait(mh_send_fwd);
963  comm_wait(mh_send_back);
964  comm_wait(mh_recv_back);
965  comm_wait(mh_recv_fwd);
966 
967  comm_free(mh_send_fwd);
968  comm_free(mh_send_back);
969  comm_free(mh_recv_back);
970  comm_free(mh_recv_fwd);
971  }
972 }
973 
974 
975 //this function is used for link fattening computation
976 void exchange_cpu_staple(int* X, void* staple, void** ghost_staple, QudaPrecision gPrecision)
977 {
978  setup_dims(X);
979 
980  int Vs[4] = {Vs_x, Vs_y, Vs_z, Vs_t};
981  void *staple_fwd_sendbuf[4];
982  void *staple_back_sendbuf[4];
983 
984  for(int i=0;i < 4; i++){
985  staple_fwd_sendbuf[i] = safe_malloc(Vs[i] * gauge_site_size * gPrecision);
986  staple_back_sendbuf[i] = safe_malloc(Vs[i] * gauge_site_size * gPrecision);
987  }
988 
989  if (gPrecision == QUDA_DOUBLE_PRECISION) {
990  do_exchange_cpu_staple((double*)staple, (double**)ghost_staple,
991  (double**)staple_fwd_sendbuf, (double**)staple_back_sendbuf, X);
992  } else { //single
993  do_exchange_cpu_staple((float*)staple, (float**)ghost_staple,
994  (float**)staple_fwd_sendbuf, (float**)staple_back_sendbuf, X);
995  }
996 
997  for (int i=0;i < 4;i++) {
998  host_free(staple_fwd_sendbuf[i]);
999  host_free(staple_back_sendbuf[i]);
1000  }
1001 }
1002 
1004 {
1005  for (int i=0; i<4; i++) {
1006 
1007  if(fwd_nbr_staple[i]){
1008  host_free(fwd_nbr_staple[i]); fwd_nbr_staple[i] = NULL;
1009  }
1010  if(back_nbr_staple[i]){
1011  host_free(back_nbr_staple[i]); back_nbr_staple[i] = NULL;
1012  }
1013  if(fwd_nbr_staple_sendbuf[i]){
1014  host_free(fwd_nbr_staple_sendbuf[i]); fwd_nbr_staple_sendbuf[i] = NULL;
1015  }
1016  if(back_nbr_staple_sendbuf[i]){
1017  host_free(back_nbr_staple_sendbuf[i]); back_nbr_staple_sendbuf[i] = NULL;
1018  }
1019 
1020  }
1021 }
1022 
1023 #undef gauge_site_size
void comm_start(MsgHandle *mh)
MsgHandle * comm_declare_receive_displaced(void *buffer, const int displacement[], size_t nbytes)
MsgHandle * comm_declare_send_displaced(void *buffer, const int displacement[], size_t nbytes)
int commDim(int)
#define comm_declare_receive_relative(buffer, dim, dir, nbytes)
Definition: comm_quda.h:82
void comm_wait(MsgHandle *mh)
void comm_free(MsgHandle *&mh)
int comm_dim(int dim)
int commDimPartitioned(int dir)
#define comm_declare_send_relative(buffer, dim, dir, nbytes)
Definition: comm_quda.h:67
double mu
QudaPrecision prec
int V
Definition: host_utils.cpp:37
void * memset(void *s, int c, size_t n)
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:34
cpuGaugeField * cpuLink
Definition: covdev_test.cpp:29
enum QudaPrecision_s QudaPrecision
enum QudaGaugeFieldOrder_s QudaGaugeFieldOrder
@ QUDA_DOUBLE_PRECISION
Definition: enum_quda.h:65
#define MEMCOPY_GAUGE_FIELDS_GRID_TO_BUF(ghost_buf, dst_idx, sitelink, src_idx, num, dir, geom)
Definition: face_gauge.cpp:544
#define MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID_T(sitelink, ghost_buf, dst_face, src_face, dir, geom)
Definition: face_gauge.cpp:594
#define MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID(sitelink, dst_idx, ghost_buf, src_idx, num, dir, geom)
Definition: face_gauge.cpp:561
void pack_gauge_diag(void *buf, int *X, void **sitelink, int nu, int mu, int dir1, int dir2, QudaPrecision prec)
Definition: face_gauge.cpp:193
void exchange_sitelink_diag(int *X, Float **sitelink, Float **ghost_sitelink_diag, int optflag)
Definition: face_gauge.cpp:394
void packGhostAllStaples(Float *cpuStaple, Float **cpuGhostBack, Float **cpuGhostFwd, int nFace, int *X)
Definition: face_gauge.cpp:76
void exchange_cpu_sitelink_ex(int *X, int *R, void **sitelink, QudaGaugeFieldOrder cpu_order, QudaPrecision gPrecision, int optflag, int geometry)
Definition: face_gauge.cpp:644
void exchange_sitelink(int *X, Float **sitelink, Float **ghost_sitelink, Float **ghost_sitelink_diag, Float **sitelink_fwd_sendbuf, Float **sitelink_back_sendbuf, int optflag)
Definition: face_gauge.cpp:463
void packGhostAllLinks(Float **cpuLink, Float **cpuGhostBack, Float **cpuGhostFwd, int dir, int nFace, int *X)
Definition: face_gauge.cpp:258
void pack_ghost_all_links(void **cpuLink, void **cpuGhostBack, void **cpuGhostFwd, int dir, int nFace, QudaPrecision precision, int *X)
Definition: face_gauge.cpp:364
void pack_ghost_all_staples_cpu(void *staple, void **cpuGhostStapleBack, void **cpuGhostStapleFwd, int nFace, QudaPrecision precision, int *X)
Definition: face_gauge.cpp:184
@ YDOWN
Definition: face_gauge.cpp:28
@ TUP
Definition: face_gauge.cpp:25
@ ZDOWN
Definition: face_gauge.cpp:27
@ TDOWN
Definition: face_gauge.cpp:26
@ YUP
Definition: face_gauge.cpp:23
@ ZUP
Definition: face_gauge.cpp:24
@ XUP
Definition: face_gauge.cpp:22
@ XDOWN
Definition: face_gauge.cpp:29
void exchange_llfat_cleanup(void)
#define gauge_site_size
Definition: face_gauge.cpp:34
void exchange_llfat_init(QudaPrecision prec)
Definition: face_gauge.cpp:373
void exchange_cpu_sitelink(int *X, void **sitelink, void **ghost_sitelink, void **ghost_sitelink_diag, QudaPrecision gPrecision, QudaGaugeParam *param, int optflag)
Definition: face_gauge.cpp:512
void do_exchange_cpu_staple(Float *staple, Float **ghost_staple, Float **staple_fwd_sendbuf, Float **staple_back_sendbuf, int *X)
Definition: face_gauge.cpp:930
void exchange_cpu_staple(int *X, void *staple, void **ghost_staple, QudaPrecision gPrecision)
Definition: face_gauge.cpp:976
int E[4]
Definition: host_utils.cpp:45
int Vh_ex
Definition: host_utils.cpp:46
int Vs_y
Definition: host_utils.cpp:39
int Vsh_y
Definition: host_utils.cpp:40
int Vsh_x
Definition: host_utils.cpp:40
int Vs_x
Definition: host_utils.cpp:39
int Vs_t
Definition: host_utils.cpp:39
int Vs_z
Definition: host_utils.cpp:39
int Vsh_z
Definition: host_utils.cpp:40
int Vsh_t
Definition: host_utils.cpp:40
#define safe_malloc(size)
Definition: malloc_quda.h:106
#define pinned_malloc(size)
Definition: malloc_quda.h:107
#define host_free(ptr)
Definition: malloc_quda.h:115
qudaStream_t * stream
FloatingPoint< float > Float
QudaGaugeParam param
Definition: pack_test.cpp:18
cudaStream_t qudaStream_t
Definition: quda_api.h:9
#define errorQuda(...)
Definition: util_quda.h:120