QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
face_gauge.cpp
Go to the documentation of this file.
1 #include <cstdio>
2 #include <cstdlib>
3 #include <string.h>
4 #include <sys/time.h>
5 
6 #include <quda_internal.h>
7 #include <comm_quda.h>
8 #include <fat_force_quda.h>
9 #include <face_quda.h>
10 
11 using namespace quda;
12 
13 extern cudaStream_t *stream;
14 
15 /**************************************************************
16  * Staple exchange routine
17  * used in fat link computation
18  ***************************************************************/
19 //#ifndef CLOVER_FORCE
20 //#define CLOVER_FORCE
21 //#endif
22 
23 #if defined(MULTI_GPU) && (defined(GPU_FATLINK) || defined(GPU_GAUGE_FORCE)|| defined(GPU_FERMION_FORCE) || defined(GPU_HISQ_FORCE) || defined(CLOVER_FORCE)) || defined(GPU_CLOVER_DIRAC)
24 
25 enum {
26  XUP = 0,
27  YUP = 1,
28  ZUP = 2,
29  TUP = 3,
30  TDOWN = 4,
31  ZDOWN = 5,
32  YDOWN = 6,
33  XDOWN = 7
34 };
35 
36 #define gaugeSiteSize 18
37 
38 #ifndef GPU_DIRECT
39 static void* fwd_nbr_staple_cpu[4];
40 static void* back_nbr_staple_cpu[4];
41 static void* fwd_nbr_staple_sendbuf_cpu[4];
42 static void* back_nbr_staple_sendbuf_cpu[4];
43 #endif
44 
45 static void* fwd_nbr_staple_gpu[4];
46 static void* back_nbr_staple_gpu[4];
47 
48 static void* fwd_nbr_staple[4];
49 static void* back_nbr_staple[4];
50 static void* fwd_nbr_staple_sendbuf[4];
51 static void* back_nbr_staple_sendbuf[4];
52 
53 static int dims[4];
54 static int X1,X2,X3,X4;
55 static int V;
56 static int volumeCB;
57 static int Vs[4], Vsh[4];
58 static int Vs_x, Vs_y, Vs_z, Vs_t;
59 static int Vsh_x, Vsh_y, Vsh_z, Vsh_t;
60 
61 static struct {
62  MsgHandle *fwd[4];
63  MsgHandle *back[4];
64 } llfat_recv, llfat_send;
65 
66 #include "gauge_field.h"
67 extern void setup_dims_in_gauge(int *XX);
68 
69 static void
70 setup_dims(int* X)
71 {
72  V = 1;
73  for (int d=0; d< 4; d++) {
74  V *= X[d];
75  dims[d] = X[d];
76  }
77  volumeCB = V/2;
78 
79  X1=X[0];
80  X2=X[1];
81  X3=X[2];
82  X4=X[3];
83 
84  Vs[0] = Vs_x = X[1]*X[2]*X[3];
85  Vs[1] = Vs_y = X[0]*X[2]*X[3];
86  Vs[2] = Vs_z = X[0]*X[1]*X[3];
87  Vs[3] = Vs_t = X[0]*X[1]*X[2];
88 
89  Vsh[0] = Vsh_x = Vs_x/2;
90  Vsh[1] = Vsh_y = Vs_y/2;
91  Vsh[2] = Vsh_z = Vs_z/2;
92  Vsh[3] = Vsh_t = Vs_t/2;
93 }
94 
95 
97 {
98  static bool initialized = false;
99 
100  if (initialized) return;
101  initialized = true;
102 
103  for (int i=0; i < 4; i++) {
104 
105  size_t packet_size = Vs[i]*gaugeSiteSize*prec;
106 
107  fwd_nbr_staple_gpu[i] = device_malloc(packet_size);
108  back_nbr_staple_gpu[i] = device_malloc(packet_size);
109 
110  fwd_nbr_staple[i] = pinned_malloc(packet_size);
111  back_nbr_staple[i] = pinned_malloc(packet_size);
112  fwd_nbr_staple_sendbuf[i] = pinned_malloc(packet_size);
113  back_nbr_staple_sendbuf[i] = pinned_malloc(packet_size);
114 
115 #ifndef GPU_DIRECT
116  fwd_nbr_staple_cpu[i] = safe_malloc(packet_size);
117  back_nbr_staple_cpu[i] = safe_malloc(packet_size);
118  fwd_nbr_staple_sendbuf_cpu[i] = safe_malloc(packet_size);
119  back_nbr_staple_sendbuf_cpu[i] = safe_malloc(packet_size);
120 #endif
121 
122  }
123 }
124 
125 
126 template<typename Float>
127 void exchange_sitelink_diag(int* X, Float** sitelink, Float** ghost_sitelink_diag, int optflag)
128 {
129  /*
130  nu | |
131  |__________|
132  mu
133 
134  * There are total 12 different combinations for (nu,mu)
135  * since nu/mu = X,Y,Z,T and nu != mu
136  * For each combination, we need to communicate with the corresponding
137  * neighbor and get the diag ghost data
138  * The neighbor we need to get data from is dx[nu]=-1, dx[mu]= +1
139  * and we need to send our data to neighbor with dx[nu]=+1, dx[mu]=-1
140  */
141 
142  for(int nu = XUP; nu <=TUP; nu++){
143  for(int mu = XUP; mu <= TUP; mu++){
144  if(nu == mu){
145  continue;
146  }
147  if(optflag && (!commDimPartitioned(mu) || !commDimPartitioned(nu))){
148  continue;
149  }
150 
151  int dir1, dir2; //other two dimensions
152  for(dir1=0; dir1 < 4; dir1 ++){
153  if(dir1 != nu && dir1 != mu){
154  break;
155  }
156  }
157  for(dir2=0; dir2 < 4; dir2 ++){
158  if(dir2 != nu && dir2 != mu && dir2 != dir1){
159  break;
160  }
161  }
162 
163  if(dir1 == 4 || dir2 == 4){
164  errorQuda("Invalid dir1/dir2");
165  }
166  int len = X[dir1]*X[dir2]*gaugeSiteSize*sizeof(Float);
167  void *sendbuf = safe_malloc(len);
168 
169  pack_gauge_diag(sendbuf, X, (void**)sitelink, nu, mu, dir1, dir2, (QudaPrecision)sizeof(Float));
170 
171  int dx[4] = {0};
172  dx[nu] = -1;
173  dx[mu] = +1;
174  MsgHandle *mh_recv = comm_declare_receive_displaced(ghost_sitelink_diag[nu*4+mu], dx, len);
175  comm_start(mh_recv);
176 
177  dx[nu] = +1;
178  dx[mu] = -1;
179  MsgHandle *mh_send = comm_declare_send_displaced(sendbuf, dx, len);
180  comm_start(mh_send);
181 
182  comm_wait(mh_send);
183  comm_wait(mh_recv);
184 
185  comm_free(mh_send);
186  comm_free(mh_recv);
187 
188  host_free(sendbuf);
189  }
190  }
191 }
192 
193 
194 template<typename Float>
195 void
196 exchange_sitelink(int*X, Float** sitelink, Float** ghost_sitelink, Float** ghost_sitelink_diag,
197  Float** sitelink_fwd_sendbuf, Float** sitelink_back_sendbuf, int optflag)
198 {
199 
200 
201 #if 0
202  int i;
203  int len = Vsh_t*gaugeSiteSize*sizeof(Float);
204  for(i=0;i < 4;i++){
205  Float* even_sitelink_back_src = sitelink[i];
206  Float* odd_sitelink_back_src = sitelink[i] + volumeCB*gaugeSiteSize;
207  Float* sitelink_back_dst = sitelink_back_sendbuf[3] + 2*i*Vsh_t*gaugeSiteSize;
208 
209  if(dims[3] % 2 == 0){
210  memcpy(sitelink_back_dst, even_sitelink_back_src, len);
211  memcpy(sitelink_back_dst + Vsh_t*gaugeSiteSize, odd_sitelink_back_src, len);
212  }else{
213  //switching odd and even ghost sitelink
214  memcpy(sitelink_back_dst, odd_sitelink_back_src, len);
215  memcpy(sitelink_back_dst + Vsh_t*gaugeSiteSize, even_sitelink_back_src, len);
216  }
217  }
218 
219  for(i=0;i < 4;i++){
220  Float* even_sitelink_fwd_src = sitelink[i] + (volumeCB - Vsh_t)*gaugeSiteSize;
221  Float* odd_sitelink_fwd_src = sitelink[i] + volumeCB*gaugeSiteSize + (volumeCB - Vsh_t)*gaugeSiteSize;
222  Float* sitelink_fwd_dst = sitelink_fwd_sendbuf[3] + 2*i*Vsh_t*gaugeSiteSize;
223  if(dims[3] % 2 == 0){
224  memcpy(sitelink_fwd_dst, even_sitelink_fwd_src, len);
225  memcpy(sitelink_fwd_dst + Vsh_t*gaugeSiteSize, odd_sitelink_fwd_src, len);
226  }else{
227  //switching odd and even ghost sitelink
228  memcpy(sitelink_fwd_dst, odd_sitelink_fwd_src, len);
229  memcpy(sitelink_fwd_dst + Vsh_t*gaugeSiteSize, even_sitelink_fwd_src, len);
230  }
231 
232  }
233 #else
234  int nFace =1;
235  for(int dir=0; dir < 4; dir++){
236  if(optflag && !commDimPartitioned(dir)) continue;
237  pack_ghost_all_links((void**)sitelink, (void**)sitelink_back_sendbuf, (void**)sitelink_fwd_sendbuf, dir, nFace, (QudaPrecision)(sizeof(Float)), X);
238  }
239 #endif
240 
241  for (int dir = 0; dir < 4; dir++) {
242  if(optflag && !commDimPartitioned(dir)) continue;
243  int len = Vsh[dir]*gaugeSiteSize*sizeof(Float);
244  Float* ghost_sitelink_back = ghost_sitelink[dir];
245  Float* ghost_sitelink_fwd = ghost_sitelink[dir] + 8*Vsh[dir]*gaugeSiteSize;
246 
247  MsgHandle *mh_recv_back;
248  MsgHandle *mh_recv_fwd;
249  MsgHandle *mh_send_fwd;
250  MsgHandle *mh_send_back;
251 
252  mh_recv_back = comm_declare_receive_relative(ghost_sitelink_back, dir, -1, 8*len);
253  mh_recv_fwd = comm_declare_receive_relative(ghost_sitelink_fwd, dir, +1, 8*len);
254  mh_send_fwd = comm_declare_send_relative(sitelink_fwd_sendbuf[dir], dir, +1, 8*len);
255  mh_send_back = comm_declare_send_relative(sitelink_back_sendbuf[dir], dir, -1, 8*len);
256 
257  comm_start(mh_recv_back);
258  comm_start(mh_recv_fwd);
259  comm_start(mh_send_fwd);
260  comm_start(mh_send_back);
261 
262  comm_wait(mh_send_fwd);
263  comm_wait(mh_send_back);
264  comm_wait(mh_recv_back);
265  comm_wait(mh_recv_fwd);
266 
267  comm_free(mh_send_fwd);
268  comm_free(mh_send_back);
269  comm_free(mh_recv_back);
270  comm_free(mh_recv_fwd);
271  }
272 
273  exchange_sitelink_diag(X, sitelink, ghost_sitelink_diag, optflag);
274 }
275 
276 
277 //this function is used for link fattening computation
278 //@optflag: if this flag is set, we only communicate in directions that are partitioned
279 // if not set, then we communicate in all directions regradless of partitions
280 void exchange_cpu_sitelink(int* X,
281  void** sitelink, void** ghost_sitelink,
282  void** ghost_sitelink_diag,
283  QudaPrecision gPrecision, QudaGaugeParam* param, int optflag)
284 {
285  setup_dims(X);
286  static void* sitelink_fwd_sendbuf[4];
287  static void* sitelink_back_sendbuf[4];
288  static bool allocated = false;
289 
290  if (!allocated) {
291  for (int i=0; i<4; i++) {
292  int nbytes = 4*Vs[i]*gaugeSiteSize*gPrecision;
293  sitelink_fwd_sendbuf[i] = safe_malloc(nbytes);
294  sitelink_back_sendbuf[i] = safe_malloc(nbytes);
295  memset(sitelink_fwd_sendbuf[i], 0, nbytes);
296  memset(sitelink_back_sendbuf[i], 0, nbytes);
297  }
298  allocated = true;
299  }
300 
301  if (gPrecision == QUDA_DOUBLE_PRECISION){
302  exchange_sitelink(X, (double**)sitelink, (double**)(ghost_sitelink), (double**)ghost_sitelink_diag,
303  (double**)sitelink_fwd_sendbuf, (double**)sitelink_back_sendbuf, optflag);
304  }else{ //single
305  exchange_sitelink(X, (float**)sitelink, (float**)(ghost_sitelink), (float**)ghost_sitelink_diag,
306  (float**)sitelink_fwd_sendbuf, (float**)sitelink_back_sendbuf, optflag);
307  }
308 
310  for(int i=0;i < 4;i++){
311  host_free(sitelink_fwd_sendbuf[i]);
312  host_free(sitelink_back_sendbuf[i]);
313  }
314  allocated = false;
315  }
316 }
317 
318 
319 #define MEMCOPY_GAUGE_FIELDS_GRID_TO_BUF(ghost_buf, dst_idx, sitelink, src_idx, num, dir, geom) \
320  if(src_oddness) src_idx += Vh_ex; \
321  if(dst_oddness) dst_idx += R[dir]*slice_3d[dir]/2; \
322  if(cpu_order == QUDA_QDP_GAUGE_ORDER) { \
323  for(int linkdir=0; linkdir < 4; linkdir++){ \
324  char* src = (char*) sitelink[linkdir] + (src_idx)*gaugebytes; \
325  char* dst = ((char*)ghost_buf[dir])+ linkdir*R[dir]*slice_3d[dir]*gaugebytes + (dst_idx)*gaugebytes; \
326  memcpy(dst, src, gaugebytes*(num)); \
327  } \
328  } else if (cpu_order == QUDA_MILC_GAUGE_ORDER) { \
329  char* src = ((char*)sitelink)+ (geom)*(src_idx)*gaugebytes; \
330  char* dst = ((char*)ghost_buf[dir]) + (geom)*(dst_idx)*gaugebytes; \
331  memcpy(dst, src, (geom)*gaugebytes*(num)); \
332  } else { \
333  errorQuda("Unsupported gauge order"); \
334  } \
335 
336 #define MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID(sitelink, dst_idx, ghost_buf, src_idx, num, dir, geom) \
337  if(oddness){ \
338  if(commDimPartitioned(dir)){ \
339  src_idx += R[dir]*slice_3d[dir]/2; \
340  }else{ \
341  src_idx += Vh_ex; \
342  } \
343  dst_idx += Vh_ex; \
344  } \
345  if(cpu_order == QUDA_QDP_GAUGE_ORDER){ \
346  for(int linkdir=0; linkdir < 4; linkdir++){ \
347  char* src; \
348  if(commDimPartitioned(dir)){ \
349  src = ((char*)ghost_buf[dir])+ linkdir*R[dir]*slice_3d[dir]*gaugebytes + (src_idx)*gaugebytes; \
350  }else{ \
351  src = ((char*)sitelink[linkdir])+ (src_idx)*gaugebytes; \
352  } \
353  char* dst = (char*) sitelink[linkdir] + (dst_idx)*gaugebytes; \
354  memcpy(dst, src, gaugebytes*(num)); \
355  } \
356  } else if (cpu_order == QUDA_MILC_GAUGE_ORDER) { \
357  char* src; \
358  if(commDimPartitioned(dir)){ \
359  src=((char*)ghost_buf[dir]) + (geom)*(src_idx)*gaugebytes; \
360  }else{ \
361  src = ((char*)sitelink)+ (geom)*(src_idx)*gaugebytes; \
362  } \
363  char* dst = ((char*)sitelink) + (geom)*(dst_idx)*gaugebytes; \
364  memcpy(dst, src, (geom)*gaugebytes*(num)); \
365  } else { \
366  errorQuda("Unsupported gauge order"); \
367  }
368 
369 #define MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID_T(sitelink, ghost_buf, dst_face, src_face, dir, geom) \
370  /*even*/ \
371  int even_dst_idx = (dst_face*E[2]*E[1]*E[0])/2; \
372  int even_src_idx; \
373  if(commDimPartitioned(dir)){ \
374  even_src_idx = 0; \
375  }else{ \
376  even_src_idx = (src_face*E[2]*E[1]*E[0])/2; \
377  } \
378  /*odd*/ \
379  int odd_dst_idx = even_dst_idx+Vh_ex; \
380  int odd_src_idx; \
381  if(commDimPartitioned(dir)){ \
382  odd_src_idx = R[dir]*slice_3d[dir]/2; \
383  }else{ \
384  odd_src_idx = even_src_idx+Vh_ex; \
385  } \
386  if(cpu_order == QUDA_QDP_GAUGE_ORDER){ \
387  for(int linkdir=0; linkdir < 4; linkdir ++){ \
388  char* dst = (char*)sitelink[linkdir]; \
389  char* src; \
390  if(commDimPartitioned(dir)){ \
391  src = ((char*)ghost_buf[dir]) + linkdir*R[dir]*slice_3d[dir]*gaugebytes; \
392  }else{ \
393  src = (char*)sitelink[linkdir]; \
394  } \
395  memcpy(dst + even_dst_idx * gaugebytes, src + even_src_idx*gaugebytes, R[dir]*slice_3d[dir]*gaugebytes/2); \
396  memcpy(dst + odd_dst_idx * gaugebytes, src + odd_src_idx*gaugebytes, R[dir]*slice_3d[dir]*gaugebytes/2); \
397  } \
398  } else if (cpu_order == QUDA_MILC_GAUGE_ORDER) { \
399  char* dst = (char*)sitelink; \
400  char* src; \
401  if(commDimPartitioned(dir)){ \
402  src = (char*)ghost_buf[dir]; \
403  }else{ \
404  src = (char*)sitelink; \
405  } \
406  memcpy(dst+(geom)*even_dst_idx*gaugebytes, src+(geom)*even_src_idx*gaugebytes, (geom)*R[dir]*slice_3d[dir]*gaugebytes/2); \
407  memcpy(dst+(geom)*odd_dst_idx*gaugebytes, src+(geom)*odd_src_idx*gaugebytes, (geom)*R[dir]*slice_3d[dir]*gaugebytes/2); \
408  } else { \
409  errorQuda("Unsupported gauge order\n"); \
410  }
411 
412 /* This function exchange the sitelink and store them in the correspoinding portion of
413  * the extended sitelink memory region
414  * @sitelink: this is stored according to dimension size (X4+R4) * (X1+R1) * (X2+R2) * (X3+R3)
415  */
416 
417 // gaugeSiteSize
418 
419 void exchange_cpu_sitelink_ex(int* X, int *R, void** sitelink, QudaGaugeFieldOrder cpu_order,
420  QudaPrecision gPrecision, int optflag, int geometry)
421 {
422  int E[4];
423  for (int i=0; i<4; i++) E[i] = X[i] + 2*R[i];
424  int Vh_ex = E[3]*E[2]*E[1]*E[0]/2;
425 
426  //...............x.........y.....z......t
427  int starta[] = {R[3], R[3], R[3], 0};
428  int enda[] = {X[3]+R[3], X[3]+R[3], X[3]+R[3], X[2]+2*R[2]};
429 
430  int startb[] = {R[2], R[2], 0, 0};
431  int endb[] = {X[2]+R[2], X[2]+R[2], X[1]+2*R[1], X[1]+2*R[1]};
432 
433  int startc[] = {R[1], 0, 0, 0};
434  int endc[] = {X[1]+R[1], X[0]+2*R[0], X[0]+2*R[0], X[0]+2*R[0]};
435 
436  int f_main[4][4] = {
437  {E[2]*E[1]*E[0], E[1]*E[0], E[0], 1},
438  {E[2]*E[1]*E[0], E[1]*E[0], 1, E[0]},
439  {E[2]*E[1]*E[0], E[0], 1, E[1]*E[0]},
440  {E[1]*E[0], E[0], 1, E[2]*E[1]*E[0]}
441  };
442 
443  int f_bound[4][4]={
444  {E[2]*E[1], E[1], 1, E[3]*E[2]*E[1]},
445  {E[2]*E[0], E[0], 1, E[3]*E[2]*E[0]},
446  {E[1]*E[0], E[0], 1, E[3]*E[1]*E[0]},
447  {E[1]*E[0], E[0], 1, E[2]*E[1]*E[0]}
448  };
449 
450  int slice_3d[] = { E[3]*E[2]*E[1], E[3]*E[2]*E[0], E[3]*E[1]*E[0], E[2]*E[1]*E[0]};
451  int len[4];
452  for(int i=0; i<4;i++){
453  len[i] = slice_3d[i] * R[i] * geometry*gaugeSiteSize*gPrecision; //2 slices, 4 directions' links
454  }
455 
456  void* ghost_sitelink_fwd_sendbuf[4];
457  void* ghost_sitelink_back_sendbuf[4];
458  void* ghost_sitelink_fwd[4];
459  void* ghost_sitelink_back[4];
460 
461  for(int i=0; i<4; i++) {
462  if(!commDimPartitioned(i)) continue;
463  ghost_sitelink_fwd_sendbuf[i] = safe_malloc(len[i]);
464  ghost_sitelink_back_sendbuf[i] = safe_malloc(len[i]);
465  ghost_sitelink_fwd[i] = safe_malloc(len[i]);
466  ghost_sitelink_back[i] = safe_malloc(len[i]);
467  }
468 
469  int gaugebytes = gaugeSiteSize*gPrecision;
470  int a, b, c,d;
471  for(int dir =0;dir < 4;dir++){
472  if( (!commDimPartitioned(dir)) && optflag) continue;
473  if(commDimPartitioned(dir)){
474  //fill the sendbuf here
475  //back
476  for(d=R[dir]; d < 2*R[dir]; d++)
477  for(a=starta[dir];a < enda[dir]; a++)
478  for(b=startb[dir]; b < endb[dir]; b++)
479 
480  if(f_main[dir][2] != 1 || f_bound[dir][2] !=1){
481  for (c=startc[dir]; c < endc[dir]; c++){
482  int oddness = (a+b+c+d)%2;
483  int src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
484  int dst_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + (d-R[dir])*f_bound[dir][3])>> 1;
485 
486  int src_oddness = oddness;
487  int dst_oddness = oddness;
488  if((X[dir] % 2 ==1) && (commDim(dir) > 1)){ //switch even/odd position
489  dst_oddness = 1-oddness;
490  }
491 
492  MEMCOPY_GAUGE_FIELDS_GRID_TO_BUF(ghost_sitelink_back_sendbuf, dst_idx, sitelink, src_idx, 1, dir, geometry);
493 
494  }//c
495  }else{
496  for(int loop=0; loop < 2; loop++){
497  c=startc[dir]+loop;
498  if(c < endc[dir]){
499  int oddness = (a+b+c+d)%2;
500  int src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
501  int dst_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + (d-R[dir])*f_bound[dir][3])>> 1;
502 
503  int src_oddness = oddness;
504  int dst_oddness = oddness;
505  if((X[dir] % 2 ==1) && (commDim(dir) > 1)){ //switch even/odd position
506  dst_oddness = 1-oddness;
507  }
508  MEMCOPY_GAUGE_FIELDS_GRID_TO_BUF(ghost_sitelink_back_sendbuf, dst_idx, sitelink, src_idx, (endc[dir]-c+1)/2, dir, geometry);
509 
510  }//if c
511  }//for loop
512  }//if
513 
514 
515  //fwd
516  for(d=X[dir]; d < X[dir]+R[dir]; d++) {
517  for(a=starta[dir];a < enda[dir]; a++) {
518  for(b=startb[dir]; b < endb[dir]; b++) {
519 
520  if(f_main[dir][2] != 1 || f_bound[dir][2] !=1){
521  for (c=startc[dir]; c < endc[dir]; c++){
522  int oddness = (a+b+c+d)%2;
523  int src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
524  int dst_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + (d-X[dir])*f_bound[dir][3])>> 1;
525 
526  int src_oddness = oddness;
527  int dst_oddness = oddness;
528  if((X[dir] % 2 ==1) && (commDim(dir) > 1)){ //switch even/odd position
529  dst_oddness = 1-oddness;
530  }
531 
532  MEMCOPY_GAUGE_FIELDS_GRID_TO_BUF(ghost_sitelink_fwd_sendbuf, dst_idx, sitelink, src_idx, 1,dir, geometry);
533  }//c
534  }else{
535  for(int loop=0; loop < 2; loop++){
536  c=startc[dir]+loop;
537  if(c < endc[dir]){
538  int oddness = (a+b+c+d)%2;
539  int src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
540  int dst_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + (d-X[dir])*f_bound[dir][3])>> 1;
541 
542  int src_oddness = oddness;
543  int dst_oddness = oddness;
544  if((X[dir] % 2 ==1) && (commDim(dir) > 1)){ //switch even/odd position
545  dst_oddness = 1-oddness;
546  }
547  MEMCOPY_GAUGE_FIELDS_GRID_TO_BUF(ghost_sitelink_fwd_sendbuf, dst_idx, sitelink, src_idx, (endc[dir]-c+1)/2,dir, geometry);
548  }
549  }//for loop
550  }//if
551 
552  }
553  }
554  }
555 
556  MsgHandle *mh_recv_back;
557  MsgHandle *mh_recv_fwd;
558  MsgHandle *mh_send_fwd;
559  MsgHandle *mh_send_back;
560 
561  mh_recv_back = comm_declare_receive_relative(ghost_sitelink_back[dir], dir, -1, len[dir]);
562  mh_recv_fwd = comm_declare_receive_relative(ghost_sitelink_fwd[dir], dir, +1, len[dir]);
563  mh_send_fwd = comm_declare_send_relative(ghost_sitelink_fwd_sendbuf[dir], dir, +1, len[dir]);
564  mh_send_back = comm_declare_send_relative(ghost_sitelink_back_sendbuf[dir], dir, -1, len[dir]);
565 
566  comm_start(mh_recv_back);
567  comm_start(mh_recv_fwd);
568  comm_start(mh_send_fwd);
569  comm_start(mh_send_back);
570 
571  comm_wait(mh_send_fwd);
572  comm_wait(mh_send_back);
573  comm_wait(mh_recv_back);
574  comm_wait(mh_recv_fwd);
575 
576  comm_free(mh_send_fwd);
577  comm_free(mh_send_back);
578  comm_free(mh_recv_back);
579  comm_free(mh_recv_fwd);
580 
581  }//if
582 
583  //use the messages to fill the sitelink data
584  //back
585  if (dir < 3 ) {
586 
587  for(d=0; d < R[dir]; d++) {
588  for(a=starta[dir];a < enda[dir]; a++) {
589  for(b=startb[dir]; b < endb[dir]; b++) {
590 
591  if(f_main[dir][2] != 1 || f_bound[dir][2] !=1){
592  for (c=startc[dir]; c < endc[dir]; c++){
593  int oddness = (a+b+c+d)%2;
594  int dst_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
595  int src_idx;
596  if(commDimPartitioned(dir)){
597  src_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + d*f_bound[dir][3])>> 1;
598  }else{
599  src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + (d+X[dir])*f_main[dir][3])>> 1;
600  }
601 
602  MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID(sitelink, dst_idx, ghost_sitelink_back, src_idx, 1, dir, geometry);
603 
604  }//c
605  }else{
606  //optimized copy
607  //first half: startc[dir] -> end[dir] with step=2
608 
609  for(int loop =0;loop <2;loop++){
610  int c=startc[dir]+loop;
611  if(c < endc[dir]){
612  int oddness = (a+b+c+d)%2;
613  int dst_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
614  int src_idx;
615  if(commDimPartitioned(dir)){
616  src_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + d*f_bound[dir][3])>> 1;
617  }else{
618  src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + (d+X[dir])*f_main[dir][3])>> 1;
619  }
620 
621  MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID(sitelink, dst_idx, ghost_sitelink_back, src_idx, (endc[dir]-c+1)/2, dir, geometry);
622 
623  }//if c
624  }//for loop
625  }//if
626 
627  }
628  }
629  }
630 
631  }else{
632  //when dir == 3 (T direction), the data layout format in sitelink and the message is the same, we can do large copys
633 
634  MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID_T(sitelink, ghost_sitelink_back, 0, X[3], dir, geometry)
635  }//if
636 
637  //fwd
638  if( dir < 3 ){
639 
640  for(d=X[dir]+R[dir]; d < X[dir]+2*R[dir]; d++) {
641  for(a=starta[dir];a < enda[dir]; a++) {
642  for(b=startb[dir]; b < endb[dir]; b++) {
643 
644  if(f_main[dir][2] != 1 || f_bound[dir][2] != 1){
645  for (c=startc[dir]; c < endc[dir]; c++){
646  int oddness = (a+b+c+d)%2;
647  int dst_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
648  int src_idx;
649  if(commDimPartitioned(dir)){
650  src_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + (d-X[dir]-R[dir])*f_bound[dir][3])>> 1;
651  }else{
652  src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + (d-X[dir])*f_main[dir][3])>> 1;
653  }
654 
655  MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID(sitelink, dst_idx, ghost_sitelink_fwd, src_idx, 1, dir, geometry);
656 
657  }//c
658  }else{
659  for(int loop =0; loop < 2; loop++){
660  //for (c=startc[dir]; c < endc[dir]; c++){
661  c=startc[dir] + loop;
662  if(c < endc[dir]){
663  int oddness = (a+b+c+d)%2;
664  int dst_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
665  int src_idx;
666  if(commDimPartitioned(dir)){
667  src_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + (d-X[dir]-R[dir])*f_bound[dir][3])>> 1;
668  }else{
669  src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + (d-X[dir])*f_main[dir][3])>> 1;
670  }
671  MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID(sitelink, dst_idx, ghost_sitelink_fwd, src_idx, (endc[dir]-c+1)/2, dir, geometry);
672  }//if
673  }//for loop
674  }//if
675 
676  }
677  }
678  }
679 
680 
681  } else {
682 
683  //when dir == 3 (T direction), the data layout format in sitelink and the message is the same, we can do large copys
684  MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID_T(sitelink, ghost_sitelink_fwd, (X[3]+R[3]), 2, dir, geometry) // TESTME 2
685 
686  }//if
687 
688  }//dir for loop
689 
690 
691  for(int dir=0;dir < 4;dir++){
692  if(!commDimPartitioned(dir)) continue;
693  host_free(ghost_sitelink_fwd_sendbuf[dir]);
694  host_free(ghost_sitelink_back_sendbuf[dir]);
695  host_free(ghost_sitelink_fwd[dir]);
696  host_free(ghost_sitelink_back[dir]);
697  }
698 
699 }
700 
701 
702 
703 template<typename Float>
704 void
705 do_exchange_cpu_staple(Float* staple, Float** ghost_staple, Float** staple_fwd_sendbuf, Float** staple_back_sendbuf, int* X)
706 {
707 
708 
709 #if 0
710  int len = Vsh_t*gaugeSiteSize*sizeof(Float);
711  Float* even_staple_back_src = staple;
712  Float* odd_staple_back_src = staple + volumeCB*gaugeSiteSize;
713  Float* staple_back_dst = staple_back_sendbuf[3];
714 
715  if(dims[3] % 2 == 0){
716  memcpy(staple_back_dst, even_staple_back_src, len);
717  memcpy(staple_back_dst + Vsh_t*gaugeSiteSize, odd_staple_back_src, len);
718  }else{
719  //switching odd and even ghost staple
720  memcpy(staple_back_dst, odd_staple_back_src, len);
721  memcpy(staple_back_dst + Vsh_t*gaugeSiteSize, even_staple_back_src, len);
722  }
723 
724 
725  Float* even_staple_fwd_src = staple + (volumeCB - Vsh_t)*gaugeSiteSize;
726  Float* odd_staple_fwd_src = staple + volumeCB*gaugeSiteSize + (volumeCB - Vsh_t)*gaugeSiteSize;
727  Float* staple_fwd_dst = staple_fwd_sendbuf[3];
728  if(dims[3] % 2 == 0){
729  memcpy(staple_fwd_dst, even_staple_fwd_src, len);
730  memcpy(staple_fwd_dst + Vsh_t*gaugeSiteSize, odd_staple_fwd_src, len);
731  }else{
732  //switching odd and even ghost staple
733  memcpy(staple_fwd_dst, odd_staple_fwd_src, len);
734  memcpy(staple_fwd_dst + Vsh_t*gaugeSiteSize, even_staple_fwd_src, len);
735  }
736 #else
737  int nFace =1;
738  pack_ghost_all_staples_cpu(staple, (void**)staple_back_sendbuf,
739  (void**)staple_fwd_sendbuf, nFace, (QudaPrecision)(sizeof(Float)), X);
740 
741 #endif
742 
743  int Vsh[4] = {Vsh_x, Vsh_y, Vsh_z, Vsh_t};
744  size_t len[4] = {
745  Vsh_x*gaugeSiteSize*sizeof(Float),
746  Vsh_y*gaugeSiteSize*sizeof(Float),
747  Vsh_z*gaugeSiteSize*sizeof(Float),
748  Vsh_t*gaugeSiteSize*sizeof(Float)
749  };
750 
751  for (int dir=0;dir < 4; dir++) {
752 
753  Float *ghost_staple_back = ghost_staple[dir];
754  Float *ghost_staple_fwd = ghost_staple[dir] + 2*Vsh[dir]*gaugeSiteSize;
755 
756  MsgHandle *mh_recv_back;
757  MsgHandle *mh_recv_fwd;
758  MsgHandle *mh_send_fwd;
759  MsgHandle *mh_send_back;
760 
761  mh_recv_back = comm_declare_receive_relative(ghost_staple_back, dir, -1, 2*len[dir]);
762  mh_recv_fwd = comm_declare_receive_relative(ghost_staple_fwd, dir, +1, 2*len[dir]);
763  mh_send_fwd = comm_declare_send_relative(staple_fwd_sendbuf[dir], dir, +1, 2*len[dir]);
764  mh_send_back = comm_declare_send_relative(staple_back_sendbuf[dir], dir, -1, 2*len[dir]);
765 
766  comm_start(mh_recv_back);
767  comm_start(mh_recv_fwd);
768  comm_start(mh_send_fwd);
769  comm_start(mh_send_back);
770 
771  comm_wait(mh_send_fwd);
772  comm_wait(mh_send_back);
773  comm_wait(mh_recv_back);
774  comm_wait(mh_recv_fwd);
775 
776  comm_free(mh_send_fwd);
777  comm_free(mh_send_back);
778  comm_free(mh_recv_back);
779  comm_free(mh_recv_fwd);
780  }
781 }
782 
783 
784 //this function is used for link fattening computation
785 void exchange_cpu_staple(int* X, void* staple, void** ghost_staple, QudaPrecision gPrecision)
786 {
787  setup_dims(X);
788 
789  int Vs[4] = {Vs_x, Vs_y, Vs_z, Vs_t};
790  void *staple_fwd_sendbuf[4];
791  void *staple_back_sendbuf[4];
792 
793  for(int i=0;i < 4; i++){
794  staple_fwd_sendbuf[i] = safe_malloc(Vs[i]*gaugeSiteSize*gPrecision);
795  staple_back_sendbuf[i] = safe_malloc(Vs[i]*gaugeSiteSize*gPrecision);
796  }
797 
798  if (gPrecision == QUDA_DOUBLE_PRECISION) {
799  do_exchange_cpu_staple((double*)staple, (double**)ghost_staple,
800  (double**)staple_fwd_sendbuf, (double**)staple_back_sendbuf, X);
801  } else { //single
802  do_exchange_cpu_staple((float*)staple, (float**)ghost_staple,
803  (float**)staple_fwd_sendbuf, (float**)staple_back_sendbuf, X);
804  }
805 
806  for (int i=0;i < 4;i++) {
807  host_free(staple_fwd_sendbuf[i]);
808  host_free(staple_back_sendbuf[i]);
809  }
810 }
811 
812 //@whichway indicates send direction
813 void
814 exchange_gpu_staple_start(int* X, void* _cudaStaple, int dir, int whichway, cudaStream_t * stream)
815 {
816  setup_dims(X);
817 
818  cudaGaugeField* cudaStaple = (cudaGaugeField*) _cudaStaple;
819  exchange_llfat_init(cudaStaple->Precision());
820 
821 
822  void* even = cudaStaple->Even_p();
823  void* odd = cudaStaple->Odd_p();
824  int volumeCB = cudaStaple->VolumeCB();
825  QudaPrecision prec = cudaStaple->Precision();
826  int stride = cudaStaple->Stride();
827 
828  packGhostStaple(X, even, odd, volumeCB, prec, stride,
829  dir, whichway, fwd_nbr_staple_gpu, back_nbr_staple_gpu,
830  fwd_nbr_staple_sendbuf, back_nbr_staple_sendbuf, stream);
831 }
832 
833 
834 void exchange_gpu_staple_comms(int* X, void* _cudaStaple, int dim, int send_dir, cudaStream_t *stream)
835 {
836  cudaGaugeField* cudaStaple = (cudaGaugeField*) _cudaStaple;
837  QudaPrecision prec = cudaStaple->Precision();
838 
839  cudaStreamSynchronize(*stream);
840 
841  int recv_dir = (send_dir == QUDA_BACKWARDS) ? QUDA_FORWARDS : QUDA_BACKWARDS;
842 
843  int len = Vs[dim]*gaugeSiteSize*prec;
844 
845  if (recv_dir == QUDA_BACKWARDS) {
846 
847 #ifdef GPU_DIRECT
848  llfat_recv.back[dim] = comm_declare_receive_relative(back_nbr_staple[dim], dim, -1, len);
849  llfat_send.fwd[dim] = comm_declare_send_relative(fwd_nbr_staple_sendbuf[dim], dim, +1, len);
850 #else
851  llfat_recv.back[dim] = comm_declare_receive_relative(back_nbr_staple_cpu[dim], dim, -1, len);
852  memcpy(fwd_nbr_staple_sendbuf_cpu[dim], fwd_nbr_staple_sendbuf[dim], len);
853  llfat_send.fwd[dim] = comm_declare_send_relative(fwd_nbr_staple_sendbuf_cpu[dim], dim, +1, len);
854 #endif
855 
856  comm_start(llfat_recv.back[dim]);
857  comm_start(llfat_send.fwd[dim]);
858 
859  } else { // QUDA_FORWARDS
860 
861 #ifdef GPU_DIRECT
862  llfat_recv.fwd[dim] = comm_declare_receive_relative(fwd_nbr_staple[dim], dim, +1, len);
863  llfat_send.back[dim] = comm_declare_send_relative(back_nbr_staple_sendbuf[dim], dim, -1, len);
864 #else
865  llfat_recv.fwd[dim] = comm_declare_receive_relative(fwd_nbr_staple_cpu[dim], dim, +1, len);
866  memcpy(back_nbr_staple_sendbuf_cpu[dim], back_nbr_staple_sendbuf[dim], len);
867  llfat_send.back[dim] = comm_declare_send_relative(back_nbr_staple_sendbuf_cpu[dim], dim, -1, len);
868 #endif
869 
870  comm_start(llfat_recv.fwd[dim]);
871  comm_start(llfat_send.back[dim]);
872 
873  }
874 }
875 
876 
877 //@whichway indicates send direction
878 //we use recv_whichway to indicate recv direction
879 void
880 exchange_gpu_staple_wait(int* X, void* _cudaStaple, int dim, int send_dir, cudaStream_t * stream)
881 {
882  cudaGaugeField* cudaStaple = (cudaGaugeField*) _cudaStaple;
883 
884  void* even = cudaStaple->Even_p();
885  void* odd = cudaStaple->Odd_p();
886  int volumeCB = cudaStaple->VolumeCB();
887  QudaPrecision prec = cudaStaple->Precision();
888  int stride = cudaStaple->Stride();
889 
890  int recv_dir = (send_dir == QUDA_BACKWARDS) ? QUDA_FORWARDS : QUDA_BACKWARDS;
891 
892 #ifndef GPU_DIRECT
893  int len = Vs[dim]*gaugeSiteSize*prec;
894 #endif
895 
896  if (recv_dir == QUDA_BACKWARDS) {
897 
898  comm_wait(llfat_send.fwd[dim]);
899  comm_wait(llfat_recv.back[dim]);
900 
901  comm_free(llfat_send.fwd[dim]);
902  comm_free(llfat_recv.back[dim]);
903 
904 #ifdef GPU_DIRECT
905  unpackGhostStaple(X, even, odd, volumeCB, prec, stride,
906  dim, QUDA_BACKWARDS, fwd_nbr_staple, back_nbr_staple, stream);
907 #else
908  memcpy(back_nbr_staple[dim], back_nbr_staple_cpu[dim], len);
909  unpackGhostStaple(X, even, odd, volumeCB, prec, stride,
910  dim, QUDA_BACKWARDS, fwd_nbr_staple, back_nbr_staple, stream);
911 #endif
912 
913  } else { // QUDA_FORWARDS
914 
915  comm_wait(llfat_send.back[dim]);
916  comm_wait(llfat_recv.fwd[dim]);
917 
918  comm_free(llfat_send.back[dim]);
919  comm_free(llfat_recv.fwd[dim]);
920 
921 #ifdef GPU_DIRECT
922  unpackGhostStaple(X, even, odd, volumeCB, prec, stride,
923  dim, QUDA_FORWARDS, fwd_nbr_staple, back_nbr_staple, stream);
924 #else
925  memcpy(fwd_nbr_staple[dim], fwd_nbr_staple_cpu[dim], len);
926  unpackGhostStaple(X, even, odd, volumeCB, prec, stride,
927  dim, QUDA_FORWARDS, fwd_nbr_staple, back_nbr_staple, stream);
928 #endif
929 
930  }
931 }
932 
933 
934 void exchange_llfat_cleanup(void)
935 {
936  for (int i=0; i<4; i++) {
937 
938  if(fwd_nbr_staple_gpu[i]){
939  device_free(fwd_nbr_staple_gpu[i]); fwd_nbr_staple_gpu[i] = NULL;
940  }
941  if(back_nbr_staple_gpu[i]){
942  device_free(back_nbr_staple_gpu[i]); back_nbr_staple_gpu[i] = NULL;
943  }
944 
945 #ifndef GPU_DIRECT
946  if(fwd_nbr_staple_cpu[i]){
947  host_free(fwd_nbr_staple_cpu[i]); fwd_nbr_staple_cpu[i] = NULL;
948  }
949  if(back_nbr_staple_cpu[i]){
950  host_free(back_nbr_staple_cpu[i]);back_nbr_staple_cpu[i] = NULL;
951  }
952  if(fwd_nbr_staple_sendbuf_cpu[i]){
953  host_free(fwd_nbr_staple_sendbuf_cpu[i]); fwd_nbr_staple_sendbuf_cpu[i] = NULL;
954  }
955  if(back_nbr_staple_sendbuf_cpu[i]){
956  host_free(back_nbr_staple_sendbuf_cpu[i]); back_nbr_staple_sendbuf_cpu[i] = NULL;
957  }
958 #endif
959 
960  if(fwd_nbr_staple[i]){
961  host_free(fwd_nbr_staple[i]); fwd_nbr_staple[i] = NULL;
962  }
963  if(back_nbr_staple[i]){
964  host_free(back_nbr_staple[i]); back_nbr_staple[i] = NULL;
965  }
966  if(fwd_nbr_staple_sendbuf[i]){
967  host_free(fwd_nbr_staple_sendbuf[i]); fwd_nbr_staple_sendbuf[i] = NULL;
968  }
969  if(back_nbr_staple_sendbuf[i]){
970  host_free(back_nbr_staple_sendbuf[i]); back_nbr_staple_sendbuf[i] = NULL;
971  }
972 
973  }
974  checkCudaError();
975 }
976 
977 #endif
int commDim(int)
void exchange_llfat_cleanup(void)
__constant__ int X2
#define pinned_malloc(size)
Definition: malloc_quda.h:26
enum QudaPrecision_s QudaPrecision
int V
Definition: test_util.cpp:29
int commDimPartitioned(int dir)
MsgHandle * comm_declare_receive_displaced(void *buffer, const int displacement[], size_t nbytes)
Definition: comm_mpi.cpp:117
__constant__ int Vh_ex
int Vs_z
Definition: test_util.cpp:31
int VolumeCB() const
__constant__ int Vsh
#define errorQuda(...)
Definition: util_quda.h:73
void exchange_cpu_staple(int *X, void *staple, void **ghost_staple, QudaPrecision gPrecision)
#define host_free(ptr)
Definition: malloc_quda.h:29
__constant__ int X1
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int mu
int Vs_y
Definition: test_util.cpp:31
MsgHandle * comm_declare_send_relative(void *buffer, int dim, int dir, size_t nbytes)
cudaStream_t * stream
void unpackGhostStaple(int *X, void *_even, void *_odd, int volume, QudaPrecision prec, int stride, int dir, int whichway, void **fwd_nbr_buf, void **back_nbr_buf, cudaStream_t *stream)
#define gaugeSiteSize
#define Vsh_t
Definition: llfat_core.h:4
#define Vsh_z
Definition: llfat_core.h:3
QudaGaugeParam param
Definition: pack_test.cpp:17
__constant__ int Vs
void exchange_llfat_init(QudaPrecision prec)
void comm_free(MsgHandle *mh)
Definition: comm_mpi.cpp:174
int E[4]
QudaPrecision Precision() const
int Vs_x
Definition: test_util.cpp:31
void exchange_cpu_sitelink_ex(int *X, int *R, void **sitelink, QudaGaugeFieldOrder cpu_order, QudaPrecision gPrecision, int optflag, int geometry)
void exchange_gpu_staple_start(int *X, void *_cudaStaple, int dir, int whichway, cudaStream_t *stream)
MsgHandle * comm_declare_send_displaced(void *buffer, const int displacement[], size_t nbytes)
Definition: comm_mpi.cpp:101
FloatingPoint< float > Float
Definition: gtest.h:7350
int Stride() const
int preserve_gauge
Definition: quda.h:62
void comm_start(MsgHandle *mh)
Definition: comm_mpi.cpp:180
enum QudaGaugeFieldOrder_s QudaGaugeFieldOrder
#define safe_malloc(size)
Definition: malloc_quda.h:25
MsgHandle * comm_declare_receive_relative(void *buffer, int dim, int dir, size_t nbytes)
void pack_ghost_all_links(void **cpuLink, void **cpuGhostBack, void **cpuGhostFwd, int dir, int nFace, QudaPrecision precision, int *X)
int dx[4]
void exchange_gpu_staple_wait(int *X, void *_cudaStaple, int dir, int whichway, cudaStream_t *stream)
void exchange_gpu_staple_comms(int *X, void *_cudaStaple, int dir, int whichway, cudaStream_t *stream)
void * memset(void *s, int c, size_t n)
void pack_gauge_diag(void *buf, int *X, void **sitelink, int nu, int mu, int dir1, int dir2, QudaPrecision prec)
__constant__ int X3
#define Vsh_y
Definition: llfat_core.h:2
void packGhostStaple(int *X, void *even, void *odd, int volume, QudaPrecision prec, int stride, int dir, int whichway, void **fwd_nbr_buf_gpu, void **back_nbr_buf_gpu, void **fwd_nbr_buf, void **back_nbr_buf, cudaStream_t *stream)
#define device_malloc(size)
Definition: malloc_quda.h:24
int Vs_t
Definition: test_util.cpp:31
#define Vsh_x
Definition: llfat_core.h:1
#define checkCudaError()
Definition: util_quda.h:110
void comm_wait(MsgHandle *mh)
Definition: comm_mpi.cpp:186
QudaPrecision prec
Definition: test_util.cpp:1551
void pack_ghost_all_staples_cpu(void *staple, void **cpuGhostStapleBack, void **cpuGhostStapleFwd, int nFace, QudaPrecision precision, int *X)
void exchange_cpu_sitelink(int *X, void **sitelink, void **ghost_sitelink, void **ghost_sitelink_diag, QudaPrecision gPrecision, QudaGaugeParam *param, int optflag)
__constant__ int X4
#define device_free(ptr)
Definition: malloc_quda.h:28