14 extern cudaStream_t *
stream;
34 #define gaugeSiteSize 18 // real numbers per gauge field 53 for (
int d=0; d< 4; d++) {
64 Vs[0] =
Vs_x = X[1]*X[2]*X[3];
65 Vs[1] =
Vs_y = X[0]*X[2]*X[3];
66 Vs[2] =
Vs_z = X[0]*X[1]*X[3];
67 Vs[3] =
Vs_t = X[0]*X[1]*X[2];
75 template <
typename Float>
78 int XYZ=X[0]*X[1]*X[2];
79 int volumeCB = X[0]*X[1]*X[2]*X[3]/2;
93 A[0] = X[3]; B[0] = X[2]; C[0] = X[1];
96 A[1] = X[3]; B[1] = X[2]; C[1] = X[0];
99 A[2] = X[3]; B[2] = X[1]; C[2] = X[0];
102 A[3] = X[2]; B[3] = X[1]; C[3] = X[0];
112 for(
int ite = 0; ite < 2; ite++){
123 for(
int dir =0; dir < 4; dir++){
127 for(
int linkdir=0; linkdir < 1; linkdir ++){
128 Float* even_src = cpuStaple;
136 if((X[dir] % 2 ==0) || (
comm_dim(dir) == 1)){
144 int even_dst_index = 0;
145 int odd_dst_index = 0;
152 startd = X[dir] - nFace;
155 for(d = startd; d < endd; d++){
156 for(a = 0; a < A[dir]; a++){
157 for(b = 0; b < B[dir]; b++){
158 for(c = 0; c < C[dir]; c++){
159 int index = ( a*f[dir][0] + b*f[dir][1]+ c*f[dir][2] + d*f[dir][3])>> 1;
160 int oddness = (a+b+c+d)%2;
162 for(
int i=0;i < 18;i++){
163 even_dst[18*even_dst_index+i] = even_src[18*index + i];
167 for(
int i=0;i < 18;i++){
168 odd_dst[18*odd_dst_index+i] = odd_src[18*index + i];
176 assert( even_dst_index == nFace*faceVolumeCB[dir]);
177 assert( odd_dst_index == nFace*faceVolumeCB[dir]);
187 packGhostAllStaples((
double*)staple, (
double**)cpuGhostStapleBack, (
double**) cpuGhostStapleFwd, nFace, X);
189 packGhostAllStaples((
float*)staple, (
float**)cpuGhostStapleBack, (
float**)cpuGhostStapleFwd, nFace, X);
207 int mul_factor[4]={1, X[0], X[1]*X[0], X[2]*X[1]*X[0]};
209 int even_dst_idx = 0;
211 char* dst_even =(
char*)buf;
212 char* dst_odd = dst_even + (X[dir1]*X[dir2]/2)*
gaugeSiteSize*prec;
213 char* src_even = (
char*)sitelink[nu];
214 char* src_odd = src_even + (X[0]*X[1]*X[2]*X[3]/2)*
gaugeSiteSize*prec;
216 if( (X[nu]+X[mu]) % 2 == 1){
224 for(
int i=0;i < X[dir2]; i++){
225 for(
int j=0; j < X[dir1]; j++){
226 int src_idx = ((X[nu]-1)*mul_factor[nu]+ 0*mul_factor[mu]+i*mul_factor[dir2]+j*mul_factor[dir1])>>1;
227 int oddness = ( (X[nu]-1) + 0 + i + j) %2;
230 memcpy(&dst_even[(18*even_dst_idx+tmpidx)*prec], &src_even[(18*src_idx + tmpidx)*prec], prec);
235 memcpy(&dst_odd[(18*odd_dst_idx+tmpidx)*prec], &src_odd[(18*src_idx + tmpidx)*prec], prec);
241 if( (even_dst_idx != X[dir1]*X[dir2]/2)|| (odd_dst_idx != X[dir1]*X[dir2]/2)){
242 errorQuda(
"even_dst_idx/odd_dst_idx(%d/%d) does not match the value of X[dir1]*X[dir2]/2 (%d)\n",
243 even_dst_idx, odd_dst_idx, X[dir1]*X[dir2]/2);
257 template <
typename Float>
260 int XYZ=X[0]*X[1]*X[2];
261 int volumeCB = X[0]*X[1]*X[2]*X[3]/2;
262 int faceVolumeCB[4]={
272 int A[4], B[4], C[4];
275 A[0] = X[3]; B[0] = X[2]; C[0] = X[1];
278 A[1] = X[3]; B[1] = X[2]; C[1] = X[0];
281 A[2] = X[3]; B[2] = X[1]; C[2] = X[0];
284 A[3] = X[2]; B[3] = X[1]; C[3] = X[0];
294 for(
int ite = 0; ite < 2; ite++){
309 for(
int linkdir=0; linkdir < 4; linkdir ++){
310 Float* even_src = cpuLink[linkdir];
317 if((X[dir] % 2 ==0) || (
comm_dim(dir) == 1)){
318 even_dst = dst[dir] + 2*linkdir* nFace *faceVolumeCB[dir]*
gaugeSiteSize;
321 odd_dst = dst[dir] + 2*linkdir* nFace *faceVolumeCB[dir]*
gaugeSiteSize;
325 int even_dst_index = 0;
326 int odd_dst_index = 0;
333 startd = X[dir] - nFace;
336 for(d = startd; d < endd; d++){
337 for(a = 0; a < A[dir]; a++){
338 for(b = 0; b < B[dir]; b++){
339 for(c = 0; c < C[dir]; c++){
340 int index = ( a*f[dir][0] + b*f[dir][1]+ c*f[dir][2] + d*f[dir][3])>> 1;
341 int oddness = (a+b+c+d)%2;
343 for(
int i=0;i < 18;i++){
344 even_dst[18*even_dst_index+i] = even_src[18*index + i];
348 for(
int i=0;i < 18;i++){
349 odd_dst[18*odd_dst_index+i] = odd_src[18*index + i];
357 assert( even_dst_index == nFace*faceVolumeCB[dir]);
358 assert( odd_dst_index == nFace*faceVolumeCB[dir]);
367 packGhostAllLinks((
double**)cpuLink, (
double**)cpuGhostBack, (
double**) cpuGhostFwd, dir, nFace, X);
369 packGhostAllLinks((
float**)cpuLink, (
float**)cpuGhostBack, (
float**)cpuGhostFwd, dir, nFace, X);
377 if (initialized)
return;
380 for (
int i=0; i < 4; i++) {
393 template<
typename Float>
409 for(
int nu =
XUP; nu <=
TUP; nu++){
419 for(dir1=0; dir1 < 4; dir1 ++){
420 if(dir1 != nu && dir1 !=
mu){
424 for(dir2=0; dir2 < 4; dir2 ++){
425 if(dir2 != nu && dir2 !=
mu && dir2 != dir1){
430 if(dir1 == 4 || dir2 == 4){
461 template<
typename Float>
464 Float** sitelink_fwd_sendbuf, Float** sitelink_back_sendbuf,
int optflag)
468 for(
int dir=0; dir < 4; dir++){
473 for (
int dir = 0; dir < 4; dir++) {
476 Float* ghost_sitelink_back = ghost_sitelink[dir];
513 void** sitelink,
void** ghost_sitelink,
514 void** ghost_sitelink_diag,
518 static void* sitelink_fwd_sendbuf[4];
519 static void* sitelink_back_sendbuf[4];
521 for (
int i=0; i<4; i++) {
525 memset(sitelink_fwd_sendbuf[i], 0, nbytes);
526 memset(sitelink_back_sendbuf[i], 0, nbytes);
530 exchange_sitelink(X, (
double**)sitelink, (
double**)(ghost_sitelink), (
double**)ghost_sitelink_diag,
531 (
double**)sitelink_fwd_sendbuf, (
double**)sitelink_back_sendbuf, optflag);
533 exchange_sitelink(X, (
float**)sitelink, (
float**)(ghost_sitelink), (
float**)ghost_sitelink_diag,
534 (
float**)sitelink_fwd_sendbuf, (
float**)sitelink_back_sendbuf, optflag);
537 for(
int i=0;i < 4;i++){
544 #define MEMCOPY_GAUGE_FIELDS_GRID_TO_BUF(ghost_buf, dst_idx, sitelink, src_idx, num, dir, geom) \ 545 if(src_oddness) src_idx += Vh_ex; \ 546 if(dst_oddness) dst_idx += R[dir]*slice_3d[dir]/2; \ 547 if(cpu_order == QUDA_QDP_GAUGE_ORDER) { \ 548 for(int linkdir=0; linkdir < 4; linkdir++){ \ 549 char* src = (char*) sitelink[linkdir] + (src_idx)*gaugebytes; \ 550 char* dst = ((char*)ghost_buf[dir])+ linkdir*R[dir]*slice_3d[dir]*gaugebytes + (dst_idx)*gaugebytes; \ 551 memcpy(dst, src, gaugebytes*(num)); \ 553 } else if (cpu_order == QUDA_MILC_GAUGE_ORDER) { \ 554 char* src = ((char*)sitelink)+ (geom)*(src_idx)*gaugebytes; \ 555 char* dst = ((char*)ghost_buf[dir]) + (geom)*(dst_idx)*gaugebytes; \ 556 memcpy(dst, src, (geom)*gaugebytes*(num)); \ 558 errorQuda("Unsupported gauge order"); \ 561 #define MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID(sitelink, dst_idx, ghost_buf, src_idx, num, dir, geom) \ 563 if(commDimPartitioned(dir)){ \ 564 src_idx += R[dir]*slice_3d[dir]/2; \ 570 if(cpu_order == QUDA_QDP_GAUGE_ORDER){ \ 571 for(int linkdir=0; linkdir < 4; linkdir++){ \ 573 if(commDimPartitioned(dir)){ \ 574 src = ((char*)ghost_buf[dir])+ linkdir*R[dir]*slice_3d[dir]*gaugebytes + (src_idx)*gaugebytes; \ 576 src = ((char*)sitelink[linkdir])+ (src_idx)*gaugebytes; \ 578 char* dst = (char*) sitelink[linkdir] + (dst_idx)*gaugebytes; \ 579 memcpy(dst, src, gaugebytes*(num)); \ 581 } else if (cpu_order == QUDA_MILC_GAUGE_ORDER) { \ 583 if(commDimPartitioned(dir)){ \ 584 src=((char*)ghost_buf[dir]) + (geom)*(src_idx)*gaugebytes; \ 586 src = ((char*)sitelink)+ (geom)*(src_idx)*gaugebytes; \ 588 char* dst = ((char*)sitelink) + (geom)*(dst_idx)*gaugebytes; \ 589 memcpy(dst, src, (geom)*gaugebytes*(num)); \ 591 errorQuda("Unsupported gauge order"); \ 594 #define MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID_T(sitelink, ghost_buf, dst_face, src_face, dir, geom) \ 596 int even_dst_idx = (dst_face*E[2]*E[1]*E[0])/2; \ 598 if(commDimPartitioned(dir)){ \ 601 even_src_idx = (src_face*E[2]*E[1]*E[0])/2; \ 604 int odd_dst_idx = even_dst_idx+Vh_ex; \ 606 if(commDimPartitioned(dir)){ \ 607 odd_src_idx = R[dir]*slice_3d[dir]/2; \ 609 odd_src_idx = even_src_idx+Vh_ex; \ 611 if(cpu_order == QUDA_QDP_GAUGE_ORDER){ \ 612 for(int linkdir=0; linkdir < 4; linkdir ++){ \ 613 char* dst = (char*)sitelink[linkdir]; \ 615 if(commDimPartitioned(dir)){ \ 616 src = ((char*)ghost_buf[dir]) + linkdir*R[dir]*slice_3d[dir]*gaugebytes; \ 618 src = (char*)sitelink[linkdir]; \ 620 memcpy(dst + even_dst_idx * gaugebytes, src + even_src_idx*gaugebytes, R[dir]*slice_3d[dir]*gaugebytes/2); \ 621 memcpy(dst + odd_dst_idx * gaugebytes, src + odd_src_idx*gaugebytes, R[dir]*slice_3d[dir]*gaugebytes/2); \ 623 } else if (cpu_order == QUDA_MILC_GAUGE_ORDER) { \ 624 char* dst = (char*)sitelink; \ 626 if(commDimPartitioned(dir)){ \ 627 src = (char*)ghost_buf[dir]; \ 629 src = (char*)sitelink; \ 631 memcpy(dst+(geom)*even_dst_idx*gaugebytes, src+(geom)*even_src_idx*gaugebytes, (geom)*R[dir]*slice_3d[dir]*gaugebytes/2); \ 632 memcpy(dst+(geom)*odd_dst_idx*gaugebytes, src+(geom)*odd_src_idx*gaugebytes, (geom)*R[dir]*slice_3d[dir]*gaugebytes/2); \ 634 errorQuda("Unsupported gauge order\n"); \ 648 for (
int i=0; i<4; i++) E[i] = X[i] + 2*R[i];
649 int Vh_ex = E[3]*E[2]*E[1]*E[0]/2;
652 int starta[] = {R[3], R[3], R[3], 0};
653 int enda[] = {X[3]+R[3], X[3]+R[3], X[3]+R[3], X[2]+2*R[2]};
655 int startb[] = {R[2], R[2], 0, 0};
656 int endb[] = {X[2]+R[2], X[2]+R[2], X[1]+2*R[1], X[1]+2*R[1]};
658 int startc[] = {R[1], 0, 0, 0};
659 int endc[] = {X[1]+R[1], X[0]+2*R[0], X[0]+2*R[0], X[0]+2*R[0]};
662 {E[2]*E[1]*E[0], E[1]*E[0], E[0], 1},
663 {E[2]*E[1]*E[0], E[1]*E[0], 1, E[0]},
664 {E[2]*E[1]*E[0], E[0], 1, E[1]*E[0]},
665 {E[1]*E[0], E[0], 1, E[2]*E[1]*E[0]}
669 {E[2]*E[1], E[1], 1, E[3]*E[2]*E[1]},
670 {E[2]*E[0], E[0], 1, E[3]*E[2]*E[0]},
671 {E[1]*E[0], E[0], 1, E[3]*E[1]*E[0]},
672 {E[1]*E[0], E[0], 1, E[2]*E[1]*E[0]}
675 int slice_3d[] = { E[3]*E[2]*E[1], E[3]*E[2]*E[0], E[3]*E[1]*E[0], E[2]*E[1]*E[0]};
677 for(
int i=0; i<4;i++){
678 len[i] = slice_3d[i] * R[i] * geometry*
gaugeSiteSize*gPrecision;
681 void* ghost_sitelink_fwd_sendbuf[4];
682 void* ghost_sitelink_back_sendbuf[4];
683 void* ghost_sitelink_fwd[4];
684 void* ghost_sitelink_back[4];
686 for(
int i=0; i<4; i++) {
688 ghost_sitelink_fwd_sendbuf[i] =
safe_malloc(len[i]);
689 ghost_sitelink_back_sendbuf[i] =
safe_malloc(len[i]);
696 for(
int dir =0;dir < 4;dir++){
701 for(d=R[dir]; d < 2*R[dir]; d++)
702 for(a=starta[dir];a < enda[dir]; a++)
703 for(b=startb[dir]; b < endb[dir]; b++)
705 if(f_main[dir][2] != 1 || f_bound[dir][2] !=1){
706 for (c=startc[dir]; c < endc[dir]; c++){
707 int oddness = (a+b+c+d)%2;
708 int src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
709 int dst_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + (d-R[dir])*f_bound[dir][3])>> 1;
711 int src_oddness = oddness;
712 int dst_oddness = oddness;
713 if((X[dir] % 2 ==1) && (
commDim(dir) > 1)){
714 dst_oddness = 1-oddness;
721 for(
int loop=0; loop < 2; loop++){
724 int oddness = (a+b+c+d)%2;
725 int src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
726 int dst_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + (d-R[dir])*f_bound[dir][3])>> 1;
728 int src_oddness = oddness;
729 int dst_oddness = oddness;
730 if((X[dir] % 2 ==1) && (
commDim(dir) > 1)){
731 dst_oddness = 1-oddness;
741 for(d=X[dir]; d < X[dir]+R[dir]; d++) {
742 for(a=starta[dir];a < enda[dir]; a++) {
743 for(b=startb[dir]; b < endb[dir]; b++) {
745 if(f_main[dir][2] != 1 || f_bound[dir][2] !=1){
746 for (c=startc[dir]; c < endc[dir]; c++){
747 int oddness = (a+b+c+d)%2;
748 int src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
749 int dst_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + (d-X[dir])*f_bound[dir][3])>> 1;
751 int src_oddness = oddness;
752 int dst_oddness = oddness;
753 if((X[dir] % 2 ==1) && (
commDim(dir) > 1)){
754 dst_oddness = 1-oddness;
760 for(
int loop=0; loop < 2; loop++){
763 int oddness = (a+b+c+d)%2;
764 int src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
765 int dst_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + (d-X[dir])*f_bound[dir][3])>> 1;
767 int src_oddness = oddness;
768 int dst_oddness = oddness;
769 if((X[dir] % 2 ==1) && (
commDim(dir) > 1)){
770 dst_oddness = 1-oddness;
812 for(d=0; d < R[dir]; d++) {
813 for(a=starta[dir];a < enda[dir]; a++) {
814 for(b=startb[dir]; b < endb[dir]; b++) {
816 if(f_main[dir][2] != 1 || f_bound[dir][2] !=1){
817 for (c=startc[dir]; c < endc[dir]; c++){
818 int oddness = (a+b+c+d)%2;
819 int dst_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
822 src_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + d*f_bound[dir][3])>> 1;
824 src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + (d+X[dir])*f_main[dir][3])>> 1;
834 for(
int loop =0;loop <2;loop++){
835 int c=startc[dir]+loop;
837 int oddness = (a+b+c+d)%2;
838 int dst_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
841 src_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + d*f_bound[dir][3])>> 1;
843 src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + (d+X[dir])*f_main[dir][3])>> 1;
865 for(d=X[dir]+R[dir]; d < X[dir]+2*R[dir]; d++) {
866 for(a=starta[dir];a < enda[dir]; a++) {
867 for(b=startb[dir]; b < endb[dir]; b++) {
869 if(f_main[dir][2] != 1 || f_bound[dir][2] != 1){
870 for (c=startc[dir]; c < endc[dir]; c++){
871 int oddness = (a+b+c+d)%2;
872 int dst_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
875 src_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + (d-X[dir]-R[dir])*f_bound[dir][3])>> 1;
877 src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + (d-X[dir])*f_main[dir][3])>> 1;
884 for(
int loop =0; loop < 2; loop++){
886 c=startc[dir] + loop;
888 int oddness = (a+b+c+d)%2;
889 int dst_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
892 src_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + (d-X[dir]-R[dir])*f_bound[dir][3])>> 1;
894 src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + (d-X[dir])*f_main[dir][3])>> 1;
916 for(
int dir=0;dir < 4;dir++){
918 host_free(ghost_sitelink_fwd_sendbuf[dir]);
919 host_free(ghost_sitelink_back_sendbuf[dir]);
928 template<
typename Float>
935 (
void**)staple_fwd_sendbuf, nFace, (
QudaPrecision)(
sizeof(Float)), X);
946 for (
int dir=0;dir < 4; dir++) {
948 Float *ghost_staple_back = ghost_staple[dir];
949 Float *ghost_staple_fwd = ghost_staple[dir] + 2*Vsh[dir]*
gaugeSiteSize;
985 void *staple_fwd_sendbuf[4];
986 void *staple_back_sendbuf[4];
988 for(
int i=0;i < 4; i++){
995 (
double**)staple_fwd_sendbuf, (
double**)staple_back_sendbuf, X);
998 (
float**)staple_fwd_sendbuf, (
float**)staple_back_sendbuf, X);
1001 for (
int i=0;i < 4;i++) {
1009 for (
int i=0; i<4; i++) {
1028 #undef gaugeSiteSize void setup_dims_in_gauge(int *XX)
void exchange_sitelink(int *X, Float **sitelink, Float **ghost_sitelink, Float **ghost_sitelink_diag, Float **sitelink_fwd_sendbuf, Float **sitelink_back_sendbuf, int optflag)
int commDimPartitioned(int dir)
#define pinned_malloc(size)
enum QudaPrecision_s QudaPrecision
MsgHandle * comm_declare_receive_displaced(void *buffer, const int displacement[], size_t nbytes)
#define MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID(sitelink, dst_idx, ghost_buf, src_idx, num, dir, geom)
cudaColorSpinorField * tmp
static void * fwd_nbr_staple[4]
void do_exchange_cpu_staple(Float *staple, Float **ghost_staple, Float **staple_fwd_sendbuf, Float **staple_back_sendbuf, int *X)
void exchange_llfat_cleanup(void)
void exchange_cpu_sitelink(int *X, void **sitelink, void **ghost_sitelink, void **ghost_sitelink_diag, QudaPrecision gPrecision, QudaGaugeParam *param, int optflag)
void packGhostAllLinks(Float **cpuLink, Float **cpuGhostBack, Float **cpuGhostFwd, int dir, int nFace, int *X)
void exchange_sitelink_diag(int *X, Float **sitelink, Float **ghost_sitelink_diag, int optflag)
void pack_ghost_all_links(void **cpuLink, void **cpuGhostBack, void **cpuGhostFwd, int dir, int nFace, QudaPrecision precision, int *X)
#define MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID_T(sitelink, ghost_buf, dst_face, src_face, dir, geom)
void packGhostAllStaples(Float *cpuStaple, Float **cpuGhostBack, Float **cpuGhostFwd, int nFace, int *X)
void exchange_cpu_staple(int *X, void *staple, void **ghost_staple, QudaPrecision gPrecision)
MsgHandle * comm_declare_send_displaced(void *buffer, const int displacement[], size_t nbytes)
#define comm_declare_send_relative(buffer, dim, dir, nbytes)
void exchange_llfat_init(QudaPrecision prec)
#define comm_declare_receive_relative(buffer, dim, dir, nbytes)
static bool initialized
Profiler for initQuda.
void pack_ghost_all_staples_cpu(void *staple, void **cpuGhostStapleBack, void **cpuGhostStapleFwd, int nFace, QudaPrecision precision, int *X)
void comm_start(MsgHandle *mh)
enum QudaGaugeFieldOrder_s QudaGaugeFieldOrder
void comm_free(MsgHandle *&mh)
static void * back_nbr_staple_sendbuf[4]
#define safe_malloc(size)
void * memset(void *s, int c, size_t n)
static int index(int ndim, const int *dims, const int *x)
static void * fwd_nbr_staple_sendbuf[4]
static int commDim[QUDA_MAX_DIM]
static void * back_nbr_staple[4]
void comm_wait(MsgHandle *mh)
static void setup_dims(int *X)
void pack_gauge_diag(void *buf, int *X, void **sitelink, int nu, int mu, int dir1, int dir2, QudaPrecision prec)
#define MEMCOPY_GAUGE_FIELDS_GRID_TO_BUF(ghost_buf, dst_idx, sitelink, src_idx, num, dir, geom)
void exchange_cpu_sitelink_ex(int *X, int *R, void **sitelink, QudaGaugeFieldOrder cpu_order, QudaPrecision gPrecision, int optflag, int geometry)