34 #define gauge_site_size 18
36 static void* fwd_nbr_staple[4];
37 static void* back_nbr_staple[4];
38 static void* fwd_nbr_staple_sendbuf[4];
39 static void* back_nbr_staple_sendbuf[4];
42 static int X1,X2,X3,X4;
44 static int Vs[4], Vsh[4];
53 for (
int d=0; d< 4; d++) {
64 Vs[0] =
Vs_x = X[1]*X[2]*X[3];
65 Vs[1] =
Vs_y = X[0]*X[2]*X[3];
66 Vs[2] =
Vs_z = X[0]*X[1]*X[3];
67 Vs[3] =
Vs_t = X[0]*X[1]*X[2];
75 template <
typename Float>
78 int XYZ=X[0]*X[1]*X[2];
79 int volumeCB = X[0]*X[1]*X[2]*X[3]/2;
93 A[0] = X[3]; B[0] = X[2]; C[0] = X[1];
96 A[1] = X[3]; B[1] = X[2]; C[1] = X[0];
99 A[2] = X[3]; B[2] = X[1]; C[2] = X[0];
102 A[3] = X[2]; B[3] = X[1]; C[3] = X[0];
112 for(
int ite = 0; ite < 2; ite++){
123 for(
int dir =0; dir < 4; dir++){
127 for(
int linkdir=0; linkdir < 1; linkdir ++){
128 Float* even_src = cpuStaple;
136 if ((X[dir] % 2 == 0) || (
comm_dim(dir) == 1)) {
144 int even_dst_index = 0;
145 int odd_dst_index = 0;
152 startd = X[dir] - nFace;
155 for (d = startd; d < endd; d++) {
156 for (a = 0; a < A[dir]; a++) {
157 for (b = 0; b < B[dir]; b++) {
158 for (c = 0; c < C[dir]; c++) {
159 int index = ( a*f[dir][0] + b*f[dir][1]+ c*f[dir][2] + d*f[dir][3])>> 1;
160 int oddness = (a+b+c+d)%2;
162 for(
int i=0;i < 18;i++){
163 even_dst[18*even_dst_index+i] = even_src[18*index + i];
167 for(
int i=0;i < 18;i++){
168 odd_dst[18*odd_dst_index+i] = odd_src[18*index + i];
176 assert(even_dst_index == nFace * faceVolumeCB[dir]);
177 assert(odd_dst_index == nFace * faceVolumeCB[dir]);
187 packGhostAllStaples((
double*)staple, (
double**)cpuGhostStapleBack, (
double**) cpuGhostStapleFwd, nFace, X);
189 packGhostAllStaples((
float*)staple, (
float**)cpuGhostStapleBack, (
float**)cpuGhostStapleFwd, nFace, X);
207 int mul_factor[4]={1, X[0], X[1]*X[0], X[2]*X[1]*X[0]};
209 int even_dst_idx = 0;
211 char* dst_even =(
char*)buf;
213 char* src_even = (
char*)sitelink[nu];
216 if( (X[nu]+X[
mu]) % 2 == 1){
224 for(
int i=0;i < X[dir2]; i++){
225 for(
int j=0; j < X[dir1]; j++){
226 int src_idx = ((X[nu]-1)*mul_factor[nu]+ 0*mul_factor[
mu]+i*mul_factor[dir2]+j*mul_factor[dir1])>>1;
227 int oddness = ( (X[nu]-1) + 0 + i + j) %2;
230 memcpy(&dst_even[(18 * even_dst_idx + tmpidx) *
prec], &src_even[(18 * src_idx + tmpidx) *
prec],
prec);
235 memcpy(&dst_odd[(18 * odd_dst_idx + tmpidx) *
prec], &src_odd[(18 * src_idx + tmpidx) *
prec],
prec);
241 if( (even_dst_idx != X[dir1]*X[dir2]/2)|| (odd_dst_idx != X[dir1]*X[dir2]/2)){
242 errorQuda(
"even_dst_idx/odd_dst_idx(%d/%d) does not match the value of X[dir1]*X[dir2]/2 (%d)\n",
243 even_dst_idx, odd_dst_idx, X[dir1]*X[dir2]/2);
257 template <
typename Float>
260 int XYZ=X[0]*X[1]*X[2];
261 int volumeCB = X[0]*X[1]*X[2]*X[3]/2;
262 int faceVolumeCB[4]={
272 int A[4], B[4], C[4];
275 A[0] = X[3]; B[0] = X[2]; C[0] = X[1];
278 A[1] = X[3]; B[1] = X[2]; C[1] = X[0];
281 A[2] = X[3]; B[2] = X[1]; C[2] = X[0];
284 A[3] = X[2]; B[3] = X[1]; C[3] = X[0];
294 for(
int ite = 0; ite < 2; ite++){
309 for(
int linkdir=0; linkdir < 4; linkdir ++){
317 if((X[dir] % 2 ==0) || (
comm_dim(dir) == 1)){
318 even_dst = dst[dir] + 2 * linkdir * nFace * faceVolumeCB[dir] *
gauge_site_size;
321 odd_dst = dst[dir] + 2 * linkdir * nFace * faceVolumeCB[dir] *
gauge_site_size;
325 int even_dst_index = 0;
326 int odd_dst_index = 0;
333 startd = X[dir] - nFace;
336 for(d = startd; d < endd; d++){
337 for(a = 0; a < A[dir]; a++){
338 for(b = 0; b < B[dir]; b++){
339 for(c = 0; c < C[dir]; c++){
340 int index = ( a*f[dir][0] + b*f[dir][1]+ c*f[dir][2] + d*f[dir][3])>> 1;
341 int oddness = (a+b+c+d)%2;
343 for(
int i=0;i < 18;i++){
344 even_dst[18*even_dst_index+i] = even_src[18*index + i];
348 for(
int i=0;i < 18;i++){
349 odd_dst[18*odd_dst_index+i] = odd_src[18*index + i];
357 assert( even_dst_index == nFace*faceVolumeCB[dir]);
358 assert( odd_dst_index == nFace*faceVolumeCB[dir]);
375 static bool initialized =
false;
377 if (initialized)
return;
380 for (
int i=0; i < 4; i++) {
393 template<
typename Float>
409 for(
int nu =
XUP; nu <=
TUP; nu++){
419 for(dir1=0; dir1 < 4; dir1 ++){
420 if(dir1 != nu && dir1 !=
mu){
424 for(dir2=0; dir2 < 4; dir2 ++){
425 if(dir2 != nu && dir2 !=
mu && dir2 != dir1){
430 if(dir1 == 4 || dir2 == 4){
461 template<
typename Float>
464 Float** sitelink_fwd_sendbuf,
Float** sitelink_back_sendbuf,
int optflag)
468 for(
int dir=0; dir < 4; dir++){
473 for (
int dir = 0; dir < 4; dir++) {
476 Float* ghost_sitelink_back = ghost_sitelink[dir];
513 void** sitelink,
void** ghost_sitelink,
514 void** ghost_sitelink_diag,
518 static void* sitelink_fwd_sendbuf[4];
519 static void* sitelink_back_sendbuf[4];
521 for (
int i=0; i<4; i++) {
525 memset(sitelink_fwd_sendbuf[i], 0, nbytes);
526 memset(sitelink_back_sendbuf[i], 0, nbytes);
530 exchange_sitelink(X, (
double**)sitelink, (
double**)(ghost_sitelink), (
double**)ghost_sitelink_diag,
531 (
double**)sitelink_fwd_sendbuf, (
double**)sitelink_back_sendbuf, optflag);
533 exchange_sitelink(X, (
float**)sitelink, (
float**)(ghost_sitelink), (
float**)ghost_sitelink_diag,
534 (
float**)sitelink_fwd_sendbuf, (
float**)sitelink_back_sendbuf, optflag);
537 for(
int i=0;i < 4;i++){
544 #define MEMCOPY_GAUGE_FIELDS_GRID_TO_BUF(ghost_buf, dst_idx, sitelink, src_idx, num, dir, geom) \
545 if(src_oddness) src_idx += Vh_ex; \
546 if(dst_oddness) dst_idx += R[dir]*slice_3d[dir]/2; \
547 if(cpu_order == QUDA_QDP_GAUGE_ORDER) { \
548 for(int linkdir=0; linkdir < 4; linkdir++){ \
549 char* src = (char*) sitelink[linkdir] + (src_idx)*gaugebytes; \
550 char* dst = ((char*)ghost_buf[dir])+ linkdir*R[dir]*slice_3d[dir]*gaugebytes + (dst_idx)*gaugebytes; \
551 memcpy(dst, src, gaugebytes*(num)); \
553 } else if (cpu_order == QUDA_MILC_GAUGE_ORDER) { \
554 char* src = ((char*)sitelink)+ (geom)*(src_idx)*gaugebytes; \
555 char* dst = ((char*)ghost_buf[dir]) + (geom)*(dst_idx)*gaugebytes; \
556 memcpy(dst, src, (geom)*gaugebytes*(num)); \
558 errorQuda("Unsupported gauge order"); \
561 #define MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID(sitelink, dst_idx, ghost_buf, src_idx, num, dir, geom) \
563 if(commDimPartitioned(dir)){ \
564 src_idx += R[dir]*slice_3d[dir]/2; \
570 if(cpu_order == QUDA_QDP_GAUGE_ORDER){ \
571 for(int linkdir=0; linkdir < 4; linkdir++){ \
573 if(commDimPartitioned(dir)){ \
574 src = ((char*)ghost_buf[dir])+ linkdir*R[dir]*slice_3d[dir]*gaugebytes + (src_idx)*gaugebytes; \
576 src = ((char*)sitelink[linkdir])+ (src_idx)*gaugebytes; \
578 char* dst = (char*) sitelink[linkdir] + (dst_idx)*gaugebytes; \
579 memcpy(dst, src, gaugebytes*(num)); \
581 } else if (cpu_order == QUDA_MILC_GAUGE_ORDER) { \
583 if(commDimPartitioned(dir)){ \
584 src=((char*)ghost_buf[dir]) + (geom)*(src_idx)*gaugebytes; \
586 src = ((char*)sitelink)+ (geom)*(src_idx)*gaugebytes; \
588 char* dst = ((char*)sitelink) + (geom)*(dst_idx)*gaugebytes; \
589 memcpy(dst, src, (geom)*gaugebytes*(num)); \
591 errorQuda("Unsupported gauge order"); \
594 #define MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID_T(sitelink, ghost_buf, dst_face, src_face, dir, geom) \
596 int even_dst_idx = (dst_face*E[2]*E[1]*E[0])/2; \
598 if(commDimPartitioned(dir)){ \
601 even_src_idx = (src_face*E[2]*E[1]*E[0])/2; \
604 int odd_dst_idx = even_dst_idx+Vh_ex; \
606 if(commDimPartitioned(dir)){ \
607 odd_src_idx = R[dir]*slice_3d[dir]/2; \
609 odd_src_idx = even_src_idx+Vh_ex; \
611 if(cpu_order == QUDA_QDP_GAUGE_ORDER){ \
612 for(int linkdir=0; linkdir < 4; linkdir ++){ \
613 char* dst = (char*)sitelink[linkdir]; \
615 if(commDimPartitioned(dir)){ \
616 src = ((char*)ghost_buf[dir]) + linkdir*R[dir]*slice_3d[dir]*gaugebytes; \
618 src = (char*)sitelink[linkdir]; \
620 memcpy(dst + even_dst_idx * gaugebytes, src + even_src_idx*gaugebytes, R[dir]*slice_3d[dir]*gaugebytes/2); \
621 memcpy(dst + odd_dst_idx * gaugebytes, src + odd_src_idx*gaugebytes, R[dir]*slice_3d[dir]*gaugebytes/2); \
623 } else if (cpu_order == QUDA_MILC_GAUGE_ORDER) { \
624 char* dst = (char*)sitelink; \
626 if(commDimPartitioned(dir)){ \
627 src = (char*)ghost_buf[dir]; \
629 src = (char*)sitelink; \
631 memcpy(dst+(geom)*even_dst_idx*gaugebytes, src+(geom)*even_src_idx*gaugebytes, (geom)*R[dir]*slice_3d[dir]*gaugebytes/2); \
632 memcpy(dst+(geom)*odd_dst_idx*gaugebytes, src+(geom)*odd_src_idx*gaugebytes, (geom)*R[dir]*slice_3d[dir]*gaugebytes/2); \
634 errorQuda("Unsupported gauge order\n"); \
648 for (
int i=0; i<4; i++)
E[i] = X[i] + 2*R[i];
652 int starta[] = {R[3], R[3], R[3], 0};
653 int enda[] = {X[3]+R[3], X[3]+R[3], X[3]+R[3], X[2]+2*R[2]};
655 int startb[] = {R[2], R[2], 0, 0};
656 int endb[] = {X[2]+R[2], X[2]+R[2], X[1]+2*R[1], X[1]+2*R[1]};
658 int startc[] = {R[1], 0, 0, 0};
659 int endc[] = {X[1]+R[1], X[0]+2*R[0], X[0]+2*R[0], X[0]+2*R[0]};
662 {
E[2]*
E[1]*
E[0],
E[1]*
E[0],
E[0], 1},
663 {
E[2]*
E[1]*
E[0],
E[1]*
E[0], 1,
E[0]},
664 {
E[2]*
E[1]*
E[0],
E[0], 1,
E[1]*
E[0]},
665 {
E[1]*
E[0],
E[0], 1,
E[2]*
E[1]*
E[0]}
669 {
E[2]*
E[1],
E[1], 1,
E[3]*
E[2]*
E[1]},
670 {
E[2]*
E[0],
E[0], 1,
E[3]*
E[2]*
E[0]},
671 {
E[1]*
E[0],
E[0], 1,
E[3]*
E[1]*
E[0]},
672 {
E[1]*
E[0],
E[0], 1,
E[2]*
E[1]*
E[0]}
675 int slice_3d[] = {
E[3]*
E[2]*
E[1],
E[3]*
E[2]*
E[0],
E[3]*
E[1]*
E[0],
E[2]*
E[1]*
E[0]};
677 for(
int i=0; i<4;i++){
681 void* ghost_sitelink_fwd_sendbuf[4];
682 void* ghost_sitelink_back_sendbuf[4];
683 void* ghost_sitelink_fwd[4];
684 void* ghost_sitelink_back[4];
686 for(
int i=0; i<4; i++) {
688 ghost_sitelink_fwd_sendbuf[i] =
safe_malloc(len[i]);
689 ghost_sitelink_back_sendbuf[i] =
safe_malloc(len[i]);
696 for(
int dir =0;dir < 4;dir++){
701 for(d=R[dir]; d < 2*R[dir]; d++)
702 for(a=starta[dir];a < enda[dir]; a++)
703 for(b=startb[dir]; b < endb[dir]; b++)
705 if(f_main[dir][2] != 1 || f_bound[dir][2] !=1){
706 for (c=startc[dir]; c < endc[dir]; c++){
707 int oddness = (a+b+c+d)%2;
708 int src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
709 int dst_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + (d-R[dir])*f_bound[dir][3])>> 1;
711 int src_oddness = oddness;
712 int dst_oddness = oddness;
713 if((X[dir] % 2 ==1) && (
commDim(dir) > 1)){
714 dst_oddness = 1-oddness;
721 for(
int loop=0; loop < 2; loop++){
724 int oddness = (a+b+c+d)%2;
725 int src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
726 int dst_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + (d-R[dir])*f_bound[dir][3])>> 1;
728 int src_oddness = oddness;
729 int dst_oddness = oddness;
730 if((X[dir] % 2 ==1) && (
commDim(dir) > 1)){
731 dst_oddness = 1-oddness;
741 for(d=X[dir]; d < X[dir]+R[dir]; d++) {
742 for(a=starta[dir];a < enda[dir]; a++) {
743 for(b=startb[dir]; b < endb[dir]; b++) {
745 if(f_main[dir][2] != 1 || f_bound[dir][2] !=1){
746 for (c=startc[dir]; c < endc[dir]; c++){
747 int oddness = (a+b+c+d)%2;
748 int src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
749 int dst_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + (d-X[dir])*f_bound[dir][3])>> 1;
751 int src_oddness = oddness;
752 int dst_oddness = oddness;
753 if((X[dir] % 2 ==1) && (
commDim(dir) > 1)){
754 dst_oddness = 1-oddness;
760 for(
int loop=0; loop < 2; loop++){
763 int oddness = (a+b+c+d)%2;
764 int src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
765 int dst_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + (d-X[dir])*f_bound[dir][3])>> 1;
767 int src_oddness = oddness;
768 int dst_oddness = oddness;
769 if((X[dir] % 2 ==1) && (
commDim(dir) > 1)){
770 dst_oddness = 1-oddness;
812 for(d=0; d < R[dir]; d++) {
813 for(a=starta[dir];a < enda[dir]; a++) {
814 for(b=startb[dir]; b < endb[dir]; b++) {
816 if(f_main[dir][2] != 1 || f_bound[dir][2] !=1){
817 for (c=startc[dir]; c < endc[dir]; c++){
818 int oddness = (a+b+c+d)%2;
819 int dst_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
822 src_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + d*f_bound[dir][3])>> 1;
824 src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + (d+X[dir])*f_main[dir][3])>> 1;
834 for(
int loop =0;loop <2;loop++){
835 int c=startc[dir]+loop;
837 int oddness = (a+b+c+d)%2;
838 int dst_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
841 src_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + d*f_bound[dir][3])>> 1;
843 src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + (d+X[dir])*f_main[dir][3])>> 1;
865 for(d=X[dir]+R[dir]; d < X[dir]+2*R[dir]; d++) {
866 for(a=starta[dir];a < enda[dir]; a++) {
867 for(b=startb[dir]; b < endb[dir]; b++) {
869 if(f_main[dir][2] != 1 || f_bound[dir][2] != 1){
870 for (c=startc[dir]; c < endc[dir]; c++){
871 int oddness = (a+b+c+d)%2;
872 int dst_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
875 src_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + (d-X[dir]-R[dir])*f_bound[dir][3])>> 1;
877 src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + (d-X[dir])*f_main[dir][3])>> 1;
884 for(
int loop =0; loop < 2; loop++){
886 c=startc[dir] + loop;
888 int oddness = (a+b+c+d)%2;
889 int dst_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
892 src_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + (d-X[dir]-R[dir])*f_bound[dir][3])>> 1;
894 src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + (d-X[dir])*f_main[dir][3])>> 1;
916 for(
int dir=0;dir < 4;dir++){
918 host_free(ghost_sitelink_fwd_sendbuf[dir]);
919 host_free(ghost_sitelink_back_sendbuf[dir]);
928 template<
typename Float>
942 for (
int dir=0;dir < 4; dir++) {
944 Float *ghost_staple_back = ghost_staple[dir];
981 void *staple_fwd_sendbuf[4];
982 void *staple_back_sendbuf[4];
984 for(
int i=0;i < 4; i++){
991 (
double**)staple_fwd_sendbuf, (
double**)staple_back_sendbuf, X);
994 (
float**)staple_fwd_sendbuf, (
float**)staple_back_sendbuf, X);
997 for (
int i=0;i < 4;i++) {
1005 for (
int i=0; i<4; i++) {
1007 if(fwd_nbr_staple[i]){
1008 host_free(fwd_nbr_staple[i]); fwd_nbr_staple[i] = NULL;
1010 if(back_nbr_staple[i]){
1011 host_free(back_nbr_staple[i]); back_nbr_staple[i] = NULL;
1013 if(fwd_nbr_staple_sendbuf[i]){
1014 host_free(fwd_nbr_staple_sendbuf[i]); fwd_nbr_staple_sendbuf[i] = NULL;
1016 if(back_nbr_staple_sendbuf[i]){
1017 host_free(back_nbr_staple_sendbuf[i]); back_nbr_staple_sendbuf[i] = NULL;
1023 #undef gauge_site_size
void comm_start(MsgHandle *mh)
MsgHandle * comm_declare_receive_displaced(void *buffer, const int displacement[], size_t nbytes)
MsgHandle * comm_declare_send_displaced(void *buffer, const int displacement[], size_t nbytes)
#define comm_declare_receive_relative(buffer, dim, dir, nbytes)
void comm_wait(MsgHandle *mh)
void comm_free(MsgHandle *&mh)
int commDimPartitioned(int dir)
#define comm_declare_send_relative(buffer, dim, dir, nbytes)
void * memset(void *s, int c, size_t n)
cudaColorSpinorField * tmp
enum QudaPrecision_s QudaPrecision
enum QudaGaugeFieldOrder_s QudaGaugeFieldOrder
#define MEMCOPY_GAUGE_FIELDS_GRID_TO_BUF(ghost_buf, dst_idx, sitelink, src_idx, num, dir, geom)
#define MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID_T(sitelink, ghost_buf, dst_face, src_face, dir, geom)
#define MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID(sitelink, dst_idx, ghost_buf, src_idx, num, dir, geom)
void pack_gauge_diag(void *buf, int *X, void **sitelink, int nu, int mu, int dir1, int dir2, QudaPrecision prec)
void exchange_sitelink_diag(int *X, Float **sitelink, Float **ghost_sitelink_diag, int optflag)
void packGhostAllStaples(Float *cpuStaple, Float **cpuGhostBack, Float **cpuGhostFwd, int nFace, int *X)
void exchange_cpu_sitelink_ex(int *X, int *R, void **sitelink, QudaGaugeFieldOrder cpu_order, QudaPrecision gPrecision, int optflag, int geometry)
void exchange_sitelink(int *X, Float **sitelink, Float **ghost_sitelink, Float **ghost_sitelink_diag, Float **sitelink_fwd_sendbuf, Float **sitelink_back_sendbuf, int optflag)
void packGhostAllLinks(Float **cpuLink, Float **cpuGhostBack, Float **cpuGhostFwd, int dir, int nFace, int *X)
void pack_ghost_all_links(void **cpuLink, void **cpuGhostBack, void **cpuGhostFwd, int dir, int nFace, QudaPrecision precision, int *X)
void pack_ghost_all_staples_cpu(void *staple, void **cpuGhostStapleBack, void **cpuGhostStapleFwd, int nFace, QudaPrecision precision, int *X)
void exchange_llfat_cleanup(void)
void exchange_llfat_init(QudaPrecision prec)
void exchange_cpu_sitelink(int *X, void **sitelink, void **ghost_sitelink, void **ghost_sitelink_diag, QudaPrecision gPrecision, QudaGaugeParam *param, int optflag)
void do_exchange_cpu_staple(Float *staple, Float **ghost_staple, Float **staple_fwd_sendbuf, Float **staple_back_sendbuf, int *X)
void exchange_cpu_staple(int *X, void *staple, void **ghost_staple, QudaPrecision gPrecision)
#define safe_malloc(size)
#define pinned_malloc(size)
FloatingPoint< float > Float
cudaStream_t qudaStream_t