14 extern cudaStream_t *
stream;
21 #if defined(MULTI_GPU) && (defined(GPU_FATLINK) || defined(GPU_GAUGE_FORCE)|| defined(GPU_FERMION_FORCE) || defined(GPU_HISQ_FORCE) || defined(CLOVER_FORCE)) || defined(GPU_CLOVER_DIRAC) 36 #define gaugeSiteSize 18 // real numbers per gauge field 38 static void* fwd_nbr_staple[4];
39 static void* back_nbr_staple[4];
40 static void* fwd_nbr_staple_sendbuf[4];
41 static void* back_nbr_staple_sendbuf[4];
44 static int X1,X2,X3,X4;
46 static int Vs[4],
Vsh[4];
49 extern void setup_dims_in_gauge(
int *XX);
55 for (
int d=0;
d< 4;
d++) {
77 template <
typename Float>
78 void packGhostAllStaples(Float *cpuStaple, Float **cpuGhostBack,Float**cpuGhostFwd,
int nFace,
int*
X) {
80 int XYZ=
X[0]*
X[1]*
X[2];
81 int volumeCB =
X[0]*
X[1]*
X[2]*
X[3]/2;
95 A[0] =
X[3]; B[0] =
X[2]; C[0] =
X[1];
98 A[1] =
X[3]; B[1] =
X[2]; C[1] =
X[0];
101 A[2] =
X[3]; B[2] =
X[1]; C[2] =
X[0];
104 A[3] =
X[2]; B[3] =
X[1]; C[3] =
X[0];
114 for(
int ite = 0; ite < 2; ite++){
125 for(
int dir =0; dir < 4; dir++){
129 for(
int linkdir=0; linkdir < 1; linkdir ++){
130 Float* even_src = cpuStaple;
138 if((
X[dir] % 2 ==0) || (
comm_dim(dir) == 1)){
146 int even_dst_index = 0;
147 int odd_dst_index = 0;
154 startd =
X[dir] - nFace;
157 for(
d = startd;
d < endd;
d++){
158 for(
a = 0;
a < A[dir];
a++){
159 for(
b = 0;
b < B[dir];
b++){
160 for(
c = 0;
c < C[dir];
c++){
161 int index = (
a*
f[dir][0] +
b*
f[dir][1]+
c*
f[dir][2] +
d*
f[dir][3])>> 1;
162 int oddness = (
a+
b+
c+
d)%2;
164 for(
int i=0;
i < 18;
i++){
165 even_dst[18*even_dst_index+
i] = even_src[18*
index +
i];
169 for(
int i=0;
i < 18;
i++){
170 odd_dst[18*odd_dst_index+
i] = odd_src[18*
index +
i];
178 assert( even_dst_index == nFace*faceVolumeCB[dir]);
179 assert( odd_dst_index == nFace*faceVolumeCB[dir]);
186 void pack_ghost_all_staples_cpu(
void *staple,
void **cpuGhostStapleBack,
void** cpuGhostStapleFwd,
189 packGhostAllStaples((
double*)staple, (
double**)cpuGhostStapleBack, (
double**) cpuGhostStapleFwd, nFace,
X);
191 packGhostAllStaples((
float*)staple, (
float**)cpuGhostStapleBack, (
float**)cpuGhostStapleFwd, nFace,
X);
195 void pack_gauge_diag(
void* buf,
int*
X,
void** sitelink,
int nu,
int mu,
int dir1,
int dir2,
QudaPrecision prec)
209 int mul_factor[4]={1,
X[0],
X[1]*
X[0],
X[2]*
X[1]*
X[0]};
211 int even_dst_idx = 0;
213 char* dst_even =(
char*)buf;
215 char* src_even = (
char*)sitelink[nu];
218 if( (
X[nu]+
X[
mu]) % 2 == 1){
226 for(
int i=0;
i <
X[dir2];
i++){
227 for(
int j=0; j <
X[dir1]; j++){
228 int src_idx = ((
X[nu]-1)*mul_factor[nu]+ 0*mul_factor[
mu]+
i*mul_factor[dir2]+j*mul_factor[dir1])>>1;
229 int oddness = ( (
X[nu]-1) + 0 +
i + j) %2;
243 if( (even_dst_idx !=
X[dir1]*
X[dir2]/2)|| (odd_dst_idx !=
X[dir1]*
X[dir2]/2)){
244 errorQuda(
"even_dst_idx/odd_dst_idx(%d/%d) does not match the value of X[dir1]*X[dir2]/2 (%d)\n",
245 even_dst_idx, odd_dst_idx,
X[dir1]*
X[dir2]/2);
259 template <
typename Float>
260 void packGhostAllLinks(Float **
cpuLink, Float **cpuGhostBack,Float**cpuGhostFwd,
int dir,
int nFace,
int*
X) {
262 int XYZ=
X[0]*
X[1]*
X[2];
263 int volumeCB =
X[0]*
X[1]*
X[2]*
X[3]/2;
264 int faceVolumeCB[4]={
274 int A[4], B[4], C[4];
277 A[0] =
X[3]; B[0] =
X[2]; C[0] =
X[1];
280 A[1] =
X[3]; B[1] =
X[2]; C[1] =
X[0];
283 A[2] =
X[3]; B[2] =
X[1]; C[2] =
X[0];
286 A[3] =
X[2]; B[3] =
X[1]; C[3] =
X[0];
296 for(
int ite = 0; ite < 2; ite++){
311 for(
int linkdir=0; linkdir < 4; linkdir ++){
312 Float* even_src =
cpuLink[linkdir];
319 if((
X[dir] % 2 ==0) || (
comm_dim(dir) == 1)){
320 even_dst = dst[dir] + 2*linkdir* nFace *faceVolumeCB[dir]*
gaugeSiteSize;
323 odd_dst = dst[dir] + 2*linkdir* nFace *faceVolumeCB[dir]*
gaugeSiteSize;
327 int even_dst_index = 0;
328 int odd_dst_index = 0;
335 startd =
X[dir] - nFace;
338 for(
d = startd;
d < endd;
d++){
339 for(
a = 0;
a < A[dir];
a++){
340 for(
b = 0;
b < B[dir];
b++){
341 for(
c = 0;
c < C[dir];
c++){
342 int index = (
a*
f[dir][0] +
b*
f[dir][1]+
c*
f[dir][2] +
d*
f[dir][3])>> 1;
343 int oddness = (
a+
b+
c+
d)%2;
345 for(
int i=0;
i < 18;
i++){
346 even_dst[18*even_dst_index+
i] = even_src[18*
index +
i];
350 for(
int i=0;
i < 18;
i++){
351 odd_dst[18*odd_dst_index+
i] = odd_src[18*
index +
i];
359 assert( even_dst_index == nFace*faceVolumeCB[dir]);
360 assert( odd_dst_index == nFace*faceVolumeCB[dir]);
366 void pack_ghost_all_links(
void **
cpuLink,
void **cpuGhostBack,
void** cpuGhostFwd,
369 packGhostAllLinks((
double**)
cpuLink, (
double**)cpuGhostBack, (
double**) cpuGhostFwd, dir, nFace,
X);
371 packGhostAllLinks((
float**)
cpuLink, (
float**)cpuGhostBack, (
float**)cpuGhostFwd, dir, nFace,
X);
382 for (
int i=0;
i < 4;
i++) {
395 template<
typename Float>
396 void exchange_sitelink_diag(
int*
X, Float** sitelink, Float** ghost_sitelink_diag,
int optflag)
411 for(
int nu =
XUP; nu <=
TUP; nu++){
421 for(dir1=0; dir1 < 4; dir1 ++){
422 if(dir1 != nu && dir1 !=
mu){
426 for(dir2=0; dir2 < 4; dir2 ++){
427 if(dir2 != nu && dir2 !=
mu && dir2 != dir1){
432 if(dir1 == 4 || dir2 == 4){
438 pack_gauge_diag(sendbuf,
X, (
void**)sitelink, nu,
mu, dir1, dir2, (
QudaPrecision)
sizeof(Float));
463 template<
typename Float>
465 exchange_sitelink(
int*
X, Float** sitelink, Float** ghost_sitelink, Float** ghost_sitelink_diag,
466 Float** sitelink_fwd_sendbuf, Float** sitelink_back_sendbuf,
int optflag)
470 for(
int dir=0; dir < 4; dir++){
472 pack_ghost_all_links((
void**)sitelink, (
void**)sitelink_back_sendbuf, (
void**)sitelink_fwd_sendbuf, dir, nFace, (
QudaPrecision)(
sizeof(Float)),
X);
475 for (
int dir = 0; dir < 4; dir++) {
478 Float* ghost_sitelink_back = ghost_sitelink[dir];
507 exchange_sitelink_diag(
X, sitelink, ghost_sitelink_diag, optflag);
515 void** sitelink,
void** ghost_sitelink,
516 void** ghost_sitelink_diag,
520 static void* sitelink_fwd_sendbuf[4];
521 static void* sitelink_back_sendbuf[4];
523 for (
int i=0;
i<4;
i++) {
527 memset(sitelink_fwd_sendbuf[
i], 0, nbytes);
528 memset(sitelink_back_sendbuf[
i], 0, nbytes);
532 exchange_sitelink(
X, (
double**)sitelink, (
double**)(ghost_sitelink), (
double**)ghost_sitelink_diag,
533 (
double**)sitelink_fwd_sendbuf, (
double**)sitelink_back_sendbuf, optflag);
535 exchange_sitelink(
X, (
float**)sitelink, (
float**)(ghost_sitelink), (
float**)ghost_sitelink_diag,
536 (
float**)sitelink_fwd_sendbuf, (
float**)sitelink_back_sendbuf, optflag);
539 for(
int i=0;
i < 4;
i++){
546 #define MEMCOPY_GAUGE_FIELDS_GRID_TO_BUF(ghost_buf, dst_idx, sitelink, src_idx, num, dir, geom) \ 547 if(src_oddness) src_idx += Vh_ex; \ 548 if(dst_oddness) dst_idx += R[dir]*slice_3d[dir]/2; \ 549 if(cpu_order == QUDA_QDP_GAUGE_ORDER) { \ 550 for(int linkdir=0; linkdir < 4; linkdir++){ \ 551 char* src = (char*) sitelink[linkdir] + (src_idx)*gaugebytes; \ 552 char* dst = ((char*)ghost_buf[dir])+ linkdir*R[dir]*slice_3d[dir]*gaugebytes + (dst_idx)*gaugebytes; \ 553 memcpy(dst, src, gaugebytes*(num)); \ 555 } else if (cpu_order == QUDA_MILC_GAUGE_ORDER) { \ 556 char* src = ((char*)sitelink)+ (geom)*(src_idx)*gaugebytes; \ 557 char* dst = ((char*)ghost_buf[dir]) + (geom)*(dst_idx)*gaugebytes; \ 558 memcpy(dst, src, (geom)*gaugebytes*(num)); \ 560 errorQuda("Unsupported gauge order"); \ 563 #define MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID(sitelink, dst_idx, ghost_buf, src_idx, num, dir, geom) \ 565 if(commDimPartitioned(dir)){ \ 566 src_idx += R[dir]*slice_3d[dir]/2; \ 572 if(cpu_order == QUDA_QDP_GAUGE_ORDER){ \ 573 for(int linkdir=0; linkdir < 4; linkdir++){ \ 575 if(commDimPartitioned(dir)){ \ 576 src = ((char*)ghost_buf[dir])+ linkdir*R[dir]*slice_3d[dir]*gaugebytes + (src_idx)*gaugebytes; \ 578 src = ((char*)sitelink[linkdir])+ (src_idx)*gaugebytes; \ 580 char* dst = (char*) sitelink[linkdir] + (dst_idx)*gaugebytes; \ 581 memcpy(dst, src, gaugebytes*(num)); \ 583 } else if (cpu_order == QUDA_MILC_GAUGE_ORDER) { \ 585 if(commDimPartitioned(dir)){ \ 586 src=((char*)ghost_buf[dir]) + (geom)*(src_idx)*gaugebytes; \ 588 src = ((char*)sitelink)+ (geom)*(src_idx)*gaugebytes; \ 590 char* dst = ((char*)sitelink) + (geom)*(dst_idx)*gaugebytes; \ 591 memcpy(dst, src, (geom)*gaugebytes*(num)); \ 593 errorQuda("Unsupported gauge order"); \ 596 #define MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID_T(sitelink, ghost_buf, dst_face, src_face, dir, geom) \ 598 int even_dst_idx = (dst_face*E[2]*E[1]*E[0])/2; \ 600 if(commDimPartitioned(dir)){ \ 603 even_src_idx = (src_face*E[2]*E[1]*E[0])/2; \ 606 int odd_dst_idx = even_dst_idx+Vh_ex; \ 608 if(commDimPartitioned(dir)){ \ 609 odd_src_idx = R[dir]*slice_3d[dir]/2; \ 611 odd_src_idx = even_src_idx+Vh_ex; \ 613 if(cpu_order == QUDA_QDP_GAUGE_ORDER){ \ 614 for(int linkdir=0; linkdir < 4; linkdir ++){ \ 615 char* dst = (char*)sitelink[linkdir]; \ 617 if(commDimPartitioned(dir)){ \ 618 src = ((char*)ghost_buf[dir]) + linkdir*R[dir]*slice_3d[dir]*gaugebytes; \ 620 src = (char*)sitelink[linkdir]; \ 622 memcpy(dst + even_dst_idx * gaugebytes, src + even_src_idx*gaugebytes, R[dir]*slice_3d[dir]*gaugebytes/2); \ 623 memcpy(dst + odd_dst_idx * gaugebytes, src + odd_src_idx*gaugebytes, R[dir]*slice_3d[dir]*gaugebytes/2); \ 625 } else if (cpu_order == QUDA_MILC_GAUGE_ORDER) { \ 626 char* dst = (char*)sitelink; \ 628 if(commDimPartitioned(dir)){ \ 629 src = (char*)ghost_buf[dir]; \ 631 src = (char*)sitelink; \ 633 memcpy(dst+(geom)*even_dst_idx*gaugebytes, src+(geom)*even_src_idx*gaugebytes, (geom)*R[dir]*slice_3d[dir]*gaugebytes/2); \ 634 memcpy(dst+(geom)*odd_dst_idx*gaugebytes, src+(geom)*odd_src_idx*gaugebytes, (geom)*R[dir]*slice_3d[dir]*gaugebytes/2); \ 636 errorQuda("Unsupported gauge order\n"); \ 650 for (
int i=0;
i<4;
i++)
E[
i] =
X[
i] + 2*
R[
i];
654 int starta[] = {
R[3],
R[3],
R[3], 0};
655 int enda[] = {
X[3]+
R[3],
X[3]+
R[3],
X[3]+
R[3],
X[2]+2*
R[2]};
657 int startb[] = {
R[2],
R[2], 0, 0};
658 int endb[] = {
X[2]+
R[2],
X[2]+
R[2],
X[1]+2*
R[1],
X[1]+2*
R[1]};
660 int startc[] = {
R[1], 0, 0, 0};
661 int endc[] = {
X[1]+
R[1],
X[0]+2*
R[0],
X[0]+2*
R[0],
X[0]+2*
R[0]};
664 {
E[2]*
E[1]*
E[0],
E[1]*
E[0],
E[0], 1},
665 {
E[2]*
E[1]*
E[0],
E[1]*
E[0], 1,
E[0]},
666 {
E[2]*
E[1]*
E[0],
E[0], 1,
E[1]*
E[0]},
667 {
E[1]*
E[0],
E[0], 1,
E[2]*
E[1]*
E[0]}
671 {
E[2]*
E[1],
E[1], 1,
E[3]*
E[2]*
E[1]},
672 {
E[2]*
E[0],
E[0], 1,
E[3]*
E[2]*
E[0]},
673 {
E[1]*
E[0],
E[0], 1,
E[3]*
E[1]*
E[0]},
674 {
E[1]*
E[0],
E[0], 1,
E[2]*
E[1]*
E[0]}
677 int slice_3d[] = {
E[3]*
E[2]*
E[1],
E[3]*
E[2]*
E[0],
E[3]*
E[1]*
E[0],
E[2]*
E[1]*
E[0]};
679 for(
int i=0;
i<4;
i++){
683 void* ghost_sitelink_fwd_sendbuf[4];
684 void* ghost_sitelink_back_sendbuf[4];
685 void* ghost_sitelink_fwd[4];
686 void* ghost_sitelink_back[4];
688 for(
int i=0;
i<4;
i++) {
698 for(
int dir =0;dir < 4;dir++){
703 for(
d=
R[dir];
d < 2*
R[dir];
d++)
704 for(
a=starta[dir];
a < enda[dir];
a++)
705 for(
b=startb[dir];
b < endb[dir];
b++)
707 if(f_main[dir][2] != 1 || f_bound[dir][2] !=1){
708 for (
c=startc[dir];
c < endc[dir];
c++){
709 int oddness = (
a+
b+
c+
d)%2;
710 int src_idx = (
a*f_main[dir][0] +
b*f_main[dir][1]+
c*f_main[dir][2] +
d*f_main[dir][3])>> 1;
711 int dst_idx = (
a*f_bound[dir][0] +
b*f_bound[dir][1]+
c*f_bound[dir][2] + (
d-
R[dir])*f_bound[dir][3])>> 1;
713 int src_oddness = oddness;
714 int dst_oddness = oddness;
715 if((
X[dir] % 2 ==1) && (
commDim(dir) > 1)){
716 dst_oddness = 1-oddness;
719 MEMCOPY_GAUGE_FIELDS_GRID_TO_BUF(ghost_sitelink_back_sendbuf, dst_idx, sitelink,
src_idx, 1, dir, geometry);
723 for(
int loop=0; loop < 2; loop++){
726 int oddness = (
a+
b+
c+
d)%2;
727 int src_idx = (
a*f_main[dir][0] +
b*f_main[dir][1]+
c*f_main[dir][2] +
d*f_main[dir][3])>> 1;
728 int dst_idx = (
a*f_bound[dir][0] +
b*f_bound[dir][1]+
c*f_bound[dir][2] + (
d-
R[dir])*f_bound[dir][3])>> 1;
730 int src_oddness = oddness;
731 int dst_oddness = oddness;
732 if((
X[dir] % 2 ==1) && (
commDim(dir) > 1)){
733 dst_oddness = 1-oddness;
735 MEMCOPY_GAUGE_FIELDS_GRID_TO_BUF(ghost_sitelink_back_sendbuf, dst_idx, sitelink,
src_idx, (endc[dir]-
c+1)/2, dir, geometry);
743 for(
d=
X[dir];
d <
X[dir]+
R[dir];
d++) {
744 for(
a=starta[dir];
a < enda[dir];
a++) {
745 for(
b=startb[dir];
b < endb[dir];
b++) {
747 if(f_main[dir][2] != 1 || f_bound[dir][2] !=1){
748 for (
c=startc[dir];
c < endc[dir];
c++){
749 int oddness = (
a+
b+
c+
d)%2;
750 int src_idx = (
a*f_main[dir][0] +
b*f_main[dir][1]+
c*f_main[dir][2] +
d*f_main[dir][3])>> 1;
751 int dst_idx = (
a*f_bound[dir][0] +
b*f_bound[dir][1]+
c*f_bound[dir][2] + (
d-
X[dir])*f_bound[dir][3])>> 1;
753 int src_oddness = oddness;
754 int dst_oddness = oddness;
755 if((
X[dir] % 2 ==1) && (
commDim(dir) > 1)){
756 dst_oddness = 1-oddness;
759 MEMCOPY_GAUGE_FIELDS_GRID_TO_BUF(ghost_sitelink_fwd_sendbuf, dst_idx, sitelink,
src_idx, 1,dir, geometry);
762 for(
int loop=0; loop < 2; loop++){
765 int oddness = (
a+
b+
c+
d)%2;
766 int src_idx = (
a*f_main[dir][0] +
b*f_main[dir][1]+
c*f_main[dir][2] +
d*f_main[dir][3])>> 1;
767 int dst_idx = (
a*f_bound[dir][0] +
b*f_bound[dir][1]+
c*f_bound[dir][2] + (
d-
X[dir])*f_bound[dir][3])>> 1;
769 int src_oddness = oddness;
770 int dst_oddness = oddness;
771 if((
X[dir] % 2 ==1) && (
commDim(dir) > 1)){
772 dst_oddness = 1-oddness;
774 MEMCOPY_GAUGE_FIELDS_GRID_TO_BUF(ghost_sitelink_fwd_sendbuf, dst_idx, sitelink,
src_idx, (endc[dir]-
c+1)/2,dir, geometry);
814 for(
d=0;
d <
R[dir];
d++) {
815 for(
a=starta[dir];
a < enda[dir];
a++) {
816 for(
b=startb[dir];
b < endb[dir];
b++) {
818 if(f_main[dir][2] != 1 || f_bound[dir][2] !=1){
819 for (
c=startc[dir];
c < endc[dir];
c++){
820 int oddness = (
a+
b+
c+
d)%2;
821 int dst_idx = (
a*f_main[dir][0] +
b*f_main[dir][1]+
c*f_main[dir][2] +
d*f_main[dir][3])>> 1;
824 src_idx = (
a*f_bound[dir][0] +
b*f_bound[dir][1]+
c*f_bound[dir][2] +
d*f_bound[dir][3])>> 1;
826 src_idx = (
a*f_main[dir][0] +
b*f_main[dir][1]+
c*f_main[dir][2] + (
d+
X[dir])*f_main[dir][3])>> 1;
829 MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID(sitelink, dst_idx, ghost_sitelink_back,
src_idx, 1, dir, geometry);
836 for(
int loop =0;loop <2;loop++){
837 int c=startc[dir]+loop;
839 int oddness = (
a+
b+
c+
d)%2;
840 int dst_idx = (
a*f_main[dir][0] +
b*f_main[dir][1]+
c*f_main[dir][2] +
d*f_main[dir][3])>> 1;
843 src_idx = (
a*f_bound[dir][0] +
b*f_bound[dir][1]+
c*f_bound[dir][2] +
d*f_bound[dir][3])>> 1;
845 src_idx = (
a*f_main[dir][0] +
b*f_main[dir][1]+
c*f_main[dir][2] + (
d+
X[dir])*f_main[dir][3])>> 1;
848 MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID(sitelink, dst_idx, ghost_sitelink_back,
src_idx, (endc[dir]-
c+1)/2, dir, geometry);
861 MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID_T(sitelink, ghost_sitelink_back, 0,
X[3], dir, geometry)
867 for(
d=
X[dir]+
R[dir];
d <
X[dir]+2*
R[dir];
d++) {
868 for(
a=starta[dir];
a < enda[dir];
a++) {
869 for(
b=startb[dir];
b < endb[dir];
b++) {
871 if(f_main[dir][2] != 1 || f_bound[dir][2] != 1){
872 for (
c=startc[dir];
c < endc[dir];
c++){
873 int oddness = (
a+
b+
c+
d)%2;
874 int dst_idx = (
a*f_main[dir][0] +
b*f_main[dir][1]+
c*f_main[dir][2] +
d*f_main[dir][3])>> 1;
877 src_idx = (
a*f_bound[dir][0] +
b*f_bound[dir][1]+
c*f_bound[dir][2] + (
d-
X[dir]-
R[dir])*f_bound[dir][3])>> 1;
879 src_idx = (
a*f_main[dir][0] +
b*f_main[dir][1]+
c*f_main[dir][2] + (
d-
X[dir])*f_main[dir][3])>> 1;
882 MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID(sitelink, dst_idx, ghost_sitelink_fwd,
src_idx, 1, dir, geometry);
886 for(
int loop =0; loop < 2; loop++){
888 c=startc[dir] + loop;
890 int oddness = (
a+
b+
c+
d)%2;
891 int dst_idx = (
a*f_main[dir][0] +
b*f_main[dir][1]+
c*f_main[dir][2] +
d*f_main[dir][3])>> 1;
894 src_idx = (
a*f_bound[dir][0] +
b*f_bound[dir][1]+
c*f_bound[dir][2] + (
d-
X[dir]-
R[dir])*f_bound[dir][3])>> 1;
896 src_idx = (
a*f_main[dir][0] +
b*f_main[dir][1]+
c*f_main[dir][2] + (
d-
X[dir])*f_main[dir][3])>> 1;
898 MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID(sitelink, dst_idx, ghost_sitelink_fwd,
src_idx, (endc[dir]-
c+1)/2, dir, geometry);
911 MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID_T(sitelink, ghost_sitelink_fwd, (
X[3]+
R[3]), 2, dir, geometry)
918 for(
int dir=0;dir < 4;dir++){
920 host_free(ghost_sitelink_fwd_sendbuf[dir]);
921 host_free(ghost_sitelink_back_sendbuf[dir]);
930 template<
typename Float>
932 do_exchange_cpu_staple(Float* staple, Float** ghost_staple, Float** staple_fwd_sendbuf, Float** staple_back_sendbuf,
int*
X)
936 pack_ghost_all_staples_cpu(staple, (
void**)staple_back_sendbuf,
937 (
void**)staple_fwd_sendbuf, nFace, (
QudaPrecision)(
sizeof(Float)),
X);
948 for (
int dir=0;dir < 4; dir++) {
950 Float *ghost_staple_back = ghost_staple[dir];
987 void *staple_fwd_sendbuf[4];
988 void *staple_back_sendbuf[4];
990 for(
int i=0;
i < 4;
i++){
996 do_exchange_cpu_staple((
double*)staple, (
double**)ghost_staple,
997 (
double**)staple_fwd_sendbuf, (
double**)staple_back_sendbuf,
X);
999 do_exchange_cpu_staple((
float*)staple, (
float**)ghost_staple,
1000 (
float**)staple_fwd_sendbuf, (
float**)staple_back_sendbuf,
X);
1003 for (
int i=0;
i < 4;
i++) {
1011 for (
int i=0;
i<4;
i++) {
1013 if(fwd_nbr_staple[
i]){
1014 host_free(fwd_nbr_staple[
i]); fwd_nbr_staple[
i] = NULL;
1016 if(back_nbr_staple[
i]){
1017 host_free(back_nbr_staple[
i]); back_nbr_staple[
i] = NULL;
1019 if(fwd_nbr_staple_sendbuf[
i]){
1020 host_free(fwd_nbr_staple_sendbuf[
i]); fwd_nbr_staple_sendbuf[
i] = NULL;
1022 if(back_nbr_staple_sendbuf[
i]){
1023 host_free(back_nbr_staple_sendbuf[
i]); back_nbr_staple_sendbuf[
i] = NULL;
1030 #undef gaugeSiteSize void exchange_cpu_sitelink_ex(int *X, int *R, void **sitelink, QudaGaugeFieldOrder cpu_order, QudaPrecision gPrecision, int optflag, int geometry)
int commDimPartitioned(int dir)
#define pinned_malloc(size)
enum QudaPrecision_s QudaPrecision
MsgHandle * comm_declare_receive_displaced(void *buffer, const int displacement[], size_t nbytes)
void exchange_cpu_sitelink(int *X, void **sitelink, void **ghost_sitelink, void **ghost_sitelink_diag, QudaPrecision gPrecision, QudaGaugeParam *param, int optflag)
cudaColorSpinorField * tmp
char * index(const char *, int)
void comm_free(MsgHandle *mh)
else return(__swbuf(_c, _p))
MsgHandle * comm_declare_send_displaced(void *buffer, const int displacement[], size_t nbytes)
#define comm_declare_send_relative(buffer, dim, dir, nbytes)
#define comm_declare_receive_relative(buffer, dim, dir, nbytes)
static bool initialized
Profiler for initQuda.
void exchange_cpu_staple(int *X, void *staple, void **ghost_staple, QudaPrecision gPrecision)
void comm_start(MsgHandle *mh)
int int int enum cudaChannelFormatKind f
enum QudaGaugeFieldOrder_s QudaGaugeFieldOrder
void exchange_llfat_init(QudaPrecision prec)
void * memcpy(void *__dst, const void *__src, size_t __n)
#define safe_malloc(size)
void exchange_llfat_cleanup(void)
void * memset(void *__b, int __c, size_t __len)
void comm_wait(MsgHandle *mh)
static __inline__ size_t size_t d