13 extern cudaStream_t *
stream;
23 #if defined(MULTI_GPU) && (defined(GPU_FATLINK) || defined(GPU_GAUGE_FORCE)|| defined(GPU_FERMION_FORCE) || defined(GPU_HISQ_FORCE) || defined(CLOVER_FORCE)) || defined(GPU_CLOVER_DIRAC)
36 #define gaugeSiteSize 18
39 static void* fwd_nbr_staple_cpu[4];
40 static void* back_nbr_staple_cpu[4];
41 static void* fwd_nbr_staple_sendbuf_cpu[4];
42 static void* back_nbr_staple_sendbuf_cpu[4];
45 static void* fwd_nbr_staple_gpu[4];
46 static void* back_nbr_staple_gpu[4];
48 static void* fwd_nbr_staple[4];
49 static void* back_nbr_staple[4];
50 static void* fwd_nbr_staple_sendbuf[4];
51 static void* back_nbr_staple_sendbuf[4];
57 static int Vs[4],
Vsh[4];
64 } llfat_recv, llfat_send;
67 extern void setup_dims_in_gauge(
int *XX);
73 for (
int d=0; d< 4; d++) {
84 Vs[0] =
Vs_x = X[1]*X[2]*X[3];
85 Vs[1] =
Vs_y = X[0]*X[2]*X[3];
86 Vs[2] =
Vs_z = X[0]*X[1]*X[3];
87 Vs[3] =
Vs_t = X[0]*X[1]*X[2];
98 static bool initialized =
false;
100 if (initialized)
return;
103 for (
int i=0; i < 4; i++) {
118 fwd_nbr_staple_sendbuf_cpu[i] =
safe_malloc(packet_size);
119 back_nbr_staple_sendbuf_cpu[i] =
safe_malloc(packet_size);
126 template<
typename Float>
127 void exchange_sitelink_diag(
int* X,
Float** sitelink,
Float** ghost_sitelink_diag,
int optflag)
142 for(
int nu =
XUP; nu <=
TUP; nu++){
152 for(dir1=0; dir1 < 4; dir1 ++){
153 if(dir1 != nu && dir1 !=
mu){
157 for(dir2=0; dir2 < 4; dir2 ++){
158 if(dir2 != nu && dir2 !=
mu && dir2 != dir1){
163 if(dir1 == 4 || dir2 == 4){
194 template<
typename Float>
196 exchange_sitelink(
int*X,
Float** sitelink,
Float** ghost_sitelink,
Float** ghost_sitelink_diag,
197 Float** sitelink_fwd_sendbuf,
Float** sitelink_back_sendbuf,
int optflag)
205 Float* even_sitelink_back_src = sitelink[i];
209 if(
dims[3] % 2 == 0){
210 memcpy(sitelink_back_dst, even_sitelink_back_src, len);
211 memcpy(sitelink_back_dst +
Vsh_t*gaugeSiteSize, odd_sitelink_back_src, len);
214 memcpy(sitelink_back_dst, odd_sitelink_back_src, len);
215 memcpy(sitelink_back_dst +
Vsh_t*gaugeSiteSize, even_sitelink_back_src, len);
220 Float* even_sitelink_fwd_src = sitelink[i] + (volumeCB -
Vsh_t)*gaugeSiteSize;
221 Float* odd_sitelink_fwd_src = sitelink[i] + volumeCB*gaugeSiteSize + (volumeCB -
Vsh_t)*gaugeSiteSize;
223 if(
dims[3] % 2 == 0){
224 memcpy(sitelink_fwd_dst, even_sitelink_fwd_src, len);
225 memcpy(sitelink_fwd_dst +
Vsh_t*gaugeSiteSize, odd_sitelink_fwd_src, len);
228 memcpy(sitelink_fwd_dst, odd_sitelink_fwd_src, len);
229 memcpy(sitelink_fwd_dst +
Vsh_t*gaugeSiteSize, even_sitelink_fwd_src, len);
235 for(
int dir=0; dir < 4; dir++){
241 for (
int dir = 0; dir < 4; dir++) {
243 int len =
Vsh[dir]*gaugeSiteSize*
sizeof(
Float);
244 Float* ghost_sitelink_back = ghost_sitelink[dir];
273 exchange_sitelink_diag(X, sitelink, ghost_sitelink_diag, optflag);
281 void** sitelink,
void** ghost_sitelink,
282 void** ghost_sitelink_diag,
286 static void* sitelink_fwd_sendbuf[4];
287 static void* sitelink_back_sendbuf[4];
288 static bool allocated =
false;
291 for (
int i=0; i<4; i++) {
292 int nbytes = 4*
Vs[i]*gaugeSiteSize*gPrecision;
295 memset(sitelink_fwd_sendbuf[i], 0, nbytes);
296 memset(sitelink_back_sendbuf[i], 0, nbytes);
302 exchange_sitelink(X, (
double**)sitelink, (
double**)(ghost_sitelink), (
double**)ghost_sitelink_diag,
303 (
double**)sitelink_fwd_sendbuf, (
double**)sitelink_back_sendbuf, optflag);
305 exchange_sitelink(X, (
float**)sitelink, (
float**)(ghost_sitelink), (
float**)ghost_sitelink_diag,
306 (
float**)sitelink_fwd_sendbuf, (
float**)sitelink_back_sendbuf, optflag);
310 for(
int i=0;i < 4;i++){
319 #define MEMCOPY_GAUGE_FIELDS_GRID_TO_BUF(ghost_buf, dst_idx, sitelink, src_idx, num, dir, geom) \
320 if(src_oddness) src_idx += Vh_ex; \
321 if(dst_oddness) dst_idx += R[dir]*slice_3d[dir]/2; \
322 if(cpu_order == QUDA_QDP_GAUGE_ORDER) { \
323 for(int linkdir=0; linkdir < 4; linkdir++){ \
324 char* src = (char*) sitelink[linkdir] + (src_idx)*gaugebytes; \
325 char* dst = ((char*)ghost_buf[dir])+ linkdir*R[dir]*slice_3d[dir]*gaugebytes + (dst_idx)*gaugebytes; \
326 memcpy(dst, src, gaugebytes*(num)); \
328 } else if (cpu_order == QUDA_MILC_GAUGE_ORDER) { \
329 char* src = ((char*)sitelink)+ (geom)*(src_idx)*gaugebytes; \
330 char* dst = ((char*)ghost_buf[dir]) + (geom)*(dst_idx)*gaugebytes; \
331 memcpy(dst, src, (geom)*gaugebytes*(num)); \
333 errorQuda("Unsupported gauge order"); \
336 #define MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID(sitelink, dst_idx, ghost_buf, src_idx, num, dir, geom) \
338 if(commDimPartitioned(dir)){ \
339 src_idx += R[dir]*slice_3d[dir]/2; \
345 if(cpu_order == QUDA_QDP_GAUGE_ORDER){ \
346 for(int linkdir=0; linkdir < 4; linkdir++){ \
348 if(commDimPartitioned(dir)){ \
349 src = ((char*)ghost_buf[dir])+ linkdir*R[dir]*slice_3d[dir]*gaugebytes + (src_idx)*gaugebytes; \
351 src = ((char*)sitelink[linkdir])+ (src_idx)*gaugebytes; \
353 char* dst = (char*) sitelink[linkdir] + (dst_idx)*gaugebytes; \
354 memcpy(dst, src, gaugebytes*(num)); \
356 } else if (cpu_order == QUDA_MILC_GAUGE_ORDER) { \
358 if(commDimPartitioned(dir)){ \
359 src=((char*)ghost_buf[dir]) + (geom)*(src_idx)*gaugebytes; \
361 src = ((char*)sitelink)+ (geom)*(src_idx)*gaugebytes; \
363 char* dst = ((char*)sitelink) + (geom)*(dst_idx)*gaugebytes; \
364 memcpy(dst, src, (geom)*gaugebytes*(num)); \
366 errorQuda("Unsupported gauge order"); \
369 #define MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID_T(sitelink, ghost_buf, dst_face, src_face, dir, geom) \
371 int even_dst_idx = (dst_face*E[2]*E[1]*E[0])/2; \
373 if(commDimPartitioned(dir)){ \
376 even_src_idx = (src_face*E[2]*E[1]*E[0])/2; \
379 int odd_dst_idx = even_dst_idx+Vh_ex; \
381 if(commDimPartitioned(dir)){ \
382 odd_src_idx = R[dir]*slice_3d[dir]/2; \
384 odd_src_idx = even_src_idx+Vh_ex; \
386 if(cpu_order == QUDA_QDP_GAUGE_ORDER){ \
387 for(int linkdir=0; linkdir < 4; linkdir ++){ \
388 char* dst = (char*)sitelink[linkdir]; \
390 if(commDimPartitioned(dir)){ \
391 src = ((char*)ghost_buf[dir]) + linkdir*R[dir]*slice_3d[dir]*gaugebytes; \
393 src = (char*)sitelink[linkdir]; \
395 memcpy(dst + even_dst_idx * gaugebytes, src + even_src_idx*gaugebytes, R[dir]*slice_3d[dir]*gaugebytes/2); \
396 memcpy(dst + odd_dst_idx * gaugebytes, src + odd_src_idx*gaugebytes, R[dir]*slice_3d[dir]*gaugebytes/2); \
398 } else if (cpu_order == QUDA_MILC_GAUGE_ORDER) { \
399 char* dst = (char*)sitelink; \
401 if(commDimPartitioned(dir)){ \
402 src = (char*)ghost_buf[dir]; \
404 src = (char*)sitelink; \
406 memcpy(dst+(geom)*even_dst_idx*gaugebytes, src+(geom)*even_src_idx*gaugebytes, (geom)*R[dir]*slice_3d[dir]*gaugebytes/2); \
407 memcpy(dst+(geom)*odd_dst_idx*gaugebytes, src+(geom)*odd_src_idx*gaugebytes, (geom)*R[dir]*slice_3d[dir]*gaugebytes/2); \
409 errorQuda("Unsupported gauge order\n"); \
423 for (
int i=0; i<4; i++) E[i] = X[i] + 2*R[i];
424 int Vh_ex = E[3]*E[2]*E[1]*E[0]/2;
427 int starta[] = {R[3], R[3], R[3], 0};
428 int enda[] = {X[3]+R[3], X[3]+R[3], X[3]+R[3], X[2]+2*R[2]};
430 int startb[] = {R[2], R[2], 0, 0};
431 int endb[] = {X[2]+R[2], X[2]+R[2], X[1]+2*R[1], X[1]+2*R[1]};
433 int startc[] = {R[1], 0, 0, 0};
434 int endc[] = {X[1]+R[1], X[0]+2*R[0], X[0]+2*R[0], X[0]+2*R[0]};
437 {E[2]*E[1]*E[0], E[1]*E[0], E[0], 1},
438 {E[2]*E[1]*E[0], E[1]*E[0], 1, E[0]},
439 {E[2]*E[1]*E[0], E[0], 1, E[1]*E[0]},
440 {E[1]*E[0], E[0], 1, E[2]*E[1]*E[0]}
444 {E[2]*E[1], E[1], 1, E[3]*E[2]*E[1]},
445 {E[2]*E[0], E[0], 1, E[3]*E[2]*E[0]},
446 {E[1]*E[0], E[0], 1, E[3]*E[1]*E[0]},
447 {E[1]*E[0], E[0], 1, E[2]*E[1]*E[0]}
450 int slice_3d[] = { E[3]*E[2]*E[1], E[3]*E[2]*E[0], E[3]*E[1]*E[0], E[2]*E[1]*E[0]};
452 for(
int i=0; i<4;i++){
453 len[i] = slice_3d[i] * R[i] * geometry*gaugeSiteSize*gPrecision;
456 void* ghost_sitelink_fwd_sendbuf[4];
457 void* ghost_sitelink_back_sendbuf[4];
458 void* ghost_sitelink_fwd[4];
459 void* ghost_sitelink_back[4];
461 for(
int i=0; i<4; i++) {
463 ghost_sitelink_fwd_sendbuf[i] =
safe_malloc(len[i]);
464 ghost_sitelink_back_sendbuf[i] =
safe_malloc(len[i]);
469 int gaugebytes = gaugeSiteSize*gPrecision;
471 for(
int dir =0;dir < 4;dir++){
476 for(d=R[dir]; d < 2*R[dir]; d++)
477 for(a=starta[dir];a < enda[dir]; a++)
478 for(b=startb[dir]; b < endb[dir]; b++)
480 if(f_main[dir][2] != 1 || f_bound[dir][2] !=1){
481 for (c=startc[dir]; c < endc[dir]; c++){
482 int oddness = (a+b+c+d)%2;
483 int src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
484 int dst_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + (d-R[dir])*f_bound[dir][3])>> 1;
486 int src_oddness = oddness;
487 int dst_oddness = oddness;
488 if((X[dir] % 2 ==1) && (
commDim(dir) > 1)){
489 dst_oddness = 1-oddness;
492 MEMCOPY_GAUGE_FIELDS_GRID_TO_BUF(ghost_sitelink_back_sendbuf, dst_idx, sitelink, src_idx, 1, dir, geometry);
496 for(
int loop=0; loop < 2; loop++){
499 int oddness = (a+b+c+d)%2;
500 int src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
501 int dst_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + (d-R[dir])*f_bound[dir][3])>> 1;
503 int src_oddness = oddness;
504 int dst_oddness = oddness;
505 if((X[dir] % 2 ==1) && (
commDim(dir) > 1)){
506 dst_oddness = 1-oddness;
508 MEMCOPY_GAUGE_FIELDS_GRID_TO_BUF(ghost_sitelink_back_sendbuf, dst_idx, sitelink, src_idx, (endc[dir]-c+1)/2, dir, geometry);
516 for(d=X[dir]; d < X[dir]+R[dir]; d++) {
517 for(a=starta[dir];a < enda[dir]; a++) {
518 for(b=startb[dir]; b < endb[dir]; b++) {
520 if(f_main[dir][2] != 1 || f_bound[dir][2] !=1){
521 for (c=startc[dir]; c < endc[dir]; c++){
522 int oddness = (a+b+c+d)%2;
523 int src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
524 int dst_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + (d-X[dir])*f_bound[dir][3])>> 1;
526 int src_oddness = oddness;
527 int dst_oddness = oddness;
528 if((X[dir] % 2 ==1) && (
commDim(dir) > 1)){
529 dst_oddness = 1-oddness;
532 MEMCOPY_GAUGE_FIELDS_GRID_TO_BUF(ghost_sitelink_fwd_sendbuf, dst_idx, sitelink, src_idx, 1,dir, geometry);
535 for(
int loop=0; loop < 2; loop++){
538 int oddness = (a+b+c+d)%2;
539 int src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
540 int dst_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + (d-X[dir])*f_bound[dir][3])>> 1;
542 int src_oddness = oddness;
543 int dst_oddness = oddness;
544 if((X[dir] % 2 ==1) && (
commDim(dir) > 1)){
545 dst_oddness = 1-oddness;
547 MEMCOPY_GAUGE_FIELDS_GRID_TO_BUF(ghost_sitelink_fwd_sendbuf, dst_idx, sitelink, src_idx, (endc[dir]-c+1)/2,dir, geometry);
587 for(d=0; d < R[dir]; d++) {
588 for(a=starta[dir];a < enda[dir]; a++) {
589 for(b=startb[dir]; b < endb[dir]; b++) {
591 if(f_main[dir][2] != 1 || f_bound[dir][2] !=1){
592 for (c=startc[dir]; c < endc[dir]; c++){
593 int oddness = (a+b+c+d)%2;
594 int dst_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
597 src_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + d*f_bound[dir][3])>> 1;
599 src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + (d+X[dir])*f_main[dir][3])>> 1;
602 MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID(sitelink, dst_idx, ghost_sitelink_back, src_idx, 1, dir, geometry);
609 for(
int loop =0;loop <2;loop++){
610 int c=startc[dir]+loop;
612 int oddness = (a+b+c+d)%2;
613 int dst_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
616 src_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + d*f_bound[dir][3])>> 1;
618 src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + (d+X[dir])*f_main[dir][3])>> 1;
621 MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID(sitelink, dst_idx, ghost_sitelink_back, src_idx, (endc[dir]-c+1)/2, dir, geometry);
634 MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID_T(sitelink, ghost_sitelink_back, 0, X[3], dir, geometry)
640 for(d=X[dir]+R[dir]; d < X[dir]+2*R[dir]; d++) {
641 for(a=starta[dir];a < enda[dir]; a++) {
642 for(b=startb[dir]; b < endb[dir]; b++) {
644 if(f_main[dir][2] != 1 || f_bound[dir][2] != 1){
645 for (c=startc[dir]; c < endc[dir]; c++){
646 int oddness = (a+b+c+d)%2;
647 int dst_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
650 src_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + (d-X[dir]-R[dir])*f_bound[dir][3])>> 1;
652 src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + (d-X[dir])*f_main[dir][3])>> 1;
655 MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID(sitelink, dst_idx, ghost_sitelink_fwd, src_idx, 1, dir, geometry);
659 for(
int loop =0; loop < 2; loop++){
661 c=startc[dir] + loop;
663 int oddness = (a+b+c+d)%2;
664 int dst_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + d*f_main[dir][3])>> 1;
667 src_idx = ( a*f_bound[dir][0] + b*f_bound[dir][1]+ c*f_bound[dir][2] + (d-X[dir]-R[dir])*f_bound[dir][3])>> 1;
669 src_idx = ( a*f_main[dir][0] + b*f_main[dir][1]+ c*f_main[dir][2] + (d-X[dir])*f_main[dir][3])>> 1;
671 MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID(sitelink, dst_idx, ghost_sitelink_fwd, src_idx, (endc[dir]-c+1)/2, dir, geometry);
684 MEMCOPY_GAUGE_FIELDS_BUF_TO_GRID_T(sitelink, ghost_sitelink_fwd, (X[3]+R[3]), 2, dir, geometry)
691 for(
int dir=0;dir < 4;dir++){
693 host_free(ghost_sitelink_fwd_sendbuf[dir]);
694 host_free(ghost_sitelink_back_sendbuf[dir]);
703 template<
typename Float>
705 do_exchange_cpu_staple(
Float* staple,
Float** ghost_staple,
Float** staple_fwd_sendbuf,
Float** staple_back_sendbuf,
int* X)
711 Float* even_staple_back_src = staple;
713 Float* staple_back_dst = staple_back_sendbuf[3];
715 if(
dims[3] % 2 == 0){
716 memcpy(staple_back_dst, even_staple_back_src, len);
717 memcpy(staple_back_dst +
Vsh_t*gaugeSiteSize, odd_staple_back_src, len);
720 memcpy(staple_back_dst, odd_staple_back_src, len);
721 memcpy(staple_back_dst +
Vsh_t*gaugeSiteSize, even_staple_back_src, len);
725 Float* even_staple_fwd_src = staple + (volumeCB -
Vsh_t)*gaugeSiteSize;
726 Float* odd_staple_fwd_src = staple + volumeCB*gaugeSiteSize + (volumeCB -
Vsh_t)*gaugeSiteSize;
727 Float* staple_fwd_dst = staple_fwd_sendbuf[3];
728 if(
dims[3] % 2 == 0){
729 memcpy(staple_fwd_dst, even_staple_fwd_src, len);
730 memcpy(staple_fwd_dst +
Vsh_t*gaugeSiteSize, odd_staple_fwd_src, len);
733 memcpy(staple_fwd_dst, odd_staple_fwd_src, len);
734 memcpy(staple_fwd_dst +
Vsh_t*gaugeSiteSize, even_staple_fwd_src, len);
751 for (
int dir=0;dir < 4; dir++) {
753 Float *ghost_staple_back = ghost_staple[dir];
790 void *staple_fwd_sendbuf[4];
791 void *staple_back_sendbuf[4];
793 for(
int i=0;i < 4; i++){
794 staple_fwd_sendbuf[i] =
safe_malloc(Vs[i]*gaugeSiteSize*gPrecision);
795 staple_back_sendbuf[i] =
safe_malloc(Vs[i]*gaugeSiteSize*gPrecision);
799 do_exchange_cpu_staple((
double*)staple, (
double**)ghost_staple,
800 (
double**)staple_fwd_sendbuf, (
double**)staple_back_sendbuf, X);
802 do_exchange_cpu_staple((
float*)staple, (
float**)ghost_staple,
803 (
float**)staple_fwd_sendbuf, (
float**)staple_back_sendbuf, X);
806 for (
int i=0;i < 4;i++) {
822 void* even = cudaStaple->
Even_p();
823 void* odd = cudaStaple->
Odd_p();
824 int volumeCB = cudaStaple->
VolumeCB();
826 int stride = cudaStaple->
Stride();
829 dir, whichway, fwd_nbr_staple_gpu, back_nbr_staple_gpu,
830 fwd_nbr_staple_sendbuf, back_nbr_staple_sendbuf, stream);
839 cudaStreamSynchronize(*stream);
843 int len = Vs[
dim]*gaugeSiteSize*
prec;
852 memcpy(fwd_nbr_staple_sendbuf_cpu[dim], fwd_nbr_staple_sendbuf[dim], len);
866 memcpy(back_nbr_staple_sendbuf_cpu[dim], back_nbr_staple_sendbuf[dim], len);
884 void* even = cudaStaple->
Even_p();
885 void* odd = cudaStaple->
Odd_p();
886 int volumeCB = cudaStaple->
VolumeCB();
888 int stride = cudaStaple->
Stride();
893 int len = Vs[
dim]*gaugeSiteSize*
prec;
908 memcpy(back_nbr_staple[dim], back_nbr_staple_cpu[dim], len);
923 dim,
QUDA_FORWARDS, fwd_nbr_staple, back_nbr_staple, stream);
925 memcpy(fwd_nbr_staple[dim], fwd_nbr_staple_cpu[dim], len);
927 dim,
QUDA_FORWARDS, fwd_nbr_staple, back_nbr_staple, stream);
936 for (
int i=0; i<4; i++) {
938 if(fwd_nbr_staple_gpu[i]){
939 device_free(fwd_nbr_staple_gpu[i]); fwd_nbr_staple_gpu[i] = NULL;
941 if(back_nbr_staple_gpu[i]){
942 device_free(back_nbr_staple_gpu[i]); back_nbr_staple_gpu[i] = NULL;
946 if(fwd_nbr_staple_cpu[i]){
947 host_free(fwd_nbr_staple_cpu[i]); fwd_nbr_staple_cpu[i] = NULL;
949 if(back_nbr_staple_cpu[i]){
950 host_free(back_nbr_staple_cpu[i]);back_nbr_staple_cpu[i] = NULL;
952 if(fwd_nbr_staple_sendbuf_cpu[i]){
953 host_free(fwd_nbr_staple_sendbuf_cpu[i]); fwd_nbr_staple_sendbuf_cpu[i] = NULL;
955 if(back_nbr_staple_sendbuf_cpu[i]){
956 host_free(back_nbr_staple_sendbuf_cpu[i]); back_nbr_staple_sendbuf_cpu[i] = NULL;
960 if(fwd_nbr_staple[i]){
961 host_free(fwd_nbr_staple[i]); fwd_nbr_staple[i] = NULL;
963 if(back_nbr_staple[i]){
964 host_free(back_nbr_staple[i]); back_nbr_staple[i] = NULL;
966 if(fwd_nbr_staple_sendbuf[i]){
967 host_free(fwd_nbr_staple_sendbuf[i]); fwd_nbr_staple_sendbuf[i] = NULL;
969 if(back_nbr_staple_sendbuf[i]){
970 host_free(back_nbr_staple_sendbuf[i]); back_nbr_staple_sendbuf[i] = NULL;
void exchange_llfat_cleanup(void)
#define pinned_malloc(size)
enum QudaPrecision_s QudaPrecision
int commDimPartitioned(int dir)
MsgHandle * comm_declare_receive_displaced(void *buffer, const int displacement[], size_t nbytes)
void exchange_cpu_staple(int *X, void *staple, void **ghost_staple, QudaPrecision gPrecision)
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int mu
MsgHandle * comm_declare_send_relative(void *buffer, int dim, int dir, size_t nbytes)
void unpackGhostStaple(int *X, void *_even, void *_odd, int volume, QudaPrecision prec, int stride, int dir, int whichway, void **fwd_nbr_buf, void **back_nbr_buf, cudaStream_t *stream)
void exchange_llfat_init(QudaPrecision prec)
void comm_free(MsgHandle *mh)
QudaPrecision Precision() const
void exchange_cpu_sitelink_ex(int *X, int *R, void **sitelink, QudaGaugeFieldOrder cpu_order, QudaPrecision gPrecision, int optflag, int geometry)
void exchange_gpu_staple_start(int *X, void *_cudaStaple, int dir, int whichway, cudaStream_t *stream)
MsgHandle * comm_declare_send_displaced(void *buffer, const int displacement[], size_t nbytes)
FloatingPoint< float > Float
void comm_start(MsgHandle *mh)
enum QudaGaugeFieldOrder_s QudaGaugeFieldOrder
#define safe_malloc(size)
MsgHandle * comm_declare_receive_relative(void *buffer, int dim, int dir, size_t nbytes)
void pack_ghost_all_links(void **cpuLink, void **cpuGhostBack, void **cpuGhostFwd, int dir, int nFace, QudaPrecision precision, int *X)
void exchange_gpu_staple_wait(int *X, void *_cudaStaple, int dir, int whichway, cudaStream_t *stream)
void exchange_gpu_staple_comms(int *X, void *_cudaStaple, int dir, int whichway, cudaStream_t *stream)
void * memset(void *s, int c, size_t n)
void pack_gauge_diag(void *buf, int *X, void **sitelink, int nu, int mu, int dir1, int dir2, QudaPrecision prec)
void packGhostStaple(int *X, void *even, void *odd, int volume, QudaPrecision prec, int stride, int dir, int whichway, void **fwd_nbr_buf_gpu, void **back_nbr_buf_gpu, void **fwd_nbr_buf, void **back_nbr_buf, cudaStream_t *stream)
#define device_malloc(size)
void comm_wait(MsgHandle *mh)
void pack_ghost_all_staples_cpu(void *staple, void **cpuGhostStapleBack, void **cpuGhostStapleFwd, int nFace, QudaPrecision precision, int *X)
void exchange_cpu_sitelink(int *X, void **sitelink, void **ghost_sitelink, void **ghost_sitelink_diag, QudaPrecision gPrecision, QudaGaugeParam *param, int optflag)