14 #define MAX(a,b) ((a)>(b)?(a):(b))
15 #define ALIGNMENT 4096
19 #if defined(GPU_FATLINK)||defined(GPU_GAUGE_FORCE)|| defined(GPU_FERMION_FORCE) ||defined(GPU_HISQ_FORCE)
23 template <
typename Float>
24 void packGhostAllStaples(
Float *cpuStaple,
Float **cpuGhostBack,
Float**cpuGhostFwd,
int nFace,
int*
X) {
26 int XYZ=X[0]*X[1]*X[2];
27 int volumeCB = X[0]*X[1]*X[2]*X[3]/2;
41 A[0] = X[3]; B[0] = X[2]; C[0] = X[1];
44 A[1] = X[3]; B[1] = X[2]; C[1] = X[0];
47 A[2] = X[3]; B[2] = X[1]; C[2] = X[0];
50 A[3] = X[2]; B[3] = X[1]; C[3] = X[0];
62 for(
int ite = 0; ite < 2; ite++){
78 for(
int linkdir=0; linkdir < 1; linkdir ++){
79 Float* even_src = cpuStaple;
95 int even_dst_index = 0;
96 int odd_dst_index = 0;
104 startd = X[
dir] - nFace;
107 for(d = startd; d < endd; d++){
108 for(a = 0; a < A[
dir]; a++){
109 for(b = 0; b < B[
dir]; b++){
110 for(c = 0; c < C[
dir]; c++){
112 int oddness = (a+b+c+d)%2;
114 for(
int i=0;i < 18;i++){
115 even_dst[18*even_dst_index+i] = even_src[18*index + i];
119 for(
int i=0;i < 18;i++){
120 odd_dst[18*odd_dst_index+i] = odd_src[18*index + i];
128 assert( even_dst_index == nFace*faceVolumeCB[
dir]);
129 assert( odd_dst_index == nFace*faceVolumeCB[dir]);
141 packGhostAllStaples((
double*)staple, (
double**)cpuGhostStapleBack, (
double**) cpuGhostStapleFwd, nFace, X);
143 packGhostAllStaples((
float*)staple, (
float**)cpuGhostStapleBack, (
float**)cpuGhostStapleFwd, nFace, X);
164 1, X[0], X[1]*X[0], X[2]*X[1]*X[0],
167 int even_dst_idx = 0;
169 char* dst_even =(
char*)buf;
170 char* dst_odd = dst_even + (X[dir1]*X[dir2]/2)*
gaugeSiteSize*prec;
171 char* src_even = (
char*)sitelink[nu];
172 char* src_odd = src_even + (X[0]*X[1]*X[2]*X[3]/2)*
gaugeSiteSize*prec;
174 if( (X[nu]+X[mu]) % 2 == 1){
182 for(
int i=0;i < X[dir2]; i++){
183 for(
int j=0; j < X[dir1]; j++){
184 int src_idx = ((X[nu]-1)*mul_factor[nu]+ 0*mul_factor[mu]+i*mul_factor[dir2]+j*mul_factor[dir1])>>1;
186 int oddness = ( (X[nu]-1) + 0 + i + j) %2;
190 memcpy(&dst_even[(18*even_dst_idx+tmpidx)*prec], &src_even[(18*src_idx + tmpidx)*prec], prec);
195 memcpy(&dst_odd[(18*odd_dst_idx+tmpidx)*prec], &src_odd[(18*src_idx + tmpidx)*prec], prec);
203 if( (even_dst_idx != X[dir1]*X[dir2]/2)|| (odd_dst_idx != X[dir1]*X[dir2]/2)){
204 errorQuda(
"even_dst_idx/odd_dst_idx(%d/%d) does not match the value of X[dir1]*X[dir2]/2 (%d)\n",
205 even_dst_idx, odd_dst_idx, X[dir1]*X[dir2]/2);
215 int dir,
int whichway,
216 void** fwd_nbr_buf_gpu,
void** back_nbr_buf_gpu,
217 void** fwd_nbr_buf,
void** back_nbr_buf,
222 Vs_x = X[1]*X[2]*X[3];
223 Vs_y = X[0]*X[2]*X[3];
224 Vs_z = X[0]*X[1]*X[3];
225 Vs_t = X[0]*X[1]*X[2];
233 gpu_buf = back_nbr_buf_gpu[i];
235 cudaMemcpyAsync(back_nbr_buf[i], gpu_buf, Vs[i]*
gaugeSiteSize*prec, cudaMemcpyDeviceToHost, *stream);
237 gpu_buf = fwd_nbr_buf_gpu[i];
239 cudaMemcpyAsync(fwd_nbr_buf[i], gpu_buf, Vs[i]*
gaugeSiteSize*prec, cudaMemcpyDeviceToHost, *stream);
243 int Vsh = X[0]*X[1]*X[2]/2;
244 int sizeOfFloatN = 2*
prec;
245 int len = Vsh*sizeOfFloatN;
250 void* dst = ((
char*)back_nbr_buf[3]) + i*len ;
251 void* src = ((
char*)even) + i*stride*sizeOfFloatN;
252 cudaMemcpyAsync(dst, src, len, cudaMemcpyDeviceToHost, *stream);
256 void* dst = ((
char*)back_nbr_buf[3]) + 9*len + i*len ;
257 void* src = ((
char*)odd) + i*stride*sizeOfFloatN;
258 cudaMemcpyAsync(dst, src, len, cudaMemcpyDeviceToHost, *stream);
262 void* dst = ((
char*)fwd_nbr_buf[3]) + i*len ;
263 void* src = ((
char*)even) + (Vh-
Vsh)*sizeOfFloatN + i*stride*sizeOfFloatN;
264 cudaMemcpyAsync(dst, src, len, cudaMemcpyDeviceToHost, *stream);
268 void* dst = ((
char*)fwd_nbr_buf[3]) + 9*len + i*len ;
269 void* src = ((
char*)odd) + (Vh-
Vsh)*sizeOfFloatN + i*stride*sizeOfFloatN;
270 cudaMemcpyAsync(dst, src, len, cudaMemcpyDeviceToHost, *stream);
276 void* dst = ((
char*)back_nbr_buf[3]) + i*len ;
277 void* src = ((
char*)odd) + i*stride*sizeOfFloatN;
278 cudaMemcpyAsync(dst, src, len, cudaMemcpyDeviceToHost, *stream);
282 void* dst = ((
char*)back_nbr_buf[3]) + 9*len + i*len ;
283 void* src = ((
char*)even) + i*stride*sizeOfFloatN;
284 cudaMemcpyAsync(dst, src, len, cudaMemcpyDeviceToHost, *stream);
288 void* dst = ((
char*)fwd_nbr_buf[3]) + i*len ;
289 void* src = ((
char*)odd) + (Vh-
Vsh)*sizeOfFloatN + i*stride*sizeOfFloatN;
290 cudaMemcpyAsync(dst, src, len, cudaMemcpyDeviceToHost, *stream);
294 void* dst = ((
char*)fwd_nbr_buf[3]) + 9*len + i*len ;
295 void* src = ((
char*)even) + (Vh-
Vsh)*sizeOfFloatN + i*stride*sizeOfFloatN;
296 cudaMemcpyAsync(dst, src, len, cudaMemcpyDeviceToHost, *stream);
308 int dir,
int whichway,
void** fwd_nbr_buf,
void** back_nbr_buf,
309 cudaStream_t* stream)
314 Vsh_x = X[1]*X[2]*X[3]/2;
315 Vsh_y = X[0]*X[2]*X[3]/2;
316 Vsh_z = X[0]*X[1]*X[3]/2;
317 Vsh_t = X[0]*X[1]*X[2]/2;
321 int sizeOfFloatN = 2*
prec;
333 Vsh_x + Vsh_y +
Vsh_z,
336 char* even = ((
char*)_even) + Vh*sizeOfFloatN + 2*tmpint[
dir]*sizeOfFloatN;
337 char* odd = ((
char*)_odd) + Vh*sizeOfFloatN +2*tmpint[
dir]*sizeOfFloatN;
341 for(
int i=0;i < 9; i++){
342 void* dst = even + i*stride*sizeOfFloatN;
343 void* src = ((
char*)back_nbr_buf[dir]) + i*len[
dir] ;
344 cudaMemcpyAsync(dst, src, len[dir], cudaMemcpyHostToDevice, *stream);
347 for(
int i=0;i < 9; i++){
348 void* dst = odd + i*stride*sizeOfFloatN;
349 void* src = ((
char*)back_nbr_buf[dir]) + 9*len[
dir] + i*len[
dir] ;
350 cudaMemcpyAsync(dst, src, len[dir], cudaMemcpyHostToDevice, *stream);
354 for(
int i=0;i < 9; i++){
355 void* dst = even + Vsh[
dir]*sizeOfFloatN + i*stride*sizeOfFloatN;
356 void* src = ((
char*)fwd_nbr_buf[dir]) + i*len[
dir] ;
357 cudaMemcpyAsync(dst, src, len[dir], cudaMemcpyHostToDevice, *stream);
360 for(
int i=0;i < 9; i++){
361 void* dst = odd + Vsh[
dir]*sizeOfFloatN + i*stride*sizeOfFloatN;
362 void* src = ((
char*)fwd_nbr_buf[dir]) + 9*len[
dir] + i*len[
dir] ;
363 cudaMemcpyAsync(dst, src, len[dir], cudaMemcpyHostToDevice, *stream);
377 template <
typename Float>
378 void packGhostAllLinks(
Float **cpuLink,
Float **cpuGhostBack,
Float**cpuGhostFwd,
int dir,
int nFace,
int* X) {
380 int XYZ=X[0]*X[1]*X[2];
382 int volumeCB = X[0]*X[1]*X[2]*X[3]/2;
383 int faceVolumeCB[4]={
393 int A[4], B[4], C[4];
396 A[0] = X[3]; B[0] = X[2]; C[0] = X[1];
399 A[1] = X[3]; B[1] = X[2]; C[1] = X[0];
402 A[2] = X[3]; B[2] = X[1]; C[2] = X[0];
405 A[3] = X[2]; B[3] = X[1]; C[3] = X[0];
417 for(
int ite = 0; ite < 2; ite++){
433 for(
int linkdir=0; linkdir < 4; linkdir ++){
434 Float* even_src = cpuLink[linkdir];
442 if((X[dir] % 2 ==0) || (
commDim(dir) == 1)){
450 int even_dst_index = 0;
451 int odd_dst_index = 0;
459 startd = X[
dir] - nFace;
462 for(d = startd; d < endd; d++){
463 for(a = 0; a < A[
dir]; a++){
464 for(b = 0; b < B[
dir]; b++){
465 for(c = 0; c < C[
dir]; c++){
467 int oddness = (a+b+c+d)%2;
469 for(
int i=0;i < 18;i++){
470 even_dst[18*even_dst_index+i] = even_src[18*index + i];
474 for(
int i=0;i < 18;i++){
475 odd_dst[18*odd_dst_index+i] = odd_src[18*index + i];
483 assert( even_dst_index == nFace*faceVolumeCB[dir]);
484 assert( odd_dst_index == nFace*faceVolumeCB[dir]);
496 packGhostAllLinks((
double**)cpuLink, (
double**)cpuGhostBack, (
double**) cpuGhostFwd, dir, nFace, X);
498 packGhostAllLinks((
float**)cpuLink, (
float**)cpuGhostBack, (
float**)cpuGhostFwd, dir, nFace, X);
512 do_loadLinkToGPU(
int* X,
void *even,
void*odd,
void **
cpuGauge,
void** ghost_cpuGauge,
513 void** ghost_cpuGauge_diag,
519 Vh_2d_max =
MAX(Vh_2d_max, X[0]*X[3]/2);
520 Vh_2d_max =
MAX(Vh_2d_max, X[1]*X[2]/2);
521 Vh_2d_max =
MAX(Vh_2d_max, X[1]*X[3]/2);
522 Vh_2d_max =
MAX(Vh_2d_max, X[2]*X[3]/2);
534 int ghostV = 2*(Vsh_x+Vsh_y+Vsh_z+
Vsh_t)+4*Vh_2d_max;
541 char *tmp_odd = tmp_even;
547 cudaMemcpyAsync(tmp_even + i*(len+glen_sum), cpuGauge[i], len, cudaMemcpyHostToDevice,
streams[0]);
549 cudaMemcpy(tmp_even + i*(len+glen_sum), cpuGauge[i], len, cudaMemcpyHostToDevice);
555 errorQuda(
"Multi-GPU for MILC gauge order is not supported");
558 cudaMemcpyAsync(tmp_even, ((
char*)cpuGauge), 4*len, cudaMemcpyHostToDevice,
streams[0]);
560 cudaMemcpy(tmp_even, ((
char*)cpuGauge), 4*len, cudaMemcpyHostToDevice);
568 char* dest = tmp_even + i*(len+glen_sum)+len;
569 for(
int dir = 0; dir < 4; dir++){
571 cudaMemcpyAsync(dest, ((
char*)ghost_cpuGauge[dir])+i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice,
streams[0]);
572 cudaMemcpyAsync(dest + glen[dir], ((
char*)ghost_cpuGauge[dir])+8*glen[dir]+i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice,
streams[0]);
574 cudaMemcpy(dest, ((
char*)ghost_cpuGauge[dir])+i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice);
575 cudaMemcpy(dest + glen[dir], ((
char*)ghost_cpuGauge[dir])+8*glen[dir]+i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice);
582 for(
int mu = 0; mu < 4; mu++){
587 for(dir1=0; dir1 < 4; dir1 ++){
588 if(dir1 != nu && dir1 != mu){
592 for(dir2=0; dir2 < 4; dir2 ++){
593 if(dir2 != nu && dir2 != mu && dir2 != dir1){
598 cudaMemcpyAsync(dest+ mu *Vh_2d_max*
gaugeSiteSize*prec,ghost_cpuGauge_diag[nu*4+mu],
601 cudaMemcpy(dest+ mu *Vh_2d_max*
gaugeSiteSize*prec,ghost_cpuGauge_diag[nu*4+mu],
602 X[dir1]*X[dir2]/2*
gaugeSiteSize*prec, cudaMemcpyHostToDevice);
616 cudaMemcpyAsync(tmp_odd + i*(len+glen_sum), ((
char*)cpuGauge[i]) + Vh*
gaugeSiteSize*prec, len, cudaMemcpyHostToDevice,
streams[0]);
618 cudaMemcpy(tmp_odd + i*(len+glen_sum), ((
char*)cpuGauge[i]) + Vh*
gaugeSiteSize*prec, len, cudaMemcpyHostToDevice);
623 cudaMemcpyAsync(tmp_odd , ((
char*)cpuGauge)+4*Vh*
gaugeSiteSize*prec, 4*len, cudaMemcpyHostToDevice,
streams[0]);
625 cudaMemcpy(tmp_odd, (
char*)cpuGauge+4*Vh*
gaugeSiteSize*prec, 4*len, cudaMemcpyHostToDevice);
632 char* dest = tmp_odd + i*(len+glen_sum)+len;
633 for(
int dir = 0; dir < 4; dir++){
635 cudaMemcpyAsync(dest, ((
char*)ghost_cpuGauge[dir])+glen[dir] +i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice,
streams[0]);
636 cudaMemcpyAsync(dest + glen[dir], ((
char*)ghost_cpuGauge[dir])+8*glen[dir]+glen[dir] +i*2*glen[dir], glen[dir],
637 cudaMemcpyHostToDevice,
streams[0]);
639 cudaMemcpy(dest, ((
char*)ghost_cpuGauge[dir])+glen[dir] +i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice);
640 cudaMemcpy(dest + glen[dir], ((
char*)ghost_cpuGauge[dir])+8*glen[dir]+glen[dir] +i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice);
649 for(
int mu = 0; mu < 4; mu++){
654 for(dir1=0; dir1 < 4; dir1 ++){
655 if(dir1 != nu && dir1 != mu){
659 for(dir2=0; dir2 < 4; dir2 ++){
660 if(dir2 != nu && dir2 != mu && dir2 != dir1){
665 cudaMemcpyAsync(dest+ mu *Vh_2d_max*
gaugeSiteSize*prec,((
char*)ghost_cpuGauge_diag[nu*4+mu])+X[dir1]*X[dir2]/2*
gaugeSiteSize*prec,
669 X[dir1]*X[dir2]/2*
gaugeSiteSize*prec, cudaMemcpyHostToDevice );
678 cudaStreamSynchronize(
streams[0]);
687 if (cudaGauge->Precision() != cpuGauge->Precision()){
688 errorQuda(
"Mismatch between CPU precision and CUDA precision");
693 const int*
Z = cudaGauge->X();
695 int pad = cudaGauge->Pad();
696 int Vsh_x = param->
X[1]*param->
X[2]*param->
X[3]/2;
697 int Vsh_y = param->
X[0]*param->
X[2]*param->
X[3]/2;
698 int Vsh_z = param->
X[0]*param->
X[1]*param->
X[3]/2;
699 int Vsh_t = param->
X[0]*param->
X[1]*param->
X[2]/2;
701 static void* ghost_cpuGauge[4];
702 static void* ghost_cpuGauge_diag[16];
705 static int allocated = 0;
710 for(
int i=0;i < 4; i++) {
725 int ghost_diag_len[16];
726 for(
int nu=0;nu < 4;nu++){
727 for(
int mu=0; mu < 4;mu++){
729 ghost_cpuGauge_diag[nu*4+
mu] = NULL;
733 for(dir1= 0; dir1 < 4; dir1++){
734 if(dir1 !=nu && dir1 != mu){
738 for(dir2=0; dir2 < 4; dir2++){
739 if(dir2 != nu && dir2 != mu && dir2 != dir1){
746 ghost_diag_len[nu*4+
mu] = nbytes;
752 memset(ghost_cpuGauge_diag[nu*4+mu], 0, nbytes);
765 do_loadLinkToGPU(param->X, cudaGauge->Even_p(), cudaGauge->Odd_p(), (
void**)cpuGauge->Gauge_p(),
766 ghost_cpuGauge, ghost_cpuGauge_diag,
767 cudaGauge->Reconstruct(), cudaGauge->Bytes(), cudaGauge->VolumeCB(), pad,
769 prec, cpuGauge->Order());
774 for(
int i=0;i < 4;i++){
777 for(
int i=0;i <4; i++){
778 for(
int j=0;j <4; j++){
779 if (i != j)
host_free(ghost_cpuGauge_diag[i*4+j]);
789 do_loadLinkToGPU_ex(
const int* X,
void *even,
void *odd,
void**cpuGauge,
796 char *tmp_odd = tmp_even;
800 for(
int i=0; i < 4; i++){
802 cudaMemcpyAsync(tmp_even + i*len, cpuGauge[i], len, cudaMemcpyHostToDevice);
804 cudaMemcpy(tmp_even + i*len, cpuGauge[i], len, cudaMemcpyHostToDevice);
810 cudaMemcpyAsync(tmp_even, (
char*)cpuGauge, 4*len, cudaMemcpyHostToDevice);
812 cudaMemcpy(tmp_even, (
char*)cpuGauge, 4*len, cudaMemcpyHostToDevice);
820 for(
int i=0; i < 4; i++){
822 cudaMemcpyAsync(tmp_odd + i*len, ((
char*)cpuGauge[i]) + Vh_ex*
gaugeSiteSize*prec, len, cudaMemcpyHostToDevice);
824 cudaMemcpy(tmp_odd + i*len, ((
char*)cpuGauge[i]) + Vh_ex*
gaugeSiteSize*prec, len, cudaMemcpyHostToDevice);
829 cudaMemcpyAsync(tmp_odd, ((
char*)cpuGauge) + 4*Vh_ex*
gaugeSiteSize*prec, 4*len, cudaMemcpyHostToDevice);
831 cudaMemcpy(tmp_odd, ((
char*)cpuGauge) + 4*Vh_ex*
gaugeSiteSize*prec, 4*len, cudaMemcpyHostToDevice);
842 if (cudaGauge->Precision() != cpuGauge->Precision()){
843 errorQuda(
"Mismatch between CPU precision and CUDA precision");
846 const int *
E = cudaGauge->X();
847 int pad = cudaGauge->Pad();
848 do_loadLinkToGPU_ex(E, cudaGauge->Even_p(), cudaGauge->Odd_p(), (
void**)cpuGauge->Gauge_p(),
849 cudaGauge->Reconstruct(), cudaGauge->Bytes(), cudaGauge->VolumeCB(), pad,
850 prec, cpuGauge->Order());
854 template<
typename FloatN,
typename Float>
860 double *unpackedDataEven = (
double *)
device_malloc(datalen);
861 double *unpackedDataOdd = unpackedDataEven;
867 cudaMemcpyAsync(cpuGauge, unpackedDataEven, datalen, cudaMemcpyDeviceToHost,
streams[0]);
869 cudaMemcpy(cpuGauge, unpackedDataEven, datalen, cudaMemcpyDeviceToHost);
875 cudaMemcpyAsync(cpuGauge + 4*Vh*
gaugeSiteSize, unpackedDataOdd, datalen, cudaMemcpyDeviceToHost,
streams[0]);
877 cudaMemcpy(cpuGauge + 4*Vh*gaugeSiteSize, unpackedDataOdd, datalen, cudaMemcpyDeviceToHost);
889 if (cpu_prec != cuda_prec){
890 errorQuda(
"Mismatch between CPU precision and CUDA precision");
894 errorQuda(
"Reconstruct type not supported");
897 int stride = cudaGauge->VolumeCB() + cudaGauge->Pad();
900 do_storeLinkToCPU( (
double*)cpuGauge->Gauge_p(), (double2*) cudaGauge->Even_p(), (double2*)cudaGauge->Odd_p(),
903 do_storeLinkToCPU( (
float*)cpuGauge->Gauge_p(), (float2*) cudaGauge->Even_p(), (float2*)cudaGauge->Odd_p(),
906 errorQuda(
"Half precision not supported");