14 #define MAX(a,b) ((a)>(b)?(a):(b))
15 #define ALIGNMENT 4096
19 #if defined(GPU_FATLINK) || defined(GPU_GAUGE_FORCE)|| defined(GPU_FERMION_FORCE) || defined(GPU_HISQ_FORCE) || defined(GPU_CLOVER_DIRAC)
23 template <
typename Float>
24 void packGhostAllStaples(
Float *cpuStaple,
Float **cpuGhostBack,
Float**cpuGhostFwd,
int nFace,
int*
X) {
26 int XYZ=X[0]*X[1]*X[2];
27 int volumeCB = X[0]*X[1]*X[2]*X[3]/2;
41 A[0] = X[3]; B[0] = X[2]; C[0] = X[1];
44 A[1] = X[3]; B[1] = X[2]; C[1] = X[0];
47 A[2] = X[3]; B[2] = X[1]; C[2] = X[0];
50 A[3] = X[2]; B[3] = X[1]; C[3] = X[0];
62 for(
int ite = 0; ite < 2; ite++){
73 for(
int dir =0; dir < 4; dir++){
78 for(
int linkdir=0; linkdir < 1; linkdir ++){
79 Float* even_src = cpuStaple;
87 if((X[dir] % 2 ==0) || (
commDim(dir) == 1)){
95 int even_dst_index = 0;
96 int odd_dst_index = 0;
104 startd = X[dir] - nFace;
107 for(d = startd; d < endd; d++){
108 for(a = 0; a < A[dir]; a++){
109 for(b = 0; b < B[dir]; b++){
110 for(c = 0; c < C[dir]; c++){
111 int index = ( a*f[dir][0] + b*f[dir][1]+ c*f[dir][2] + d*f[dir][3])>> 1;
112 int oddness = (a+b+c+d)%2;
114 for(
int i=0;i < 18;i++){
115 even_dst[18*even_dst_index+i] = even_src[18*index + i];
119 for(
int i=0;i < 18;i++){
120 odd_dst[18*odd_dst_index+i] = odd_src[18*index + i];
128 assert( even_dst_index == nFace*faceVolumeCB[dir]);
129 assert( odd_dst_index == nFace*faceVolumeCB[dir]);
141 packGhostAllStaples((
double*)staple, (
double**)cpuGhostStapleBack, (
double**) cpuGhostStapleFwd, nFace, X);
143 packGhostAllStaples((
float*)staple, (
float**)cpuGhostStapleBack, (
float**)cpuGhostStapleFwd, nFace, X);
164 1, X[0], X[1]*X[0], X[2]*X[1]*X[0],
167 int even_dst_idx = 0;
169 char* dst_even =(
char*)buf;
170 char* dst_odd = dst_even + (X[dir1]*X[dir2]/2)*
gaugeSiteSize*prec;
171 char* src_even = (
char*)sitelink[nu];
172 char* src_odd = src_even + (X[0]*X[1]*X[2]*X[3]/2)*
gaugeSiteSize*prec;
174 if( (X[nu]+X[mu]) % 2 == 1){
182 for(
int i=0;i < X[dir2]; i++){
183 for(
int j=0; j < X[dir1]; j++){
184 int src_idx = ((X[nu]-1)*mul_factor[nu]+ 0*mul_factor[mu]+i*mul_factor[dir2]+j*mul_factor[dir1])>>1;
186 int oddness = ( (X[nu]-1) + 0 + i + j) %2;
190 memcpy(&dst_even[(18*even_dst_idx+tmpidx)*prec], &src_even[(18*src_idx + tmpidx)*prec], prec);
195 memcpy(&dst_odd[(18*odd_dst_idx+tmpidx)*prec], &src_odd[(18*src_idx + tmpidx)*prec], prec);
203 if( (even_dst_idx != X[dir1]*X[dir2]/2)|| (odd_dst_idx != X[dir1]*X[dir2]/2)){
204 errorQuda(
"even_dst_idx/odd_dst_idx(%d/%d) does not match the value of X[dir1]*X[dir2]/2 (%d)\n",
205 even_dst_idx, odd_dst_idx, X[dir1]*X[dir2]/2);
215 int dir,
int whichway,
216 void** fwd_nbr_buf_gpu,
void** back_nbr_buf_gpu,
217 void** fwd_nbr_buf,
void** back_nbr_buf,
222 Vs_x = X[1]*X[2]*X[3];
223 Vs_y = X[0]*X[2]*X[3];
224 Vs_z = X[0]*X[1]*X[3];
225 Vs_t = X[0]*X[1]*X[2];
233 gpu_buf = back_nbr_buf_gpu[i];
234 collectGhostStaple(X, even, odd, volumeCB, stride, prec, gpu_buf, i, whichway, stream);
235 cudaMemcpyAsync(back_nbr_buf[i], gpu_buf, Vs[i]*
gaugeSiteSize*prec, cudaMemcpyDeviceToHost, *stream);
237 gpu_buf = fwd_nbr_buf_gpu[i];
238 collectGhostStaple(X, even, odd, volumeCB, stride, prec, gpu_buf, i, whichway, stream);
239 cudaMemcpyAsync(fwd_nbr_buf[i], gpu_buf, Vs[i]*
gaugeSiteSize*prec, cudaMemcpyDeviceToHost, *stream);
243 int Vsh = X[0]*X[1]*X[2]/2;
244 int sizeOfFloatN = 2*
prec;
245 int len = Vsh*sizeOfFloatN;
250 void* dst = ((
char*)back_nbr_buf[3]) + i*len ;
251 void* src = ((
char*)even) + i*stride*sizeOfFloatN;
252 cudaMemcpyAsync(dst, src, len, cudaMemcpyDeviceToHost, *stream);
256 void* dst = ((
char*)back_nbr_buf[3]) + 9*len + i*len ;
257 void* src = ((
char*)odd) + i*stride*sizeOfFloatN;
258 cudaMemcpyAsync(dst, src, len, cudaMemcpyDeviceToHost, *stream);
262 void* dst = ((
char*)fwd_nbr_buf[3]) + i*len ;
263 void* src = ((
char*)even) + (Vh-
Vsh)*sizeOfFloatN + i*stride*sizeOfFloatN;
264 cudaMemcpyAsync(dst, src, len, cudaMemcpyDeviceToHost, *stream);
268 void* dst = ((
char*)fwd_nbr_buf[3]) + 9*len + i*len ;
269 void* src = ((
char*)odd) + (Vh-
Vsh)*sizeOfFloatN + i*stride*sizeOfFloatN;
270 cudaMemcpyAsync(dst, src, len, cudaMemcpyDeviceToHost, *stream);
276 void* dst = ((
char*)back_nbr_buf[3]) + i*len ;
277 void* src = ((
char*)odd) + i*stride*sizeOfFloatN;
278 cudaMemcpyAsync(dst, src, len, cudaMemcpyDeviceToHost, *stream);
282 void* dst = ((
char*)back_nbr_buf[3]) + 9*len + i*len ;
283 void* src = ((
char*)even) + i*stride*sizeOfFloatN;
284 cudaMemcpyAsync(dst, src, len, cudaMemcpyDeviceToHost, *stream);
288 void* dst = ((
char*)fwd_nbr_buf[3]) + i*len ;
289 void* src = ((
char*)odd) + (Vh-
Vsh)*sizeOfFloatN + i*stride*sizeOfFloatN;
290 cudaMemcpyAsync(dst, src, len, cudaMemcpyDeviceToHost, *stream);
294 void* dst = ((
char*)fwd_nbr_buf[3]) + 9*len + i*len ;
295 void* src = ((
char*)even) + (Vh-
Vsh)*sizeOfFloatN + i*stride*sizeOfFloatN;
296 cudaMemcpyAsync(dst, src, len, cudaMemcpyDeviceToHost, *stream);
308 int dir,
int whichway,
void** fwd_nbr_buf,
void** back_nbr_buf,
309 cudaStream_t* stream)
314 Vsh_x = X[1]*X[2]*X[3]/2;
315 Vsh_y = X[0]*X[2]*X[3]/2;
316 Vsh_z = X[0]*X[1]*X[3]/2;
317 Vsh_t = X[0]*X[1]*X[2]/2;
321 int sizeOfFloatN = 2*
prec;
333 Vsh_x + Vsh_y +
Vsh_z,
336 char* even = ((
char*)_even) + Vh*sizeOfFloatN + 2*tmpint[dir]*sizeOfFloatN;
337 char* odd = ((
char*)_odd) + Vh*sizeOfFloatN +2*tmpint[dir]*sizeOfFloatN;
341 for(
int i=0;i < 9; i++){
342 void* dst = even + i*stride*sizeOfFloatN;
343 void* src = ((
char*)back_nbr_buf[dir]) + i*len[dir] ;
344 cudaMemcpyAsync(dst, src, len[dir], cudaMemcpyHostToDevice, *stream);
347 for(
int i=0;i < 9; i++){
348 void* dst = odd + i*stride*sizeOfFloatN;
349 void* src = ((
char*)back_nbr_buf[dir]) + 9*len[dir] + i*len[dir] ;
350 cudaMemcpyAsync(dst, src, len[dir], cudaMemcpyHostToDevice, *stream);
354 for(
int i=0;i < 9; i++){
355 void* dst = even + Vsh[dir]*sizeOfFloatN + i*stride*sizeOfFloatN;
356 void* src = ((
char*)fwd_nbr_buf[dir]) + i*len[dir] ;
357 cudaMemcpyAsync(dst, src, len[dir], cudaMemcpyHostToDevice, *stream);
360 for(
int i=0;i < 9; i++){
361 void* dst = odd + Vsh[dir]*sizeOfFloatN + i*stride*sizeOfFloatN;
362 void* src = ((
char*)fwd_nbr_buf[dir]) + 9*len[dir] + i*len[dir] ;
363 cudaMemcpyAsync(dst, src, len[dir], cudaMemcpyHostToDevice, *stream);
377 template <
typename Float>
378 void packGhostAllLinks(
Float **cpuLink,
Float **cpuGhostBack,
Float**cpuGhostFwd,
int dir,
int nFace,
int* X) {
380 int XYZ=X[0]*X[1]*X[2];
382 int volumeCB = X[0]*X[1]*X[2]*X[3]/2;
383 int faceVolumeCB[4]={
393 int A[4], B[4], C[4];
396 A[0] = X[3]; B[0] = X[2]; C[0] = X[1];
399 A[1] = X[3]; B[1] = X[2]; C[1] = X[0];
402 A[2] = X[3]; B[2] = X[1]; C[2] = X[0];
405 A[3] = X[2]; B[3] = X[1]; C[3] = X[0];
417 for(
int ite = 0; ite < 2; ite++){
433 for(
int linkdir=0; linkdir < 4; linkdir ++){
434 Float* even_src = cpuLink[linkdir];
442 if((X[dir] % 2 ==0) || (
commDim(dir) == 1)){
443 even_dst = dst[dir] + 2*linkdir* nFace *faceVolumeCB[dir]*
gaugeSiteSize;
446 odd_dst = dst[dir] + 2*linkdir* nFace *faceVolumeCB[dir]*
gaugeSiteSize;
450 int even_dst_index = 0;
451 int odd_dst_index = 0;
459 startd = X[dir] - nFace;
462 for(d = startd; d < endd; d++){
463 for(a = 0; a < A[dir]; a++){
464 for(b = 0; b < B[dir]; b++){
465 for(c = 0; c < C[dir]; c++){
466 int index = ( a*f[dir][0] + b*f[dir][1]+ c*f[dir][2] + d*f[dir][3])>> 1;
467 int oddness = (a+b+c+d)%2;
469 for(
int i=0;i < 18;i++){
470 even_dst[18*even_dst_index+i] = even_src[18*index + i];
474 for(
int i=0;i < 18;i++){
475 odd_dst[18*odd_dst_index+i] = odd_src[18*index + i];
483 assert( even_dst_index == nFace*faceVolumeCB[dir]);
484 assert( odd_dst_index == nFace*faceVolumeCB[dir]);
496 packGhostAllLinks((
double**)cpuLink, (
double**)cpuGhostBack, (
double**) cpuGhostFwd, dir, nFace, X);
498 packGhostAllLinks((
float**)cpuLink, (
float**)cpuGhostBack, (
float**)cpuGhostFwd, dir, nFace, X);
512 do_loadLinkToGPU(
int* X,
void *even,
void*odd,
void **
cpuGauge,
void** ghost_cpuGauge,
513 void** ghost_cpuGauge_diag,
519 Vh_2d_max =
MAX(Vh_2d_max, X[0]*X[3]/2);
520 Vh_2d_max =
MAX(Vh_2d_max, X[1]*X[2]/2);
521 Vh_2d_max =
MAX(Vh_2d_max, X[1]*X[3]/2);
522 Vh_2d_max =
MAX(Vh_2d_max, X[2]*X[3]/2);
534 int ghostV = 2*(Vsh_x+Vsh_y+Vsh_z+
Vsh_t)+4*Vh_2d_max;
541 char *tmp_odd = tmp_even;
547 cudaMemcpyAsync(tmp_even + i*(len+glen_sum), cpuGauge[i], len, cudaMemcpyHostToDevice,
streams[0]);
549 cudaMemcpy(tmp_even + i*(len+glen_sum), cpuGauge[i], len, cudaMemcpyHostToDevice);
555 errorQuda(
"Multi-GPU for MILC gauge order is not supported");
558 cudaMemcpyAsync(tmp_even, ((
char*)cpuGauge), 4*len, cudaMemcpyHostToDevice,
streams[0]);
560 cudaMemcpy(tmp_even, ((
char*)cpuGauge), 4*len, cudaMemcpyHostToDevice);
563 errorQuda(
"Unsupported gauge order\n"); \
570 char* dest = tmp_even + i*(len+glen_sum)+len;
571 for(
int dir = 0; dir < 4; dir++){
573 cudaMemcpyAsync(dest, ((
char*)ghost_cpuGauge[dir])+i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice,
streams[0]);
574 cudaMemcpyAsync(dest + glen[dir], ((
char*)ghost_cpuGauge[dir])+8*glen[dir]+i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice,
streams[0]);
576 cudaMemcpy(dest, ((
char*)ghost_cpuGauge[dir])+i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice);
577 cudaMemcpy(dest + glen[dir], ((
char*)ghost_cpuGauge[dir])+8*glen[dir]+i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice);
584 for(
int mu = 0; mu < 4; mu++){
589 for(dir1=0; dir1 < 4; dir1 ++){
590 if(dir1 != nu && dir1 != mu){
594 for(dir2=0; dir2 < 4; dir2 ++){
595 if(dir2 != nu && dir2 != mu && dir2 != dir1){
600 cudaMemcpyAsync(dest+ mu *Vh_2d_max*
gaugeSiteSize*prec,ghost_cpuGauge_diag[nu*4+mu],
603 cudaMemcpy(dest+ mu *Vh_2d_max*
gaugeSiteSize*prec,ghost_cpuGauge_diag[nu*4+mu],
604 X[dir1]*X[dir2]/2*
gaugeSiteSize*prec, cudaMemcpyHostToDevice);
618 cudaMemcpyAsync(tmp_odd + i*(len+glen_sum), ((
char*)cpuGauge[i]) + Vh*
gaugeSiteSize*prec, len, cudaMemcpyHostToDevice,
streams[0]);
620 cudaMemcpy(tmp_odd + i*(len+glen_sum), ((
char*)cpuGauge[i]) + Vh*
gaugeSiteSize*prec, len, cudaMemcpyHostToDevice);
625 cudaMemcpyAsync(tmp_odd , ((
char*)cpuGauge)+4*Vh*
gaugeSiteSize*prec, 4*len, cudaMemcpyHostToDevice,
streams[0]);
627 cudaMemcpy(tmp_odd, (
char*)cpuGauge+4*Vh*
gaugeSiteSize*prec, 4*len, cudaMemcpyHostToDevice);
636 char* dest = tmp_odd + i*(len+glen_sum)+len;
637 for(
int dir = 0; dir < 4; dir++){
639 cudaMemcpyAsync(dest, ((
char*)ghost_cpuGauge[dir])+glen[dir] +i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice,
streams[0]);
640 cudaMemcpyAsync(dest + glen[dir], ((
char*)ghost_cpuGauge[dir])+8*glen[dir]+glen[dir] +i*2*glen[dir], glen[dir],
641 cudaMemcpyHostToDevice,
streams[0]);
643 cudaMemcpy(dest, ((
char*)ghost_cpuGauge[dir])+glen[dir] +i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice);
644 cudaMemcpy(dest + glen[dir], ((
char*)ghost_cpuGauge[dir])+8*glen[dir]+glen[dir] +i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice);
653 for(
int mu = 0; mu < 4; mu++){
658 for(dir1=0; dir1 < 4; dir1 ++){
659 if(dir1 != nu && dir1 != mu){
663 for(dir2=0; dir2 < 4; dir2 ++){
664 if(dir2 != nu && dir2 != mu && dir2 != dir1){
669 cudaMemcpyAsync(dest+ mu *Vh_2d_max*
gaugeSiteSize*prec,((
char*)ghost_cpuGauge_diag[nu*4+mu])+X[dir1]*X[dir2]/2*
gaugeSiteSize*prec,
673 X[dir1]*X[dir2]/2*
gaugeSiteSize*prec, cudaMemcpyHostToDevice );
682 cudaStreamSynchronize(
streams[0]);
691 if (cudaGauge->Precision() != cpuGauge->Precision()){
692 errorQuda(
"Mismatch between CPU precision and CUDA precision");
697 const int*
Z = cudaGauge->X();
699 int pad = cudaGauge->Pad();
700 int Vsh_x = param->
X[1]*param->
X[2]*param->
X[3]/2;
701 int Vsh_y = param->
X[0]*param->
X[2]*param->
X[3]/2;
702 int Vsh_z = param->
X[0]*param->
X[1]*param->
X[3]/2;
703 int Vsh_t = param->
X[0]*param->
X[1]*param->
X[2]/2;
705 static void* ghost_cpuGauge[4];
706 static void* ghost_cpuGauge_diag[16];
709 static int allocated = 0;
714 for(
int i=0;i < 4; i++) {
729 for(
int nu=0;nu < 4;nu++){
730 for(
int mu=0; mu < 4;mu++){
732 ghost_cpuGauge_diag[nu*4+
mu] = NULL;
736 for(dir1= 0; dir1 < 4; dir1++){
737 if(dir1 !=nu && dir1 != mu){
741 for(dir2=0; dir2 < 4; dir2++){
742 if(dir2 != nu && dir2 != mu && dir2 != dir1){
754 memset(ghost_cpuGauge_diag[nu*4+mu], 0, nbytes);
767 do_loadLinkToGPU(param->X, cudaGauge->Even_p(), cudaGauge->Odd_p(), (
void**)cpuGauge->Gauge_p(),
768 ghost_cpuGauge, ghost_cpuGauge_diag,
769 cudaGauge->Reconstruct(), cudaGauge->Bytes(), cudaGauge->VolumeCB(), pad,
771 prec, cpuGauge->Order());
776 for(
int i=0;i < 4;i++){
779 for(
int i=0;i <4; i++){
780 for(
int j=0;j <4; j++){
781 if (i != j)
host_free(ghost_cpuGauge_diag[i*4+j]);
791 do_loadLinkToGPU_ex(
const int* X,
void *even,
void *odd,
void**cpuGauge,
798 char *tmp_odd = tmp_even;
802 for(
int i=0; i < 4; i++){
804 cudaMemcpyAsync(tmp_even + i*len, cpuGauge[i], len, cudaMemcpyHostToDevice);
806 cudaMemcpy(tmp_even + i*len, cpuGauge[i], len, cudaMemcpyHostToDevice);
812 cudaMemcpyAsync(tmp_even, (
char*)cpuGauge, 4*len, cudaMemcpyHostToDevice);
814 cudaMemcpy(tmp_even, (
char*)cpuGauge, 4*len, cudaMemcpyHostToDevice);
827 for(
int i=0; i < 4; i++){
829 cudaMemcpyAsync(tmp_odd + i*len, ((
char*)cpuGauge[i]) + Vh_ex*
gaugeSiteSize*prec, len, cudaMemcpyHostToDevice);
831 cudaMemcpy(tmp_odd + i*len, ((
char*)cpuGauge[i]) + Vh_ex*
gaugeSiteSize*prec, len, cudaMemcpyHostToDevice);
836 cudaMemcpyAsync(tmp_odd, ((
char*)cpuGauge) + 4*Vh_ex*
gaugeSiteSize*prec, 4*len, cudaMemcpyHostToDevice);
838 cudaMemcpy(tmp_odd, ((
char*)cpuGauge) + 4*Vh_ex*
gaugeSiteSize*prec, 4*len, cudaMemcpyHostToDevice);
851 if (cudaGauge->Precision() != cpuGauge->Precision()){
852 errorQuda(
"Mismatch between CPU precision and CUDA precision");
855 const int *
E = cudaGauge->X();
856 int pad = cudaGauge->Pad();
857 do_loadLinkToGPU_ex(E, cudaGauge->Even_p(), cudaGauge->Odd_p(), (
void**)cpuGauge->Gauge_p(),
858 cudaGauge->Reconstruct(), cudaGauge->Bytes(), cudaGauge->VolumeCB(), pad,
859 prec, cpuGauge->Order());
863 template<
typename FloatN,
typename Float>
864 static void do_storeLinkToCPU(
Float* cpuGauge, FloatN *even, FloatN *odd,
869 double *unpackedDataEven = (
double *)
device_malloc(datalen);
870 double *unpackedDataOdd = unpackedDataEven;
876 cudaMemcpyAsync(cpuGauge, unpackedDataEven, datalen, cudaMemcpyDeviceToHost,
streams[0]);
878 cudaMemcpy(cpuGauge, unpackedDataEven, datalen, cudaMemcpyDeviceToHost);
884 cudaMemcpyAsync(cpuGauge + 4*Vh*
gaugeSiteSize, unpackedDataOdd, datalen, cudaMemcpyDeviceToHost,
streams[0]);
886 cudaMemcpy(cpuGauge + 4*Vh*gaugeSiteSize, unpackedDataOdd, datalen, cudaMemcpyDeviceToHost);
898 if (cpu_prec != cuda_prec){
899 errorQuda(
"Mismatch between CPU precision and CUDA precision");
903 errorQuda(
"Reconstruct type not supported");
906 int stride = cudaGauge->VolumeCB() + cudaGauge->Pad();
909 do_storeLinkToCPU( (
double*)cpuGauge->Gauge_p(), (double2*) cudaGauge->Even_p(), (double2*)cudaGauge->Odd_p(),
910 cudaGauge->Bytes(), cudaGauge->VolumeCB(), stride,
cuda_prec);
912 do_storeLinkToCPU( (
float*)cpuGauge->Gauge_p(), (float2*) cudaGauge->Even_p(), (float2*)cudaGauge->Odd_p(),
913 cudaGauge->Bytes(), cudaGauge->VolumeCB(), stride,
cuda_prec);
915 errorQuda(
"Half precision not supported");
#define pinned_malloc(size)
enum QudaPrecision_s QudaPrecision
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int mu
void unpackGhostStaple(int *X, void *_even, void *_odd, int volume, QudaPrecision prec, int stride, int dir, int whichway, void **fwd_nbr_buf, void **back_nbr_buf, cudaStream_t *stream)
void collectGhostStaple(int *X, void *even, void *odd, int volumeCB, int stride, QudaPrecision precision, void *ghost_staple_gpu, int dir, int whichway, cudaStream_t *stream)
cudaGaugeField * cudaGauge
cudaColorSpinorField * tmp
__device__ __host__ int index(int i, int j)
FloatingPoint< float > Float
void storeLinkToCPU(cpuGaugeField *cpuGauge, cudaGaugeField *cudaGauge, QudaGaugeParam *param)
enum QudaGaugeFieldOrder_s QudaGaugeFieldOrder
void loadLinkToGPU_ex(cudaGaugeField *cudaGauge, cpuGaugeField *cpuGauge)
#define safe_malloc(size)
void pack_ghost_all_links(void **cpuLink, void **cpuGhostBack, void **cpuGhostFwd, int dir, int nFace, QudaPrecision precision, int *X)
void * memset(void *s, int c, size_t n)
__constant__ int Vh_2d_max
void pack_gauge_diag(void *buf, int *X, void **sitelink, int nu, int mu, int dir1, int dir2, QudaPrecision prec)
enum QudaReconstructType_s QudaReconstructType
Main header file for the QUDA library.
void packGhostStaple(int *X, void *even, void *odd, int volume, QudaPrecision prec, int stride, int dir, int whichway, void **fwd_nbr_buf_gpu, void **back_nbr_buf_gpu, void **fwd_nbr_buf, void **back_nbr_buf, cudaStream_t *stream)
#define device_malloc(size)
void link_format_gpu_to_cpu(void *dst, void *src, int Vh, int stride, QudaPrecision prec, cudaStream_t stream)
void pack_ghost_all_staples_cpu(void *staple, void **cpuGhostStapleBack, void **cpuGhostStapleFwd, int nFace, QudaPrecision precision, int *X)
void loadLinkToGPU(cudaGaugeField *cudaGauge, cpuGaugeField *cpuGauge, QudaGaugeParam *param)
void exchange_cpu_sitelink(int *X, void **sitelink, void **ghost_sitelink, void **ghost_sitelink_diag, QudaPrecision gPrecision, QudaGaugeParam *param, int optflag)
void link_format_cpu_to_gpu(void *dst, void *src, int reconstruct, int Vh, int pad, int ghostV, QudaPrecision prec, QudaGaugeFieldOrder cpu_order, cudaStream_t stream)