QUDA  v0.5.0
A library for QCD on GPUs
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
fat_force_quda.cpp
Go to the documentation of this file.
1 #include <stdlib.h>
2 #include <stdio.h>
3 #include <math.h>
4 #include <string.h>
5 
6 #include <typeinfo>
7 #include <quda.h>
8 #include <quda_internal.h>
9 #include <fat_force_quda.h>
10 #include <face_quda.h>
11 #include <misc_helpers.h>
12 #include <assert.h>
13 
14 #define MAX(a,b) ((a)>(b)?(a):(b))
15 #define ALIGNMENT 4096
16 
17  /********************** Staple code, used by link fattening **************/
18 
19 #if defined(GPU_FATLINK)||defined(GPU_GAUGE_FORCE)|| defined(GPU_FERMION_FORCE) ||defined(GPU_HISQ_FORCE)
20 
21 namespace quda {
22 
23  template <typename Float>
24  void packGhostAllStaples(Float *cpuStaple, Float **cpuGhostBack,Float**cpuGhostFwd, int nFace, int* X) {
25  int XY=X[0]*X[1];
26  int XYZ=X[0]*X[1]*X[2];
27  int volumeCB = X[0]*X[1]*X[2]*X[3]/2;
28  int faceVolumeCB[4]={
29  X[1]*X[2]*X[3]/2,
30  X[0]*X[2]*X[3]/2,
31  X[0]*X[1]*X[3]/2,
32  X[0]*X[1]*X[2]/2
33  };
34 
35  //loop variables: a, b, c with a the most signifcant and c the least significant
36  //A, B, C the maximum value
37  //we need to loop in d as well, d's vlaue dims[dir]-3, dims[dir]-2, dims[dir]-1
38  int A[4], B[4], C[4];
39 
40  //X dimension
41  A[0] = X[3]; B[0] = X[2]; C[0] = X[1];
42 
43  //Y dimension
44  A[1] = X[3]; B[1] = X[2]; C[1] = X[0];
45 
46  //Z dimension
47  A[2] = X[3]; B[2] = X[1]; C[2] = X[0];
48 
49  //T dimension
50  A[3] = X[2]; B[3] = X[1]; C[3] = X[0];
51 
52 
53  //multiplication factor to compute index in original cpu memory
54  int f[4][4]={
55  {XYZ, XY, X[0], 1},
56  {XYZ, XY, 1, X[0]},
57  {XYZ, X[0], 1, XY},
58  { XY, X[0], 1, XYZ}
59  };
60 
61 
62  for(int ite = 0; ite < 2; ite++){
63  //ite == 0: back
64  //ite == 1: fwd
65  Float** dst;
66  if (ite == 0){
67  dst = cpuGhostBack;
68  }else{
69  dst = cpuGhostFwd;
70  }
71 
72  //collect back ghost staple
73  for(int dir =0; dir < 4; dir++){
74  int d;
75  int a,b,c;
76 
77  //ther is only one staple in the same location
78  for(int linkdir=0; linkdir < 1; linkdir ++){
79  Float* even_src = cpuStaple;
80  Float* odd_src = cpuStaple + volumeCB*gaugeSiteSize;
81 
82  Float* even_dst;
83  Float* odd_dst;
84 
85  //switching odd and even ghost cpuLink when that dimension size is odd
86  //only switch if X[dir] is odd and the gridsize in that dimension is greater than 1
87  if((X[dir] % 2 ==0) || (commDim(dir) == 1)){
88  even_dst = dst[dir];
89  odd_dst = even_dst + nFace*faceVolumeCB[dir]*gaugeSiteSize;
90  }else{
91  odd_dst = dst[dir];
92  even_dst = dst[dir] + nFace*faceVolumeCB[dir]*gaugeSiteSize;
93  }
94 
95  int even_dst_index = 0;
96  int odd_dst_index = 0;
97 
98  int startd;
99  int endd;
100  if(ite == 0){ //back
101  startd = 0;
102  endd= nFace;
103  }else{//fwd
104  startd = X[dir] - nFace;
105  endd =X[dir];
106  }
107  for(d = startd; d < endd; d++){
108  for(a = 0; a < A[dir]; a++){
109  for(b = 0; b < B[dir]; b++){
110  for(c = 0; c < C[dir]; c++){
111  int index = ( a*f[dir][0] + b*f[dir][1]+ c*f[dir][2] + d*f[dir][3])>> 1;
112  int oddness = (a+b+c+d)%2;
113  if (oddness == 0){ //even
114  for(int i=0;i < 18;i++){
115  even_dst[18*even_dst_index+i] = even_src[18*index + i];
116  }
117  even_dst_index++;
118  }else{ //odd
119  for(int i=0;i < 18;i++){
120  odd_dst[18*odd_dst_index+i] = odd_src[18*index + i];
121  }
122  odd_dst_index++;
123  }
124  }//c
125  }//b
126  }//a
127  }//d
128  assert( even_dst_index == nFace*faceVolumeCB[dir]);
129  assert( odd_dst_index == nFace*faceVolumeCB[dir]);
130  }//linkdir
131 
132  }//dir
133  }//ite
134  }
135 
136 
137  void pack_ghost_all_staples_cpu(void *staple, void **cpuGhostStapleBack, void** cpuGhostStapleFwd,
138  int nFace, QudaPrecision precision, int* X) {
139 
140  if (precision == QUDA_DOUBLE_PRECISION) {
141  packGhostAllStaples((double*)staple, (double**)cpuGhostStapleBack, (double**) cpuGhostStapleFwd, nFace, X);
142  } else {
143  packGhostAllStaples((float*)staple, (float**)cpuGhostStapleBack, (float**)cpuGhostStapleFwd, nFace, X);
144  }
145 
146  }
147 
148  void pack_gauge_diag(void* buf, int* X, void** sitelink, int nu, int mu, int dir1, int dir2, QudaPrecision prec)
149  {
150  /*
151  nu | |
152  |__________|
153  mu
154  *
155  * nu, mu are the directions we are working on
156  * Since we are packing our own data, we need to go to the north-west corner in the diagram
157  * i.e. x[nu] = X[nu]-1, x[mu]=0, and looop throught x[dir1],x[dir2]
158  * in the remaining two directions (dir1/dir2), dir2 is the slowest changing dim when computing
159  * index
160  */
161 
162 
163  int mul_factor[4]={
164  1, X[0], X[1]*X[0], X[2]*X[1]*X[0],
165  };
166 
167  int even_dst_idx = 0;
168  int odd_dst_idx = 0;
169  char* dst_even =(char*)buf;
170  char* dst_odd = dst_even + (X[dir1]*X[dir2]/2)*gaugeSiteSize*prec;
171  char* src_even = (char*)sitelink[nu];
172  char* src_odd = src_even + (X[0]*X[1]*X[2]*X[3]/2)*gaugeSiteSize*prec;
173 
174  if( (X[nu]+X[mu]) % 2 == 1){
175  //oddness will change between me and the diagonal neighbor
176  //switch it now
177  char* tmp = dst_odd;
178  dst_odd = dst_even;
179  dst_even = tmp;
180  }
181 
182  for(int i=0;i < X[dir2]; i++){
183  for(int j=0; j < X[dir1]; j++){
184  int src_idx = ((X[nu]-1)*mul_factor[nu]+ 0*mul_factor[mu]+i*mul_factor[dir2]+j*mul_factor[dir1])>>1;
185  //int dst_idx = (i*X[dir1]+j) >> 1;
186  int oddness = ( (X[nu]-1) + 0 + i + j) %2;
187 
188  if(oddness==0){
189  for(int tmpidx = 0; tmpidx < gaugeSiteSize; tmpidx++){
190  memcpy(&dst_even[(18*even_dst_idx+tmpidx)*prec], &src_even[(18*src_idx + tmpidx)*prec], prec);
191  }
192  even_dst_idx++;
193  }else{
194  for(int tmpidx = 0; tmpidx < gaugeSiteSize; tmpidx++){
195  memcpy(&dst_odd[(18*odd_dst_idx+tmpidx)*prec], &src_odd[(18*src_idx + tmpidx)*prec], prec);
196  }
197  odd_dst_idx++;
198  }//if
199 
200  }//for j
201  }//for i
202 
203  if( (even_dst_idx != X[dir1]*X[dir2]/2)|| (odd_dst_idx != X[dir1]*X[dir2]/2)){
204  errorQuda("even_dst_idx/odd_dst_idx(%d/%d) does not match the value of X[dir1]*X[dir2]/2 (%d)\n",
205  even_dst_idx, odd_dst_idx, X[dir1]*X[dir2]/2);
206  }
207  return ;
208 
209 
210  }
211 
212  void
213  packGhostStaple(int* X, void* even, void* odd, int volume, QudaPrecision prec,
214  int stride,
215  int dir, int whichway,
216  void** fwd_nbr_buf_gpu, void** back_nbr_buf_gpu,
217  void** fwd_nbr_buf, void** back_nbr_buf,
218  cudaStream_t* stream)
219  {
220  int Vs_x, Vs_y, Vs_z, Vs_t;
221 
222  Vs_x = X[1]*X[2]*X[3];
223  Vs_y = X[0]*X[2]*X[3];
224  Vs_z = X[0]*X[1]*X[3];
225  Vs_t = X[0]*X[1]*X[2];
226  int Vs[4] = {Vs_x, Vs_y, Vs_z, Vs_t};
227 
228  if (dir != 3){ //the code would work for dir=3 as well
229  //even and odd ness switch (if necessary) is taken caren of in collectGhostStaple();
230  void* gpu_buf;
231  int i =dir;
232  if (whichway == QUDA_BACKWARDS){
233  gpu_buf = back_nbr_buf_gpu[i];
234  collectGhostStaple(X, even, odd, volume, prec, gpu_buf, i, whichway, stream);
235  cudaMemcpyAsync(back_nbr_buf[i], gpu_buf, Vs[i]*gaugeSiteSize*prec, cudaMemcpyDeviceToHost, *stream);
236  }else{//whichway is QUDA_FORWARDS;
237  gpu_buf = fwd_nbr_buf_gpu[i];
238  collectGhostStaple(X, even, odd, volume, prec, gpu_buf, i, whichway, stream);
239  cudaMemcpyAsync(fwd_nbr_buf[i], gpu_buf, Vs[i]*gaugeSiteSize*prec, cudaMemcpyDeviceToHost, *stream);
240  }
241  }else{ //special case for dir=3 since no gather kernel is required
242  int Vh = volume;
243  int Vsh = X[0]*X[1]*X[2]/2;
244  int sizeOfFloatN = 2*prec;
245  int len = Vsh*sizeOfFloatN;
246  int i;
247  if(X[3] %2 == 0){
248  //back,even
249  for(i=0;i < 9; i++){
250  void* dst = ((char*)back_nbr_buf[3]) + i*len ;
251  void* src = ((char*)even) + i*stride*sizeOfFloatN;
252  cudaMemcpyAsync(dst, src, len, cudaMemcpyDeviceToHost, *stream);
253  }
254  //back, odd
255  for(i=0;i < 9; i++){
256  void* dst = ((char*)back_nbr_buf[3]) + 9*len + i*len ;
257  void* src = ((char*)odd) + i*stride*sizeOfFloatN;
258  cudaMemcpyAsync(dst, src, len, cudaMemcpyDeviceToHost, *stream);
259  }
260  //fwd,even
261  for(i=0;i < 9; i++){
262  void* dst = ((char*)fwd_nbr_buf[3]) + i*len ;
263  void* src = ((char*)even) + (Vh-Vsh)*sizeOfFloatN + i*stride*sizeOfFloatN;
264  cudaMemcpyAsync(dst, src, len, cudaMemcpyDeviceToHost, *stream);
265  }
266  //fwd, odd
267  for(i=0;i < 9; i++){
268  void* dst = ((char*)fwd_nbr_buf[3]) + 9*len + i*len ;
269  void* src = ((char*)odd) + (Vh-Vsh)*sizeOfFloatN + i*stride*sizeOfFloatN;
270  cudaMemcpyAsync(dst, src, len, cudaMemcpyDeviceToHost, *stream);
271  }
272  }else{
273  //reverse even and odd position
274  //back,odd
275  for(i=0;i < 9; i++){
276  void* dst = ((char*)back_nbr_buf[3]) + i*len ;
277  void* src = ((char*)odd) + i*stride*sizeOfFloatN;
278  cudaMemcpyAsync(dst, src, len, cudaMemcpyDeviceToHost, *stream);
279  }
280  //back, even
281  for(i=0;i < 9; i++){
282  void* dst = ((char*)back_nbr_buf[3]) + 9*len + i*len ;
283  void* src = ((char*)even) + i*stride*sizeOfFloatN;
284  cudaMemcpyAsync(dst, src, len, cudaMemcpyDeviceToHost, *stream);
285  }
286  //fwd,odd
287  for(i=0;i < 9; i++){
288  void* dst = ((char*)fwd_nbr_buf[3]) + i*len ;
289  void* src = ((char*)odd) + (Vh-Vsh)*sizeOfFloatN + i*stride*sizeOfFloatN;
290  cudaMemcpyAsync(dst, src, len, cudaMemcpyDeviceToHost, *stream);
291  }
292  //fwd, even
293  for(i=0;i < 9; i++){
294  void* dst = ((char*)fwd_nbr_buf[3]) + 9*len + i*len ;
295  void* src = ((char*)even) + (Vh-Vsh)*sizeOfFloatN + i*stride*sizeOfFloatN;
296  cudaMemcpyAsync(dst, src, len, cudaMemcpyDeviceToHost, *stream);
297  }
298 
299  }
300  }
301 
302  }
303 
304 
305  void
306  unpackGhostStaple(int* X, void* _even, void* _odd, int volume, QudaPrecision prec,
307  int stride,
308  int dir, int whichway, void** fwd_nbr_buf, void** back_nbr_buf,
309  cudaStream_t* stream)
310  {
311 
312  int Vsh_x, Vsh_y, Vsh_z, Vsh_t;
313 
314  Vsh_x = X[1]*X[2]*X[3]/2;
315  Vsh_y = X[0]*X[2]*X[3]/2;
316  Vsh_z = X[0]*X[1]*X[3]/2;
317  Vsh_t = X[0]*X[1]*X[2]/2;
318  int Vsh[4] = {Vsh_x, Vsh_y, Vsh_z, Vsh_t};
319 
320  int Vh = volume;
321  int sizeOfFloatN = 2*prec;
322  int len[4] = {
323  Vsh_x*sizeOfFloatN,
324  Vsh_y*sizeOfFloatN,
325  Vsh_z*sizeOfFloatN,
326  Vsh_t*sizeOfFloatN
327  };
328 
329  int tmpint[4] = {
330  0,
331  Vsh_x,
332  Vsh_x + Vsh_y,
333  Vsh_x + Vsh_y + Vsh_z,
334  };
335 
336  char* even = ((char*)_even) + Vh*sizeOfFloatN + 2*tmpint[dir]*sizeOfFloatN;
337  char* odd = ((char*)_odd) + Vh*sizeOfFloatN +2*tmpint[dir]*sizeOfFloatN;
338 
339  if(whichway == QUDA_BACKWARDS){
340  //back,even
341  for(int i=0;i < 9; i++){
342  void* dst = even + i*stride*sizeOfFloatN;
343  void* src = ((char*)back_nbr_buf[dir]) + i*len[dir] ;
344  cudaMemcpyAsync(dst, src, len[dir], cudaMemcpyHostToDevice, *stream);
345  }
346  //back, odd
347  for(int i=0;i < 9; i++){
348  void* dst = odd + i*stride*sizeOfFloatN;
349  void* src = ((char*)back_nbr_buf[dir]) + 9*len[dir] + i*len[dir] ;
350  cudaMemcpyAsync(dst, src, len[dir], cudaMemcpyHostToDevice, *stream);
351  }
352  }else { //QUDA_FORWARDS
353  //fwd,even
354  for(int i=0;i < 9; i++){
355  void* dst = even + Vsh[dir]*sizeOfFloatN + i*stride*sizeOfFloatN;
356  void* src = ((char*)fwd_nbr_buf[dir]) + i*len[dir] ;
357  cudaMemcpyAsync(dst, src, len[dir], cudaMemcpyHostToDevice, *stream);
358  }
359  //fwd, odd
360  for(int i=0;i < 9; i++){
361  void* dst = odd + Vsh[dir]*sizeOfFloatN + i*stride*sizeOfFloatN;
362  void* src = ((char*)fwd_nbr_buf[dir]) + 9*len[dir] + i*len[dir] ;
363  cudaMemcpyAsync(dst, src, len[dir], cudaMemcpyHostToDevice, *stream);
364  }
365  }
366  }
367 
368  /*
369  This is the packing kernel for the multi-dimensional ghost zone in
370  the padded region. This is called by cpuexchangesitelink in
371  FaceBuffer (MPI only), which was called by loadLinkToGPU (defined at
372  the bottom).
373 
374  Not currently included since it will be replaced by Guochun's new
375  routine which uses an enlarged domain instead of a ghost zone.
376  */
377  template <typename Float>
378  void packGhostAllLinks(Float **cpuLink, Float **cpuGhostBack,Float**cpuGhostFwd, int dir, int nFace, int* X) {
379  int XY=X[0]*X[1];
380  int XYZ=X[0]*X[1]*X[2];
381 
382  int volumeCB = X[0]*X[1]*X[2]*X[3]/2;
383  int faceVolumeCB[4]={
384  X[1]*X[2]*X[3]/2,
385  X[0]*X[2]*X[3]/2,
386  X[0]*X[1]*X[3]/2,
387  X[0]*X[1]*X[2]/2
388  };
389 
390  //loop variables: a, b, c with a the most signifcant and c the least significant
391  //A, B, C the maximum value
392  //we need to loop in d as well, d's vlaue dims[dir]-3, dims[dir]-2, dims[dir]-1
393  int A[4], B[4], C[4];
394 
395  //X dimension
396  A[0] = X[3]; B[0] = X[2]; C[0] = X[1];
397 
398  //Y dimension
399  A[1] = X[3]; B[1] = X[2]; C[1] = X[0];
400 
401  //Z dimension
402  A[2] = X[3]; B[2] = X[1]; C[2] = X[0];
403 
404  //T dimension
405  A[3] = X[2]; B[3] = X[1]; C[3] = X[0];
406 
407 
408  //multiplication factor to compute index in original cpu memory
409  int f[4][4]={
410  {XYZ, XY, X[0], 1},
411  {XYZ, XY, 1, X[0]},
412  {XYZ, X[0], 1, XY},
413  { XY, X[0], 1, XYZ}
414  };
415 
416 
417  for(int ite = 0; ite < 2; ite++){
418  //ite == 0: back
419  //ite == 1: fwd
420  Float** dst;
421  if (ite == 0){
422  dst = cpuGhostBack;
423  }else{
424  dst = cpuGhostFwd;
425  }
426 
427  //collect back ghost gauge field
428  //for(int dir =0; dir < 4; dir++){
429  int d;
430  int a,b,c;
431 
432  //we need copy all 4 links in the same location
433  for(int linkdir=0; linkdir < 4; linkdir ++){
434  Float* even_src = cpuLink[linkdir];
435  Float* odd_src = cpuLink[linkdir] + volumeCB*gaugeSiteSize;
436 
437  Float* even_dst;
438  Float* odd_dst;
439 
440  //switching odd and even ghost cpuLink when that dimension size is odd
441  //only switch if X[dir] is odd and the gridsize in that dimension is greater than 1
442  if((X[dir] % 2 ==0) || (commDim(dir) == 1)){
443  even_dst = dst[dir] + 2*linkdir* nFace *faceVolumeCB[dir]*gaugeSiteSize;
444  odd_dst = even_dst + nFace*faceVolumeCB[dir]*gaugeSiteSize;
445  }else{
446  odd_dst = dst[dir] + 2*linkdir* nFace *faceVolumeCB[dir]*gaugeSiteSize;
447  even_dst = odd_dst + nFace*faceVolumeCB[dir]*gaugeSiteSize;
448  }
449 
450  int even_dst_index = 0;
451  int odd_dst_index = 0;
452 
453  int startd;
454  int endd;
455  if(ite == 0){ //back
456  startd = 0;
457  endd= nFace;
458  }else{//fwd
459  startd = X[dir] - nFace;
460  endd =X[dir];
461  }
462  for(d = startd; d < endd; d++){
463  for(a = 0; a < A[dir]; a++){
464  for(b = 0; b < B[dir]; b++){
465  for(c = 0; c < C[dir]; c++){
466  int index = ( a*f[dir][0] + b*f[dir][1]+ c*f[dir][2] + d*f[dir][3])>> 1;
467  int oddness = (a+b+c+d)%2;
468  if (oddness == 0){ //even
469  for(int i=0;i < 18;i++){
470  even_dst[18*even_dst_index+i] = even_src[18*index + i];
471  }
472  even_dst_index++;
473  }else{ //odd
474  for(int i=0;i < 18;i++){
475  odd_dst[18*odd_dst_index+i] = odd_src[18*index + i];
476  }
477  odd_dst_index++;
478  }
479  }//c
480  }//b
481  }//a
482  }//d
483  assert( even_dst_index == nFace*faceVolumeCB[dir]);
484  assert( odd_dst_index == nFace*faceVolumeCB[dir]);
485  }//linkdir
486 
487  //}//dir
488  }//ite
489  }
490 
491 
492  void pack_ghost_all_links(void **cpuLink, void **cpuGhostBack, void** cpuGhostFwd,
493  int dir, int nFace, QudaPrecision precision, int *X) {
494 
495  if (precision == QUDA_DOUBLE_PRECISION) {
496  packGhostAllLinks((double**)cpuLink, (double**)cpuGhostBack, (double**) cpuGhostFwd, dir, nFace, X);
497  } else {
498  packGhostAllLinks((float**)cpuLink, (float**)cpuGhostBack, (float**)cpuGhostFwd, dir, nFace, X);
499  }
500 
501  }
502 
503  /*
504  Copies the device gauge field to the host.
505  - no reconstruction support
506  - device data is always Float2 ordered
507  - device data is a 1-dimensional array (MILC ordered)
508  - no support for half precision
509  */
510 
511  static void
512  do_loadLinkToGPU(int* X, void *even, void*odd, void **cpuGauge, void** ghost_cpuGauge,
513  void** ghost_cpuGauge_diag,
514  QudaReconstructType reconstruct, int bytes, int Vh, int pad,
515  int Vsh_x, int Vsh_y, int Vsh_z, int Vsh_t,
516  QudaPrecision prec, QudaGaugeFieldOrder cpu_order)
517  {
518  int Vh_2d_max = MAX(X[0]*X[1]/2, X[0]*X[2]/2);
519  Vh_2d_max = MAX(Vh_2d_max, X[0]*X[3]/2);
520  Vh_2d_max = MAX(Vh_2d_max, X[1]*X[2]/2);
521  Vh_2d_max = MAX(Vh_2d_max, X[1]*X[3]/2);
522  Vh_2d_max = MAX(Vh_2d_max, X[2]*X[3]/2);
523 
524  int i;
525  int len = Vh*gaugeSiteSize*prec;
526 
527 #ifdef MULTI_GPU
528  int glen[4] = {
529  Vsh_x*gaugeSiteSize*prec,
530  Vsh_y*gaugeSiteSize*prec,
531  Vsh_z*gaugeSiteSize*prec,
532  Vsh_t*gaugeSiteSize*prec
533  };
534  int ghostV = 2*(Vsh_x+Vsh_y+Vsh_z+Vsh_t)+4*Vh_2d_max;
535 #else
536  int ghostV = 0;
537 #endif
538 
539  int glen_sum = ghostV*gaugeSiteSize*prec;
540  char *tmp_even = (char *) device_malloc(4*(len+glen_sum));
541  char *tmp_odd = tmp_even;
542 
543  //even links
544  if(cpu_order == QUDA_QDP_GAUGE_ORDER){
545  for(i=0;i < 4; i++){
546 #ifdef GPU_DIRECT
547  cudaMemcpyAsync(tmp_even + i*(len+glen_sum), cpuGauge[i], len, cudaMemcpyHostToDevice, streams[0]);
548 #else
549  cudaMemcpy(tmp_even + i*(len+glen_sum), cpuGauge[i], len, cudaMemcpyHostToDevice);
550 #endif
551  }
552  } else { //QUDA_MILC_GAUGE_ORDER
553 
554 #ifdef MULTI_GPU
555  errorQuda("Multi-GPU for MILC gauge order is not supported");
556 #endif
557 #ifdef GPU_DIRECT
558  cudaMemcpyAsync(tmp_even, ((char*)cpuGauge), 4*len, cudaMemcpyHostToDevice, streams[0]);
559 #else
560  cudaMemcpy(tmp_even, ((char*)cpuGauge), 4*len, cudaMemcpyHostToDevice);
561 #endif
562  }
563 
564 
565  for(i=0;i < 4;i++){
566 #ifdef MULTI_GPU
567  //dir: the source direction
568  char* dest = tmp_even + i*(len+glen_sum)+len;
569  for(int dir = 0; dir < 4; dir++){
570 #ifdef GPU_DIRECT
571  cudaMemcpyAsync(dest, ((char*)ghost_cpuGauge[dir])+i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice, streams[0]);
572  cudaMemcpyAsync(dest + glen[dir], ((char*)ghost_cpuGauge[dir])+8*glen[dir]+i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice, streams[0]);
573 #else
574  cudaMemcpy(dest, ((char*)ghost_cpuGauge[dir])+i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice);
575  cudaMemcpy(dest + glen[dir], ((char*)ghost_cpuGauge[dir])+8*glen[dir]+i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice);
576 #endif
577  dest += 2*glen[dir];
578  }
579  //fill in diag
580  //@nu is @i, mu iterats from 0 to 4 and mu != nu
581  int nu = i;
582  for(int mu = 0; mu < 4; mu++){
583  if(nu == mu ){
584  continue;
585  }
586  int dir1, dir2;
587  for(dir1=0; dir1 < 4; dir1 ++){
588  if(dir1 != nu && dir1 != mu){
589  break;
590  }
591  }
592  for(dir2=0; dir2 < 4; dir2 ++){
593  if(dir2 != nu && dir2 != mu && dir2 != dir1){
594  break;
595  }
596  }
597 #ifdef GPU_DIRECT
598  cudaMemcpyAsync(dest+ mu *Vh_2d_max*gaugeSiteSize*prec,ghost_cpuGauge_diag[nu*4+mu],
599  X[dir1]*X[dir2]/2*gaugeSiteSize*prec, cudaMemcpyHostToDevice, streams[0]);
600 #else
601  cudaMemcpy(dest+ mu *Vh_2d_max*gaugeSiteSize*prec,ghost_cpuGauge_diag[nu*4+mu],
602  X[dir1]*X[dir2]/2*gaugeSiteSize*prec, cudaMemcpyHostToDevice);
603 #endif
604 
605  }
606 
607 #endif
608  }
609 
610  link_format_cpu_to_gpu((void*)even, (void*)tmp_even, reconstruct, Vh, pad, ghostV, prec, cpu_order, streams[0]);
611 
612  //odd links
613  if(cpu_order == QUDA_QDP_GAUGE_ORDER){
614  for(i=0;i < 4; i++){
615 #ifdef GPU_DIRECT
616  cudaMemcpyAsync(tmp_odd + i*(len+glen_sum), ((char*)cpuGauge[i]) + Vh*gaugeSiteSize*prec, len, cudaMemcpyHostToDevice, streams[0]);
617 #else
618  cudaMemcpy(tmp_odd + i*(len+glen_sum), ((char*)cpuGauge[i]) + Vh*gaugeSiteSize*prec, len, cudaMemcpyHostToDevice);
619 #endif
620  }
621  }else{ //QUDA_MILC_GAUGE_ORDER
622 #ifdef GPU_DIRECT
623  cudaMemcpyAsync(tmp_odd , ((char*)cpuGauge)+4*Vh*gaugeSiteSize*prec, 4*len, cudaMemcpyHostToDevice, streams[0]);
624 #else
625  cudaMemcpy(tmp_odd, (char*)cpuGauge+4*Vh*gaugeSiteSize*prec, 4*len, cudaMemcpyHostToDevice);
626 #endif
627  }
628 
629 
630  for(i=0;i < 4; i++){
631 #ifdef MULTI_GPU
632  char* dest = tmp_odd + i*(len+glen_sum)+len;
633  for(int dir = 0; dir < 4; dir++){
634 #ifdef GPU_DIRECT
635  cudaMemcpyAsync(dest, ((char*)ghost_cpuGauge[dir])+glen[dir] +i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice, streams[0]);
636  cudaMemcpyAsync(dest + glen[dir], ((char*)ghost_cpuGauge[dir])+8*glen[dir]+glen[dir] +i*2*glen[dir], glen[dir],
637  cudaMemcpyHostToDevice, streams[0]);
638 #else
639  cudaMemcpy(dest, ((char*)ghost_cpuGauge[dir])+glen[dir] +i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice);
640  cudaMemcpy(dest + glen[dir], ((char*)ghost_cpuGauge[dir])+8*glen[dir]+glen[dir] +i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice);
641 
642 #endif
643 
644  dest += 2*glen[dir];
645  }
646  //fill in diag
647  //@nu is @i, mu iterats from 0 to 4 and mu != nu
648  int nu = i;
649  for(int mu = 0; mu < 4; mu++){
650  if(nu == mu ){
651  continue;
652  }
653  int dir1, dir2;
654  for(dir1=0; dir1 < 4; dir1 ++){
655  if(dir1 != nu && dir1 != mu){
656  break;
657  }
658  }
659  for(dir2=0; dir2 < 4; dir2 ++){
660  if(dir2 != nu && dir2 != mu && dir2 != dir1){
661  break;
662  }
663  }
664 #ifdef GPU_DIRECT
665  cudaMemcpyAsync(dest+ mu *Vh_2d_max*gaugeSiteSize*prec,((char*)ghost_cpuGauge_diag[nu*4+mu])+X[dir1]*X[dir2]/2*gaugeSiteSize*prec,
666  X[dir1]*X[dir2]/2*gaugeSiteSize*prec, cudaMemcpyHostToDevice, streams[0]);
667 #else
668  cudaMemcpy(dest+ mu *Vh_2d_max*gaugeSiteSize*prec,((char*)ghost_cpuGauge_diag[nu*4+mu])+X[dir1]*X[dir2]/2*gaugeSiteSize*prec,
669  X[dir1]*X[dir2]/2*gaugeSiteSize*prec, cudaMemcpyHostToDevice );
670 #endif
671  }
672 
673 
674 #endif
675  }
676  link_format_cpu_to_gpu((void*)odd, (void*)tmp_odd, reconstruct, Vh, pad, ghostV, prec, cpu_order, streams[0]);
677 
678  cudaStreamSynchronize(streams[0]);
679 
680  device_free(tmp_even);
681 
682  }
683 
684 
685  void loadLinkToGPU(cudaGaugeField* cudaGauge, cpuGaugeField* cpuGauge, QudaGaugeParam* param)
686  {
687  if (cudaGauge->Precision() != cpuGauge->Precision()){
688  errorQuda("Mismatch between CPU precision and CUDA precision");
689  }
690  QudaPrecision prec = cudaGauge->Precision();
691 
692 #ifdef MULTI_GPU
693  const int* Z = cudaGauge->X();
694 #endif
695  int pad = cudaGauge->Pad();
696  int Vsh_x = param->X[1]*param->X[2]*param->X[3]/2;
697  int Vsh_y = param->X[0]*param->X[2]*param->X[3]/2;
698  int Vsh_z = param->X[0]*param->X[1]*param->X[3]/2;
699  int Vsh_t = param->X[0]*param->X[1]*param->X[2]/2;
700 
701  static void* ghost_cpuGauge[4];
702  static void* ghost_cpuGauge_diag[16];
703 
704 #ifdef MULTI_GPU
705  static int allocated = 0;
706  int Vs[4] = {2*Vsh_x, 2*Vsh_y, 2*Vsh_z, 2*Vsh_t};
707 
708  if (!allocated) {
709 
710  for(int i=0;i < 4; i++) {
711  size_t ghost_bytes = 8*Vs[i]*gaugeSiteSize*prec;
712 #ifdef GPU_DIRECT
713  ghost_cpuGauge[i] = pinned_malloc(ghost_bytes);
714 #else
715  ghost_cpuGauge[i] = safe_malloc(ghost_bytes);
716 #endif
717  }
718 
719  /*
720  * nu | |
721  * |_____|
722  * mu
723  */
724 
725  int ghost_diag_len[16];
726  for(int nu=0;nu < 4;nu++){
727  for(int mu=0; mu < 4;mu++){
728  if(nu == mu){
729  ghost_cpuGauge_diag[nu*4+mu] = NULL;
730  }else{
731  //the other directions
732  int dir1, dir2;
733  for(dir1= 0; dir1 < 4; dir1++){
734  if(dir1 !=nu && dir1 != mu){
735  break;
736  }
737  }
738  for(dir2=0; dir2 < 4; dir2++){
739  if(dir2 != nu && dir2 != mu && dir2 != dir1){
740  break;
741  }
742  }
743  //int rc = posix_memalign((void**)&ghost_cpuGauge_diag[nu*4+mu], ALIGNMENT, Z[dir1]*Z[dir2]*gaugeSiteSize*prec);
744 
745  size_t nbytes = Z[dir1]*Z[dir2]*gaugeSiteSize*prec;
746  ghost_diag_len[nu*4+mu] = nbytes;
747 #ifdef GPU_DIRECT
748  ghost_cpuGauge_diag[nu*4+mu] = pinned_malloc(nbytes);
749 #else
750  ghost_cpuGauge_diag[nu*4+mu] = safe_malloc(nbytes);
751 #endif
752  memset(ghost_cpuGauge_diag[nu*4+mu], 0, nbytes);
753  }
754  }
755  }
756  allocated = 1;
757  }
758 
759  int optflag=1;
760  // driver for for packalllink
761  exchange_cpu_sitelink(param->X, (void**)cpuGauge->Gauge_p(), ghost_cpuGauge, ghost_cpuGauge_diag, prec, param, optflag);
762 
763 #endif
764 
765  do_loadLinkToGPU(param->X, cudaGauge->Even_p(), cudaGauge->Odd_p(), (void**)cpuGauge->Gauge_p(),
766  ghost_cpuGauge, ghost_cpuGauge_diag,
767  cudaGauge->Reconstruct(), cudaGauge->Bytes(), cudaGauge->VolumeCB(), pad,
768  Vsh_x, Vsh_y, Vsh_z, Vsh_t,
769  prec, cpuGauge->Order());
770 
771 #ifdef MULTI_GPU
772  if(!(param->preserve_gauge & QUDA_FAT_PRESERVE_COMM_MEM)) {
773 
774  for(int i=0;i < 4;i++){
775  host_free(ghost_cpuGauge[i]);
776  }
777  for(int i=0;i <4; i++){
778  for(int j=0;j <4; j++){
779  if (i != j) host_free(ghost_cpuGauge_diag[i*4+j]);
780  }
781  }
782  allocated = 0;
783  }
784 #endif
785 
786  }
787 
788  static void
789  do_loadLinkToGPU_ex(const int* X, void *even, void *odd, void**cpuGauge,
790  QudaReconstructType reconstruct, int bytes, int Vh_ex, int pad,
791  QudaPrecision prec, QudaGaugeFieldOrder cpu_order)
792  {
793  int len = Vh_ex*gaugeSiteSize*prec;
794 
795  char *tmp_even = (char *) device_malloc(4*len);
796  char *tmp_odd = tmp_even;
797 
798  //even links
799  if(cpu_order == QUDA_QDP_GAUGE_ORDER){
800  for(int i=0; i < 4; i++){
801 #ifdef GPU_DIRECT
802  cudaMemcpyAsync(tmp_even + i*len, cpuGauge[i], len, cudaMemcpyHostToDevice);
803 #else
804  cudaMemcpy(tmp_even + i*len, cpuGauge[i], len, cudaMemcpyHostToDevice);
805 #endif
806 
807  }
808  } else { //QUDA_MILC_GAUGE_ORDER
809 #ifdef GPU_DIRECT
810  cudaMemcpyAsync(tmp_even, (char*)cpuGauge, 4*len, cudaMemcpyHostToDevice);
811 #else
812  cudaMemcpy(tmp_even, (char*)cpuGauge, 4*len, cudaMemcpyHostToDevice);
813 #endif
814  }
815 
816  link_format_cpu_to_gpu((void*)even, (void*)tmp_even, reconstruct, Vh_ex, pad, 0, prec, cpu_order, 0/*default stream*/);
817 
818  //odd links
819  if(cpu_order == QUDA_QDP_GAUGE_ORDER){
820  for(int i=0; i < 4; i++){
821 #ifdef GPU_DIRECT
822  cudaMemcpyAsync(tmp_odd + i*len, ((char*)cpuGauge[i]) + Vh_ex*gaugeSiteSize*prec, len, cudaMemcpyHostToDevice);
823 #else
824  cudaMemcpy(tmp_odd + i*len, ((char*)cpuGauge[i]) + Vh_ex*gaugeSiteSize*prec, len, cudaMemcpyHostToDevice);
825 #endif
826  }
827  } else {//QUDA_MILC_GAUGE_ORDER
828 #ifdef GPU_DIRECT
829  cudaMemcpyAsync(tmp_odd, ((char*)cpuGauge) + 4*Vh_ex*gaugeSiteSize*prec, 4*len, cudaMemcpyHostToDevice);
830 #else
831  cudaMemcpy(tmp_odd, ((char*)cpuGauge) + 4*Vh_ex*gaugeSiteSize*prec, 4*len, cudaMemcpyHostToDevice);
832 #endif
833  }
834  link_format_cpu_to_gpu((void*)odd, (void*)tmp_odd, reconstruct, Vh_ex, pad, 0, prec, cpu_order, 0 /*default stream*/);
835 
836  device_free(tmp_even);
837  }
838 
839 
840  void loadLinkToGPU_ex(cudaGaugeField* cudaGauge, cpuGaugeField* cpuGauge)
841  {
842  if (cudaGauge->Precision() != cpuGauge->Precision()){
843  errorQuda("Mismatch between CPU precision and CUDA precision");
844  }
845  QudaPrecision prec = cudaGauge->Precision();
846  const int *E = cudaGauge->X();
847  int pad = cudaGauge->Pad();
848  do_loadLinkToGPU_ex(E, cudaGauge->Even_p(), cudaGauge->Odd_p(), (void**)cpuGauge->Gauge_p(),
849  cudaGauge->Reconstruct(), cudaGauge->Bytes(), cudaGauge->VolumeCB(), pad,
850  prec, cpuGauge->Order());
851  }
852 
853 
854  template<typename FloatN, typename Float>
855  static void do_storeLinkToCPU(Float* cpuGauge, FloatN *even, FloatN *odd,
856  int bytes, int Vh, int stride, QudaPrecision prec)
857  {
858  int datalen = 4*Vh*gaugeSiteSize*sizeof(Float);
859 
860  double *unpackedDataEven = (double *) device_malloc(datalen);
861  double *unpackedDataOdd = unpackedDataEven;
862 
863  //unpack even data kernel
864  link_format_gpu_to_cpu((void*)unpackedDataEven, (void*)even, Vh, stride, prec, streams[0]);
865 
866 #ifdef GPU_DIRECT
867  cudaMemcpyAsync(cpuGauge, unpackedDataEven, datalen, cudaMemcpyDeviceToHost, streams[0]);
868 #else
869  cudaMemcpy(cpuGauge, unpackedDataEven, datalen, cudaMemcpyDeviceToHost);
870 #endif
871 
872  //unpack odd data kernel
873  link_format_gpu_to_cpu((void*)unpackedDataOdd, (void*)odd, Vh, stride, prec, streams[0]);
874 #ifdef GPU_DIRECT
875  cudaMemcpyAsync(cpuGauge + 4*Vh*gaugeSiteSize, unpackedDataOdd, datalen, cudaMemcpyDeviceToHost, streams[0]);
876 #else
877  cudaMemcpy(cpuGauge + 4*Vh*gaugeSiteSize, unpackedDataOdd, datalen, cudaMemcpyDeviceToHost);
878 #endif
879 
880  device_free(unpackedDataEven);
881  }
882 
883 
884  void storeLinkToCPU(cpuGaugeField* cpuGauge, cudaGaugeField *cudaGauge, QudaGaugeParam* param)
885  {
888 
889  if (cpu_prec != cuda_prec){
890  errorQuda("Mismatch between CPU precision and CUDA precision");
891  }
892 
893  if (cudaGauge->Reconstruct() != QUDA_RECONSTRUCT_NO){
894  errorQuda("Reconstruct type not supported");
895  }
896 
897  int stride = cudaGauge->VolumeCB() + cudaGauge->Pad();
898 
899  if (cuda_prec == QUDA_DOUBLE_PRECISION){
900  do_storeLinkToCPU( (double*)cpuGauge->Gauge_p(), (double2*) cudaGauge->Even_p(), (double2*)cudaGauge->Odd_p(),
901  cudaGauge->Bytes(), cudaGauge->VolumeCB(), stride, cuda_prec);
902  }else if (cuda_prec == QUDA_SINGLE_PRECISION){
903  do_storeLinkToCPU( (float*)cpuGauge->Gauge_p(), (float2*) cudaGauge->Even_p(), (float2*)cudaGauge->Odd_p(),
904  cudaGauge->Bytes(), cudaGauge->VolumeCB(), stride, cuda_prec);
905  } else {
906  errorQuda("Half precision not supported");
907  }
908  }
909 
910 } // namespace quda
911 
912 
913 #endif
914