QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
fat_force_quda.cpp
Go to the documentation of this file.
1 #include <stdlib.h>
2 #include <stdio.h>
3 #include <math.h>
4 #include <string.h>
5 
6 #include <typeinfo>
7 #include <quda.h>
8 #include <quda_internal.h>
9 #include <fat_force_quda.h>
10 #include <face_quda.h>
11 #include <misc_helpers.h>
12 #include <assert.h>
13 
14 #define MAX(a,b) ((a)>(b)?(a):(b))
15 #define ALIGNMENT 4096
16 
17  /********************** Staple code, used by link fattening **************/
18 
19 #if defined(GPU_FATLINK) || defined(GPU_GAUGE_FORCE)|| defined(GPU_FERMION_FORCE) || defined(GPU_HISQ_FORCE) || defined(GPU_CLOVER_DIRAC)
20 
21 namespace quda {
22 
23  template <typename Float>
24  void packGhostAllStaples(Float *cpuStaple, Float **cpuGhostBack,Float**cpuGhostFwd, int nFace, int* X) {
25  int XY=X[0]*X[1];
26  int XYZ=X[0]*X[1]*X[2];
27  int volumeCB = X[0]*X[1]*X[2]*X[3]/2;
28  int faceVolumeCB[4]={
29  X[1]*X[2]*X[3]/2,
30  X[0]*X[2]*X[3]/2,
31  X[0]*X[1]*X[3]/2,
32  X[0]*X[1]*X[2]/2
33  };
34 
35  //loop variables: a, b, c with a the most signifcant and c the least significant
36  //A, B, C the maximum value
37  //we need to loop in d as well, d's vlaue dims[dir]-3, dims[dir]-2, dims[dir]-1
38  int A[4], B[4], C[4];
39 
40  //X dimension
41  A[0] = X[3]; B[0] = X[2]; C[0] = X[1];
42 
43  //Y dimension
44  A[1] = X[3]; B[1] = X[2]; C[1] = X[0];
45 
46  //Z dimension
47  A[2] = X[3]; B[2] = X[1]; C[2] = X[0];
48 
49  //T dimension
50  A[3] = X[2]; B[3] = X[1]; C[3] = X[0];
51 
52 
53  //multiplication factor to compute index in original cpu memory
54  int f[4][4]={
55  {XYZ, XY, X[0], 1},
56  {XYZ, XY, 1, X[0]},
57  {XYZ, X[0], 1, XY},
58  { XY, X[0], 1, XYZ}
59  };
60 
61 
62  for(int ite = 0; ite < 2; ite++){
63  //ite == 0: back
64  //ite == 1: fwd
65  Float** dst;
66  if (ite == 0){
67  dst = cpuGhostBack;
68  }else{
69  dst = cpuGhostFwd;
70  }
71 
72  //collect back ghost staple
73  for(int dir =0; dir < 4; dir++){
74  int d;
75  int a,b,c;
76 
77  //ther is only one staple in the same location
78  for(int linkdir=0; linkdir < 1; linkdir ++){
79  Float* even_src = cpuStaple;
80  Float* odd_src = cpuStaple + volumeCB*gaugeSiteSize;
81 
82  Float* even_dst;
83  Float* odd_dst;
84 
85  //switching odd and even ghost cpuLink when that dimension size is odd
86  //only switch if X[dir] is odd and the gridsize in that dimension is greater than 1
87  if((X[dir] % 2 ==0) || (commDim(dir) == 1)){
88  even_dst = dst[dir];
89  odd_dst = even_dst + nFace*faceVolumeCB[dir]*gaugeSiteSize;
90  }else{
91  odd_dst = dst[dir];
92  even_dst = dst[dir] + nFace*faceVolumeCB[dir]*gaugeSiteSize;
93  }
94 
95  int even_dst_index = 0;
96  int odd_dst_index = 0;
97 
98  int startd;
99  int endd;
100  if(ite == 0){ //back
101  startd = 0;
102  endd= nFace;
103  }else{//fwd
104  startd = X[dir] - nFace;
105  endd =X[dir];
106  }
107  for(d = startd; d < endd; d++){
108  for(a = 0; a < A[dir]; a++){
109  for(b = 0; b < B[dir]; b++){
110  for(c = 0; c < C[dir]; c++){
111  int index = ( a*f[dir][0] + b*f[dir][1]+ c*f[dir][2] + d*f[dir][3])>> 1;
112  int oddness = (a+b+c+d)%2;
113  if (oddness == 0){ //even
114  for(int i=0;i < 18;i++){
115  even_dst[18*even_dst_index+i] = even_src[18*index + i];
116  }
117  even_dst_index++;
118  }else{ //odd
119  for(int i=0;i < 18;i++){
120  odd_dst[18*odd_dst_index+i] = odd_src[18*index + i];
121  }
122  odd_dst_index++;
123  }
124  }//c
125  }//b
126  }//a
127  }//d
128  assert( even_dst_index == nFace*faceVolumeCB[dir]);
129  assert( odd_dst_index == nFace*faceVolumeCB[dir]);
130  }//linkdir
131 
132  }//dir
133  }//ite
134  }
135 
136 
137  void pack_ghost_all_staples_cpu(void *staple, void **cpuGhostStapleBack, void** cpuGhostStapleFwd,
138  int nFace, QudaPrecision precision, int* X) {
139 
140  if (precision == QUDA_DOUBLE_PRECISION) {
141  packGhostAllStaples((double*)staple, (double**)cpuGhostStapleBack, (double**) cpuGhostStapleFwd, nFace, X);
142  } else {
143  packGhostAllStaples((float*)staple, (float**)cpuGhostStapleBack, (float**)cpuGhostStapleFwd, nFace, X);
144  }
145 
146  }
147 
148  void pack_gauge_diag(void* buf, int* X, void** sitelink, int nu, int mu, int dir1, int dir2, QudaPrecision prec)
149  {
150  /*
151  nu | |
152  |__________|
153  mu
154  *
155  * nu, mu are the directions we are working on
156  * Since we are packing our own data, we need to go to the north-west corner in the diagram
157  * i.e. x[nu] = X[nu]-1, x[mu]=0, and looop throught x[dir1],x[dir2]
158  * in the remaining two directions (dir1/dir2), dir2 is the slowest changing dim when computing
159  * index
160  */
161 
162 
163  int mul_factor[4]={
164  1, X[0], X[1]*X[0], X[2]*X[1]*X[0],
165  };
166 
167  int even_dst_idx = 0;
168  int odd_dst_idx = 0;
169  char* dst_even =(char*)buf;
170  char* dst_odd = dst_even + (X[dir1]*X[dir2]/2)*gaugeSiteSize*prec;
171  char* src_even = (char*)sitelink[nu];
172  char* src_odd = src_even + (X[0]*X[1]*X[2]*X[3]/2)*gaugeSiteSize*prec;
173 
174  if( (X[nu]+X[mu]) % 2 == 1){
175  //oddness will change between me and the diagonal neighbor
176  //switch it now
177  char* tmp = dst_odd;
178  dst_odd = dst_even;
179  dst_even = tmp;
180  }
181 
182  for(int i=0;i < X[dir2]; i++){
183  for(int j=0; j < X[dir1]; j++){
184  int src_idx = ((X[nu]-1)*mul_factor[nu]+ 0*mul_factor[mu]+i*mul_factor[dir2]+j*mul_factor[dir1])>>1;
185  //int dst_idx = (i*X[dir1]+j) >> 1;
186  int oddness = ( (X[nu]-1) + 0 + i + j) %2;
187 
188  if(oddness==0){
189  for(int tmpidx = 0; tmpidx < gaugeSiteSize; tmpidx++){
190  memcpy(&dst_even[(18*even_dst_idx+tmpidx)*prec], &src_even[(18*src_idx + tmpidx)*prec], prec);
191  }
192  even_dst_idx++;
193  }else{
194  for(int tmpidx = 0; tmpidx < gaugeSiteSize; tmpidx++){
195  memcpy(&dst_odd[(18*odd_dst_idx+tmpidx)*prec], &src_odd[(18*src_idx + tmpidx)*prec], prec);
196  }
197  odd_dst_idx++;
198  }//if
199 
200  }//for j
201  }//for i
202 
203  if( (even_dst_idx != X[dir1]*X[dir2]/2)|| (odd_dst_idx != X[dir1]*X[dir2]/2)){
204  errorQuda("even_dst_idx/odd_dst_idx(%d/%d) does not match the value of X[dir1]*X[dir2]/2 (%d)\n",
205  even_dst_idx, odd_dst_idx, X[dir1]*X[dir2]/2);
206  }
207  return ;
208 
209 
210  }
211 
212  void
213  packGhostStaple(int* X, void* even, void* odd, int volumeCB, QudaPrecision prec,
214  int stride,
215  int dir, int whichway,
216  void** fwd_nbr_buf_gpu, void** back_nbr_buf_gpu,
217  void** fwd_nbr_buf, void** back_nbr_buf,
218  cudaStream_t* stream)
219  {
220  int Vs_x, Vs_y, Vs_z, Vs_t;
221 
222  Vs_x = X[1]*X[2]*X[3];
223  Vs_y = X[0]*X[2]*X[3];
224  Vs_z = X[0]*X[1]*X[3];
225  Vs_t = X[0]*X[1]*X[2];
226  int Vs[4] = {Vs_x, Vs_y, Vs_z, Vs_t};
227 
228  if (dir != 3){ //the code would work for dir=3 as well
229  //even and odd ness switch (if necessary) is taken caren of in collectGhostStaple();
230  void* gpu_buf;
231  int i =dir;
232  if (whichway == QUDA_BACKWARDS){
233  gpu_buf = back_nbr_buf_gpu[i];
234  collectGhostStaple(X, even, odd, volumeCB, stride, prec, gpu_buf, i, whichway, stream);
235  cudaMemcpyAsync(back_nbr_buf[i], gpu_buf, Vs[i]*gaugeSiteSize*prec, cudaMemcpyDeviceToHost, *stream);
236  }else{//whichway is QUDA_FORWARDS;
237  gpu_buf = fwd_nbr_buf_gpu[i];
238  collectGhostStaple(X, even, odd, volumeCB, stride, prec, gpu_buf, i, whichway, stream);
239  cudaMemcpyAsync(fwd_nbr_buf[i], gpu_buf, Vs[i]*gaugeSiteSize*prec, cudaMemcpyDeviceToHost, *stream);
240  }
241  }else{ //special case for dir=3 since no gather kernel is required
242  int Vh = volumeCB;
243  int Vsh = X[0]*X[1]*X[2]/2;
244  int sizeOfFloatN = 2*prec;
245  int len = Vsh*sizeOfFloatN;
246  int i;
247  if(X[3] %2 == 0){
248  //back,even
249  for(i=0;i < 9; i++){
250  void* dst = ((char*)back_nbr_buf[3]) + i*len ;
251  void* src = ((char*)even) + i*stride*sizeOfFloatN;
252  cudaMemcpyAsync(dst, src, len, cudaMemcpyDeviceToHost, *stream);
253  }
254  //back, odd
255  for(i=0;i < 9; i++){
256  void* dst = ((char*)back_nbr_buf[3]) + 9*len + i*len ;
257  void* src = ((char*)odd) + i*stride*sizeOfFloatN;
258  cudaMemcpyAsync(dst, src, len, cudaMemcpyDeviceToHost, *stream);
259  }
260  //fwd,even
261  for(i=0;i < 9; i++){
262  void* dst = ((char*)fwd_nbr_buf[3]) + i*len ;
263  void* src = ((char*)even) + (Vh-Vsh)*sizeOfFloatN + i*stride*sizeOfFloatN;
264  cudaMemcpyAsync(dst, src, len, cudaMemcpyDeviceToHost, *stream);
265  }
266  //fwd, odd
267  for(i=0;i < 9; i++){
268  void* dst = ((char*)fwd_nbr_buf[3]) + 9*len + i*len ;
269  void* src = ((char*)odd) + (Vh-Vsh)*sizeOfFloatN + i*stride*sizeOfFloatN;
270  cudaMemcpyAsync(dst, src, len, cudaMemcpyDeviceToHost, *stream);
271  }
272  }else{
273  //reverse even and odd position
274  //back,odd
275  for(i=0;i < 9; i++){
276  void* dst = ((char*)back_nbr_buf[3]) + i*len ;
277  void* src = ((char*)odd) + i*stride*sizeOfFloatN;
278  cudaMemcpyAsync(dst, src, len, cudaMemcpyDeviceToHost, *stream);
279  }
280  //back, even
281  for(i=0;i < 9; i++){
282  void* dst = ((char*)back_nbr_buf[3]) + 9*len + i*len ;
283  void* src = ((char*)even) + i*stride*sizeOfFloatN;
284  cudaMemcpyAsync(dst, src, len, cudaMemcpyDeviceToHost, *stream);
285  }
286  //fwd,odd
287  for(i=0;i < 9; i++){
288  void* dst = ((char*)fwd_nbr_buf[3]) + i*len ;
289  void* src = ((char*)odd) + (Vh-Vsh)*sizeOfFloatN + i*stride*sizeOfFloatN;
290  cudaMemcpyAsync(dst, src, len, cudaMemcpyDeviceToHost, *stream);
291  }
292  //fwd, even
293  for(i=0;i < 9; i++){
294  void* dst = ((char*)fwd_nbr_buf[3]) + 9*len + i*len ;
295  void* src = ((char*)even) + (Vh-Vsh)*sizeOfFloatN + i*stride*sizeOfFloatN;
296  cudaMemcpyAsync(dst, src, len, cudaMemcpyDeviceToHost, *stream);
297  }
298 
299  }
300  }
301 
302  }
303 
304 
305  void
306  unpackGhostStaple(int* X, void* _even, void* _odd, int volume, QudaPrecision prec,
307  int stride,
308  int dir, int whichway, void** fwd_nbr_buf, void** back_nbr_buf,
309  cudaStream_t* stream)
310  {
311 
312  int Vsh_x, Vsh_y, Vsh_z, Vsh_t;
313 
314  Vsh_x = X[1]*X[2]*X[3]/2;
315  Vsh_y = X[0]*X[2]*X[3]/2;
316  Vsh_z = X[0]*X[1]*X[3]/2;
317  Vsh_t = X[0]*X[1]*X[2]/2;
318  int Vsh[4] = {Vsh_x, Vsh_y, Vsh_z, Vsh_t};
319 
320  int Vh = volume;
321  int sizeOfFloatN = 2*prec;
322  int len[4] = {
323  Vsh_x*sizeOfFloatN,
324  Vsh_y*sizeOfFloatN,
325  Vsh_z*sizeOfFloatN,
326  Vsh_t*sizeOfFloatN
327  };
328 
329  int tmpint[4] = {
330  0,
331  Vsh_x,
332  Vsh_x + Vsh_y,
333  Vsh_x + Vsh_y + Vsh_z,
334  };
335 
336  char* even = ((char*)_even) + Vh*sizeOfFloatN + 2*tmpint[dir]*sizeOfFloatN;
337  char* odd = ((char*)_odd) + Vh*sizeOfFloatN +2*tmpint[dir]*sizeOfFloatN;
338 
339  if(whichway == QUDA_BACKWARDS){
340  //back,even
341  for(int i=0;i < 9; i++){
342  void* dst = even + i*stride*sizeOfFloatN;
343  void* src = ((char*)back_nbr_buf[dir]) + i*len[dir] ;
344  cudaMemcpyAsync(dst, src, len[dir], cudaMemcpyHostToDevice, *stream);
345  }
346  //back, odd
347  for(int i=0;i < 9; i++){
348  void* dst = odd + i*stride*sizeOfFloatN;
349  void* src = ((char*)back_nbr_buf[dir]) + 9*len[dir] + i*len[dir] ;
350  cudaMemcpyAsync(dst, src, len[dir], cudaMemcpyHostToDevice, *stream);
351  }
352  }else { //QUDA_FORWARDS
353  //fwd,even
354  for(int i=0;i < 9; i++){
355  void* dst = even + Vsh[dir]*sizeOfFloatN + i*stride*sizeOfFloatN;
356  void* src = ((char*)fwd_nbr_buf[dir]) + i*len[dir] ;
357  cudaMemcpyAsync(dst, src, len[dir], cudaMemcpyHostToDevice, *stream);
358  }
359  //fwd, odd
360  for(int i=0;i < 9; i++){
361  void* dst = odd + Vsh[dir]*sizeOfFloatN + i*stride*sizeOfFloatN;
362  void* src = ((char*)fwd_nbr_buf[dir]) + 9*len[dir] + i*len[dir] ;
363  cudaMemcpyAsync(dst, src, len[dir], cudaMemcpyHostToDevice, *stream);
364  }
365  }
366  }
367 
368  /*
369  This is the packing kernel for the multi-dimensional ghost zone in
370  the padded region. This is called by cpuexchangesitelink in
371  FaceBuffer (MPI only), which was called by loadLinkToGPU (defined at
372  the bottom).
373 
374  Not currently included since it will be replaced by Guochun's new
375  routine which uses an enlarged domain instead of a ghost zone.
376  */
377  template <typename Float>
378  void packGhostAllLinks(Float **cpuLink, Float **cpuGhostBack,Float**cpuGhostFwd, int dir, int nFace, int* X) {
379  int XY=X[0]*X[1];
380  int XYZ=X[0]*X[1]*X[2];
381 
382  int volumeCB = X[0]*X[1]*X[2]*X[3]/2;
383  int faceVolumeCB[4]={
384  X[1]*X[2]*X[3]/2,
385  X[0]*X[2]*X[3]/2,
386  X[0]*X[1]*X[3]/2,
387  X[0]*X[1]*X[2]/2
388  };
389 
390  //loop variables: a, b, c with a the most signifcant and c the least significant
391  //A, B, C the maximum value
392  //we need to loop in d as well, d's vlaue dims[dir]-3, dims[dir]-2, dims[dir]-1
393  int A[4], B[4], C[4];
394 
395  //X dimension
396  A[0] = X[3]; B[0] = X[2]; C[0] = X[1];
397 
398  //Y dimension
399  A[1] = X[3]; B[1] = X[2]; C[1] = X[0];
400 
401  //Z dimension
402  A[2] = X[3]; B[2] = X[1]; C[2] = X[0];
403 
404  //T dimension
405  A[3] = X[2]; B[3] = X[1]; C[3] = X[0];
406 
407 
408  //multiplication factor to compute index in original cpu memory
409  int f[4][4]={
410  {XYZ, XY, X[0], 1},
411  {XYZ, XY, 1, X[0]},
412  {XYZ, X[0], 1, XY},
413  { XY, X[0], 1, XYZ}
414  };
415 
416 
417  for(int ite = 0; ite < 2; ite++){
418  //ite == 0: back
419  //ite == 1: fwd
420  Float** dst;
421  if (ite == 0){
422  dst = cpuGhostBack;
423  }else{
424  dst = cpuGhostFwd;
425  }
426 
427  //collect back ghost gauge field
428  //for(int dir =0; dir < 4; dir++){
429  int d;
430  int a,b,c;
431 
432  //we need copy all 4 links in the same location
433  for(int linkdir=0; linkdir < 4; linkdir ++){
434  Float* even_src = cpuLink[linkdir];
435  Float* odd_src = cpuLink[linkdir] + volumeCB*gaugeSiteSize;
436 
437  Float* even_dst;
438  Float* odd_dst;
439 
440  //switching odd and even ghost cpuLink when that dimension size is odd
441  //only switch if X[dir] is odd and the gridsize in that dimension is greater than 1
442  if((X[dir] % 2 ==0) || (commDim(dir) == 1)){
443  even_dst = dst[dir] + 2*linkdir* nFace *faceVolumeCB[dir]*gaugeSiteSize;
444  odd_dst = even_dst + nFace*faceVolumeCB[dir]*gaugeSiteSize;
445  }else{
446  odd_dst = dst[dir] + 2*linkdir* nFace *faceVolumeCB[dir]*gaugeSiteSize;
447  even_dst = odd_dst + nFace*faceVolumeCB[dir]*gaugeSiteSize;
448  }
449 
450  int even_dst_index = 0;
451  int odd_dst_index = 0;
452 
453  int startd;
454  int endd;
455  if(ite == 0){ //back
456  startd = 0;
457  endd= nFace;
458  }else{//fwd
459  startd = X[dir] - nFace;
460  endd =X[dir];
461  }
462  for(d = startd; d < endd; d++){
463  for(a = 0; a < A[dir]; a++){
464  for(b = 0; b < B[dir]; b++){
465  for(c = 0; c < C[dir]; c++){
466  int index = ( a*f[dir][0] + b*f[dir][1]+ c*f[dir][2] + d*f[dir][3])>> 1;
467  int oddness = (a+b+c+d)%2;
468  if (oddness == 0){ //even
469  for(int i=0;i < 18;i++){
470  even_dst[18*even_dst_index+i] = even_src[18*index + i];
471  }
472  even_dst_index++;
473  }else{ //odd
474  for(int i=0;i < 18;i++){
475  odd_dst[18*odd_dst_index+i] = odd_src[18*index + i];
476  }
477  odd_dst_index++;
478  }
479  }//c
480  }//b
481  }//a
482  }//d
483  assert( even_dst_index == nFace*faceVolumeCB[dir]);
484  assert( odd_dst_index == nFace*faceVolumeCB[dir]);
485  }//linkdir
486 
487  //}//dir
488  }//ite
489  }
490 
491 
492  void pack_ghost_all_links(void **cpuLink, void **cpuGhostBack, void** cpuGhostFwd,
493  int dir, int nFace, QudaPrecision precision, int *X) {
494 
495  if (precision == QUDA_DOUBLE_PRECISION) {
496  packGhostAllLinks((double**)cpuLink, (double**)cpuGhostBack, (double**) cpuGhostFwd, dir, nFace, X);
497  } else {
498  packGhostAllLinks((float**)cpuLink, (float**)cpuGhostBack, (float**)cpuGhostFwd, dir, nFace, X);
499  }
500 
501  }
502 
503  /*
504  Copies the device gauge field to the host.
505  - no reconstruction support
506  - device data is always Float2 ordered
507  - device data is a 1-dimensional array (MILC ordered)
508  - no support for half precision
509  */
510 
511  static void
512  do_loadLinkToGPU(int* X, void *even, void*odd, void **cpuGauge, void** ghost_cpuGauge,
513  void** ghost_cpuGauge_diag,
514  QudaReconstructType reconstruct, int bytes, int Vh, int pad,
515  int Vsh_x, int Vsh_y, int Vsh_z, int Vsh_t,
516  QudaPrecision prec, QudaGaugeFieldOrder cpu_order)
517  {
518  int Vh_2d_max = MAX(X[0]*X[1]/2, X[0]*X[2]/2);
519  Vh_2d_max = MAX(Vh_2d_max, X[0]*X[3]/2);
520  Vh_2d_max = MAX(Vh_2d_max, X[1]*X[2]/2);
521  Vh_2d_max = MAX(Vh_2d_max, X[1]*X[3]/2);
522  Vh_2d_max = MAX(Vh_2d_max, X[2]*X[3]/2);
523 
524  int i;
525  int len = Vh*gaugeSiteSize*prec;
526 
527 #ifdef MULTI_GPU
528  int glen[4] = {
529  Vsh_x*gaugeSiteSize*prec,
530  Vsh_y*gaugeSiteSize*prec,
531  Vsh_z*gaugeSiteSize*prec,
532  Vsh_t*gaugeSiteSize*prec
533  };
534  int ghostV = 2*(Vsh_x+Vsh_y+Vsh_z+Vsh_t)+4*Vh_2d_max;
535 #else
536  int ghostV = 0;
537 #endif
538 
539  int glen_sum = ghostV*gaugeSiteSize*prec;
540  char *tmp_even = (char *) device_malloc(4*(len+glen_sum));
541  char *tmp_odd = tmp_even;
542 
543  //even links
544  if(cpu_order == QUDA_QDP_GAUGE_ORDER){
545  for(i=0;i < 4; i++){
546 #ifdef GPU_DIRECT
547  cudaMemcpyAsync(tmp_even + i*(len+glen_sum), cpuGauge[i], len, cudaMemcpyHostToDevice, streams[0]);
548 #else
549  cudaMemcpy(tmp_even + i*(len+glen_sum), cpuGauge[i], len, cudaMemcpyHostToDevice);
550 #endif
551  }
552  } else if (cpu_order == QUDA_MILC_GAUGE_ORDER) {
553 
554 #ifdef MULTI_GPU
555  errorQuda("Multi-GPU for MILC gauge order is not supported");
556 #endif
557 #ifdef GPU_DIRECT
558  cudaMemcpyAsync(tmp_even, ((char*)cpuGauge), 4*len, cudaMemcpyHostToDevice, streams[0]);
559 #else
560  cudaMemcpy(tmp_even, ((char*)cpuGauge), 4*len, cudaMemcpyHostToDevice);
561 #endif
562  } else { \
563  errorQuda("Unsupported gauge order\n"); \
564  }
565 
566 
567  for(i=0;i < 4;i++){
568 #ifdef MULTI_GPU
569  //dir: the source direction
570  char* dest = tmp_even + i*(len+glen_sum)+len;
571  for(int dir = 0; dir < 4; dir++){
572 #ifdef GPU_DIRECT
573  cudaMemcpyAsync(dest, ((char*)ghost_cpuGauge[dir])+i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice, streams[0]);
574  cudaMemcpyAsync(dest + glen[dir], ((char*)ghost_cpuGauge[dir])+8*glen[dir]+i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice, streams[0]);
575 #else
576  cudaMemcpy(dest, ((char*)ghost_cpuGauge[dir])+i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice);
577  cudaMemcpy(dest + glen[dir], ((char*)ghost_cpuGauge[dir])+8*glen[dir]+i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice);
578 #endif
579  dest += 2*glen[dir];
580  }
581  //fill in diag
582  //@nu is @i, mu iterats from 0 to 4 and mu != nu
583  int nu = i;
584  for(int mu = 0; mu < 4; mu++){
585  if(nu == mu ){
586  continue;
587  }
588  int dir1, dir2;
589  for(dir1=0; dir1 < 4; dir1 ++){
590  if(dir1 != nu && dir1 != mu){
591  break;
592  }
593  }
594  for(dir2=0; dir2 < 4; dir2 ++){
595  if(dir2 != nu && dir2 != mu && dir2 != dir1){
596  break;
597  }
598  }
599 #ifdef GPU_DIRECT
600  cudaMemcpyAsync(dest+ mu *Vh_2d_max*gaugeSiteSize*prec,ghost_cpuGauge_diag[nu*4+mu],
601  X[dir1]*X[dir2]/2*gaugeSiteSize*prec, cudaMemcpyHostToDevice, streams[0]);
602 #else
603  cudaMemcpy(dest+ mu *Vh_2d_max*gaugeSiteSize*prec,ghost_cpuGauge_diag[nu*4+mu],
604  X[dir1]*X[dir2]/2*gaugeSiteSize*prec, cudaMemcpyHostToDevice);
605 #endif
606 
607  }
608 
609 #endif
610  }
611 
612  link_format_cpu_to_gpu((void*)even, (void*)tmp_even, reconstruct, Vh, pad, ghostV, prec, cpu_order, streams[0]);
613 
614  //odd links
615  if(cpu_order == QUDA_QDP_GAUGE_ORDER){
616  for(i=0;i < 4; i++){
617 #ifdef GPU_DIRECT
618  cudaMemcpyAsync(tmp_odd + i*(len+glen_sum), ((char*)cpuGauge[i]) + Vh*gaugeSiteSize*prec, len, cudaMemcpyHostToDevice, streams[0]);
619 #else
620  cudaMemcpy(tmp_odd + i*(len+glen_sum), ((char*)cpuGauge[i]) + Vh*gaugeSiteSize*prec, len, cudaMemcpyHostToDevice);
621 #endif
622  }
623  } else if (cpu_order == QUDA_MILC_GAUGE_ORDER) {
624 #ifdef GPU_DIRECT
625  cudaMemcpyAsync(tmp_odd , ((char*)cpuGauge)+4*Vh*gaugeSiteSize*prec, 4*len, cudaMemcpyHostToDevice, streams[0]);
626 #else
627  cudaMemcpy(tmp_odd, (char*)cpuGauge+4*Vh*gaugeSiteSize*prec, 4*len, cudaMemcpyHostToDevice);
628 #endif
629  } else {
630  errorQuda("Unsupported gauge order\n");
631  }
632 
633 
634  for(i=0;i < 4; i++){
635 #ifdef MULTI_GPU
636  char* dest = tmp_odd + i*(len+glen_sum)+len;
637  for(int dir = 0; dir < 4; dir++){
638 #ifdef GPU_DIRECT
639  cudaMemcpyAsync(dest, ((char*)ghost_cpuGauge[dir])+glen[dir] +i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice, streams[0]);
640  cudaMemcpyAsync(dest + glen[dir], ((char*)ghost_cpuGauge[dir])+8*glen[dir]+glen[dir] +i*2*glen[dir], glen[dir],
641  cudaMemcpyHostToDevice, streams[0]);
642 #else
643  cudaMemcpy(dest, ((char*)ghost_cpuGauge[dir])+glen[dir] +i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice);
644  cudaMemcpy(dest + glen[dir], ((char*)ghost_cpuGauge[dir])+8*glen[dir]+glen[dir] +i*2*glen[dir], glen[dir], cudaMemcpyHostToDevice);
645 
646 #endif
647 
648  dest += 2*glen[dir];
649  }
650  //fill in diag
651  //@nu is @i, mu iterats from 0 to 4 and mu != nu
652  int nu = i;
653  for(int mu = 0; mu < 4; mu++){
654  if(nu == mu ){
655  continue;
656  }
657  int dir1, dir2;
658  for(dir1=0; dir1 < 4; dir1 ++){
659  if(dir1 != nu && dir1 != mu){
660  break;
661  }
662  }
663  for(dir2=0; dir2 < 4; dir2 ++){
664  if(dir2 != nu && dir2 != mu && dir2 != dir1){
665  break;
666  }
667  }
668 #ifdef GPU_DIRECT
669  cudaMemcpyAsync(dest+ mu *Vh_2d_max*gaugeSiteSize*prec,((char*)ghost_cpuGauge_diag[nu*4+mu])+X[dir1]*X[dir2]/2*gaugeSiteSize*prec,
670  X[dir1]*X[dir2]/2*gaugeSiteSize*prec, cudaMemcpyHostToDevice, streams[0]);
671 #else
672  cudaMemcpy(dest+ mu *Vh_2d_max*gaugeSiteSize*prec,((char*)ghost_cpuGauge_diag[nu*4+mu])+X[dir1]*X[dir2]/2*gaugeSiteSize*prec,
673  X[dir1]*X[dir2]/2*gaugeSiteSize*prec, cudaMemcpyHostToDevice );
674 #endif
675  }
676 
677 
678 #endif
679  }
680  link_format_cpu_to_gpu((void*)odd, (void*)tmp_odd, reconstruct, Vh, pad, ghostV, prec, cpu_order, streams[0]);
681 
682  cudaStreamSynchronize(streams[0]);
683 
684  device_free(tmp_even);
685 
686  }
687 
688 
689  void loadLinkToGPU(cudaGaugeField* cudaGauge, cpuGaugeField* cpuGauge, QudaGaugeParam* param)
690  {
691  if (cudaGauge->Precision() != cpuGauge->Precision()){
692  errorQuda("Mismatch between CPU precision and CUDA precision");
693  }
694  QudaPrecision prec = cudaGauge->Precision();
695 
696 #ifdef MULTI_GPU
697  const int* Z = cudaGauge->X();
698 #endif
699  int pad = cudaGauge->Pad();
700  int Vsh_x = param->X[1]*param->X[2]*param->X[3]/2;
701  int Vsh_y = param->X[0]*param->X[2]*param->X[3]/2;
702  int Vsh_z = param->X[0]*param->X[1]*param->X[3]/2;
703  int Vsh_t = param->X[0]*param->X[1]*param->X[2]/2;
704 
705  static void* ghost_cpuGauge[4];
706  static void* ghost_cpuGauge_diag[16];
707 
708 #ifdef MULTI_GPU
709  static int allocated = 0;
710  int Vs[4] = {2*Vsh_x, 2*Vsh_y, 2*Vsh_z, 2*Vsh_t};
711 
712  if (!allocated) {
713 
714  for(int i=0;i < 4; i++) {
715  size_t ghost_bytes = 8*Vs[i]*gaugeSiteSize*prec;
716 #ifdef GPU_DIRECT
717  ghost_cpuGauge[i] = pinned_malloc(ghost_bytes);
718 #else
719  ghost_cpuGauge[i] = safe_malloc(ghost_bytes);
720 #endif
721  }
722 
723  /*
724  * nu | |
725  * |_____|
726  * mu
727  */
728 
729  for(int nu=0;nu < 4;nu++){
730  for(int mu=0; mu < 4;mu++){
731  if(nu == mu){
732  ghost_cpuGauge_diag[nu*4+mu] = NULL;
733  }else{
734  //the other directions
735  int dir1, dir2;
736  for(dir1= 0; dir1 < 4; dir1++){
737  if(dir1 !=nu && dir1 != mu){
738  break;
739  }
740  }
741  for(dir2=0; dir2 < 4; dir2++){
742  if(dir2 != nu && dir2 != mu && dir2 != dir1){
743  break;
744  }
745  }
746  //int rc = posix_memalign((void**)&ghost_cpuGauge_diag[nu*4+mu], ALIGNMENT, Z[dir1]*Z[dir2]*gaugeSiteSize*prec);
747 
748  size_t nbytes = Z[dir1]*Z[dir2]*gaugeSiteSize*prec;
749 #ifdef GPU_DIRECT
750  ghost_cpuGauge_diag[nu*4+mu] = pinned_malloc(nbytes);
751 #else
752  ghost_cpuGauge_diag[nu*4+mu] = safe_malloc(nbytes);
753 #endif
754  memset(ghost_cpuGauge_diag[nu*4+mu], 0, nbytes);
755  }
756  }
757  }
758  allocated = 1;
759  }
760 
761  int optflag=1;
762  // driver for for packalllink
763  exchange_cpu_sitelink(param->X, (void**)cpuGauge->Gauge_p(), ghost_cpuGauge, ghost_cpuGauge_diag, prec, param, optflag);
764 
765 #endif
766 
767  do_loadLinkToGPU(param->X, cudaGauge->Even_p(), cudaGauge->Odd_p(), (void**)cpuGauge->Gauge_p(),
768  ghost_cpuGauge, ghost_cpuGauge_diag,
769  cudaGauge->Reconstruct(), cudaGauge->Bytes(), cudaGauge->VolumeCB(), pad,
770  Vsh_x, Vsh_y, Vsh_z, Vsh_t,
771  prec, cpuGauge->Order());
772 
773 #ifdef MULTI_GPU
774  if(!(param->preserve_gauge & QUDA_FAT_PRESERVE_COMM_MEM)) {
775 
776  for(int i=0;i < 4;i++){
777  host_free(ghost_cpuGauge[i]);
778  }
779  for(int i=0;i <4; i++){
780  for(int j=0;j <4; j++){
781  if (i != j) host_free(ghost_cpuGauge_diag[i*4+j]);
782  }
783  }
784  allocated = 0;
785  }
786 #endif
787 
788  }
789 
790  static void
791  do_loadLinkToGPU_ex(const int* X, void *even, void *odd, void**cpuGauge,
792  QudaReconstructType reconstruct, int bytes, int Vh_ex, int pad,
793  QudaPrecision prec, QudaGaugeFieldOrder cpu_order)
794  {
795  int len = Vh_ex*gaugeSiteSize*prec;
796 
797  char *tmp_even = (char *) device_malloc(4*len);
798  char *tmp_odd = tmp_even;
799 
800  //even links
801  if(cpu_order == QUDA_QDP_GAUGE_ORDER){
802  for(int i=0; i < 4; i++){
803 #ifdef GPU_DIRECT
804  cudaMemcpyAsync(tmp_even + i*len, cpuGauge[i], len, cudaMemcpyHostToDevice);
805 #else
806  cudaMemcpy(tmp_even + i*len, cpuGauge[i], len, cudaMemcpyHostToDevice);
807 #endif
808 
809  }
810  } else if (cpu_order == QUDA_MILC_GAUGE_ORDER) { //[parity][dim][volumecb][row][col]
811 #ifdef GPU_DIRECT
812  cudaMemcpyAsync(tmp_even, (char*)cpuGauge, 4*len, cudaMemcpyHostToDevice);
813 #else
814  cudaMemcpy(tmp_even, (char*)cpuGauge, 4*len, cudaMemcpyHostToDevice);
815 #endif
816  }
817 
818  // TIFR [mu][parity][volumecb][col][row]
819 else {
820  errorQuda("Unsupported gauge order");
821  }
822 
823  link_format_cpu_to_gpu((void*)even, (void*)tmp_even, reconstruct, Vh_ex, pad, 0, prec, cpu_order, 0/*default stream*/);
824 
825  //odd links
826  if(cpu_order == QUDA_QDP_GAUGE_ORDER){
827  for(int i=0; i < 4; i++){
828 #ifdef GPU_DIRECT
829  cudaMemcpyAsync(tmp_odd + i*len, ((char*)cpuGauge[i]) + Vh_ex*gaugeSiteSize*prec, len, cudaMemcpyHostToDevice);
830 #else
831  cudaMemcpy(tmp_odd + i*len, ((char*)cpuGauge[i]) + Vh_ex*gaugeSiteSize*prec, len, cudaMemcpyHostToDevice);
832 #endif
833  }
834  } else if (cpu_order == QUDA_MILC_GAUGE_ORDER) {
835 #ifdef GPU_DIRECT
836  cudaMemcpyAsync(tmp_odd, ((char*)cpuGauge) + 4*Vh_ex*gaugeSiteSize*prec, 4*len, cudaMemcpyHostToDevice);
837 #else
838  cudaMemcpy(tmp_odd, ((char*)cpuGauge) + 4*Vh_ex*gaugeSiteSize*prec, 4*len, cudaMemcpyHostToDevice);
839 #endif
840  } else {
841  errorQuda("Unsupported gauge order");
842  }
843  link_format_cpu_to_gpu((void*)odd, (void*)tmp_odd, reconstruct, Vh_ex, pad, 0, prec, cpu_order, 0 /*default stream*/);
844 
845  device_free(tmp_even);
846  }
847 
848 
849  void loadLinkToGPU_ex(cudaGaugeField* cudaGauge, cpuGaugeField* cpuGauge)
850  {
851  if (cudaGauge->Precision() != cpuGauge->Precision()){
852  errorQuda("Mismatch between CPU precision and CUDA precision");
853  }
854  QudaPrecision prec = cudaGauge->Precision();
855  const int *E = cudaGauge->X();
856  int pad = cudaGauge->Pad();
857  do_loadLinkToGPU_ex(E, cudaGauge->Even_p(), cudaGauge->Odd_p(), (void**)cpuGauge->Gauge_p(),
858  cudaGauge->Reconstruct(), cudaGauge->Bytes(), cudaGauge->VolumeCB(), pad,
859  prec, cpuGauge->Order());
860  }
861 
862 
863  template<typename FloatN, typename Float>
864  static void do_storeLinkToCPU(Float* cpuGauge, FloatN *even, FloatN *odd,
865  int bytes, int Vh, int stride, QudaPrecision prec)
866  {
867  int datalen = 4*Vh*gaugeSiteSize*sizeof(Float);
868 
869  double *unpackedDataEven = (double *) device_malloc(datalen);
870  double *unpackedDataOdd = unpackedDataEven;
871 
872  //unpack even data kernel
873  link_format_gpu_to_cpu((void*)unpackedDataEven, (void*)even, Vh, stride, prec, streams[0]);
874 
875 #ifdef GPU_DIRECT
876  cudaMemcpyAsync(cpuGauge, unpackedDataEven, datalen, cudaMemcpyDeviceToHost, streams[0]);
877 #else
878  cudaMemcpy(cpuGauge, unpackedDataEven, datalen, cudaMemcpyDeviceToHost);
879 #endif
880 
881  //unpack odd data kernel
882  link_format_gpu_to_cpu((void*)unpackedDataOdd, (void*)odd, Vh, stride, prec, streams[0]);
883 #ifdef GPU_DIRECT
884  cudaMemcpyAsync(cpuGauge + 4*Vh*gaugeSiteSize, unpackedDataOdd, datalen, cudaMemcpyDeviceToHost, streams[0]);
885 #else
886  cudaMemcpy(cpuGauge + 4*Vh*gaugeSiteSize, unpackedDataOdd, datalen, cudaMemcpyDeviceToHost);
887 #endif
888 
889  device_free(unpackedDataEven);
890  }
891 
892 
893  void storeLinkToCPU(cpuGaugeField* cpuGauge, cudaGaugeField *cudaGauge, QudaGaugeParam* param)
894  {
897 
898  if (cpu_prec != cuda_prec){
899  errorQuda("Mismatch between CPU precision and CUDA precision");
900  }
901 
902  if (cudaGauge->Reconstruct() != QUDA_RECONSTRUCT_NO){
903  errorQuda("Reconstruct type not supported");
904  }
905 
906  int stride = cudaGauge->VolumeCB() + cudaGauge->Pad();
907 
908  if (cuda_prec == QUDA_DOUBLE_PRECISION){
909  do_storeLinkToCPU( (double*)cpuGauge->Gauge_p(), (double2*) cudaGauge->Even_p(), (double2*)cudaGauge->Odd_p(),
910  cudaGauge->Bytes(), cudaGauge->VolumeCB(), stride, cuda_prec);
911  }else if (cuda_prec == QUDA_SINGLE_PRECISION){
912  do_storeLinkToCPU( (float*)cpuGauge->Gauge_p(), (float2*) cudaGauge->Even_p(), (float2*)cudaGauge->Odd_p(),
913  cudaGauge->Bytes(), cudaGauge->VolumeCB(), stride, cuda_prec);
914  } else {
915  errorQuda("Half precision not supported");
916  }
917  }
918 
919 } // namespace quda
920 
921 
922 #endif
923 
int commDim(int)
__constant__ int Vh
#define pinned_malloc(size)
Definition: malloc_quda.h:26
enum QudaPrecision_s QudaPrecision
__constant__ int Vh_ex
int Vs_z
Definition: test_util.cpp:31
__constant__ int Vsh
#define errorQuda(...)
Definition: util_quda.h:73
#define host_free(ptr)
Definition: malloc_quda.h:29
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int mu
cudaStream_t * streams
int Vs_y
Definition: test_util.cpp:31
cudaStream_t * stream
void unpackGhostStaple(int *X, void *_even, void *_odd, int volume, QudaPrecision prec, int stride, int dir, int whichway, void **fwd_nbr_buf, void **back_nbr_buf, cudaStream_t *stream)
void collectGhostStaple(int *X, void *even, void *odd, int volumeCB, int stride, QudaPrecision precision, void *ghost_staple_gpu, int dir, int whichway, cudaStream_t *stream)
#define gaugeSiteSize
QudaPrecision cpu_prec
Definition: dslash_test.cpp:34
#define Vsh_t
Definition: llfat_core.h:4
cudaGaugeField * cudaGauge
#define Vsh_z
Definition: llfat_core.h:3
QudaGaugeParam param
Definition: pack_test.cpp:17
__constant__ int Vs
int E[4]
int Vs_x
Definition: test_util.cpp:31
cudaColorSpinorField * tmp
__device__ __host__ int index(int i, int j)
Definition: quda_matrix.h:342
#define MAX(a, b)
FloatingPoint< float > Float
Definition: gtest.h:7350
void storeLinkToCPU(cpuGaugeField *cpuGauge, cudaGaugeField *cudaGauge, QudaGaugeParam *param)
enum QudaGaugeFieldOrder_s QudaGaugeFieldOrder
cpuGaugeField * cpuGauge
QudaPrecision cuda_prec
Definition: quda.h:42
int X[4]
Definition: quda.h:29
void loadLinkToGPU_ex(cudaGaugeField *cudaGauge, cpuGaugeField *cpuGauge)
#define safe_malloc(size)
Definition: malloc_quda.h:25
void pack_ghost_all_links(void **cpuLink, void **cpuGhostBack, void **cpuGhostFwd, int dir, int nFace, QudaPrecision precision, int *X)
QudaPrecision cuda_prec
Definition: dslash_test.cpp:35
void * memset(void *s, int c, size_t n)
int Z[4]
Definition: test_util.cpp:28
__constant__ int Vh_2d_max
void pack_gauge_diag(void *buf, int *X, void **sitelink, int nu, int mu, int dir1, int dir2, QudaPrecision prec)
enum QudaReconstructType_s QudaReconstructType
Main header file for the QUDA library.
#define Vsh_y
Definition: llfat_core.h:2
void packGhostStaple(int *X, void *even, void *odd, int volume, QudaPrecision prec, int stride, int dir, int whichway, void **fwd_nbr_buf_gpu, void **back_nbr_buf_gpu, void **fwd_nbr_buf, void **back_nbr_buf, cudaStream_t *stream)
#define device_malloc(size)
Definition: malloc_quda.h:24
int Vs_t
Definition: test_util.cpp:31
#define Vsh_x
Definition: llfat_core.h:1
QudaPrecision prec
Definition: test_util.cpp:1551
void link_format_gpu_to_cpu(void *dst, void *src, int Vh, int stride, QudaPrecision prec, cudaStream_t stream)
void pack_ghost_all_staples_cpu(void *staple, void **cpuGhostStapleBack, void **cpuGhostStapleFwd, int nFace, QudaPrecision precision, int *X)
void loadLinkToGPU(cudaGaugeField *cudaGauge, cpuGaugeField *cpuGauge, QudaGaugeParam *param)
void exchange_cpu_sitelink(int *X, void **sitelink, void **ghost_sitelink, void **ghost_sitelink_diag, QudaPrecision gPrecision, QudaGaugeParam *param, int optflag)
QudaPrecision cpu_prec
Definition: quda.h:40
#define device_free(ptr)
Definition: malloc_quda.h:28
void link_format_cpu_to_gpu(void *dst, void *src, int reconstruct, int Vh, int pad, int ghostV, QudaPrecision prec, QudaGaugeFieldOrder cpu_order, cudaStream_t stream)