QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
misc_helpers.cu
Go to the documentation of this file.
1 
2 #include <quda_internal.h>
3 #include <misc_helpers.h>
4 #define gaugeSiteSize 18
5 #define BLOCKSIZE 64
6 
7 
8 
9 /*
10  * MILC order, CPU->GPU
11  *
12  *This function converts format in CPU form
13  * into forms in GPU so as to enable coalesce access
14  * The function only converts half(even or odd) of the links
15  * Therefore the entire link conversion need to call this
16  * function twice
17  *
18  * Without loss of generarity, the parity is assume to be even.
19  * The actual data format in cpu is following
20  * [a0a1 .... a17][b0b1...b17][c..][d...][a18a19 .....a35] ...[b0b1 ... b17] ...
21  * X links Y links T,Z links X Links
22  * where a0->a17 is the X link in the first site
23  * b0->b17 is the Y link in the first site
24  * c0->c17 is the Z link in the first site
25  * d0->d17 is the T link in the first site
26  * a18->a35 is the X link in the second site
27  * etc
28  *
29  * The GPU format of data looks like the following
30  * [a0a1][a18a19] ....[pad][a2a3][a20a21]..... [b0b1][b18b19]....
31  * X links Y links T,Z links
32  *
33  * N: # of FloatN in one gauge field
34  * 9 for QUDA_RECONSTRUCT_NO, SP/DP
35  * 6 for QUDA_RECONSTRUCT_12, DP
36  * 3 for QUDA_RECONSTRUCT_12, SP
37  */
38 
39 namespace quda {
40 
41  template<int N, typename FloatN, typename Float2>
42  __global__ void
43  do_link_format_cpu_to_gpu(FloatN* dst, Float2* src,
44  int reconstruct,
45  int Vh, int pad, int ghostV, size_t threads)
46  {
47  int tid = blockIdx.x * blockDim.x + threadIdx.x;
48  int thread0_tid = blockIdx.x * blockDim.x;
49  __shared__ FloatN buf[N*BLOCKSIZE];
50 
51  int dir;
52  int j;
53  int stride = Vh+pad;
54  for(dir = 0; dir < 4; dir++){
55 #ifdef MULTI_GPU
56  Float2* src_start = src + dir*9*(Vh+ghostV) + thread0_tid*9;
57 #else
58  Float2* src_start = src + dir*9*(Vh) + thread0_tid*9;
59 #endif
60  for(j=0; j < 9; j++){
61  if(thread0_tid*9+j*blockDim.x+threadIdx.x >= 9*threads) break;
62  if( N == 9){
63  ((Float2*)buf)[j*blockDim.x + threadIdx.x] = src_start[j*blockDim.x + threadIdx.x];
64  }else{
65  int idx = j*blockDim.x + threadIdx.x;
66  int modval = idx % 9;
67  int divval = idx / 9;
68  if(modval < 6){
69  ((Float2*)buf)[divval*6+modval] = src_start[idx];
70  }
71 
72  }
73  }
74 
75  __syncthreads();
76  if(tid < threads){
77  FloatN* dst_start = (FloatN*)(dst+dir*N*stride);
78  for(j=0; j < N; j++){
79  dst_start[tid + j*stride] = buf[N*threadIdx.x + j];
80  }
81  }
82  __syncthreads();
83  }//dir
84  }
85 
86 
87  /*
88  *
89  * N: # of FloatN in one gauge field
90  * 9 for QUDA_RECONSTRUCT_NO, SP/DP
91  * 6 for QUDA_RECONSTRUCT_12, DP
92  * 3 for QUDA_RECONSTRUCT_12, SP
93  *
94  * FloatN: float2/double2
95  * Float: float/double
96  *
97  * This is the reverse process for the function do_link_format_gpu_to_cpu()
98  *
99  */
100 
101  template<int N, typename FloatN, typename Float2>
102  __global__ void
103  do_link_format_cpu_to_gpu_milc(FloatN* dst, Float2* src,
104  int reconstruct,
105  int Vh, int pad, int ghostV, size_t threads)
106  {
107 
108  __shared__ FloatN buf[N*BLOCKSIZE];
109  int block_idx = blockIdx.x*blockDim.x/4;
110  int local_idx = 16*(threadIdx.x/64) + threadIdx.x%16;
111  int pos_idx = blockIdx.x * blockDim.x/4 + 16*(threadIdx.x/64) + threadIdx.x%16;
112  int mydir = (threadIdx.x >> 4)% 4;
113  int j;
114  int stride = Vh+pad;
115 
116  for(j=0; j < 9; j++){
117  if(block_idx*9*4 + j*blockDim.x+threadIdx.x >= 9*threads) break;
118  if(N == 9){
119  ((Float2*)buf)[j*blockDim.x + threadIdx.x] = src[block_idx*9*4 + j*blockDim.x + threadIdx.x];
120  }else{
121  int idx = j*blockDim.x + threadIdx.x;
122  int modval = idx % 9;
123  int divval = idx / 9;
124  if(modval < 6){
125  ((Float2*)buf)[divval*6+modval] = src[block_idx*9*4 + idx];
126  }
127  }
128  }
129 
130  __syncthreads();
131 
132  if(pos_idx >= threads/4) return;
133 
134  for(j=0; j < N; j++){
135  if(N == 9){
136  dst[pos_idx + mydir*N*stride + j*stride] = buf[local_idx*4*9+mydir*9+j];
137  }else{
138  dst[pos_idx + mydir*N*stride + j*stride] = buf[local_idx*4*N+mydir*N+j];
139  }
140  }
141  }
142 
143  void
144  link_format_cpu_to_gpu(void* dst, void* src,
145  int reconstruct, int Vh, int pad,
146  int ghostV,
148  cudaStream_t stream)
149  {
150  dim3 blockDim(BLOCKSIZE);
151 
152  if(cpu_order == QUDA_QDP_GAUGE_ORDER){
153 #ifdef MULTI_GPU
154  size_t threads=Vh+ghostV;
155 #else
156  size_t threads=Vh;
157 #endif
158  dim3 gridDim ((threads + BLOCKSIZE -1)/BLOCKSIZE);
159 
160  switch (prec){
162  switch( reconstruct){
163  case QUDA_RECONSTRUCT_NO:
164  do_link_format_cpu_to_gpu<9><<<gridDim, blockDim, 0, stream>>>((double2*)dst, (double2*)src, reconstruct, Vh, pad, ghostV, threads);
165  break;
166  case QUDA_RECONSTRUCT_12:
167  do_link_format_cpu_to_gpu<6><<<gridDim, blockDim, 0, stream>>>((double2*)dst, (double2*)src, reconstruct, Vh, pad, ghostV, threads);
168  break;
169  default:
170  errorQuda("reconstruct type not supported\n");
171  }
172  break;
173 
175  switch( reconstruct){
176  case QUDA_RECONSTRUCT_NO:
177  do_link_format_cpu_to_gpu<9><<<gridDim, blockDim, 0, stream>>>((float2*)dst, (float2*)src, reconstruct, Vh, pad, ghostV, threads);
178  break;
179  case QUDA_RECONSTRUCT_12:
180  do_link_format_cpu_to_gpu<3><<<gridDim, blockDim>>>((float4*)dst, (float2*)src, reconstruct, Vh, pad, ghostV, threads);
181  break;
182  default:
183  errorQuda("reconstruct type not supported\n");
184  }
185  break;
186 
187  default:
188  errorQuda("ERROR: half precision not support in %s\n", __FUNCTION__);
189  }
190  } else if (cpu_order == QUDA_MILC_GAUGE_ORDER){
191 #ifdef MULTI_GPU
192  int threads=4*(Vh+ghostV);
193 #else
194  int threads=4*Vh;
195 #endif
196  dim3 gridDim ((threads + BLOCKSIZE -1)/BLOCKSIZE);
197 
198  switch (prec){
200  switch( reconstruct){
201  case QUDA_RECONSTRUCT_NO:
202  do_link_format_cpu_to_gpu_milc<9><<<gridDim, blockDim, 0, stream>>>((double2*)dst, (double2*)src, reconstruct, Vh, pad, ghostV, threads);
203  break;
204  case QUDA_RECONSTRUCT_12:
205  do_link_format_cpu_to_gpu_milc<6><<<gridDim, blockDim, 0, stream>>>((double2*)dst, (double2*)src, reconstruct, Vh, pad, ghostV, threads);
206  break;
207  default:
208  errorQuda("reconstruct type not supported\n");
209  }
210  break;
211 
213  switch( reconstruct){
214  case QUDA_RECONSTRUCT_NO:
215  do_link_format_cpu_to_gpu_milc<9><<<gridDim, blockDim, 0, stream>>>((float2*)dst, (float2*)src, reconstruct, Vh, pad, ghostV, threads);
216  break;
217  case QUDA_RECONSTRUCT_12:
218  do_link_format_cpu_to_gpu_milc<3><<<gridDim, blockDim, 0, stream>>>((float4*)dst, (float2*)src, reconstruct, Vh, pad, ghostV, threads);
219  break;
220  default:
221  errorQuda("reconstruct type not supported\n");
222  }
223  break;
224 
225  default:
226  errorQuda("ERROR: half precision not support in %s\n", __FUNCTION__);
227  }
228 
229  }else{
230  errorQuda("ERROR: invalid cpu ordering (%d)\n", cpu_order);
231  }
232 
233  return;
234 
235  }
236  /*
237  * src format: the normal link format in GPU that has stride size @stride
238  * the src is stored with 9 double2
239  * dst format: an array of links where x,y,z,t links with the same node id is stored next to each other
240  * This format is used in destination in fatlink computation in cpu
241  * Without loss of generarity, the parity is assume to be even.
242  * The actual data format in GPU is the following
243  * [a0a1][a18a19] ....[pad][a2a3][a20a21]..... [b0b1][b18b19]....
244  * X links Y links T,Z links
245  * The temporary data store in GPU shared memory and the CPU format of data are the following
246  * [a0a1 .... a17] [b0b1 .....b17] [c0c1 .....c17] [d0d1 .....d17] [a18a19....a35] ....
247  * |<------------------------site 0 ---------------------------->|<----- site 2 ----->
248  *
249  *
250  * In loading phase the indices for all threads in the first block is the following (assume block size is 64)
251  * (half warp works on one direction)
252  * threadIdx.x pos_idx mydir
253  * 0 0 0
254  * 1 1 0
255  * 2 2 0
256  * 3 3 0
257  * 4 4 0
258  * 5 5 0
259  * 6 6 0
260  * 7 7 0
261  * 8 8 0
262  * 9 9 0
263  * 10 10 0
264  * 11 11 0
265  * 12 12 0
266  * 13 13 0
267  * 14 14 0
268  * 15 15 0
269  * 16 0 1
270  * 17 1 1
271  * 18 2 1
272  * 19 3 1
273  * 20 4 1
274  * 21 5 1
275  * 22 6 1
276  * 23 7 1
277  * 24 8 1
278  * 25 9 1
279  * 26 10 1
280  * 27 11 1
281  * 28 12 1
282  * 29 13 1
283  * 30 14 1
284  * 31 15 1
285  * 32 0 2
286  * 33 1 2
287  * 34 2 2
288  * 35 3 2
289  * 36 4 2
290  * 37 5 2
291  * 38 6 2
292  * 39 7 2
293  * 40 8 2
294  * 41 9 2
295  * 42 10 2
296  * 43 11 2
297  * 44 12 2
298  * 45 13 2
299  * 46 14 2
300  * 47 15 2
301  * 48 0 3
302  * 49 1 3
303  * 50 2 3
304  * 51 3 3
305  * 52 4 3
306  * 53 5 3
307  * 54 6 3
308  * 55 7 3
309  * 56 8 3
310  * 57 9 3
311  * 58 10 3
312  * 59 11 3
313  * 60 12 3
314  * 61 13 3
315  * 62 14 3
316  * 63 15 3
317  *
318  */
319 
320  template<typename FloatN>
321  __global__ void
322  do_link_format_gpu_to_cpu(FloatN* dst, FloatN* src,
323  int Vh, int stride)
324  {
325  __shared__ FloatN buf[gaugeSiteSize/2*BLOCKSIZE];
326 
327  int j;
328 
329  int block_idx = blockIdx.x*blockDim.x/4;
330  int local_idx = 16*(threadIdx.x/64) + threadIdx.x%16;
331  int pos_idx = blockIdx.x * blockDim.x/4 + 16*(threadIdx.x/64) + threadIdx.x%16;
332  int mydir = (threadIdx.x >> 4)% 4;
333  for(j=0; j < 9; j++){
334  buf[local_idx*4*9+mydir*9+j] = src[pos_idx + mydir*9*stride + j*stride];
335  }
336  __syncthreads();
337 
338  for(j=0; j < 9; j++){
339  dst[block_idx*9*4 + j*blockDim.x + threadIdx.x ] = buf[j*blockDim.x + threadIdx.x];
340  }
341 
342  }
343 
344 
345 
346  void
347  link_format_gpu_to_cpu(void* dst, void* src,
348  int Vh, int stride, QudaPrecision prec, cudaStream_t stream)
349  {
350 
351  dim3 blockDim(BLOCKSIZE);
352  dim3 gridDim(4*Vh/blockDim.x); //every 4 threads process one site's x,y,z,t links
353  //4*Vh must be multipl of BLOCKSIZE or the kernel does not work
354  if ((4*Vh) % blockDim.x != 0){
355  errorQuda("ERROR: 4*Vh(%d) is not multiple of blocksize(%d), exitting\n", Vh, blockDim.x);
356  }
357  if(prec == QUDA_DOUBLE_PRECISION){
358  do_link_format_gpu_to_cpu<<<gridDim, blockDim, 0, stream>>>((double2*)dst, (double2*)src, Vh, stride);
359  }else if(prec == QUDA_SINGLE_PRECISION){
360  do_link_format_gpu_to_cpu<<<gridDim, blockDim, 0, stream>>>((float2*)dst, (float2*)src, Vh, stride);
361  }else{
362  printf("ERROR: half precision is not supported in %s\n",__FUNCTION__);
363  exit(1);
364  }
365 
366  }
367 
368 #define READ_ST_STAPLE(staple, idx, mystride) \
369  Float2 P0 = staple[idx + 0*mystride]; \
370  Float2 P1 = staple[idx + 1*mystride]; \
371  Float2 P2 = staple[idx + 2*mystride]; \
372  Float2 P3 = staple[idx + 3*mystride]; \
373  Float2 P4 = staple[idx + 4*mystride]; \
374  Float2 P5 = staple[idx + 5*mystride]; \
375  Float2 P6 = staple[idx + 6*mystride]; \
376  Float2 P7 = staple[idx + 7*mystride]; \
377  Float2 P8 = staple[idx + 8*mystride];
378 
379 #define WRITE_ST_STAPLE(staple, idx, mystride) \
380  staple[idx + 0*mystride] = P0; \
381  staple[idx + 1*mystride] = P1; \
382  staple[idx + 2*mystride] = P2; \
383  staple[idx + 3*mystride] = P3; \
384  staple[idx + 4*mystride] = P4; \
385  staple[idx + 5*mystride] = P5; \
386  staple[idx + 6*mystride] = P6; \
387  staple[idx + 7*mystride] = P7; \
388  staple[idx + 8*mystride] = P8;
389 
390 
392  const int in_stride;
393  int X[4];
394  GhostStapleParam(const int in_stride, const int X[4]) :
395  in_stride(in_stride) {
396  for (int i=0 ;i<4; i++) this->X[i] = X[i];
397  }
398  };
399 
400 
401  template<int dir, int whichway, typename Float2>
402  __global__ void
404  {
405 
406  int sid = blockIdx.x*blockDim.x + threadIdx.x;
407  int z1 = sid / (param.X[0]>>1);
408  int x1h = sid - z1*(param.X[0]>>1);
409  int z2 = z1 / param.X[1];
410  int x2 = z1 - z2*param.X[1];
411  int x4 = z2 / param.X[2];
412  int x3 = z2 - x4*param.X[2];
413  int x1odd = (x2 + x3 + x4 + parity) & 1;
414  int x1 = 2*x1h + x1odd;
415 
416  READ_ST_STAPLE(in, sid, param.in_stride);
417  int ghost_face_idx;
418 
419  if ( dir == 0 && whichway == QUDA_BACKWARDS){
420  if (x1 < 1){
421  ghost_face_idx = (x4*(param.X[2]*param.X[1])+x3*param.X[1] +x2)>>1;
422  WRITE_ST_STAPLE(out, ghost_face_idx, param.X[3]*param.X[2]*param.X[1]/2);
423  }
424  }
425 
426  if ( dir == 0 && whichway == QUDA_FORWARDS){
427  if (x1 >= param.X[0] - 1){
428  ghost_face_idx = (x4*(param.X[2]*param.X[1])+x3*param.X[1] +x2)>>1;
429  WRITE_ST_STAPLE(out, ghost_face_idx, param.X[3]*param.X[2]*param.X[1]/2);
430  }
431  }
432 
433  if ( dir == 1 && whichway == QUDA_BACKWARDS){
434  if (x2 < 1){
435  ghost_face_idx = (x4*param.X[2]*param.X[0]+x3*param.X[0]+x1)>>1;
436  WRITE_ST_STAPLE(out, ghost_face_idx, param.X[3]*param.X[2]*param.X[0]/2);
437  }
438  }
439 
440  if ( dir == 1 && whichway == QUDA_FORWARDS){
441  if (x2 >= param.X[1] - 1){
442  ghost_face_idx = (x4*param.X[2]*param.X[0]+x3*param.X[0]+x1)>>1;
443  WRITE_ST_STAPLE(out, ghost_face_idx, param.X[3]*param.X[2]*param.X[0]/2);
444  }
445  }
446 
447  if ( dir == 2 && whichway == QUDA_BACKWARDS){
448  if (x3 < 1){
449  ghost_face_idx = (x4*param.X[1]*param.X[0]+x2*param.X[0]+x1)>>1;
450  WRITE_ST_STAPLE(out, ghost_face_idx, param.X[3]*param.X[1]*param.X[0]/2);
451  }
452  }
453 
454  if ( dir == 2 && whichway == QUDA_FORWARDS){
455  if (x3 >= param.X[2] - 1){
456  ghost_face_idx = (x4*param.X[1]*param.X[0] + x2*param.X[0] + x1)>>1;
457  WRITE_ST_STAPLE(out, ghost_face_idx, param.X[3]*param.X[1]*param.X[0]/2);
458  }
459  }
460 
461  if ( dir == 3 && whichway == QUDA_BACKWARDS){
462  if (x4 < 1){
463  ghost_face_idx = (x3*param.X[1]*param.X[0]+x2*param.X[0]+x1)>>1;
464  WRITE_ST_STAPLE(out, ghost_face_idx, param.X[2]*param.X[1]*param.X[0]/2);
465  }
466  }
467 
468  if ( dir == 3 && whichway == QUDA_FORWARDS){
469  if (x4 >= param.X[3] - 1){
470  ghost_face_idx = (x3*param.X[1]*param.X[0]+x2*param.X[0]+x1)>>1;
471  WRITE_ST_STAPLE(out, ghost_face_idx, param.X[2]*param.X[1]*param.X[0]/2);
472  }
473  }
474 
475  }
476 
477 
478  //@dir can be 0, 1, 2, 3 (X,Y,Z,T directions)
479  //@whichway can be QUDA_FORWARDS, QUDA_BACKWORDS
480  void
481  collectGhostStaple(int* X, void* even, void* odd, int volumeCB, int stride, QudaPrecision precision,
482  void* ghost_staple_gpu,
483  int dir, int whichway, cudaStream_t* stream)
484  {
485  int Vsh_x, Vsh_y, Vsh_z, Vsh_t;
486 
487  Vsh_x = X[1]*X[2]*X[3]/2;
488  Vsh_y = X[0]*X[2]*X[3]/2;
489  Vsh_z = X[0]*X[1]*X[3]/2;
490  Vsh_t = X[0]*X[1]*X[2]/2;
491 
492  dim3 gridDim(volumeCB/BLOCKSIZE, 1, 1);
493  dim3 blockDim(BLOCKSIZE, 1, 1);
494  int Vsh[4] = {Vsh_x, Vsh_y, Vsh_z, Vsh_t};
495 
496  void* gpu_buf_even = ghost_staple_gpu;
497  void* gpu_buf_odd = ((char*)ghost_staple_gpu) + Vsh[dir]*gaugeSiteSize*precision ;
498  if (X[dir] % 2 ==1){ //need switch even/odd
499  gpu_buf_odd = ghost_staple_gpu;
500  gpu_buf_even = ((char*)ghost_staple_gpu) + Vsh[dir]*gaugeSiteSize*precision ;
501  }
502 
503  int even_parity = 0;
504  int odd_parity = 1;
505  GhostStapleParam param(stride, X);
506 
507  if (precision == QUDA_DOUBLE_PRECISION){
508  switch(dir){
509  case 0:
510  switch(whichway){
511  case QUDA_BACKWARDS:
512  collectGhostStapleKernel<0, QUDA_BACKWARDS><<<gridDim, blockDim, 0, *stream>>>((double2*)gpu_buf_even, (double2*)even, even_parity, param);
513  collectGhostStapleKernel<0, QUDA_BACKWARDS><<<gridDim, blockDim, 0, *stream>>>((double2*)gpu_buf_odd, (double2*)odd, odd_parity, param);
514  break;
515  case QUDA_FORWARDS:
516  collectGhostStapleKernel<0, QUDA_FORWARDS><<<gridDim, blockDim, 0, *stream>>>((double2*)gpu_buf_even, (double2*)even, even_parity, param);
517  collectGhostStapleKernel<0, QUDA_FORWARDS><<<gridDim, blockDim, 0, *stream>>>((double2*)gpu_buf_odd, (double2*)odd, odd_parity, param);
518  break;
519  default:
520  errorQuda("Invalid whichway");
521  break;
522  }
523  break;
524 
525  case 1:
526  switch(whichway){
527  case QUDA_BACKWARDS:
528  collectGhostStapleKernel<1, QUDA_BACKWARDS><<<gridDim, blockDim, 0, *stream>>>((double2*)gpu_buf_even, (double2*)even, even_parity, param);
529  collectGhostStapleKernel<1, QUDA_BACKWARDS><<<gridDim, blockDim, 0, *stream>>>((double2*)gpu_buf_odd, (double2*)odd, odd_parity, param);
530  break;
531  case QUDA_FORWARDS:
532  collectGhostStapleKernel<1, QUDA_FORWARDS><<<gridDim, blockDim, 0, *stream>>>((double2*)gpu_buf_even, (double2*)even, even_parity, param);
533  collectGhostStapleKernel<1, QUDA_FORWARDS><<<gridDim, blockDim, 0, *stream>>>((double2*)gpu_buf_odd, (double2*)odd, odd_parity, param);
534  break;
535  default:
536  errorQuda("Invalid whichway");
537  break;
538  }
539  break;
540 
541  case 2:
542  switch(whichway){
543  case QUDA_BACKWARDS:
544  collectGhostStapleKernel<2, QUDA_BACKWARDS><<<gridDim, blockDim, 0, *stream>>>((double2*)gpu_buf_even, (double2*)even, even_parity, param);
545  collectGhostStapleKernel<2, QUDA_BACKWARDS><<<gridDim, blockDim, 0, *stream>>>((double2*)gpu_buf_odd, (double2*)odd, odd_parity, param);
546  break;
547  case QUDA_FORWARDS:
548  collectGhostStapleKernel<2, QUDA_FORWARDS><<<gridDim, blockDim, 0, *stream>>>((double2*)gpu_buf_even, (double2*)even, even_parity, param);
549  collectGhostStapleKernel<2, QUDA_FORWARDS><<<gridDim, blockDim, 0, *stream>>>((double2*)gpu_buf_odd, (double2*)odd, odd_parity, param);
550  break;
551  default:
552  errorQuda("Invalid whichway");
553  break;
554  }
555  break;
556 
557  case 3:
558  switch(whichway){
559  case QUDA_BACKWARDS:
560  collectGhostStapleKernel<3, QUDA_BACKWARDS><<<gridDim, blockDim, 0, *stream>>>((double2*)gpu_buf_even, (double2*)even, even_parity, param);
561  collectGhostStapleKernel<3, QUDA_BACKWARDS><<<gridDim, blockDim, 0, *stream>>>((double2*)gpu_buf_odd, (double2*)odd, odd_parity, param);
562  break;
563  case QUDA_FORWARDS:
564  collectGhostStapleKernel<3, QUDA_FORWARDS><<<gridDim, blockDim, 0, *stream>>>((double2*)gpu_buf_even, (double2*)even, even_parity, param);
565  collectGhostStapleKernel<3, QUDA_FORWARDS><<<gridDim, blockDim, 0, *stream>>>((double2*)gpu_buf_odd, (double2*)odd, odd_parity, param);
566  break;
567  default:
568  errorQuda("Invalid whichway");
569  break;
570  }
571  break;
572  }
573  }else if(precision == QUDA_SINGLE_PRECISION){
574  switch(dir){
575  case 0:
576  switch(whichway){
577  case QUDA_BACKWARDS:
578  collectGhostStapleKernel<0, QUDA_BACKWARDS><<<gridDim, blockDim, 0, *stream>>>((float2*)gpu_buf_even, (float2*)even, even_parity, param);
579  collectGhostStapleKernel<0, QUDA_BACKWARDS><<<gridDim, blockDim, 0, *stream>>>((float2*)gpu_buf_odd, (float2*)odd, odd_parity, param);
580  break;
581  case QUDA_FORWARDS:
582  collectGhostStapleKernel<0, QUDA_FORWARDS><<<gridDim, blockDim, 0, *stream>>>((float2*)gpu_buf_even, (float2*)even, even_parity, param);
583  collectGhostStapleKernel<0, QUDA_FORWARDS><<<gridDim, blockDim, 0, *stream>>>((float2*)gpu_buf_odd, (float2*)odd, odd_parity, param);
584  break;
585  default:
586  errorQuda("Invalid whichway");
587  break;
588  }
589  break;
590 
591  case 1:
592  switch(whichway){
593  case QUDA_BACKWARDS:
594  collectGhostStapleKernel<1, QUDA_BACKWARDS><<<gridDim, blockDim, 0, *stream>>>((float2*)gpu_buf_even, (float2*)even, even_parity, param);
595  collectGhostStapleKernel<1, QUDA_BACKWARDS><<<gridDim, blockDim, 0, *stream>>>((float2*)gpu_buf_odd, (float2*)odd, odd_parity, param);
596  break;
597  case QUDA_FORWARDS:
598  collectGhostStapleKernel<1, QUDA_FORWARDS><<<gridDim, blockDim, 0, *stream>>>((float2*)gpu_buf_even, (float2*)even, even_parity, param);
599  collectGhostStapleKernel<1, QUDA_FORWARDS><<<gridDim, blockDim, 0, *stream>>>((float2*)gpu_buf_odd, (float2*)odd, odd_parity, param);
600  break;
601  default:
602  errorQuda("Invalid whichway");
603  break;
604  }
605  break;
606 
607  case 2:
608  switch(whichway){
609  case QUDA_BACKWARDS:
610  collectGhostStapleKernel<2, QUDA_BACKWARDS><<<gridDim, blockDim, 0, *stream>>>((float2*)gpu_buf_even, (float2*)even, even_parity, param);
611  collectGhostStapleKernel<2, QUDA_BACKWARDS><<<gridDim, blockDim, 0, *stream>>>((float2*)gpu_buf_odd, (float2*)odd, odd_parity, param);
612  break;
613  case QUDA_FORWARDS:
614  collectGhostStapleKernel<2, QUDA_FORWARDS><<<gridDim, blockDim, 0, *stream>>>((float2*)gpu_buf_even, (float2*)even, even_parity, param);
615  collectGhostStapleKernel<2, QUDA_FORWARDS><<<gridDim, blockDim, 0, *stream>>>((float2*)gpu_buf_odd, (float2*)odd, odd_parity, param);
616  break;
617  default:
618  errorQuda("Invalid whichway");
619  break;
620  }
621  break;
622 
623  case 3:
624  switch(whichway){
625  case QUDA_BACKWARDS:
626  collectGhostStapleKernel<3, QUDA_BACKWARDS><<<gridDim, blockDim, 0, *stream>>>((float2*)gpu_buf_even, (float2*)even, even_parity, param);
627  collectGhostStapleKernel<3, QUDA_BACKWARDS><<<gridDim, blockDim, 0, *stream>>>((float2*)gpu_buf_odd, (float2*)odd, odd_parity, param);
628  break;
629  case QUDA_FORWARDS:
630  collectGhostStapleKernel<3, QUDA_FORWARDS><<<gridDim, blockDim, 0, *stream>>>((float2*)gpu_buf_even, (float2*)even, even_parity, param);
631  collectGhostStapleKernel<3, QUDA_FORWARDS><<<gridDim, blockDim, 0, *stream>>>((float2*)gpu_buf_odd, (float2*)odd, odd_parity, param);
632  break;
633  default:
634  errorQuda("Invalid whichway");
635  break;
636  }
637  break;
638  }
639  }else{
640  printf("ERROR: invalid precision for %s\n", __FUNCTION__);
641  exit(1);
642  }
643 
644  }
645 
646 } // namespace quda
647 
648 #undef gaugeSiteSize
649 #undef BLOCKSIZE
__constant__ int Vh
enum QudaPrecision_s QudaPrecision
__constant__ int Vsh
#define errorQuda(...)
Definition: util_quda.h:73
cudaStream_t * stream
__global__ void const FloatN FloatM FloatM Float Float int threads
Definition: llfat_core.h:1099
void collectGhostStaple(int *X, void *even, void *odd, int volumeCB, int stride, QudaPrecision precision, void *ghost_staple_gpu, int dir, int whichway, cudaStream_t *stream)
GhostStapleParam(const int in_stride, const int X[4])
__global__ void collectGhostStapleKernel(Float2 *out, Float2 *in, int parity, GhostStapleParam param)
__global__ void do_link_format_cpu_to_gpu(FloatN *dst, Float2 *src, int reconstruct, int Vh, int pad, int ghostV, size_t threads)
Definition: misc_helpers.cu:43
#define Vsh_t
Definition: llfat_core.h:4
#define READ_ST_STAPLE(staple, idx, mystride)
#define WRITE_ST_STAPLE(staple, idx, mystride)
#define Vsh_z
Definition: llfat_core.h:3
QudaGaugeParam param
Definition: pack_test.cpp:17
int z1
Definition: llfat_core.h:814
cpuColorSpinorField * in
enum QudaGaugeFieldOrder_s QudaGaugeFieldOrder
__global__ void do_link_format_gpu_to_cpu(FloatN *dst, FloatN *src, int Vh, int stride)
short x1h
Definition: llfat_core.h:815
cpuColorSpinorField * out
__global__ void do_link_format_cpu_to_gpu_milc(FloatN *dst, Float2 *src, int reconstruct, int Vh, int pad, int ghostV, size_t threads)
int z2
Definition: llfat_core.h:816
#define Vsh_y
Definition: llfat_core.h:2
#define BLOCKSIZE
Definition: misc_helpers.cu:5
short x1odd
Definition: llfat_core.h:821
#define Vsh_x
Definition: llfat_core.h:1
QudaPrecision prec
Definition: test_util.cpp:1551
void link_format_gpu_to_cpu(void *dst, void *src, int Vh, int stride, QudaPrecision prec, cudaStream_t stream)
__syncthreads()
const QudaParity parity
Definition: dslash_test.cpp:29
#define gaugeSiteSize
Definition: misc_helpers.cu:4
void link_format_cpu_to_gpu(void *dst, void *src, int reconstruct, int Vh, int pad, int ghostV, QudaPrecision prec, QudaGaugeFieldOrder cpu_order, cudaStream_t stream)