QUDA  v0.5.0
A library for QCD on GPUs
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
hisq_paths_force_quda.cu
Go to the documentation of this file.
1 #include <read_gauge.h>
2 #include <gauge_field.h>
3 #include <hisq_force_quda.h>
4 #include <hw_quda.h>
5 #include <hisq_force_macros.h>
6 #include<utility>
7 
8 
9 //DEBUG : control compile
10 #define COMPILE_HISQ_DP_18
11 #define COMPILE_HISQ_DP_12
12 #define COMPILE_HISQ_SP_18
13 #define COMPILE_HISQ_SP_12
14 
15 // Disable texture read for now. Need to revisit this.
16 #define HISQ_SITE_MATRIX_LOAD_TEX 1
17 #define HISQ_NEW_OPROD_LOAD_TEX 1
18 
19 namespace quda {
20  namespace fermion_force {
21 
22  typedef struct hisq_kernel_param_s{
23  unsigned long threads;
24  int D1, D2,D3, D4, D1h;
25  int base_idx[4];
26  int ghostDim[4];
28 
29  texture<int4, 1> newOprod0TexDouble;
30  texture<int4, 1> newOprod1TexDouble;
31  texture<float2, 1, cudaReadModeElementType> newOprod0TexSingle;
32  texture<float2, 1, cudaReadModeElementType> newOprod1TexSingle;
33 
34 
36  {
37  static int hisq_force_init_cuda_flag = 0;
38 
39  if (hisq_force_init_cuda_flag){
40  return;
41  }
42  hisq_force_init_cuda_flag=1;
43 
44  int Vh = param->X[0]*param->X[1]*param->X[2]*param->X[3]/2;
45 
46  fat_force_const_t hf_h;
47 #ifdef MULTI_GPU
48  int Vh_ex = (param->X[0]+4)*(param->X[1]+4)*(param->X[2]+4)*(param->X[3]+4)/2;
49  hf_h.site_ga_stride = Vh_ex + param->site_ga_pad;;
51 #else
52  hf_h.site_ga_stride = Vh + param->site_ga_pad;
53  hf_h.color_matrix_stride = Vh;
54 #endif
55  hf_h.mom_ga_stride = Vh + param->mom_ga_pad;
56 
57  cudaMemcpyToSymbol(hf, &hf_h, sizeof(fat_force_const_t));
58 
60  }
61 
62 
63 
64 
65 
66  // struct for holding the fattening path coefficients
67  template<class Real>
69  {
70  Real one;
71  Real three;
72  Real five;
73  Real seven;
74  Real naik;
75  Real lepage;
76  };
77 
78 
79  inline __device__ float2 operator*(float a, const float2 & b)
80  {
81  return make_float2(a*b.x,a*b.y);
82  }
83 
84  inline __device__ double2 operator*(double a, const double2 & b)
85  {
86  return make_double2(a*b.x,a*b.y);
87  }
88 
89  inline __device__ const float2 & operator+=(float2 & a, const float2 & b)
90  {
91  a.x += b.x;
92  a.y += b.y;
93  return a;
94  }
95 
96  inline __device__ const double2 & operator+=(double2 & a, const double2 & b)
97  {
98  a.x += b.x;
99  a.y += b.y;
100  return a;
101  }
102 
103  inline __device__ const float4 & operator+=(float4 & a, const float4 & b)
104  {
105  a.x += b.x;
106  a.y += b.y;
107  a.z += b.z;
108  a.w += b.w;
109  return a;
110  }
112  // Replication of code
113  // This structure is already defined in
114  // unitarize_utilities.h
116  template<class T>
117  struct RealTypeId;
118 
119  template<>
120  struct RealTypeId<float2>
121  {
122  typedef float Type;
123  };
125  template<>
126  struct RealTypeId<double2>
127  {
128  typedef double Type;
129  };
130 
132  template<class T>
133  inline __device__
135  {
136 #define CONJ_INDEX(i,j) j*3 + i
138  T tmp;
139  mat[CONJ_INDEX(0,0)] = conj(mat[0]);
140  mat[CONJ_INDEX(1,1)] = conj(mat[4]);
141  mat[CONJ_INDEX(2,2)] = conj(mat[8]);
142  tmp = conj(mat[1]);
143  mat[CONJ_INDEX(1,0)] = conj(mat[3]);
144  mat[CONJ_INDEX(0,1)] = tmp;
145  tmp = conj(mat[2]);
146  mat[CONJ_INDEX(2,0)] = conj(mat[6]);
147  mat[CONJ_INDEX(0,2)] = tmp;
148  tmp = conj(mat[5]);
149  mat[CONJ_INDEX(2,1)] = conj(mat[7]);
150  mat[CONJ_INDEX(1,2)] = tmp;
151 
152 #undef CONJ_INDEX
153  return;
154  }
155 
156 
157  template<int N, class T>
158  inline __device__
159  void loadMatrixFromField(const T* const field_even, const T* const field_odd,
160  int dir, int idx, T* const mat, int oddness, int stride)
161  {
162  const T* const field = (oddness)?field_odd:field_even;
163  for(int i = 0;i < N ;i++){
164  mat[i] = field[idx + dir*N*stride + i*stride];
165  }
166  return;
167  }
168 
169  template<class T>
170  inline __device__
171  void loadMatrixFromField(const T* const field_even, const T* const field_odd,
172  int dir, int idx, T* const mat, int oddness, int stride)
173  {
174  loadMatrixFromField<9> (field_even, field_odd, dir, idx, mat, oddness, stride);
175  return;
176  }
177 
178 
179 
180  inline __device__
181  void loadMatrixFromField(const float4* const field_even, const float4* const field_odd,
182  int dir, int idx, float2* const mat, int oddness, int stride)
183  {
184  const float4* const field = oddness?field_odd: field_even;
185  float4 tmp;
186  tmp = field[idx + dir*stride*3];
187  mat[0] = make_float2(tmp.x, tmp.y);
188  mat[1] = make_float2(tmp.z, tmp.w);
189  tmp = field[idx + dir*stride*3 + stride];
190  mat[2] = make_float2(tmp.x, tmp.y);
191  mat[3] = make_float2(tmp.z, tmp.w);
192  tmp = field[idx + dir*stride*3 + 2*stride];
193  mat[4] = make_float2(tmp.x, tmp.y);
194  mat[5] = make_float2(tmp.z, tmp.w);
195  return;
196  }
197 
198  template<class T>
199  inline __device__
200  void loadMatrixFromField(const T* const field_even, const T* const field_odd, int idx, T* const mat, int oddness, int stride)
201  {
202  const T* const field = (oddness)?field_odd:field_even;
203  mat[0] = field[idx];
204  mat[1] = field[idx + stride];
205  mat[2] = field[idx + stride*2];
206  mat[3] = field[idx + stride*3];
207  mat[4] = field[idx + stride*4];
208  mat[5] = field[idx + stride*5];
209  mat[6] = field[idx + stride*6];
210  mat[7] = field[idx + stride*7];
211  mat[8] = field[idx + stride*8];
212 
213  return;
214  }
215 
216 
217 #define addMatrixToNewOprod(mat, dir, idx, coeff, field_even, field_odd, oddness) do { \
218  RealA* const field = (oddness)?field_odd: field_even; \
219  RealA value[9]; \
220  value[0] = LOAD_TEX_ENTRY( ((oddness)?NEWOPROD_ODD_TEX:NEWOPROD_EVEN_TEX), field, idx+dir*hf.color_matrix_stride*9); \
221  value[1] = LOAD_TEX_ENTRY( ((oddness)?NEWOPROD_ODD_TEX:NEWOPROD_EVEN_TEX), field, idx+dir*hf.color_matrix_stride*9 + hf.color_matrix_stride); \
222  value[2] = LOAD_TEX_ENTRY( ((oddness)?NEWOPROD_ODD_TEX:NEWOPROD_EVEN_TEX), field, idx+dir*hf.color_matrix_stride*9 + 2*hf.color_matrix_stride); \
223  value[3] = LOAD_TEX_ENTRY( ((oddness)?NEWOPROD_ODD_TEX:NEWOPROD_EVEN_TEX), field, idx+dir*hf.color_matrix_stride*9 + 3*hf.color_matrix_stride); \
224  value[4] = LOAD_TEX_ENTRY( ((oddness)?NEWOPROD_ODD_TEX:NEWOPROD_EVEN_TEX), field, idx+dir*hf.color_matrix_stride*9 + 4*hf.color_matrix_stride); \
225  value[5] = LOAD_TEX_ENTRY( ((oddness)?NEWOPROD_ODD_TEX:NEWOPROD_EVEN_TEX), field, idx+dir*hf.color_matrix_stride*9 + 5*hf.color_matrix_stride); \
226  value[6] = LOAD_TEX_ENTRY( ((oddness)?NEWOPROD_ODD_TEX:NEWOPROD_EVEN_TEX), field, idx+dir*hf.color_matrix_stride*9 + 6*hf.color_matrix_stride); \
227  value[7] = LOAD_TEX_ENTRY( ((oddness)?NEWOPROD_ODD_TEX:NEWOPROD_EVEN_TEX), field, idx+dir*hf.color_matrix_stride*9 + 7*hf.color_matrix_stride); \
228  value[8] = LOAD_TEX_ENTRY( ((oddness)?NEWOPROD_ODD_TEX:NEWOPROD_EVEN_TEX), field, idx+dir*hf.color_matrix_stride*9 + 8*hf.color_matrix_stride); \
229  field[idx + dir*hf.color_matrix_stride*9] = value[0] + coeff*mat[0]; \
230  field[idx + dir*hf.color_matrix_stride*9 + hf.color_matrix_stride] = value[1] + coeff*mat[1]; \
231  field[idx + dir*hf.color_matrix_stride*9 + hf.color_matrix_stride*2] = value[2] + coeff*mat[2]; \
232  field[idx + dir*hf.color_matrix_stride*9 + hf.color_matrix_stride*3] = value[3] + coeff*mat[3]; \
233  field[idx + dir*hf.color_matrix_stride*9 + hf.color_matrix_stride*4] = value[4] + coeff*mat[4]; \
234  field[idx + dir*hf.color_matrix_stride*9 + hf.color_matrix_stride*5] = value[5] + coeff*mat[5]; \
235  field[idx + dir*hf.color_matrix_stride*9 + hf.color_matrix_stride*6] = value[6] + coeff*mat[6]; \
236  field[idx + dir*hf.color_matrix_stride*9 + hf.color_matrix_stride*7] = value[7] + coeff*mat[7]; \
237  field[idx + dir*hf.color_matrix_stride*9 + hf.color_matrix_stride*8] = value[8] + coeff*mat[8]; \
238  }while(0)
239 
240 
241 
242  // only works if Promote<T,U>::Type = T
243 
244  template<class T, class U>
245  inline __device__
246  void addMatrixToField(const T* const mat, int dir, int idx, U coeff,
247  T* const field_even, T* const field_odd, int oddness)
248  {
249  T* const field = (oddness)?field_odd: field_even;
250  field[idx + dir*hf.color_matrix_stride*9] += coeff*mat[0];
251  field[idx + dir*hf.color_matrix_stride*9 + hf.color_matrix_stride] += coeff*mat[1];
252  field[idx + dir*hf.color_matrix_stride*9 + hf.color_matrix_stride*2] += coeff*mat[2];
253  field[idx + dir*hf.color_matrix_stride*9 + hf.color_matrix_stride*3] += coeff*mat[3];
254  field[idx + dir*hf.color_matrix_stride*9 + hf.color_matrix_stride*4] += coeff*mat[4];
255  field[idx + dir*hf.color_matrix_stride*9 + hf.color_matrix_stride*5] += coeff*mat[5];
256  field[idx + dir*hf.color_matrix_stride*9 + hf.color_matrix_stride*6] += coeff*mat[6];
257  field[idx + dir*hf.color_matrix_stride*9 + hf.color_matrix_stride*7] += coeff*mat[7];
258  field[idx + dir*hf.color_matrix_stride*9 + hf.color_matrix_stride*8] += coeff*mat[8];
259 
260  return;
261  }
262 
263 
264  template<class T, class U>
265  inline __device__
266  void addMatrixToField(const T* const mat, int idx, U coeff, T* const field_even,
267  T* const field_odd, int oddness)
268  {
269  T* const field = (oddness)?field_odd: field_even;
270  field[idx ] += coeff*mat[0];
271  field[idx + hf.color_matrix_stride] += coeff*mat[1];
272  field[idx + hf.color_matrix_stride*2] += coeff*mat[2];
273  field[idx + hf.color_matrix_stride*3] += coeff*mat[3];
274  field[idx + hf.color_matrix_stride*4] += coeff*mat[4];
275  field[idx + hf.color_matrix_stride*5] += coeff*mat[5];
276  field[idx + hf.color_matrix_stride*6] += coeff*mat[6];
277  field[idx + hf.color_matrix_stride*7] += coeff*mat[7];
278  field[idx + hf.color_matrix_stride*8] += coeff*mat[8];
279 
280  return;
281  }
282 
283  template<class T, class U>
284  inline __device__
285  void addMatrixToField_test(const T* const mat, int idx, U coeff, T* const field_even,
286  T* const field_odd, int oddness)
287  {
288  T* const field = (oddness)?field_odd: field_even;
289  //T oldvalue=field[idx];
290  field[idx ] += coeff*mat[0];
291  field[idx + hf.color_matrix_stride] += coeff*mat[1];
292  field[idx + hf.color_matrix_stride*2] += coeff*mat[2];
293  field[idx + hf.color_matrix_stride*3] += coeff*mat[3];
294  field[idx + hf.color_matrix_stride*4] += coeff*mat[4];
295  field[idx + hf.color_matrix_stride*5] += coeff*mat[5];
296  field[idx + hf.color_matrix_stride*6] += coeff*mat[6];
297  field[idx + hf.color_matrix_stride*7] += coeff*mat[7];
298  field[idx + hf.color_matrix_stride*8] += coeff*mat[8];
299 
300 #if (!defined(__CUDA_ARCH__) || (__COMPUTE_CAPABILITY__>=200))
301  printf("value is coeff(%f) * mat[0].x(%f)=%f\n", coeff, mat[0].x, field[idx].x);
302 #endif
303  return;
304  }
305 
306  template<class T>
307  inline __device__
308  void storeMatrixToField(const T* const mat, int dir, int idx, T* const field_even, T* const field_odd, int oddness)
309  {
310  T* const field = (oddness)?field_odd: field_even;
311  field[idx + dir*hf.color_matrix_stride*9] = mat[0];
312  field[idx + dir*hf.color_matrix_stride*9 + hf.color_matrix_stride] = mat[1];
313  field[idx + dir*hf.color_matrix_stride*9 + hf.color_matrix_stride*2] = mat[2];
314  field[idx + dir*hf.color_matrix_stride*9 + hf.color_matrix_stride*3] = mat[3];
315  field[idx + dir*hf.color_matrix_stride*9 + hf.color_matrix_stride*4] = mat[4];
316  field[idx + dir*hf.color_matrix_stride*9 + hf.color_matrix_stride*5] = mat[5];
317  field[idx + dir*hf.color_matrix_stride*9 + hf.color_matrix_stride*6] = mat[6];
318  field[idx + dir*hf.color_matrix_stride*9 + hf.color_matrix_stride*7] = mat[7];
319  field[idx + dir*hf.color_matrix_stride*9 + hf.color_matrix_stride*8] = mat[8];
320 
321  return;
322  }
323 
324 
325  template<class T>
326  inline __device__
327  void storeMatrixToField(const T* const mat, int idx, T* const field_even, T* const field_odd, int oddness)
328  {
329  T* const field = (oddness)?field_odd: field_even;
330  field[idx] = mat[0];
331  field[idx + hf.color_matrix_stride] = mat[1];
332  field[idx + hf.color_matrix_stride*2] = mat[2];
333  field[idx + hf.color_matrix_stride*3] = mat[3];
334  field[idx + hf.color_matrix_stride*4] = mat[4];
335  field[idx + hf.color_matrix_stride*5] = mat[5];
336  field[idx + hf.color_matrix_stride*6] = mat[6];
337  field[idx + hf.color_matrix_stride*7] = mat[7];
338  field[idx + hf.color_matrix_stride*8] = mat[8];
339 
340  return;
341  }
342 
343 
344  template<class T, class U>
345  inline __device__
346  void storeMatrixToMomentumField(const T* const mat, int dir, int idx, U coeff,
347  T* const mom_even, T* const mom_odd, int oddness)
348  {
349  T* const mom_field = (oddness)?mom_odd:mom_even;
350  T temp2;
351  temp2.x = (mat[1].x - mat[3].x)*0.5*coeff;
352  temp2.y = (mat[1].y + mat[3].y)*0.5*coeff;
353  mom_field[idx + dir*hf.mom_ga_stride*5] = temp2;
354 
355  temp2.x = (mat[2].x - mat[6].x)*0.5*coeff;
356  temp2.y = (mat[2].y + mat[6].y)*0.5*coeff;
357  mom_field[idx + dir*hf.mom_ga_stride*5 + hf.mom_ga_stride] = temp2;
358 
359  temp2.x = (mat[5].x - mat[7].x)*0.5*coeff;
360  temp2.y = (mat[5].y + mat[7].y)*0.5*coeff;
361  mom_field[idx + dir*hf.mom_ga_stride*5 + hf.mom_ga_stride*2] = temp2;
362 
363  const typename RealTypeId<T>::Type temp = (mat[0].y + mat[4].y + mat[8].y)*0.3333333333333333333333333;
364  temp2.x = (mat[0].y-temp)*coeff;
365  temp2.y = (mat[4].y-temp)*coeff;
366  mom_field[idx + dir*hf.mom_ga_stride*5 + hf.mom_ga_stride*3] = temp2;
367 
368  temp2.x = (mat[8].y - temp)*coeff;
369  temp2.y = 0.0;
370  mom_field[idx + dir*hf.mom_ga_stride*5 + hf.mom_ga_stride*4] = temp2;
371 
372  return;
373  }
374 
375  // Struct to determine the coefficient sign at compile time
376  template<int pos_dir, int odd_lattice>
377  struct CoeffSign
378  {
379  static const int result = -1;
380  };
381 
382  template<>
383  struct CoeffSign<0,1>
384  {
385  static const int result = -1;
386  };
387 
388  template<>
389  struct CoeffSign<0,0>
390  {
391  static const int result = 1;
392  };
393 
394  template<>
395  struct CoeffSign<1,1>
396  {
397  static const int result = 1;
398  };
399 
400  template<int odd_lattice>
401  struct Sign
402  {
403  static const int result = 1;
404  };
405 
406  template<>
407  struct Sign<1>
408  {
409  static const int result = -1;
410  };
411 
412  template<class RealX>
413  struct ArrayLength
414  {
415  static const int result=9;
416  };
417 
418  template<>
419  struct ArrayLength<float4>
420  {
421  static const int result=5;
422  };
423 
424 
425 
426 
427 
428  // reconstructSign doesn't do anything right now,
429  // but it will, soon.
430  template<typename T>
431  __device__ void reconstructSign(int* const sign, int dir, const T i[4]){
432 
433 
434  *sign=1;
435 
436  switch(dir){
437  case XUP:
438  if( (i[3]&1)==1) *sign=-1;
439  break;
440 
441  case YUP:
442  if( ((i[3]+i[0])&1) == 1) *sign=-1;
443  break;
444 
445  case ZUP:
446  if( ((i[3]+i[0]+i[1])&1) == 1) *sign=-1;
447  break;
448 
449  case TUP:
450 #ifdef MULTI_GPU
451  if( (i[3] == X4+1 && PtNm1)
452  || (i[3] == 1 && Pt0)) {
453  *sign=-1;
454  }
455 #else
456  if(i[3] == X4m1) *sign=-1;
457 #endif
458  break;
459 
460  default:
461 #if (!defined(__CUDA_ARCH__) || (__COMPUTE_CAPABILITY__>=200))
462  printf("Error: invalid dir\n");
463 #endif
464  break;
465  }
466  return;
467  }
468 
469 
470 
471 
472 
473 
474  template<class RealA, int oddBit>
475  __global__ void
476  do_one_link_term_kernel(const RealA* const oprodEven, const RealA* const oprodOdd,
477  int sig, typename RealTypeId<RealA>::Type coeff,
478  RealA* const outputEven, RealA* const outputOdd, const int threads)
479  {
480  int sid = blockIdx.x * blockDim.x + threadIdx.x;
481  if (sid >= threads) return;
482 #ifdef MULTI_GPU
483  int x[4];
484  int z1 = sid/X1h;
485  int x1h = sid - z1*X1h;
486  int z2 = z1/X2;
487  x[1] = z1 - z2*X2;
488  x[3] = z2/X3;
489  x[2] = z2 - x[3]*X3;
490  int x1odd = (x[1] + x[2] + x[3] + oddBit) & 1;
491  x[0] = 2*x1h + x1odd;
492  //int X = 2*sid + x1odd;
493 
494  int new_sid = ( (x[3]+2)*E3E2E1+(x[2]+2)*E2E1+(x[1]+2)*E1+(x[0]+2))>>1 ;
495 #else
496  int new_sid = sid;
497 #endif
499  if(GOES_FORWARDS(sig)){
500  loadMatrixFromField(oprodEven, oprodOdd, sig, new_sid, COLOR_MAT_W, oddBit, hf.color_matrix_stride);
501  addMatrixToField(COLOR_MAT_W, sig, new_sid, coeff, outputEven, outputOdd, oddBit);
502  }
503  return;
504  }
505 
506 
507 #define DD_CONCAT(n,r) n ## r ## kernel
508 
509 #define HISQ_KERNEL_NAME(a,b) DD_CONCAT(a,b)
510  //precision: 0 is for double, 1 is for single
511 
512 #define NEWOPROD_EVEN_TEX newOprod0TexDouble
513 #define NEWOPROD_ODD_TEX newOprod1TexDouble
514 #if (HISQ_NEW_OPROD_LOAD_TEX == 1)
515 #define LOAD_TEX_ENTRY(tex, field, idx) READ_DOUBLE2_TEXTURE(tex, field, idx)
516 #else
517 #define LOAD_TEX_ENTRY(tex, field, idx) field[idx]
518 #endif
519 
520  //double precision, recon=18
521 #define PRECISION 0
522 #define RECON 18
523 #if (HISQ_SITE_MATRIX_LOAD_TEX == 1)
524 #define HISQ_LOAD_LINK(linkEven, linkOdd, dir, idx, var, oddness) HISQ_LOAD_MATRIX_18_DOUBLE_TEX((oddness)?siteLink1TexDouble:siteLink0TexDouble, (oddness)?linkOdd:linkEven, dir, idx, var, hf.site_ga_stride)
525 #else
526 #define HISQ_LOAD_LINK(linkEven, linkOdd, dir, idx, var, oddness) loadMatrixFromField(linkEven, linkOdd, dir, idx, var, oddness, hf.site_ga_stride)
527 #endif
528 #define COMPUTE_LINK_SIGN(sign, dir, x)
529 #define RECONSTRUCT_SITE_LINK(var, sign)
530 #include "hisq_paths_force_core.h"
531 #undef PRECISION
532 #undef RECON
533 #undef HISQ_LOAD_LINK
534 #undef COMPUTE_LINK_SIGN
535 #undef RECONSTRUCT_SITE_LINK
536 
537  //double precision, recon=12
538 #define PRECISION 0
539 #define RECON 12
540 #if (HISQ_SITE_MATRIX_LOAD_TEX == 1)
541 #define HISQ_LOAD_LINK(linkEven, linkOdd, dir, idx, var, oddness) HISQ_LOAD_MATRIX_12_DOUBLE_TEX((oddness)?siteLink1TexDouble:siteLink0TexDouble, (oddness)?linkOdd:linkEven,dir, idx, var, hf.site_ga_stride)
542 #else
543 #define HISQ_LOAD_LINK(linkEven, linkOdd, dir, idx, var, oddness) loadMatrixFromField<6>(linkEven, linkOdd, dir, idx, var, oddness, hf.site_ga_stride)
544 #endif
545 #define COMPUTE_LINK_SIGN(sign, dir, x) reconstructSign(sign, dir, x)
546 #define RECONSTRUCT_SITE_LINK(var, sign) FF_RECONSTRUCT_LINK_12(var, sign)
547 #include "hisq_paths_force_core.h"
548 #undef PRECISION
549 #undef RECON
550 #undef HISQ_LOAD_LINK
551 #undef COMPUTE_LINK_SIGN
552 #undef RECONSTRUCT_SITE_LINK
553 #undef NEWOPROD_EVEN_TEX
554 #undef NEWOPROD_ODD_TEX
555 #undef LOAD_TEX_ENTRY
556 
557 
558 #define NEWOPROD_EVEN_TEX newOprod0TexSingle
559 #define NEWOPROD_ODD_TEX newOprod1TexSingle
560 
561 #if (HISQ_NEW_OPROD_LOAD_TEX==1)
562 #define LOAD_TEX_ENTRY(tex, field, idx) tex1Dfetch(tex,idx)
563 #else
564 #define LOAD_TEX_ENTRY(tex, field, idx) field[idx]
565 #endif
566 
567  //single precision, recon=18
568 #define PRECISION 1
569 #define RECON 18
570 #if (HISQ_SITE_MATRIX_LOAD_TEX == 1)
571 #define HISQ_LOAD_LINK(linkEven, linkOdd, dir, idx, var, oddness) HISQ_LOAD_MATRIX_18_SINGLE_TEX((oddness)?siteLink1TexSingle:siteLink0TexSingle, dir, idx, var, hf.site_ga_stride)
572 #else
573 #define HISQ_LOAD_LINK(linkEven, linkOdd, dir, idx, var, oddness) loadMatrixFromField(linkEven, linkOdd, dir, idx, var, oddness, hf.site_ga_stride)
574 #endif
575 #define COMPUTE_LINK_SIGN(sign, dir, x)
576 #define RECONSTRUCT_SITE_LINK(var, sign)
577 #include "hisq_paths_force_core.h"
578 #undef PRECISION
579 #undef RECON
580 #undef HISQ_LOAD_LINK
581 #undef COMPUTE_LINK_SIGN
582 #undef RECONSTRUCT_SITE_LINK
583 
584  //single precision, recon=12
585 #define PRECISION 1
586 #define RECON 12
587 #if (HISQ_SITE_MATRIX_LOAD_TEX == 1)
588 #define HISQ_LOAD_LINK(linkEven, linkOdd, dir, idx, var, oddness) HISQ_LOAD_MATRIX_12_SINGLE_TEX((oddness)?siteLink1TexSingle_recon:siteLink0TexSingle_recon, dir, idx, var, hf.site_ga_stride)
589 #else
590 #define HISQ_LOAD_LINK(linkEven, linkOdd, dir, idx, var, oddness) loadMatrixFromField(linkEven, linkOdd, dir, idx, var, oddness, hf.site_ga_stride)
591 #endif
592 #define COMPUTE_LINK_SIGN(sign, dir, x) reconstructSign(sign, dir, x)
593 #define RECONSTRUCT_SITE_LINK(var, sign) FF_RECONSTRUCT_LINK_12(var, sign)
594 #include "hisq_paths_force_core.h"
595 #undef PRECISION
596 #undef RECON
597 #undef HISQ_LOAD_LINK
598 #undef COMPUTE_LINK_SIGN
599 #undef RECONSTRUCT_SITE_LINK
600 #undef NEWOPROD_EVEN_TEX
601 #undef NEWOPROD_ODD_TEX
602 #undef LOAD_TEX_ENTRY
603 
604 
605 
606 
607  template<class RealA, class RealB>
608  class MiddleLink : public Tunable {
609 
610  private:
611  const cudaGaugeField &link;
612  const cudaGaugeField &oprod;
613  const cudaGaugeField &Qprev;
614  const int sig;
615  const int mu;
616  const typename RealTypeId<RealA>::Type &coeff;
617  cudaGaugeField &Pmu;
618  cudaGaugeField &P3;
619  cudaGaugeField &Qmu;
620  cudaGaugeField &newOprod;
621  const hisq_kernel_param_t &kparam;
622 
623  int sharedBytesPerThread() const { return 0; }
624  int sharedBytesPerBlock(const TuneParam &) const { return 0; }
625 
626 
627  // don't tune the grid dimension
628  bool advanceGridDim(TuneParam &param) const { return false; }
629 
630  // generalize Tunable::advanceBlockDim() to also set gridDim, with extra checking to ensure that gridDim isn't too large for the device
631  bool advanceBlockDim(TuneParam &param) const
632  {
633  const unsigned int max_threads = deviceProp.maxThreadsDim[0];
634  const unsigned int max_blocks = deviceProp.maxGridSize[0];
635  const unsigned int max_shared = 16384; // FIXME: use deviceProp.sharedMemPerBlock;
636  const int step = deviceProp.warpSize;
637  bool ret;
638  param.block.x += step;
639  if (param.block.x > max_threads || sharedBytesPerThread()*param.block.x > max_shared) {
640  param.block = dim3((kparam.threads+max_blocks-1)/max_blocks, 1, 1); // ensure the blockDim is large enough, given the limit on gridDim
641  param.block.x = ((param.block.x+step-1) / step) * step; // round up to the nearest "step"
642  if (param.block.x > max_threads) errorQuda("Local lattice volume is too large for device");
643  ret = false;
644  } else {
645  ret = true;
646  }
647  param.grid = dim3((kparam.threads+param.block.x-1)/param.block.x, 1, 1);
648  return ret;
649  }
650 
651  public:
653  const cudaGaugeField &oprod,
654  const cudaGaugeField &Qprev,
655  int sig, int mu,
656  const typename RealTypeId<RealA>::Type &coeff,
657  cudaGaugeField &Pmu, // write only
658  cudaGaugeField &P3, // write only
659  cudaGaugeField &Qmu,
660  cudaGaugeField &newOprod,
661  const hisq_kernel_param_t &kparam) :
662  link(link), oprod(oprod), Qprev(Qprev), sig(sig), mu(mu),
663  coeff(coeff), Pmu(Pmu), P3(P3), Qmu(Qmu), newOprod(newOprod), kparam(kparam)
664  { ; }
665  // need alternative constructor to hack around null pointer passing
667  const cudaGaugeField &oprod,
668  int sig, int mu,
669  const typename RealTypeId<RealA>::Type &coeff,
670  cudaGaugeField &Pmu, // write only
671  cudaGaugeField &P3, // write only
672  cudaGaugeField &Qmu,
673  cudaGaugeField &newOprod,
674  const hisq_kernel_param_t &kparam) :
675  link(link), oprod(oprod), Qprev(link), sig(sig), mu(mu),
676  coeff(coeff), Pmu(Pmu), P3(P3), Qmu(Qmu), newOprod(newOprod), kparam(kparam)
677  { ; }
678  virtual ~MiddleLink() { ; }
679 
680  TuneKey tuneKey() const {
681  std::stringstream vol, aux;
682  vol << kparam.D1 << "x";
683  vol << kparam.D2 << "x";
684  vol << kparam.D3 << "x";
685  vol << kparam.D4;
686  aux << "threads=" << kparam.threads << ",prec=" << link.Precision();
687  aux << ",recon=" << link.Reconstruct() << ",sig=" << sig << ",mu=" << mu;
688  return TuneKey(vol.str(), typeid(*this).name(), aux.str());
689  }
690 
691 
692 #define CALL_ARGUMENTS(typeA, typeB) <<<tp.grid, tp.block>>> \
693  ((typeA*)oprod.Even_p(), (typeA*)oprod.Odd_p(), \
694  (typeA*)Qprev_even, (typeA*)Qprev_odd, \
695  (typeB*)link.Even_p(), (typeB*)link.Odd_p(), \
696  sig, mu, coeff, \
697  (typeA*)Pmu.Even_p(), (typeA*)Pmu.Odd_p(), \
698  (typeA*)P3.Even_p(), (typeA*)P3.Odd_p(), \
699  (typeA*)Qmu.Even_p(), (typeA*)Qmu.Odd_p(), \
700  (typeA*)newOprod.Even_p(), (typeA*)newOprod.Odd_p(), kparam)
701 
702 
703 #define CALL_MIDDLE_LINK_KERNEL(sig_sign, mu_sign) \
704  if(oddness_change ==0 ){ \
705  if(sizeof(RealA) == sizeof(float2)){ \
706  if(recon == QUDA_RECONSTRUCT_NO){ \
707  do_middle_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(float2, float2); \
708  do_middle_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(float2, float2); \
709  }else{ \
710  do_middle_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(float2, float4); \
711  do_middle_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(float2, float4); \
712  } \
713  }else{ \
714  if(recon == QUDA_RECONSTRUCT_NO){ \
715  do_middle_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(double2, double2); \
716  do_middle_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(double2, double2); \
717  }else{ \
718  do_middle_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(double2, double2); \
719  do_middle_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(double2, double2); \
720  } \
721  } \
722  }else{ \
723  if(sizeof(RealA) == sizeof(float2)){ \
724  if(recon == QUDA_RECONSTRUCT_NO){ \
725  do_middle_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(float2, float2); \
726  do_middle_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(float2, float2); \
727  }else{ \
728  do_middle_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(float2, float4); \
729  do_middle_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(float2, float4); \
730  } \
731  }else{ \
732  if(recon == QUDA_RECONSTRUCT_NO){ \
733  do_middle_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(double2, double2); \
734  do_middle_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(double2, double2); \
735  }else{ \
736  do_middle_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(double2, double2); \
737  do_middle_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(double2, double2); \
738  } \
739  } \
740  }
741 
742 
743 
744  void apply(const cudaStream_t &stream) {
745  TuneParam tp = tuneLaunch(*this, dslashTuning, verbosity);
746  QudaReconstructType recon = link.Reconstruct();
747  int oddness_change = (kparam.base_idx[0] + kparam.base_idx[1]
748  + kparam.base_idx[2] + kparam.base_idx[3])&1;
749 
750  const void *Qprev_even = (&Qprev == &link) ? NULL : Qprev.Even_p();
751  const void *Qprev_odd = (&Qprev == &link) ? NULL : Qprev.Odd_p();
752 
753 
754  if (GOES_FORWARDS(sig) && GOES_FORWARDS(mu)){
756  }else if (GOES_FORWARDS(sig) && GOES_BACKWARDS(mu)){
758  }else if (GOES_BACKWARDS(sig) && GOES_FORWARDS(mu)){
760  }else{
762  }
763  }
764 
765 #undef CALL_ARGUMENTS
766 #undef CALL_MIDDLE_LINK_KERNEL
767 
768  void preTune() {
769  Pmu.backup();
770  P3.backup();
771  Qmu.backup();
772  newOprod.backup();
773  }
774 
775  void postTune() {
776  Pmu.restore();
777  P3.restore();
778  Qmu.restore();
779  newOprod.restore();
780  }
781 
782  virtual void initTuneParam(TuneParam &param) const
783  {
784  const unsigned int max_threads = deviceProp.maxThreadsDim[0];
785  const unsigned int max_blocks = deviceProp.maxGridSize[0];
786  const int step = deviceProp.warpSize;
787  param.block = dim3((kparam.threads+max_blocks-1)/max_blocks, 1, 1); // ensure the blockDim is large enough, given the limit on gridDim
788  param.block.x = ((param.block.x+step-1) / step) * step; // round up to the nearest "step"
789  if (param.block.x > max_threads) errorQuda("Local lattice volume is too large for device");
790  param.grid = dim3((kparam.threads+param.block.x-1)/param.block.x, 1, 1);
791  param.shared_bytes = sharedBytesPerThread()*param.block.x > sharedBytesPerBlock(param) ?
792  sharedBytesPerThread()*param.block.x : sharedBytesPerBlock(param);
793  }
794 
796  void defaultTuneParam(TuneParam &param) const {
797  initTuneParam(param);
798  }
799 
800  long long flops() const { return 0; }
801  };
802 
803 
804  template<class RealA, class RealB>
805  class LepageMiddleLink : public Tunable {
806 
807  private:
808  const cudaGaugeField &link;
809  const cudaGaugeField &oprod;
810  const cudaGaugeField &Qprev;
811  const int sig;
812  const int mu;
813  const typename RealTypeId<RealA>::Type &coeff;
814  cudaGaugeField &P3; // write only
815  cudaGaugeField &newOprod;
816  const hisq_kernel_param_t &kparam;
817 
818  int sharedBytesPerThread() const { return 0; }
819  int sharedBytesPerBlock(const TuneParam &) const { return 0; }
820 
821  // don't tune the grid dimension
822  bool advanceGridDim(TuneParam &param) const { return false; }
823  bool advanceBlockDim(TuneParam &param) const {
824  bool rtn = Tunable::advanceBlockDim(param);
825  param.grid = dim3((kparam.threads+param.block.x-1)/param.block.x, 1, 1);
826  return rtn;
827  }
828 
829  public:
831  const cudaGaugeField &oprod,
832  const cudaGaugeField &Qprev,
833  int sig, int mu,
834  const typename RealTypeId<RealA>::Type &coeff,
835  cudaGaugeField &P3, cudaGaugeField &newOprod,
836  const hisq_kernel_param_t &kparam) :
837  link(link), oprod(oprod), Qprev(Qprev), sig(sig), mu(mu),
838  coeff(coeff), P3(P3), newOprod(newOprod), kparam(kparam)
839  { ; }
840  virtual ~LepageMiddleLink() { ; }
841 
842  TuneKey tuneKey() const {
843  std::stringstream vol, aux;
844  vol << kparam.D1 << "x";
845  vol << kparam.D2 << "x";
846  vol << kparam.D3 << "x";
847  vol << kparam.D4;
848  aux << "threads=" << kparam.threads << ",prec=" << link.Precision();
849  aux << ",recon=" << link.Reconstruct() << ",sig=" << sig << ",mu=" << mu;
850  return TuneKey(vol.str(), typeid(*this).name(), aux.str());
851  }
852 
853 #define CALL_ARGUMENTS(typeA, typeB) <<<tp.grid, tp.block>>> \
854  ((typeA*)oprod.Even_p(), (typeA*)oprod.Odd_p(), \
855  (typeA*)Qprev.Even_p(), (typeA*)Qprev.Odd_p(), \
856  (typeB*)link.Even_p(), (typeB*)link.Odd_p(), \
857  sig, mu, coeff, \
858  (typeA*)P3.Even_p(), (typeA*)P3.Odd_p(), \
859  (typeA*)newOprod.Even_p(), (typeA*)newOprod.Odd_p(), \
860  kparam)
861 
862  #define CALL_MIDDLE_LINK_KERNEL(sig_sign, mu_sign) \
863  if(oddness_change == 0){ \
864  if(sizeof(RealA) == sizeof(float2)){ \
865  if(recon == QUDA_RECONSTRUCT_NO){ \
866  do_lepage_middle_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(float2, float2); \
867  do_lepage_middle_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(float2, float2); \
868  }else{ \
869  do_lepage_middle_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(float2, float4); \
870  do_lepage_middle_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(float2, float4); \
871  } \
872  }else{ \
873  if(recon == QUDA_RECONSTRUCT_NO){ \
874  do_lepage_middle_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(double2, double2); \
875  do_lepage_middle_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(double2, double2); \
876  }else{ \
877  do_lepage_middle_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(double2, double2); \
878  do_lepage_middle_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(double2, double2); \
879  } \
880  } \
881  }else{ \
882  if(sizeof(RealA) == sizeof(float2)){ \
883  if(recon == QUDA_RECONSTRUCT_NO){ \
884  do_lepage_middle_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(float2, float2); \
885  do_lepage_middle_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(float2, float2); \
886  }else{ \
887  do_lepage_middle_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(float2, float4); \
888  do_lepage_middle_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(float2, float4); \
889  } \
890  }else{ \
891  if(recon == QUDA_RECONSTRUCT_NO){ \
892  do_lepage_middle_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(double2, double2); \
893  do_lepage_middle_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(double2, double2); \
894  }else{ \
895  do_lepage_middle_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(double2, double2); \
896  do_lepage_middle_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(double2, double2); \
897  } \
898  } \
899  }
900 
901  void apply(const cudaStream_t &stream) {
902  TuneParam tp = tuneLaunch(*this, dslashTuning, verbosity);
903  QudaReconstructType recon = link.Reconstruct();
904  int oddness_change = (kparam.base_idx[0] + kparam.base_idx[1]
905  + kparam.base_idx[2] + kparam.base_idx[3])&1;
906 
907 
908  if (GOES_FORWARDS(sig) && GOES_FORWARDS(mu)){
910  }else if (GOES_FORWARDS(sig) && GOES_BACKWARDS(mu)){
912  }else if (GOES_BACKWARDS(sig) && GOES_FORWARDS(mu)){
914  }else{
916  }
917 
918  }
919 
920 #undef CALL_ARGUMENTS
921 #undef CALL_MIDDLE_LINK_KERNEL
922 
923  void preTune() {
924  P3.backup();
925  newOprod.backup();
926  }
927 
928  void postTune() {
929  P3.restore();
930  newOprod.restore();
931  }
932 
933  virtual void initTuneParam(TuneParam &param) const
934  {
935  Tunable::initTuneParam(param);
936  param.grid = dim3((kparam.threads+param.block.x-1)/param.block.x, 1, 1);
937  }
938 
940  void defaultTuneParam(TuneParam &param) const
941  {
943  param.grid = dim3((kparam.threads+param.block.x-1)/param.block.x, 1, 1);
944  }
945 
946  long long flops() const { return 0; }
947  };
948 
949  template<class RealA, class RealB>
950  class SideLink : public Tunable {
951 
952  private:
953  const cudaGaugeField &link;
954  const cudaGaugeField &P3;
955  const cudaGaugeField &oprod;
956  const int sig;
957  const int mu;
958  const typename RealTypeId<RealA>::Type &coeff;
959  const typename RealTypeId<RealA>::Type &accumu_coeff;
960  cudaGaugeField &shortP;
961  cudaGaugeField &newOprod;
962  const hisq_kernel_param_t &kparam;
963 
964  int sharedBytesPerThread() const { return 0; }
965  int sharedBytesPerBlock(const TuneParam &) const { return 0; }
966 
967  // don't tune the grid dimension
968  bool advanceGridDim(TuneParam &param) const { return false; }
969  bool advanceBlockDim(TuneParam &param) const {
970  bool rtn = Tunable::advanceBlockDim(param);
971  param.grid = dim3((kparam.threads+param.block.x-1)/param.block.x, 1, 1);
972  return rtn;
973  }
974 
975  public:
976  SideLink(const cudaGaugeField &link,
977  const cudaGaugeField &P3,
978  const cudaGaugeField &oprod,
979  int sig, int mu,
980  const typename RealTypeId<RealA>::Type &coeff,
981  const typename RealTypeId<RealA>::Type &accumu_coeff,
982  cudaGaugeField &shortP,
983  cudaGaugeField &newOprod,
984  const hisq_kernel_param_t &kparam) :
985  link(link), P3(P3), oprod(oprod),
986  sig(sig), mu(mu), coeff(coeff), accumu_coeff(accumu_coeff),
987  shortP(shortP), newOprod(newOprod), kparam(kparam)
988  { ; }
989  virtual ~SideLink() { ; }
990 
991  TuneKey tuneKey() const {
992  std::stringstream vol, aux;
993  vol << kparam.D1 << "x";
994  vol << kparam.D2 << "x";
995  vol << kparam.D3 << "x";
996  vol << kparam.D4;
997  aux << "threads=" << kparam.threads << ",prec=" << link.Precision();
998  aux << ",recon=" << link.Reconstruct() << ",sig=" << sig << ",mu=" << mu;
999  return TuneKey(vol.str(), typeid(*this).name(), aux.str());
1000  }
1001 
1002 #define CALL_ARGUMENTS(typeA, typeB) <<<tp.grid, tp.block>>> \
1003  ((typeA*)P3.Even_p(), (typeA*)P3.Odd_p(), \
1004  (typeA*)oprod.Even_p(), (typeA*)oprod.Odd_p(), \
1005  (typeB*)link.Even_p(), (typeB*)link.Odd_p(), \
1006  sig, mu, \
1007  coeff, \
1008  (typename RealTypeId<typeA>::Type) accumu_coeff, \
1009  (typeA*)shortP.Even_p(), (typeA*)shortP.Odd_p(), \
1010  (typeA*)newOprod.Even_p(), (typeA*)newOprod.Odd_p(), \
1011  kparam)
1012 
1013 #define CALL_SIDE_LINK_KERNEL(sig_sign, mu_sign) \
1014  if(oddness_change == 0){ \
1015  if(sizeof(RealA) == sizeof(float2)){ \
1016  if(recon == QUDA_RECONSTRUCT_NO){ \
1017  do_side_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(float2, float2); \
1018  do_side_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(float2, float2); \
1019  }else{ \
1020  do_side_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(float2, float4); \
1021  do_side_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(float2, float4); \
1022  } \
1023  }else{ \
1024  if(recon == QUDA_RECONSTRUCT_NO){ \
1025  do_side_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(double2, double2); \
1026  do_side_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(double2, double2); \
1027  }else{ \
1028  do_side_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(double2, double2); \
1029  do_side_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(double2, double2); \
1030  } \
1031  } \
1032  }else{ \
1033  if(sizeof(RealA) == sizeof(float2)){ \
1034  if(recon == QUDA_RECONSTRUCT_NO){ \
1035  do_side_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(float2, float2); \
1036  do_side_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(float2, float2); \
1037  }else{ \
1038  do_side_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(float2, float4); \
1039  do_side_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(float2, float4); \
1040  } \
1041  }else{ \
1042  if(recon == QUDA_RECONSTRUCT_NO){ \
1043  do_side_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(double2, double2); \
1044  do_side_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(double2, double2); \
1045  }else{ \
1046  do_side_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(double2, double2); \
1047  do_side_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(double2, double2); \
1048  } \
1049  } \
1050  }
1051 
1052  void apply(const cudaStream_t &stream) {
1053  TuneParam tp = tuneLaunch(*this, dslashTuning, verbosity);
1054  QudaReconstructType recon = link.Reconstruct();
1055  int oddness_change = (kparam.base_idx[0] + kparam.base_idx[1]
1056  + kparam.base_idx[2] + kparam.base_idx[3])&1;
1057 
1058  if (GOES_FORWARDS(sig) && GOES_FORWARDS(mu)){
1060  }else if (GOES_FORWARDS(sig) && GOES_BACKWARDS(mu)){
1061  CALL_SIDE_LINK_KERNEL(1,0);
1062  }else if (GOES_BACKWARDS(sig) && GOES_FORWARDS(mu)){
1064  }else{
1065  CALL_SIDE_LINK_KERNEL(0,0);
1066  }
1067  }
1068 
1069 #undef CALL_SIDE_LINK_KERNEL
1070 #undef CALL_ARGUMENTS
1072  void preTune() {
1073  shortP.backup();
1074  newOprod.backup();
1075  }
1076 
1077  void postTune() {
1078  shortP.restore();
1079  newOprod.restore();
1080  }
1081 
1082  virtual void initTuneParam(TuneParam &param) const
1083  {
1084  Tunable::initTuneParam(param);
1085  param.grid = dim3((kparam.threads+param.block.x-1)/param.block.x, 1, 1);
1086  }
1087 
1089  void defaultTuneParam(TuneParam &param) const
1090  {
1092  param.grid = dim3((kparam.threads+param.block.x-1)/param.block.x, 1, 1);
1093  }
1094 
1095  long long flops() const { return 0; }
1096  };
1097 
1098 
1099  template<class RealA, class RealB>
1100  class SideLinkShort : public Tunable {
1101 
1102  private:
1103  const cudaGaugeField &link;
1104  const cudaGaugeField &P3;
1105  const int sig;
1106  const int mu;
1107  const typename RealTypeId<RealA>::Type &coeff;
1108  cudaGaugeField &newOprod;
1109  const hisq_kernel_param_t &kparam;
1110 
1111  int sharedBytesPerThread() const { return 0; }
1112  int sharedBytesPerBlock(const TuneParam &) const { return 0; }
1113 
1114  // don't tune the grid dimension
1115  bool advanceGridDim(TuneParam &param) const { return false; }
1116  bool advanceBlockDim(TuneParam &param) const {
1117  bool rtn = Tunable::advanceBlockDim(param);
1118  param.grid = dim3((kparam.threads+param.block.x-1)/param.block.x, 1, 1);
1119  return rtn;
1120  }
1121 
1122  public:
1123  SideLinkShort(const cudaGaugeField &link, const cudaGaugeField &P3, int sig, int mu,
1124  const typename RealTypeId<RealA>::Type &coeff, cudaGaugeField &newOprod,
1125  const hisq_kernel_param_t &kparam) :
1126  link(link), P3(P3), sig(sig), mu(mu), coeff(coeff), newOprod(newOprod), kparam(kparam)
1127  { ; }
1128  virtual ~SideLinkShort() { ; }
1129 
1130  TuneKey tuneKey() const {
1131  std::stringstream vol, aux;
1132  vol << kparam.D1 << "x";
1133  vol << kparam.D2 << "x";
1134  vol << kparam.D3 << "x";
1135  vol << kparam.D4;
1136  aux << "threads=" << kparam.threads << ",prec=" << link.Precision();
1137  aux << ",recon=" << link.Reconstruct() << ",sig=" << sig << ",mu=" << mu;
1138  return TuneKey(vol.str(), typeid(*this).name(), aux.str());
1139  }
1140 
1141 #define CALL_ARGUMENTS(typeA, typeB) <<<tp.grid, tp.block>>> \
1142  ((typeA*)P3.Even_p(), (typeA*)P3.Odd_p(), \
1143  (typeB*)link.Even_p(), (typeB*)link.Odd_p(), \
1144  sig, mu, (typename RealTypeId<typeA>::Type) coeff, \
1145  (typeA*)newOprod.Even_p(), (typeA*)newOprod.Odd_p(), kparam)
1146 
1148 #define CALL_SIDE_LINK_KERNEL(sig_sign, mu_sign) \
1149  if(oddness_change == 0){ \
1150  if(sizeof(RealA) == sizeof(float2)){ \
1151  if(recon == QUDA_RECONSTRUCT_NO){ \
1152  do_side_link_short_sp_18_kernel<float2, float2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(float2, float2); \
1153  do_side_link_short_sp_18_kernel<float2, float2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(float2, float2); \
1154  }else{ \
1155  do_side_link_short_sp_12_kernel<float2, float4, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(float2, float4); \
1156  do_side_link_short_sp_12_kernel<float2, float4, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(float2, float4); \
1157  } \
1158  }else{ \
1159  if(recon == QUDA_RECONSTRUCT_NO){ \
1160  do_side_link_short_dp_18_kernel<double2, double2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(double2, double2); \
1161  do_side_link_short_dp_18_kernel<double2, double2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(double2, double2); \
1162  }else{ \
1163  do_side_link_short_dp_12_kernel<double2, double2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(double2, double2); \
1164  do_side_link_short_dp_12_kernel<double2, double2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(double2, double2); \
1165  } \
1166  } \
1167  }else{ \
1168  if(sizeof(RealA) == sizeof(float2)){ \
1169  if(recon == QUDA_RECONSTRUCT_NO){ \
1170  do_side_link_short_sp_18_kernel<float2, float2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(float2, float2); \
1171  do_side_link_short_sp_18_kernel<float2, float2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(float2, float2); \
1172  }else{ \
1173  do_side_link_short_sp_12_kernel<float2, float4, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(float2, float4); \
1174  do_side_link_short_sp_12_kernel<float2, float4, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(float2, float4); \
1175  } \
1176  }else{ \
1177  if(recon == QUDA_RECONSTRUCT_NO){ \
1178  do_side_link_short_dp_18_kernel<double2, double2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(double2, double2); \
1179  do_side_link_short_dp_18_kernel<double2, double2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(double2, double2); \
1180  }else{ \
1181  do_side_link_short_dp_12_kernel<double2, double2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(double2, double2); \
1182  do_side_link_short_dp_12_kernel<double2, double2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(double2, double2); \
1183  } \
1184  } \
1185  }
1186 
1187  void apply(const cudaStream_t &stream) {
1188  TuneParam tp = tuneLaunch(*this, dslashTuning, verbosity);
1189  QudaReconstructType recon = link.Reconstruct();
1190  int oddness_change = (kparam.base_idx[0] + kparam.base_idx[1]
1191  + kparam.base_idx[2] + kparam.base_idx[3])&1;
1192 
1193  if (GOES_FORWARDS(sig) && GOES_FORWARDS(mu)){
1194  CALL_SIDE_LINK_KERNEL(1,1);
1195  }else if (GOES_FORWARDS(sig) && GOES_BACKWARDS(mu)){
1196  CALL_SIDE_LINK_KERNEL(1,0);
1197 
1198  }else if (GOES_BACKWARDS(sig) && GOES_FORWARDS(mu)){
1199  CALL_SIDE_LINK_KERNEL(0,1);
1200  }else{
1201  CALL_SIDE_LINK_KERNEL(0,0);
1202  }
1203  }
1204 
1205 #undef CALL_SIDE_LINK_KERNEL
1206 #undef CALL_ARGUMENTS
1207 
1208 
1209  void preTune() {
1210  newOprod.backup();
1211  }
1212 
1213  void postTune() {
1214  newOprod.restore();
1215  }
1216 
1217  virtual void initTuneParam(TuneParam &param) const
1218  {
1219  Tunable::initTuneParam(param);
1220  param.grid = dim3((kparam.threads+param.block.x-1)/param.block.x, 1, 1);
1221  }
1222 
1224  void defaultTuneParam(TuneParam &param) const
1225  {
1227  param.grid = dim3((kparam.threads+param.block.x-1)/param.block.x, 1, 1);
1228  }
1229 
1230  long long flops() const { return 0; }
1231  };
1232 
1233  template<class RealA, class RealB>
1234  class AllLink : public Tunable {
1235 
1236  private:
1237  const cudaGaugeField &link;
1238  const cudaGaugeField &oprod;
1239  const cudaGaugeField &Qprev;
1240  const int sig;
1241  const int mu;
1242  const typename RealTypeId<RealA>::Type &coeff;
1243  const typename RealTypeId<RealA>::Type &accumu_coeff;
1244  cudaGaugeField &shortP;
1245  cudaGaugeField &newOprod;
1246  const hisq_kernel_param_t &kparam;
1247 
1248  int sharedBytesPerThread() const { return 0; }
1249  int sharedBytesPerBlock(const TuneParam &) const { return 0; }
1250 
1251  // don't tune the grid dimension
1252  bool advanceGridDim(TuneParam &param) const { return false; }
1253  bool advanceBlockDim(TuneParam &param) const {
1254  bool rtn = Tunable::advanceBlockDim(param);
1255  param.grid = dim3((kparam.threads+param.block.x-1)/param.block.x, 1, 1);
1256  return rtn;
1257  }
1258 
1259  public:
1260  AllLink(const cudaGaugeField &link,
1261  const cudaGaugeField &oprod,
1262  const cudaGaugeField &Qprev,
1263  int sig, int mu,
1264  const typename RealTypeId<RealA>::Type &coeff,
1265  const typename RealTypeId<RealA>::Type &accumu_coeff,
1266  cudaGaugeField &shortP, cudaGaugeField &newOprod,
1267  const hisq_kernel_param_t &kparam) :
1268  link(link), oprod(oprod), Qprev(Qprev), sig(sig), mu(mu),
1269  coeff(coeff), accumu_coeff(accumu_coeff), shortP(shortP),
1270  newOprod(newOprod), kparam(kparam)
1271  { ; }
1272  virtual ~AllLink() { ; }
1273 
1274  TuneKey tuneKey() const {
1275  std::stringstream vol, aux;
1276  vol << kparam.D1 << "x";
1277  vol << kparam.D2 << "x";
1278  vol << kparam.D3 << "x";
1279  vol << kparam.D4;
1280  aux << "threads=" << kparam.threads << ",prec=" << link.Precision();
1281  aux << ",recon=" << link.Reconstruct() << ",sig=" << sig << ",mu=" << mu;
1282  return TuneKey(vol.str(), typeid(*this).name(), aux.str());
1283  }
1284 
1285 #define CALL_ARGUMENTS(typeA, typeB) <<<tp.grid, tp.block>>> \
1286  ((typeA*)oprod.Even_p(), (typeA*)oprod.Odd_p(), \
1287  (typeA*)Qprev.Even_p(), (typeA*)Qprev.Odd_p(), \
1288  (typeB*)link.Even_p(), (typeB*)link.Odd_p(), sig, mu, \
1289  (typename RealTypeId<typeA>::Type)coeff, \
1290  (typename RealTypeId<typeA>::Type)accumu_coeff, \
1291  (typeA*)shortP.Even_p(),(typeA*)shortP.Odd_p(), \
1292  (typeA*)newOprod.Even_p(), (typeA*)newOprod.Odd_p(), kparam)
1293 
1294 #define CALL_ALL_LINK_KERNEL(sig_sign, mu_sign) \
1295  if(oddness_change == 0){ \
1296  if(sizeof(RealA) == sizeof(float2)){ \
1297  if(recon == QUDA_RECONSTRUCT_NO){ \
1298  do_all_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(float2, float2); \
1299  do_all_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(float2, float2); \
1300  }else{ \
1301  do_all_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(float2, float4); \
1302  do_all_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(float2, float4); \
1303  } \
1304  }else{ \
1305  if(recon == QUDA_RECONSTRUCT_NO){ \
1306  do_all_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(double2, double2); \
1307  do_all_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(double2, double2); \
1308  }else{ \
1309  do_all_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(double2, double2); \
1310  do_all_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(double2, double2); \
1311  } \
1312  } \
1313  }else{ \
1314  if(sizeof(RealA) == sizeof(float2)){ \
1315  if(recon == QUDA_RECONSTRUCT_NO){ \
1316  do_all_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(float2, float2); \
1317  do_all_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(float2, float2); \
1318  }else{ \
1319  do_all_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(float2, float4); \
1320  do_all_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(float2, float4); \
1321  } \
1322  }else{ \
1323  if(recon == QUDA_RECONSTRUCT_NO){ \
1324  do_all_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(double2, double2); \
1325  do_all_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(double2, double2); \
1326  }else{ \
1327  do_all_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(double2, double2); \
1328  do_all_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(double2, double2); \
1329  } \
1330  } \
1331  }
1332  void apply(const cudaStream_t &stream) {
1333  TuneParam tp = tuneLaunch(*this, dslashTuning, verbosity);
1334  QudaReconstructType recon = link.Reconstruct();
1335  int oddness_change = (kparam.base_idx[0] + kparam.base_idx[1]
1336  + kparam.base_idx[2] + kparam.base_idx[3])&1;
1337 
1338  if (GOES_FORWARDS(sig) && GOES_FORWARDS(mu)){
1339  CALL_ALL_LINK_KERNEL(1, 1);
1340  }else if (GOES_FORWARDS(sig) && GOES_BACKWARDS(mu)){
1341  CALL_ALL_LINK_KERNEL(1, 0);
1342  }else if (GOES_BACKWARDS(sig) && GOES_FORWARDS(mu)){
1343  CALL_ALL_LINK_KERNEL(0, 1);
1344  }else{
1345  CALL_ALL_LINK_KERNEL(0, 0);
1346  }
1347 
1348  return;
1349  }
1350 
1351 #undef CALL_ARGUMENTS
1352 #undef CALL_ALL_LINK_KERNEL
1353 
1354  void preTune() {
1355  shortP.backup();
1356  newOprod.backup();
1357  }
1358 
1359  void postTune() {
1360  shortP.restore();
1361  newOprod.restore();
1362  }
1363 
1364  virtual void initTuneParam(TuneParam &param) const
1365  {
1366  Tunable::initTuneParam(param);
1367  param.grid = dim3((kparam.threads+param.block.x-1)/param.block.x, 1, 1);
1368  }
1369 
1371  void defaultTuneParam(TuneParam &param) const
1372  {
1374  param.grid = dim3((kparam.threads+param.block.x-1)/param.block.x, 1, 1);
1375  }
1376 
1377  long long flops() const { return 0; }
1378  };
1379 
1380 
1381  template<class RealA, class RealB>
1382  class OneLinkTerm : public Tunable {
1383 
1384  private:
1385  const cudaGaugeField &oprod;
1386  const int sig;
1387  const typename RealTypeId<RealA>::Type &coeff;
1388  cudaGaugeField &ForceMatrix;
1389  const int* X;
1390 
1391  int sharedBytesPerThread() const { return 0; }
1392  int sharedBytesPerBlock(const TuneParam &) const { return 0; }
1393 
1394  // don't tune the grid dimension
1395  bool advanceGridDim(TuneParam &param) const { return false; }
1396  bool advanceBlockDim(TuneParam &param) const {
1397  bool rtn = Tunable::advanceBlockDim(param);
1398  int threads = X[0]*X[1]*X[2]*X[3]/2;
1399 
1400  param.grid = dim3((threads + param.block.x-1)/param.block.x, 1, 1);
1401  return rtn;
1402  }
1403 
1404  public:
1405  OneLinkTerm(const cudaGaugeField &oprod, int sig,
1406  const typename RealTypeId<RealA>::Type &coeff,
1407  cudaGaugeField &ForceMatrix, const int* _X) :
1408  oprod(oprod), sig(sig), coeff(coeff), ForceMatrix(ForceMatrix), X(_X)
1409  { ; }
1410 
1411  virtual ~OneLinkTerm() { ; }
1412 
1413  TuneKey tuneKey() const {
1414  std::stringstream vol, aux;
1415  vol << X[0] << "x";
1416  vol << X[1] << "x";
1417  vol << X[2] << "x";
1418  vol << X[3];
1419  int threads = X[0]*X[1]*X[2]*X[3]/2;
1420  aux << "threads=" << threads << ",prec=" << oprod.Precision();
1421  aux << ",sig=" << sig << ",coeff=" << coeff;
1422  return TuneKey(vol.str(), typeid(*this).name(), aux.str());
1423  }
1424 
1425  void apply(const cudaStream_t &stream) {
1426  TuneParam tp = tuneLaunch(*this, dslashTuning, verbosity);
1427 
1428  int threads = X[0]*X[1]*X[2]*X[3]/2;
1429 
1430  if(GOES_FORWARDS(sig)){
1431  do_one_link_term_kernel<RealA,0><<<tp.grid,tp.block>>>(static_cast<const RealA*>(oprod.Even_p()),
1432  static_cast<const RealA*>(oprod.Odd_p()),
1433  sig, coeff,
1434  static_cast<RealA*>(ForceMatrix.Even_p()),
1435  static_cast<RealA*>(ForceMatrix.Odd_p()),
1436  threads);
1437  do_one_link_term_kernel<RealA,1><<<tp.grid,tp.block>>>(static_cast<const RealA*>(oprod.Even_p()),
1438  static_cast<const RealA*>(oprod.Odd_p()),
1439  sig, coeff,
1440  static_cast<RealA*>(ForceMatrix.Even_p()),
1441  static_cast<RealA*>(ForceMatrix.Odd_p()),
1442  threads);
1443 
1444  }
1445  }
1446 
1447  void preTune() {
1448  ForceMatrix.backup();
1449  }
1450 
1451  void postTune() {
1452  ForceMatrix.restore();
1453  }
1454 
1455  virtual void initTuneParam(TuneParam &param) const
1456  {
1457  Tunable::initTuneParam(param);
1458 
1459  int threads = X[0]*X[1]*X[2]*X[3]/2;
1460  param.grid = dim3((threads+param.block.x-1)/param.block.x, 1, 1);
1461  }
1462 
1464  void defaultTuneParam(TuneParam &param) const
1465  {
1467  int threads = X[0]*X[1]*X[2]*X[3]/2;
1468  param.grid = dim3((threads+param.block.x-1)/param.block.x, 1, 1);
1469  }
1470 
1471  long long flops() const { return 0; }
1472  };
1473 
1474 
1475  template<class RealA, class RealB>
1476  class LongLinkTerm : public Tunable {
1477 
1478  private:
1479  const cudaGaugeField &link;
1480  const cudaGaugeField &naikOprod;
1481  const int sig;
1482  const typename RealTypeId<RealA>::Type &naik_coeff;
1483  cudaGaugeField &output;
1484  const int * X;
1485  const hisq_kernel_param_t &kparam;
1486 
1487  int sharedBytesPerThread() const { return 0; }
1488  int sharedBytesPerBlock(const TuneParam &) const { return 0; }
1489 
1490  // don't tune the grid dimension
1491  bool advanceGridDim(TuneParam &param) const { return false; }
1492  bool advanceBlockDim(TuneParam &param) const {
1493  bool rtn = Tunable::advanceBlockDim(param);
1494  int threads = X[0]*X[1]*X[2]*X[3]/2;
1495  param.grid = dim3((threads + param.block.x-1)/param.block.x, 1, 1);
1496  return rtn;
1497  }
1498 
1499  public:
1500  LongLinkTerm(const cudaGaugeField &link, const cudaGaugeField &naikOprod,
1501  int sig, const typename RealTypeId<RealA>::Type &naik_coeff,
1502  cudaGaugeField &output, const int* _X, const hisq_kernel_param_t &kparam) :
1503  link(link), naikOprod(naikOprod), sig(sig), naik_coeff(naik_coeff), output(output),
1504  X(_X), kparam(kparam)
1505  { ; }
1506 
1507  virtual ~LongLinkTerm() { ; }
1508 
1509  TuneKey tuneKey() const {
1510  std::stringstream vol, aux;
1511  vol << X[0] << "x";
1512  vol << X[1] << "x";
1513  vol << X[2] << "x";
1514  vol << X[3];
1515  int threads = X[0]*X[1]*X[2]*X[3]/2;
1516  aux << "threads=" << threads << ",prec=" << link.Precision();
1517  aux << ",sig=" << sig;
1518  return TuneKey(vol.str(), typeid(*this).name(), aux.str());
1519  }
1520 
1521 #define CALL_ARGUMENTS(typeA, typeB) <<<tp.grid,tp.block>>> \
1522  ((typeB*)link.Even_p(), (typeB*)link.Odd_p(), \
1523  (typeA*)naikOprod.Even_p(), (typeA*)naikOprod.Odd_p(), \
1524  sig, naik_coeff, \
1525  (typeA*)output.Even_p(), (typeA*)output.Odd_p(), \
1526  kparam);
1527 
1528  void apply(const cudaStream_t &stream) {
1529  TuneParam tp = tuneLaunch(*this, dslashTuning, verbosity);
1530  QudaReconstructType recon = link.Reconstruct();
1531 
1532  if(GOES_BACKWARDS(sig)) errorQuda("sig does not go forward\n");
1533  if(sizeof(RealA) == sizeof(float2)){
1534  if(recon == QUDA_RECONSTRUCT_NO){
1535  do_longlink_sp_18_kernel<float2,float2, 0> CALL_ARGUMENTS(float2, float2);
1536  do_longlink_sp_18_kernel<float2,float2, 1> CALL_ARGUMENTS(float2, float2);
1537  }else{
1538  do_longlink_sp_12_kernel<float2,float4, 0> CALL_ARGUMENTS(float2, float4);
1539  do_longlink_sp_12_kernel<float2,float4, 1> CALL_ARGUMENTS(float2, float4);
1540  }
1541  }else{
1542  if(recon == QUDA_RECONSTRUCT_NO){
1543  do_longlink_dp_18_kernel<double2,double2, 0> CALL_ARGUMENTS(double2, double2);
1544  do_longlink_dp_18_kernel<double2,double2, 1> CALL_ARGUMENTS(double2, double2);
1545  }else{
1546  do_longlink_dp_12_kernel<double2,double2, 0> CALL_ARGUMENTS(double2, double2);
1547  do_longlink_dp_12_kernel<double2,double2, 1> CALL_ARGUMENTS(double2, double2);
1548  }
1549  }
1550  }
1551 
1552 #undef CALL_ARGUMENTS
1553 
1554  void preTune() {
1555  output.backup();
1556  }
1557 
1558  void postTune() {
1559  output.restore();
1560  }
1561 
1562  virtual void initTuneParam(TuneParam &param) const
1563  {
1564  Tunable::initTuneParam(param);
1565  int threads = X[0]*X[1]*X[2]*X[3]/2;
1566  param.grid = dim3((threads+param.block.x-1)/param.block.x, 1, 1);
1567  }
1568 
1570  void defaultTuneParam(TuneParam &param) const
1571  {
1573  int threads = X[0]*X[1]*X[2]*X[3]/2;
1574  param.grid = dim3((threads+param.block.x-1)/param.block.x, 1, 1);
1575  }
1576 
1577  long long flops() const { return 0; }
1578  };
1579 
1580 
1581 
1582 
1583  template<class RealA, class RealB>
1584  class CompleteForce : public Tunable {
1585 
1586  private:
1587  const cudaGaugeField &link;
1588  const cudaGaugeField &oprod;
1589  const int sig;
1590  cudaGaugeField &mom;
1591  const int* X;
1592 
1593  int sharedBytesPerThread() const { return 0; }
1594  int sharedBytesPerBlock(const TuneParam &) const { return 0; }
1595 
1596  // don't tune the grid dimension
1597  bool advanceGridDim(TuneParam &param) const { return false; }
1598  bool advanceBlockDim(TuneParam &param) const {
1599  bool rtn = Tunable::advanceBlockDim(param);
1600  int threads = X[0]*X[1]*X[2]*X[3]/2;
1601  param.grid = dim3((threads + param.block.x-1)/param.block.x, 1, 1);
1602  return rtn;
1603  }
1604 
1605  public:
1606  CompleteForce(const cudaGaugeField &link, const cudaGaugeField &oprod,
1607  int sig, cudaGaugeField &mom, const int* _X) :
1608  link(link), oprod(oprod), sig(sig), mom(mom), X(_X)
1609  { ; }
1610 
1611  virtual ~CompleteForce() { ; }
1612 
1613  TuneKey tuneKey() const {
1614  std::stringstream vol, aux;
1615  vol << X[0] << "x";
1616  vol << X[1] << "x";
1617  vol << X[2] << "x";
1618  vol << X[3];
1619  int threads = X[0]*X[1]*X[2]*X[3]/2;
1620  aux << "threads=" << threads << ",prec=" << link.Precision() << ",sig=" << sig;
1621  return TuneKey(vol.str(), typeid(*this).name(), aux.str());
1622  }
1623 
1624 #define CALL_ARGUMENTS(typeA, typeB) <<<tp.grid, tp.block>>> \
1625  ((typeB*)link.Even_p(), (typeB*)link.Odd_p(), \
1626  (typeA*)oprod.Even_p(), (typeA*)oprod.Odd_p(), \
1627  sig, \
1628  (typeA*)mom.Even_p(), (typeA*)mom.Odd_p(), \
1629  X[0] * X[1] * X[2] * X[3]/2);
1630 
1631  void apply(const cudaStream_t &stream) {
1632  TuneParam tp = tuneLaunch(*this, dslashTuning, verbosity);
1633  QudaReconstructType recon = link.Reconstruct();;
1634 
1635  if(sizeof(RealA) == sizeof(float2)){
1636  if(recon == QUDA_RECONSTRUCT_NO){
1637  do_complete_force_sp_18_kernel<float2,float2, 0> CALL_ARGUMENTS(float2, float2);
1638  do_complete_force_sp_18_kernel<float2,float2, 1> CALL_ARGUMENTS(float2, float2);
1639  }else{
1640  do_complete_force_sp_12_kernel<float2,float4, 0> CALL_ARGUMENTS(float2, float4);
1641  do_complete_force_sp_12_kernel<float2,float4, 1> CALL_ARGUMENTS(float2, float4);
1642  }
1643  }else{
1644  if(recon == QUDA_RECONSTRUCT_NO){
1645  do_complete_force_dp_18_kernel<double2,double2, 0> CALL_ARGUMENTS(double2, double2);
1646  do_complete_force_dp_18_kernel<double2,double2, 1> CALL_ARGUMENTS(double2, double2);
1647  }else{
1648  do_complete_force_dp_12_kernel<double2,double2, 0> CALL_ARGUMENTS(double2, double2);
1649  do_complete_force_dp_12_kernel<double2,double2, 1> CALL_ARGUMENTS(double2, double2);
1650  }
1651  }
1652  }
1653 
1654 #undef CALL_ARGUMENTS
1655 
1656  void preTune() {
1657  mom.backup();
1658  }
1659 
1660  void postTune() {
1661  mom.restore();
1662  }
1663 
1664  virtual void initTuneParam(TuneParam &param) const
1665  {
1666  Tunable::initTuneParam(param);
1667  int threads = X[0]*X[1]*X[2]*X[3]/2;
1668  param.grid = dim3((threads+param.block.x-1)/param.block.x, 1, 1);
1669  }
1670 
1672  void defaultTuneParam(TuneParam &param) const
1673  {
1675  int threads = X[0]*X[1]*X[2]*X[3]/2;
1676  param.grid = dim3((threads+param.block.x-1)/param.block.x, 1, 1);
1677  }
1678 
1679  long long flops() const { return 0; }
1680  };
1681 
1682 
1683  static void
1684  bind_tex_link(const cudaGaugeField& link, const cudaGaugeField& newOprod)
1685  {
1686  if(link.Precision() == QUDA_DOUBLE_PRECISION){
1687  cudaBindTexture(0, siteLink0TexDouble, link.Even_p(), link.Bytes()/2);
1688  cudaBindTexture(0, siteLink1TexDouble, link.Odd_p(), link.Bytes()/2);
1689 
1690  cudaBindTexture(0, newOprod0TexDouble, newOprod.Even_p(), newOprod.Bytes()/2);
1691  cudaBindTexture(0, newOprod1TexDouble, newOprod.Odd_p(), newOprod.Bytes()/2);
1692  }else{
1693  if(link.Reconstruct() == QUDA_RECONSTRUCT_NO){
1694  cudaBindTexture(0, siteLink0TexSingle, link.Even_p(), link.Bytes()/2);
1695  cudaBindTexture(0, siteLink1TexSingle, link.Odd_p(), link.Bytes()/2);
1696  }else{
1697  cudaBindTexture(0, siteLink0TexSingle_recon, link.Even_p(), link.Bytes()/2);
1698  cudaBindTexture(0, siteLink1TexSingle_recon, link.Odd_p(), link.Bytes()/2);
1699  }
1700  cudaBindTexture(0, newOprod0TexSingle, newOprod.Even_p(), newOprod.Bytes()/2);
1701  cudaBindTexture(0, newOprod1TexSingle, newOprod.Odd_p(), newOprod.Bytes()/2);
1702 
1703  }
1704  }
1705 
1706  static void
1707  unbind_tex_link(const cudaGaugeField& link, const cudaGaugeField& newOprod)
1708  {
1709  if(link.Precision() == QUDA_DOUBLE_PRECISION){
1710  cudaUnbindTexture(siteLink0TexDouble);
1711  cudaUnbindTexture(siteLink1TexDouble);
1712  cudaUnbindTexture(newOprod0TexDouble);
1713  cudaUnbindTexture(newOprod1TexDouble);
1714  }else{
1715  if(link.Reconstruct() == QUDA_RECONSTRUCT_NO){
1716  cudaUnbindTexture(siteLink0TexSingle);
1717  cudaUnbindTexture(siteLink1TexSingle);
1718  }else{
1719  cudaUnbindTexture(siteLink0TexSingle_recon);
1720  cudaUnbindTexture(siteLink1TexSingle_recon);
1721  }
1722  cudaUnbindTexture(newOprod0TexSingle);
1723  cudaUnbindTexture(newOprod1TexSingle);
1724  }
1725  }
1726 
1727  template<class Real, class RealA, class RealB>
1728  static void
1729  do_hisq_staples_force_cuda( PathCoefficients<Real> act_path_coeff,
1730  const QudaGaugeParam& param,
1731  const cudaGaugeField &oprod,
1732  const cudaGaugeField &link,
1733  cudaGaugeField &Pmu,
1734  cudaGaugeField &P3,
1735  cudaGaugeField &P5,
1736  cudaGaugeField &Pnumu,
1737  cudaGaugeField &Qmu,
1738  cudaGaugeField &Qnumu,
1739  cudaGaugeField &newOprod)
1740  {
1741 
1742  Real coeff;
1743  Real OneLink, Lepage, FiveSt, ThreeSt, SevenSt;
1744  Real mLepage, mFiveSt, mThreeSt;
1745 
1746  OneLink = act_path_coeff.one;
1747  ThreeSt = act_path_coeff.three; mThreeSt = -ThreeSt;
1748  FiveSt = act_path_coeff.five; mFiveSt = -FiveSt;
1749  SevenSt = act_path_coeff.seven;
1750  Lepage = act_path_coeff.lepage; mLepage = -Lepage;
1751 
1752 
1753 
1754  for(int sig=0; sig<8; ++sig){
1755  if(GOES_FORWARDS(sig)){
1756  OneLinkTerm<RealA, RealB> oneLink(oprod, sig, OneLink, newOprod, param.X);
1757  oneLink.apply(0);
1758  checkCudaError();
1759  } // GOES_FORWARDS(sig)
1760  }
1761 
1762 
1763  int ghostDim[4]={
1764  commDimPartitioned(0),
1765  commDimPartitioned(1),
1766  commDimPartitioned(2),
1768  };
1769 
1770  hisq_kernel_param_t kparam_1g, kparam_2g;
1771 
1772 
1773 #ifdef MULTI_GPU
1774  kparam_1g.D1 = commDimPartitioned(0)?(param.X[0]+2):(param.X[0]);
1775  kparam_1g.D2 = commDimPartitioned(1)?(param.X[1]+2):(param.X[1]);
1776  kparam_1g.D3 = commDimPartitioned(2)?(param.X[2]+2):(param.X[2]);
1777  kparam_1g.D4 = commDimPartitioned(3)?(param.X[3]+2):(param.X[3]);
1778  kparam_1g.D1h = kparam_1g.D1/2;
1779  kparam_1g.base_idx[0]=commDimPartitioned(0)?1:2;
1780  kparam_1g.base_idx[1]=commDimPartitioned(1)?1:2;
1781  kparam_1g.base_idx[2]=commDimPartitioned(2)?1:2;
1782  kparam_1g.base_idx[3]=commDimPartitioned(3)?1:2;
1783  kparam_1g.threads = kparam_1g.D1*kparam_1g.D2*kparam_1g.D3*kparam_1g.D4/2;
1784 
1785  kparam_2g.D1 = commDimPartitioned(0)?(param.X[0]+4):(param.X[0]);
1786  kparam_2g.D2 = commDimPartitioned(1)?(param.X[1]+4):(param.X[1]);
1787  kparam_2g.D3 = commDimPartitioned(2)?(param.X[2]+4):(param.X[2]);
1788  kparam_2g.D4 = commDimPartitioned(3)?(param.X[3]+4):(param.X[3]);
1789  kparam_2g.D1h = kparam_2g.D1/2;
1790  kparam_2g.base_idx[0]=commDimPartitioned(0)?0:2;
1791  kparam_2g.base_idx[1]=commDimPartitioned(1)?0:2;
1792  kparam_2g.base_idx[2]=commDimPartitioned(2)?0:2;
1793  kparam_2g.base_idx[3]=commDimPartitioned(3)?0:2;
1794  kparam_2g.threads = kparam_2g.D1*kparam_2g.D2*kparam_2g.D3*kparam_2g.D4/2;
1795 
1796 
1797  for(int i=0;i < 4; i++){
1798  kparam_1g.ghostDim[i] = kparam_2g.ghostDim[i]=kparam_1g.ghostDim[i]=kparam_2g.ghostDim[i] = ghostDim[i];
1799  }
1800 #else
1802  kparam.D1 = param.X[0];
1803  kparam.D2 = param.X[1];
1804  kparam.D3 = param.X[2];
1805  kparam.D4 = param.X[3];
1806  kparam.D1h = param.X[0]/2;
1807  kparam.threads=param.X[0]*param.X[1]*param.X[2]*param.X[3]/2;
1808  kparam.base_idx[0]=0;
1809  kparam.base_idx[1]=0;
1810  kparam.base_idx[2]=0;
1811  kparam.base_idx[3]=0;
1812  kparam_2g = kparam_1g = kparam;
1813 
1814 #endif
1815  for(int sig=0; sig<8; sig++){
1816  for(int mu=0; mu<8; mu++){
1817  if ( (mu == sig) || (mu == OPP_DIR(sig))){
1818  continue;
1819  }
1820  //3-link
1821  //Kernel A: middle link
1822 
1823  MiddleLink<RealA,RealB> middleLink( link, oprod, // read only
1824  sig, mu, mThreeSt,
1825  Pmu, P3, Qmu, // write only
1826  newOprod, kparam_2g);
1827  middleLink.apply(0);
1828  checkCudaError();
1829 
1830  for(int nu=0; nu < 8; nu++){
1831  if (nu == sig || nu == OPP_DIR(sig)
1832  || nu == mu || nu == OPP_DIR(mu)){
1833  continue;
1834  }
1835  //5-link: middle link
1836  //Kernel B
1837  MiddleLink<RealA,RealB> middleLink( link, Pmu, Qmu, // read only
1838  sig, nu, FiveSt,
1839  Pnumu, P5, Qnumu, // write only
1840  newOprod, kparam_1g);
1841  middleLink.apply(0);
1842  checkCudaError();
1843 
1844  for(int rho = 0; rho < 8; rho++){
1845  if (rho == sig || rho == OPP_DIR(sig)
1846  || rho == mu || rho == OPP_DIR(mu)
1847  || rho == nu || rho == OPP_DIR(nu)){
1848  continue;
1849  }
1850  //7-link: middle link and side link
1851  if(FiveSt != 0)coeff = SevenSt/FiveSt; else coeff = 0;
1852  AllLink<RealA,RealB> allLink(link, Pnumu, Qnumu, sig, rho, SevenSt, coeff,
1853  P5, newOprod, kparam_1g);
1854 
1855  allLink.apply(0);
1856  checkCudaError();
1857 
1858  //return;
1859  }//rho
1860 
1861  //5-link: side link
1862  if(ThreeSt != 0)coeff = FiveSt/ThreeSt; else coeff = 0;
1863  SideLink<RealA,RealB> sideLink(link, P5, Qmu, //read only
1864  sig, nu, mFiveSt, coeff,
1865  P3, // write only
1866  newOprod, kparam_1g);
1867  sideLink.apply(0);
1868  checkCudaError();
1869 
1870  } //nu
1871 
1872  //lepage
1873  if(Lepage != 0.){
1874  LepageMiddleLink<RealA,RealB>
1875  lepageMiddleLink ( link, Pmu, Qmu, // read only
1876  sig, mu, Lepage,
1877  P5, // write only
1878  newOprod, kparam_2g);
1879  lepageMiddleLink.apply(0);
1880  checkCudaError();
1881 
1882  if(ThreeSt != 0)coeff = Lepage/ThreeSt ; else coeff = 0;
1883 
1884  SideLink<RealA, RealB> sideLink(link, P5, Qmu, // read only
1885  sig, mu, mLepage, coeff,
1886  P3, //write only
1887  newOprod, kparam_2g);
1888 
1889  sideLink.apply(0);
1890  checkCudaError();
1891 
1892  } // Lepage != 0.0
1893 
1894  //3-link side link
1895  SideLinkShort<RealA,RealB> sideLinkShort(link, P3, // read only
1896  sig, mu, ThreeSt,
1897  newOprod, kparam_1g);
1898  sideLinkShort.apply(0);
1899  checkCudaError();
1900 
1901  }//mu
1902  }//sig
1903 
1904 
1905  return;
1906  } // do_hisq_staples_force_cuda
1907 
1908 
1909 #undef Pmu
1910 #undef Pnumu
1911 #undef P3
1912 #undef P5
1913 #undef Qmu
1914 #undef Qnumu
1915 
1916 
1918  const cudaGaugeField &oprod,
1919  const cudaGaugeField &link,
1920  cudaGaugeField* force)
1921  {
1922  bind_tex_link(link, oprod);
1923 
1924  for(int sig=0; sig<4; sig++){
1925  if(param.cuda_prec == QUDA_DOUBLE_PRECISION){
1926  CompleteForce<double2,double2> completeForce(link, oprod, sig, *force, param.X);
1927  completeForce.apply(0);
1928  checkCudaError();
1929  }else if(param.cuda_prec == QUDA_SINGLE_PRECISION){
1930  CompleteForce<float2,float2> completeForce(link, oprod, sig, *force, param.X);
1931  completeForce.apply(0);
1932  checkCudaError();
1933  }else{
1934  errorQuda("Unsupported precision");
1935  }
1936  } // loop over directions
1937 
1938  unbind_tex_link(link, oprod);
1939  return;
1940  }
1941 
1942 
1943  void hisqLongLinkForceCuda(double coeff,
1944  const QudaGaugeParam &param,
1945  const cudaGaugeField &oldOprod,
1946  const cudaGaugeField &link,
1947  cudaGaugeField *newOprod)
1948  {
1949  bind_tex_link(link, *newOprod);
1950  const int volume = param.X[0]*param.X[1]*param.X[2]*param.X[3];
1952  for(int i =0;i < 4;i++){
1953  kparam.ghostDim[i] = commDimPartitioned(i);
1954  }
1955  kparam.threads = volume/2;
1956 
1957  for(int sig=0; sig<4; ++sig){
1958  if(param.cuda_prec == QUDA_DOUBLE_PRECISION){
1959  LongLinkTerm<double2,double2> longLink(link, oldOprod, sig, coeff, *newOprod, param.X, kparam);
1960  longLink.apply(0);
1961  checkCudaError();
1962  }else if(param.cuda_prec == QUDA_SINGLE_PRECISION){
1963  LongLinkTerm<float2,float2> longLink(link, oldOprod, sig, static_cast<float>(coeff),
1964  *newOprod, param.X, kparam);
1965  longLink.apply(0);
1966  checkCudaError();
1967  }else{
1968  errorQuda("Unsupported precision");
1969  }
1970  } // loop over directions
1971  unbind_tex_link(link, *newOprod);
1972  return;
1973  }
1974 
1975 
1976 
1977 
1978 
1979  void
1980  hisqStaplesForceCuda(const double path_coeff_array[6],
1981  const QudaGaugeParam &param,
1982  const cudaGaugeField &oprod,
1983  const cudaGaugeField &link,
1984  cudaGaugeField* newOprod)
1985  {
1986 
1987 #ifdef MULTI_GPU
1988  int X[4] = {
1989  param.X[0]+4, param.X[1]+4, param.X[2]+4, param.X[3]+4
1990  };
1991 #else
1992  int X[4] = {
1993  param.X[0], param.X[1], param.X[2], param.X[3]
1994  };
1995 #endif
1996 
1997  // create color matrix fields with zero padding
1998  int pad = 0;
2000 
2001  cudaGaugeField Pmu(gauge_param);
2002  cudaGaugeField P3(gauge_param);
2003  cudaGaugeField P5(gauge_param);
2004  cudaGaugeField Pnumu(gauge_param);
2005  cudaGaugeField Qmu(gauge_param);
2006  cudaGaugeField Qnumu(gauge_param);
2007 
2008  bind_tex_link(link, *newOprod);
2009 
2010  cudaEvent_t start, end;
2011 
2012  cudaEventCreate(&start);
2013  cudaEventCreate(&end);
2014 
2015  cudaEventRecord(start);
2016  if (param.cuda_prec == QUDA_DOUBLE_PRECISION){
2017 
2018  PathCoefficients<double> act_path_coeff;
2019  act_path_coeff.one = path_coeff_array[0];
2020  act_path_coeff.naik = path_coeff_array[1];
2021  act_path_coeff.three = path_coeff_array[2];
2022  act_path_coeff.five = path_coeff_array[3];
2023  act_path_coeff.seven = path_coeff_array[4];
2024  act_path_coeff.lepage = path_coeff_array[5];
2025  do_hisq_staples_force_cuda<double,double2,double2>( act_path_coeff,
2026  param,
2027  oprod,
2028  link,
2029  Pmu,
2030  P3,
2031  P5,
2032  Pnumu,
2033  Qmu,
2034  Qnumu,
2035  *newOprod);
2036 
2037 
2038  }else if(param.cuda_prec == QUDA_SINGLE_PRECISION){
2039  PathCoefficients<float> act_path_coeff;
2040  act_path_coeff.one = path_coeff_array[0];
2041  act_path_coeff.naik = path_coeff_array[1];
2042  act_path_coeff.three = path_coeff_array[2];
2043  act_path_coeff.five = path_coeff_array[3];
2044  act_path_coeff.seven = path_coeff_array[4];
2045  act_path_coeff.lepage = path_coeff_array[5];
2046 
2047  do_hisq_staples_force_cuda<float,float2,float2>( act_path_coeff,
2048  param,
2049  oprod,
2050  link,
2051  Pmu,
2052  P3,
2053  P5,
2054  Pnumu,
2055  Qmu,
2056  Qnumu,
2057  *newOprod);
2058  }else{
2059  errorQuda("Unsupported precision");
2060  }
2061 
2062 
2063  cudaEventRecord(end);
2064  cudaEventSynchronize(end);
2065  float runtime;
2066  cudaEventElapsedTime(&runtime, start, end);
2067 
2068  unbind_tex_link(link, *newOprod);
2069 
2070  return;
2071  }
2072 
2073  } // namespace fermion_force
2074 } // namespace quda