QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
hisq_paths_force_quda.cu
Go to the documentation of this file.
1 #include <quda_internal.h>
2 #include <lattice_field.h>
3 #include <read_gauge.h>
4 #include <gauge_field.h>
5 #include <ks_improved_force.h>
6 #include <hw_quda.h>
7 #include <hisq_force_macros.h>
8 #include <utility>
9 #include <quda_matrix.h>
10 #include <force_common.h>
11 #include <tune_quda.h>
12 #include <color_spinor_field.h>
13 
14 #include <face_quda.h>
15 
16 #ifdef GPU_HISQ_FORCE
17 
18 //DEBUG : control compile
19 #define COMPILE_HISQ_DP_18
20 #define COMPILE_HISQ_DP_12
21 #define COMPILE_HISQ_SP_18
22 #define COMPILE_HISQ_SP_12
23 
24 // Disable texture read for now. Need to revisit this.
25 #define HISQ_SITE_MATRIX_LOAD_TEX 1
26 #define HISQ_NEW_OPROD_LOAD_TEX 1
27 
28 #ifdef USE_TEXTURE_OBJECTS
29 #define TEX1DFETCH(type, tex, idx) tex1Dfetch<type>((tex), idx)
30 #else
31 #define TEX1DFETCH(type, tex, idx) tex1Dfetch((tex), idx)
32 #endif
33 
34 
35 #if (__COMPUTE_CAPABILITY__ >= 130)
36 
37  template<typename Tex>
38 static __inline__ __device__ double fetch_double(Tex t, int i)
39 {
40  int2 v = TEX1DFETCH(int2, t, i);
41  return __hiloint2double(v.y, v.x);
42 }
43 
44  template <typename Tex>
45 static __inline__ __device__ double2 fetch_double2(Tex t, int i)
46 {
47  int4 v = TEX1DFETCH(int4, t, i);
48  return make_double2(__hiloint2double(v.y, v.x), __hiloint2double(v.w, v.z));
49 }
50 
51 static __inline__ __device__ double2 fetch_double2_old(texture<int4, 1> t, int i)
52 {
53  int4 v = tex1Dfetch(t,i);
54  return make_double2(__hiloint2double(v.y, v.x), __hiloint2double(v.w, v.z));
55 }
56 
57 #endif //__COMPUTE_CAPABILITY__ >= 130
58 
59 
60 
61 
62 
63 namespace quda {
64  namespace fermion_force {
65 
66  struct hisq_kernel_param_t{
67  unsigned long threads;
68  int X[4];
69  int D[4];
70  int D1h;
71  int base_idx[4];
72  int ghostDim[4];
73  int color_matrix_stride;
74  int thin_link_stride;
75  int momentum_stride;
76 
77  void setStride(const QudaGaugeParam& param){
78  int half_volume = param.X[0]*param.X[1]*param.X[2]*param.X[3]/2;
79 #ifdef MULTI_GPU
80  int extended_half_volume = (param.X[0]+4)*(param.X[1]+4)*(param.X[2]+4)*(param.X[3]+4)/2;
81  thin_link_stride = extended_half_volume + param.site_ga_pad;
82  color_matrix_stride = extended_half_volume;
83 #else
84  thin_link_stride = half_volume + param.site_ga_pad;
85  color_matrix_stride = half_volume;
86 #endif
87  momentum_stride = half_volume + param.mom_ga_pad;
88  }
89  };
90 
91 
92  //Double precision for site link
93  texture<int4, 1> thinLink0TexDouble;
94  texture<int4, 1> thinLink1TexDouble;
95 
96  //Single precision for site link
97  texture<float2, 1, cudaReadModeElementType> thinLink0TexSingle;
98  texture<float2, 1, cudaReadModeElementType> thinLink1TexSingle;
99 
100  texture<float4, 1, cudaReadModeElementType> thinLink0TexSingle_recon;
101  texture<float4, 1, cudaReadModeElementType> thinLink1TexSingle_recon;
102 
103 
104  texture<int4, 1> newOprod0TexDouble;
105  texture<int4, 1> newOprod1TexDouble;
106  texture<float2, 1, cudaReadModeElementType> newOprod0TexSingle;
107  texture<float2, 1, cudaReadModeElementType> newOprod1TexSingle;
108 
109 
110 
111  inline __device__ __host__ int linkIndex(int x[], int dx[], const int X[4]) {
112  int y[4];
113  for (int i=0; i<4; i++) y[i] = (x[i] + dx[i] + X[i]) % X[i];
114  int idx = (((y[3]*X[2] + y[2])*X[1] + y[1])*X[0] + y[0]) >> 1;
115  return idx;
116  }
117 
118  inline __device__ __host__ void updateCoords(int x[], int dir, int shift, const int X[4], const int partitioned){
119 #ifdef MULTI_GPU
120  if(shift == 1){
121  x[dir] = (partitioned || (x[dir] != X[dir]+1)) ? x[dir]+1 : 2;
122  }else if(shift == -1){
123  x[dir] = (partitioned || (x[dir] != 2)) ? x[dir]-1 : X[dir]+1;
124  }
125 #else
126  x[dir] = (x[dir]+shift + X[dir])%X[dir];
127 #endif
128  return;
129  }
130 
131 
132  __device__ __host__ inline void getCoords(int x[4], int cb_index, const int X[4], int parity)
133  {
134  x[3] = cb_index/(X[2]*X[1]*X[0]/2);
135  x[2] = (cb_index/(X[1]*X[0]/2)) % X[2];
136  x[1] = (cb_index/(X[0]/2)) % X[1];
137  x[0] = 2*(cb_index%(X[0]/2)) + ((x[3]+x[2]+x[1]+parity)&1);
138 
139  return;
140  }
141 
142 
143  __device__ __host__ inline int posDir(int dir){
144  return (dir >= 4) ? 7-dir : dir;
145  }
146 
147 
148  //struct for holding the fattening path coefficients
149  template<class Real>
150  struct PathCoefficients
151  {
152  Real one;
153  Real three;
154  Real five;
155  Real seven;
156  Real naik;
157  Real lepage;
158  };
159 
160 
161  inline __device__ float2 operator*(float a, const float2 & b)
162  {
163  return make_float2(a*b.x,a*b.y);
164  }
165 
166  inline __device__ double2 operator*(double a, const double2 & b)
167  {
168  return make_double2(a*b.x,a*b.y);
169  }
170 
171  inline __device__ const float2 & operator+=(float2 & a, const float2 & b)
172  {
173  a.x += b.x;
174  a.y += b.y;
175  return a;
176  }
177 
178  inline __device__ const double2 & operator+=(double2 & a, const double2 & b)
179  {
180  a.x += b.x;
181  a.y += b.y;
182  return a;
183  }
184 
185  inline __device__ const float4 & operator+=(float4 & a, const float4 & b)
186  {
187  a.x += b.x;
188  a.y += b.y;
189  a.z += b.z;
190  a.w += b.w;
191  return a;
192  }
193 
194  // Replication of code
195  // This structure is already defined in
196  // unitarize_utilities.h
197 
198  template<class T>
199  struct RealTypeId;
200 
201  template<>
202  struct RealTypeId<float2>
203  {
204  typedef float Type;
205  };
206 
207  template<>
208  struct RealTypeId<double2>
209  {
210  typedef double Type;
211  };
212 
213 
214  template<class T>
215  inline __device__
216  void adjointMatrix(T* mat)
217  {
218 #define CONJ_INDEX(i,j) j*3 + i
219 
220  T tmp;
221  mat[CONJ_INDEX(0,0)] = Conj(mat[0]);
222  mat[CONJ_INDEX(1,1)] = Conj(mat[4]);
223  mat[CONJ_INDEX(2,2)] = Conj(mat[8]);
224  tmp = Conj(mat[1]);
225  mat[CONJ_INDEX(1,0)] = Conj(mat[3]);
226  mat[CONJ_INDEX(0,1)] = tmp;
227  tmp = Conj(mat[2]);
228  mat[CONJ_INDEX(2,0)] = Conj(mat[6]);
229  mat[CONJ_INDEX(0,2)] = tmp;
230  tmp = Conj(mat[5]);
231  mat[CONJ_INDEX(2,1)] = Conj(mat[7]);
232  mat[CONJ_INDEX(1,2)] = tmp;
233 
234 #undef CONJ_INDEX
235  return;
236  }
237 
238 
239  template<int N, class T>
240  inline __device__
241  void loadMatrixFromField(const T* const field_even, const T* const field_odd,
242  int dir, int idx, T* const mat, int oddness, int stride)
243  {
244  const T* const field = (oddness)?field_odd:field_even;
245  for(int i = 0;i < N ;i++){
246  mat[i] = field[idx + dir*N*stride + i*stride];
247  }
248  return;
249  }
250 
251  template<class T>
252  inline __device__
253  void loadMatrixFromField(const T* const field_even, const T* const field_odd,
254  int dir, int idx, T* const mat, int oddness, int stride)
255  {
256  loadMatrixFromField<9> (field_even, field_odd, dir, idx, mat, oddness, stride);
257  return;
258  }
259 
260 
261 
262  inline __device__
263  void loadMatrixFromField(const float4* const field_even, const float4* const field_odd,
264  int dir, int idx, float2* const mat, int oddness, int stride)
265  {
266  const float4* const field = oddness?field_odd: field_even;
267  float4 tmp;
268  tmp = field[idx + dir*stride*3];
269  mat[0] = make_float2(tmp.x, tmp.y);
270  mat[1] = make_float2(tmp.z, tmp.w);
271  tmp = field[idx + dir*stride*3 + stride];
272  mat[2] = make_float2(tmp.x, tmp.y);
273  mat[3] = make_float2(tmp.z, tmp.w);
274  tmp = field[idx + dir*stride*3 + 2*stride];
275  mat[4] = make_float2(tmp.x, tmp.y);
276  mat[5] = make_float2(tmp.z, tmp.w);
277  return;
278  }
279 
280  template<class T>
281  inline __device__
282  void loadMatrixFromField(const T* const field_even, const T* const field_odd, int idx, T* const mat, int oddness, int stride)
283  {
284  const T* const field = (oddness)?field_odd:field_even;
285  mat[0] = field[idx];
286  mat[1] = field[idx + stride];
287  mat[2] = field[idx + stride*2];
288  mat[3] = field[idx + stride*3];
289  mat[4] = field[idx + stride*4];
290  mat[5] = field[idx + stride*5];
291  mat[6] = field[idx + stride*6];
292  mat[7] = field[idx + stride*7];
293  mat[8] = field[idx + stride*8];
294 
295  return;
296  }
297 
298  template<class U>
299  inline __device__
300  void addMatrixToNewOprod(const double2* const mat, int dir, int idx, U coeff,
301  double2* const field_even, double2* const field_odd, int oddness, int stride){
302  double2* const field = (oddness)?field_odd: field_even;
303  double2 value[9];
304 
305 #if (HISQ_NEW_OPROD_LOAD_TEX == 1)
306  value[0] = READ_DOUBLE2_TEXTURE( ((oddness)?newOprod1TexDouble:newOprod0TexDouble), field, idx+dir*stride*9);
307  value[1] = READ_DOUBLE2_TEXTURE( ((oddness)?newOprod1TexDouble:newOprod0TexDouble), field, idx+dir*stride*9 + stride);
308  value[2] = READ_DOUBLE2_TEXTURE( ((oddness)?newOprod1TexDouble:newOprod0TexDouble), field, idx+dir*stride*9 + 2*stride);
309  value[3] = READ_DOUBLE2_TEXTURE( ((oddness)?newOprod1TexDouble:newOprod0TexDouble), field, idx+dir*stride*9 + 3*stride);
310  value[4] = READ_DOUBLE2_TEXTURE( ((oddness)?newOprod1TexDouble:newOprod0TexDouble), field, idx+dir*stride*9 + 4*stride);
311  value[5] = READ_DOUBLE2_TEXTURE( ((oddness)?newOprod1TexDouble:newOprod0TexDouble), field, idx+dir*stride*9 + 5*stride);
312  value[6] = READ_DOUBLE2_TEXTURE( ((oddness)?newOprod1TexDouble:newOprod0TexDouble), field, idx+dir*stride*9 + 6*stride);
313  value[7] = READ_DOUBLE2_TEXTURE( ((oddness)?newOprod1TexDouble:newOprod0TexDouble), field, idx+dir*stride*9 + 7*stride);
314  value[8] = READ_DOUBLE2_TEXTURE( ((oddness)?newOprod1TexDouble:newOprod0TexDouble), field, idx+dir*stride*9 + 8*stride);
315 #else
316  for(int i=0; i<9; ++i) value[i] = field[i];
317 #endif
318 
319  field[idx + dir*stride*9] = value[0] + coeff*mat[0];
320  field[idx + dir*stride*9 + stride] = value[1] + coeff*mat[1];
321  field[idx + dir*stride*9 + stride*2] = value[2] + coeff*mat[2];
322  field[idx + dir*stride*9 + stride*3] = value[3] + coeff*mat[3];
323  field[idx + dir*stride*9 + stride*4] = value[4] + coeff*mat[4];
324  field[idx + dir*stride*9 + stride*5] = value[5] + coeff*mat[5];
325  field[idx + dir*stride*9 + stride*6] = value[6] + coeff*mat[6];
326  field[idx + dir*stride*9 + stride*7] = value[7] + coeff*mat[7];
327  field[idx + dir*stride*9 + stride*8] = value[8] + coeff*mat[8];
328 
329  return;
330  }
331 
332 
333  template<class U>
334  inline __device__
335  void addMatrixToNewOprod(const float2* const mat, int dir, int idx, U coeff,
336  float2* const field_even, float2* const field_odd, int oddness, int stride){
337  float2* const field = (oddness)?field_odd: field_even;
338  float2 value[9];
339 
340 #if (HISQ_NEW_OPROD_LOAD_TEX == 1)
341  value[0] = tex1Dfetch( ((oddness)?newOprod1TexSingle:newOprod0TexSingle), idx+dir*stride*9);
342  value[1] = tex1Dfetch( ((oddness)?newOprod1TexSingle:newOprod0TexSingle), idx+dir*stride*9 + stride);
343  value[2] = tex1Dfetch( ((oddness)?newOprod1TexSingle:newOprod0TexSingle), idx+dir*stride*9 + 2*stride);
344  value[3] = tex1Dfetch( ((oddness)?newOprod1TexSingle:newOprod0TexSingle), idx+dir*stride*9 + 3*stride);
345  value[4] = tex1Dfetch( ((oddness)?newOprod1TexSingle:newOprod0TexSingle), idx+dir*stride*9 + 4*stride);
346  value[5] = tex1Dfetch( ((oddness)?newOprod1TexSingle:newOprod0TexSingle), idx+dir*stride*9 + 5*stride);
347  value[6] = tex1Dfetch( ((oddness)?newOprod1TexSingle:newOprod0TexSingle), idx+dir*stride*9 + 6*stride);
348  value[7] = tex1Dfetch( ((oddness)?newOprod1TexSingle:newOprod0TexSingle), idx+dir*stride*9 + 7*stride);
349  value[8] = tex1Dfetch( ((oddness)?newOprod1TexSingle:newOprod0TexSingle), idx+dir*stride*9 + 8*stride);
350 #else
351  for(int i=0; i<9; ++i) value[i] = field[i];
352 #endif
353  field[idx + dir*stride*9] = value[0] + coeff*mat[0];
354  field[idx + dir*stride*9 + stride] = value[1] + coeff*mat[1];
355  field[idx + dir*stride*9 + stride*2] = value[2] + coeff*mat[2];
356  field[idx + dir*stride*9 + stride*3] = value[3] + coeff*mat[3];
357  field[idx + dir*stride*9 + stride*4] = value[4] + coeff*mat[4];
358  field[idx + dir*stride*9 + stride*5] = value[5] + coeff*mat[5];
359  field[idx + dir*stride*9 + stride*6] = value[6] + coeff*mat[6];
360  field[idx + dir*stride*9 + stride*7] = value[7] + coeff*mat[7];
361  field[idx + dir*stride*9 + stride*8] = value[8] + coeff*mat[8];
362 
363  return;
364  }
365 
366 
367  // only works if Promote<T,U>::Type = T
368 
369  template<class T, class U>
370  inline __device__
371  void addMatrixToField(const T* const mat, int dir, int idx, U coeff,
372  T* const field_even, T* const field_odd, int oddness, int stride)
373  {
374  T* const field = (oddness)?field_odd: field_even;
375  field[idx + dir*stride*9] += coeff*mat[0];
376  field[idx + dir*stride*9 + stride] += coeff*mat[1];
377  field[idx + dir*stride*9 + stride*2] += coeff*mat[2];
378  field[idx + dir*stride*9 + stride*3] += coeff*mat[3];
379  field[idx + dir*stride*9 + stride*4] += coeff*mat[4];
380  field[idx + dir*stride*9 + stride*5] += coeff*mat[5];
381  field[idx + dir*stride*9 + stride*6] += coeff*mat[6];
382  field[idx + dir*stride*9 + stride*7] += coeff*mat[7];
383  field[idx + dir*stride*9 + stride*8] += coeff*mat[8];
384 
385  return;
386  }
387 
388 
389  template<class T, class U>
390  inline __device__
391  void addMatrixToField(const T* const mat, int idx, U coeff, T* const field_even,
392  T* const field_odd, int oddness, int stride)
393  {
394  T* const field = (oddness)?field_odd: field_even;
395  field[idx ] += coeff*mat[0];
396  field[idx + stride] += coeff*mat[1];
397  field[idx + stride*2] += coeff*mat[2];
398  field[idx + stride*3] += coeff*mat[3];
399  field[idx + stride*4] += coeff*mat[4];
400  field[idx + stride*5] += coeff*mat[5];
401  field[idx + stride*6] += coeff*mat[6];
402  field[idx + stride*7] += coeff*mat[7];
403  field[idx + stride*8] += coeff*mat[8];
404 
405  return;
406  }
407 
408  template<class T, class U>
409  inline __device__
410  void addMatrixToField_test(const T* const mat, int idx, U coeff, T* const field_even,
411  T* const field_odd, int oddness, int stride)
412  {
413  T* const field = (oddness)?field_odd: field_even;
414  //T oldvalue=field[idx];
415  field[idx ] += coeff*mat[0];
416  field[idx + stride] += coeff*mat[1];
417  field[idx + stride*2] += coeff*mat[2];
418  field[idx + stride*3] += coeff*mat[3];
419  field[idx + stride*4] += coeff*mat[4];
420  field[idx + stride*5] += coeff*mat[5];
421  field[idx + stride*6] += coeff*mat[6];
422  field[idx + stride*7] += coeff*mat[7];
423  field[idx + stride*8] += coeff*mat[8];
424 
425 #if (!defined(__CUDA_ARCH__) || (__COMPUTE_CAPABILITY__>=200))
426  printf("value is coeff(%f) * mat[0].x(%f)=%f\n", coeff, mat[0].x, field[idx].x);
427 #endif
428  return;
429  }
430 
431  template<class T>
432  inline __device__
433  void storeMatrixToField(const T* const mat, int dir, int idx, T* const field_even, T* const field_odd, int oddness, int stride)
434  {
435  T* const field = (oddness)?field_odd: field_even;
436  field[idx + dir*stride*9] = mat[0];
437  field[idx + dir*stride*9 + stride] = mat[1];
438  field[idx + dir*stride*9 + stride*2] = mat[2];
439  field[idx + dir*stride*9 + stride*3] = mat[3];
440  field[idx + dir*stride*9 + stride*4] = mat[4];
441  field[idx + dir*stride*9 + stride*5] = mat[5];
442  field[idx + dir*stride*9 + stride*6] = mat[6];
443  field[idx + dir*stride*9 + stride*7] = mat[7];
444  field[idx + dir*stride*9 + stride*8] = mat[8];
445 
446  return;
447  }
448 
449 
450  template<class T>
451  inline __device__
452  void storeMatrixToField(const T* const mat, int idx, T* const field_even, T* const field_odd, int oddness, int stride)
453  {
454  T* const field = (oddness)?field_odd: field_even;
455  field[idx] = mat[0];
456  field[idx + stride] = mat[1];
457  field[idx + stride*2] = mat[2];
458  field[idx + stride*3] = mat[3];
459  field[idx + stride*4] = mat[4];
460  field[idx + stride*5] = mat[5];
461  field[idx + stride*6] = mat[6];
462  field[idx + stride*7] = mat[7];
463  field[idx + stride*8] = mat[8];
464 
465  return;
466  }
467 
468 
469  template<class T, class U>
470  inline __device__
471  void storeMatrixToMomentumField(const T* const mat, int dir, int idx, U coeff,
472  T* const mom_even, T* const mom_odd, int oddness, int stride)
473  {
474  T* const mom_field = (oddness)?mom_odd:mom_even;
475  T temp2;
476  temp2.x = (mat[1].x - mat[3].x)*0.5*coeff;
477  temp2.y = (mat[1].y + mat[3].y)*0.5*coeff;
478  mom_field[idx + dir*stride*5] = temp2;
479 
480  temp2.x = (mat[2].x - mat[6].x)*0.5*coeff;
481  temp2.y = (mat[2].y + mat[6].y)*0.5*coeff;
482  mom_field[idx + dir*stride*5 + stride] = temp2;
483 
484  temp2.x = (mat[5].x - mat[7].x)*0.5*coeff;
485  temp2.y = (mat[5].y + mat[7].y)*0.5*coeff;
486  mom_field[idx + dir*stride*5 + stride*2] = temp2;
487 
488  const typename RealTypeId<T>::Type temp = (mat[0].y + mat[4].y + mat[8].y)*0.3333333333333333333333333;
489  temp2.x = (mat[0].y-temp)*coeff;
490  temp2.y = (mat[4].y-temp)*coeff;
491  mom_field[idx + dir*stride*5 + stride*3] = temp2;
492 
493  temp2.x = (mat[8].y - temp)*coeff;
494  temp2.y = 0.0;
495  mom_field[idx + dir*stride*5 + stride*4] = temp2;
496 
497  return;
498  }
499 
500  // Struct to determine the coefficient sign at compile time
501  template<int pos_dir, int odd_lattice>
502  struct CoeffSign
503  {
504  static const int result = -1;
505  };
506 
507  template<>
508  struct CoeffSign<0,1>
509  {
510  static const int result = -1;
511  };
512 
513  template<>
514  struct CoeffSign<0,0>
515  {
516  static const int result = 1;
517  };
518 
519  template<>
520  struct CoeffSign<1,1>
521  {
522  static const int result = 1;
523  };
524 
525  template<int odd_lattice>
526  struct Sign
527  {
528  static const int result = 1;
529  };
530 
531  template<>
532  struct Sign<1>
533  {
534  static const int result = -1;
535  };
536 
537  template<class RealX>
538  struct ArrayLength
539  {
540  static const int result=9;
541  };
542 
543  template<>
544  struct ArrayLength<float4>
545  {
546  static const int result=5;
547  };
548 
549 
550 
551 
552  // Flops: four matrix additions per lattice site = 72 Flops per lattice site
553  template<class RealA, int oddBit>
554  __global__ void
555  do_one_link_term_kernel(const RealA* const oprodEven, const RealA* const oprodOdd,
556  typename RealTypeId<RealA>::Type coeff,
557  RealA* const outputEven, RealA* const outputOdd, hisq_kernel_param_t kparam)
558  {
559  int sid = blockIdx.x * blockDim.x + threadIdx.x;
560  if (sid >= kparam.threads) return;
561 #ifdef MULTI_GPU
562  int dx[4] = {0,0,0,0};
563  int x[4];
564  getCoords(x, sid, kparam.X, oddBit);
565  int E[4] = {kparam.X[0]+4, kparam.X[1]+4, kparam.X[2]+4, kparam.X[3]+4};
566  for(int dir=0; dir<4; ++dir) x[dir] += 2;
567  int new_sid = linkIndex(x,dx,E);
568 #else
569  int new_sid = sid;
570 #endif
571  for(int sig=0; sig<4; ++sig){
572  RealA COLOR_MAT_W[ArrayLength<RealA>::result];
573  loadMatrixFromField(oprodEven, oprodOdd, sig, new_sid, COLOR_MAT_W, oddBit, kparam.color_matrix_stride);
574  addMatrixToField(COLOR_MAT_W, sig, new_sid, coeff, outputEven, outputOdd, oddBit, kparam.color_matrix_stride);
575  }
576  return;
577  }
578 
579 
580  template<int N>
581  __device__ void loadLink(const double2* const linkEven, const double2* const linkOdd, int dir, int idx, double2* const var, int oddness, int stride){
582 #if (HISQ_SITE_MATRIX_LOAD_TEX == 1)
583  HISQ_LOAD_MATRIX_18_DOUBLE_TEX((oddness)?thinLink1TexDouble:thinLink0TexDouble, (oddness)?linkOdd:linkEven, dir, idx, var, stride);
584 #else
585  loadMatrixFromField(linkEven, linkOdd, dir, idx, var, oddness, stride);
586 #endif
587  }
588 
589  template<>
590  void loadLink<12>(const double2* const linkEven, const double2* const linkOdd, int dir, int idx, double2* const var, int oddness, int stride){
591 #if (HISQ_SITE_MATRIX_LOAD_TEX == 1)
592  HISQ_LOAD_MATRIX_12_DOUBLE_TEX((oddness)?thinLink1TexDouble:thinLink0TexDouble, (oddness)?linkOdd:linkEven,dir, idx, var, stride);
593 #else
594  loadMatrixFromField<6>(linkEven, linkOdd, dir, idx, var, oddness, stride);
595 #endif
596  }
597 
598  template<int N>
599  __device__ void loadLink(const float4* const linkEven, const float4* const linkOdd, int dir, int idx, float2* const var, int oddness, int stride){
600 #if (HISQ_SITE_MATRIX_LOAD_TEX == 1)
601  HISQ_LOAD_MATRIX_12_SINGLE_TEX((oddness)?thinLink1TexSingle_recon:thinLink0TexSingle_recon, dir, idx, var, stride);
602 #else
603  loadMatrixFromField(linkEven, linkOdd, dir, idx, var, oddness, stride);
604 #endif
605  }
606 
607  template<int N>
608  __device__ void loadLink(const float2* const linkEven, const float2* const linkOdd, int dir, int idx, float2* const var , int oddness, int stride){
609 #if (HISQ_SITE_MATRIX_LOAD_TEX == 1)
610  HISQ_LOAD_MATRIX_18_SINGLE_TEX((oddness)?thinLink1TexSingle:thinLink0TexSingle, dir, idx, var, stride);
611 #else
612  loadMatrixFromField(linkEven, linkOdd, dir, idx, var, oddness, stride);
613 #endif
614  }
615 
616 
617 
618 #define DD_CONCAT(n,r) n ## r ## kernel
619 
620 #define HISQ_KERNEL_NAME(a,b) DD_CONCAT(a,b)
621  //precision: 0 is for double, 1 is for single
622 
623  //double precision, recon=18
624 #define PRECISION 0
625 #define RECON 18
626 #include "hisq_paths_force_core.h"
627 #undef PRECISION
628 #undef RECON
629 
630  //double precision, recon=12
631 #define PRECISION 0
632 #define RECON 12
633 #include "hisq_paths_force_core.h"
634 #undef PRECISION
635 #undef RECON
636 
637  //single precision, recon=18
638 #define PRECISION 1
639 #define RECON 18
640 #include "hisq_paths_force_core.h"
641 #undef PRECISION
642 #undef RECON
643 
644  //single precision, recon=12
645 #define PRECISION 1
646 #define RECON 12
647 #include "hisq_paths_force_core.h"
648 #undef PRECISION
649 #undef RECON
650 
651 
652 
653 
654 
655  template<class RealA, class RealB>
656  class MiddleLink : public Tunable {
657 
658  private:
659  const cudaGaugeField &link;
660  const cudaGaugeField &oprod;
661  const cudaGaugeField &Qprev;
662  const int sig;
663  const int mu;
664  const typename RealTypeId<RealA>::Type &coeff;
665  cudaGaugeField &Pmu;
666  cudaGaugeField &P3;
667  cudaGaugeField &Qmu;
668  cudaGaugeField &newOprod;
669  const hisq_kernel_param_t &kparam;
670 
671  unsigned int sharedBytesPerThread() const { return 0; }
672  unsigned int sharedBytesPerBlock(const TuneParam &) const { return 0; }
673 
674 
675  // don't tune the grid dimension
676  bool tuneGridDim() const { return false; }
677  unsigned int minThreads() const { return kparam.threads; }
678 
679  public:
680  MiddleLink(const cudaGaugeField &link,
681  const cudaGaugeField &oprod,
682  const cudaGaugeField &Qprev,
683  int sig, int mu,
684  const typename RealTypeId<RealA>::Type &coeff,
685  cudaGaugeField &Pmu, // write only
686  cudaGaugeField &P3, // write only
687  cudaGaugeField &Qmu,
688  cudaGaugeField &newOprod,
689  const hisq_kernel_param_t &kparam) :
690  link(link), oprod(oprod), Qprev(Qprev), sig(sig), mu(mu),
691  coeff(coeff), Pmu(Pmu), P3(P3), Qmu(Qmu), newOprod(newOprod), kparam(kparam)
692  { ; }
693  // need alternative constructor to hack around null pointer passing
694  MiddleLink(const cudaGaugeField &link,
695  const cudaGaugeField &oprod,
696  int sig, int mu,
697  const typename RealTypeId<RealA>::Type &coeff,
698  cudaGaugeField &Pmu, // write only
699  cudaGaugeField &P3, // write only
700  cudaGaugeField &Qmu,
701  cudaGaugeField &newOprod,
702  const hisq_kernel_param_t &kparam) :
703  link(link), oprod(oprod), Qprev(link), sig(sig), mu(mu),
704  coeff(coeff), Pmu(Pmu), P3(P3), Qmu(Qmu), newOprod(newOprod), kparam(kparam)
705  { ; }
706  virtual ~MiddleLink() { ; }
707 
708  TuneKey tuneKey() const {
709  std::stringstream vol, aux;
710  vol << kparam.D[0] << "x";
711  vol << kparam.D[1] << "x";
712  vol << kparam.D[2] << "x";
713  vol << kparam.D[3];
714  aux << "threads=" << kparam.threads << ",prec=" << link.Precision();
715  aux << ",recon=" << link.Reconstruct() << ",sig=" << sig << ",mu=" << mu;
716  return TuneKey(vol.str().c_str(), typeid(*this).name(), aux.str().c_str());
717  }
718 
719 
720 #define CALL_ARGUMENTS(typeA, typeB) <<<tp.grid, tp.block>>> \
721  ((typeA*)oprod.Even_p(), (typeA*)oprod.Odd_p(), \
722  (typeA*)Qprev_even, (typeA*)Qprev_odd, \
723  (typeB*)link.Even_p(), (typeB*)link.Odd_p(), \
724  sig, mu, coeff, \
725  (typeA*)Pmu.Even_p(), (typeA*)Pmu.Odd_p(), \
726  (typeA*)P3.Even_p(), (typeA*)P3.Odd_p(), \
727  (typeA*)Qmu.Even_p(), (typeA*)Qmu.Odd_p(), \
728  (typeA*)newOprod.Even_p(), (typeA*)newOprod.Odd_p(), kparam)
729 
730 
731 #define CALL_MIDDLE_LINK_KERNEL(sig_sign, mu_sign) \
732  if(oddness_change ==0 ){ \
733  if(sizeof(RealA) == sizeof(float2)){ \
734  if(recon == QUDA_RECONSTRUCT_NO){ \
735  do_middle_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(float2, float2); \
736  do_middle_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(float2, float2); \
737  }else{ \
738  do_middle_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(float2, float4); \
739  do_middle_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(float2, float4); \
740  } \
741  }else{ \
742  if(recon == QUDA_RECONSTRUCT_NO){ \
743  do_middle_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(double2, double2); \
744  do_middle_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(double2, double2); \
745  }else{ \
746  do_middle_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(double2, double2); \
747  do_middle_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(double2, double2); \
748  } \
749  } \
750  }else{ \
751  if(sizeof(RealA) == sizeof(float2)){ \
752  if(recon == QUDA_RECONSTRUCT_NO){ \
753  do_middle_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(float2, float2); \
754  do_middle_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(float2, float2); \
755  }else{ \
756  do_middle_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(float2, float4); \
757  do_middle_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(float2, float4); \
758  } \
759  }else{ \
760  if(recon == QUDA_RECONSTRUCT_NO){ \
761  do_middle_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(double2, double2); \
762  do_middle_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(double2, double2); \
763  }else{ \
764  do_middle_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(double2, double2); \
765  do_middle_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(double2, double2); \
766  } \
767  } \
768  }
769 
770 
771 
772  void apply(const cudaStream_t &stream) {
773  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
774  QudaReconstructType recon = link.Reconstruct();
775  int oddness_change = (kparam.base_idx[0] + kparam.base_idx[1]
776  + kparam.base_idx[2] + kparam.base_idx[3])&1;
777 
778  const void *Qprev_even = (&Qprev == &link) ? NULL : Qprev.Even_p();
779  const void *Qprev_odd = (&Qprev == &link) ? NULL : Qprev.Odd_p();
780 
781 
782  if (GOES_FORWARDS(sig) && GOES_FORWARDS(mu)){
783  CALL_MIDDLE_LINK_KERNEL(1,1);
784  }else if (GOES_FORWARDS(sig) && GOES_BACKWARDS(mu)){
785  CALL_MIDDLE_LINK_KERNEL(1,0);
786  }else if (GOES_BACKWARDS(sig) && GOES_FORWARDS(mu)){
787  CALL_MIDDLE_LINK_KERNEL(0,1);
788  }else{
789  CALL_MIDDLE_LINK_KERNEL(0,0);
790  }
791  }
792 
793 #undef CALL_ARGUMENTS
794 #undef CALL_MIDDLE_LINK_KERNEL
795 
796  void preTune() {
797  Pmu.backup();
798  P3.backup();
799  Qmu.backup();
800  newOprod.backup();
801  }
802 
803  void postTune() {
804  Pmu.restore();
805  P3.restore();
806  Qmu.restore();
807  newOprod.restore();
808  }
809 
810  long long flops() const { return 0; }
811  };
812 
813 
814  template<class RealA, class RealB>
815  class LepageMiddleLink : public Tunable {
816 
817  private:
818  const cudaGaugeField &link;
819  const cudaGaugeField &oprod;
820  const cudaGaugeField &Qprev;
821  const int sig;
822  const int mu;
823  const typename RealTypeId<RealA>::Type &coeff;
824  cudaGaugeField &P3; // write only
825  cudaGaugeField &newOprod;
826  const hisq_kernel_param_t &kparam;
827 
828  unsigned int sharedBytesPerThread() const { return 0; }
829  unsigned int sharedBytesPerBlock(const TuneParam &) const { return 0; }
830 
831  // don't tune the grid dimension
832  bool tuneGridDim() const { return false; }
833  unsigned int minThreads() const { return kparam.threads; }
834 
835  public:
836  LepageMiddleLink(const cudaGaugeField &link,
837  const cudaGaugeField &oprod,
838  const cudaGaugeField &Qprev,
839  int sig, int mu,
840  const typename RealTypeId<RealA>::Type &coeff,
841  cudaGaugeField &P3, cudaGaugeField &newOprod,
842  const hisq_kernel_param_t &kparam) :
843  link(link), oprod(oprod), Qprev(Qprev), sig(sig), mu(mu),
844  coeff(coeff), P3(P3), newOprod(newOprod), kparam(kparam)
845  { ; }
846  virtual ~LepageMiddleLink() { ; }
847 
848  TuneKey tuneKey() const {
849  std::stringstream vol, aux;
850  vol << kparam.D[0] << "x";
851  vol << kparam.D[1] << "x";
852  vol << kparam.D[2] << "x";
853  vol << kparam.D[3];
854  aux << "threads=" << kparam.threads << ",prec=" << link.Precision();
855  aux << ",recon=" << link.Reconstruct() << ",sig=" << sig << ",mu=" << mu;
856  return TuneKey(vol.str().c_str(), typeid(*this).name(), aux.str().c_str());
857  }
858 
859 #define CALL_ARGUMENTS(typeA, typeB) <<<tp.grid, tp.block>>> \
860  ((typeA*)oprod.Even_p(), (typeA*)oprod.Odd_p(), \
861  (typeA*)Qprev.Even_p(), (typeA*)Qprev.Odd_p(), \
862  (typeB*)link.Even_p(), (typeB*)link.Odd_p(), \
863  sig, mu, coeff, \
864  (typeA*)P3.Even_p(), (typeA*)P3.Odd_p(), \
865  (typeA*)newOprod.Even_p(), (typeA*)newOprod.Odd_p(), \
866  kparam)
867 
868 #define CALL_MIDDLE_LINK_KERNEL(sig_sign, mu_sign) \
869  if(oddness_change == 0){ \
870  if(sizeof(RealA) == sizeof(float2)){ \
871  if(recon == QUDA_RECONSTRUCT_NO){ \
872  do_lepage_middle_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(float2, float2); \
873  do_lepage_middle_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(float2, float2); \
874  }else{ \
875  do_lepage_middle_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(float2, float4); \
876  do_lepage_middle_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(float2, float4); \
877  } \
878  }else{ \
879  if(recon == QUDA_RECONSTRUCT_NO){ \
880  do_lepage_middle_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(double2, double2); \
881  do_lepage_middle_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(double2, double2); \
882  }else{ \
883  do_lepage_middle_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(double2, double2); \
884  do_lepage_middle_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(double2, double2); \
885  } \
886  } \
887  }else{ \
888  if(sizeof(RealA) == sizeof(float2)){ \
889  if(recon == QUDA_RECONSTRUCT_NO){ \
890  do_lepage_middle_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(float2, float2); \
891  do_lepage_middle_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(float2, float2); \
892  }else{ \
893  do_lepage_middle_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(float2, float4); \
894  do_lepage_middle_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(float2, float4); \
895  } \
896  }else{ \
897  if(recon == QUDA_RECONSTRUCT_NO){ \
898  do_lepage_middle_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(double2, double2); \
899  do_lepage_middle_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(double2, double2); \
900  }else{ \
901  do_lepage_middle_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(double2, double2); \
902  do_lepage_middle_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(double2, double2); \
903  } \
904  } \
905  }
906 
907  void apply(const cudaStream_t &stream) {
908  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
909  QudaReconstructType recon = link.Reconstruct();
910  int oddness_change = (kparam.base_idx[0] + kparam.base_idx[1]
911  + kparam.base_idx[2] + kparam.base_idx[3])&1;
912 
913  if (GOES_FORWARDS(sig) && GOES_FORWARDS(mu)){
914  CALL_MIDDLE_LINK_KERNEL(1,1);
915  }else if (GOES_FORWARDS(sig) && GOES_BACKWARDS(mu)){
916  CALL_MIDDLE_LINK_KERNEL(1,0);
917  }else if (GOES_BACKWARDS(sig) && GOES_FORWARDS(mu)){
918  CALL_MIDDLE_LINK_KERNEL(0,1);
919  }else{
920  CALL_MIDDLE_LINK_KERNEL(0,0);
921  }
922 
923  }
924 
925 #undef CALL_ARGUMENTS
926 #undef CALL_MIDDLE_LINK_KERNEL
927 
928  void preTune() {
929  P3.backup();
930  newOprod.backup();
931  }
932 
933  void postTune() {
934  P3.restore();
935  newOprod.restore();
936  }
937 
938  long long flops() const {
939  if(GOES_FORWARDS(sig)) return 810*kparam.X[0]*kparam.X[1]*kparam.X[2]*kparam.X[3];
940  return kparam.X[0]*kparam.X[1]*kparam.X[2]*kparam.X[3]*396;
941  }
942  };
943 
944  template<class RealA, class RealB>
945  class SideLink : public Tunable {
946 
947  private:
948  const cudaGaugeField &link;
949  const cudaGaugeField &P3;
950  const cudaGaugeField &oprod;
951  const int sig;
952  const int mu;
953  const typename RealTypeId<RealA>::Type &coeff;
954  const typename RealTypeId<RealA>::Type &accumu_coeff;
955  cudaGaugeField &shortP;
956  cudaGaugeField &newOprod;
957  const hisq_kernel_param_t &kparam;
958 
959  unsigned int sharedBytesPerThread() const { return 0; }
960  unsigned int sharedBytesPerBlock(const TuneParam &) const { return 0; }
961 
962  // don't tune the grid dimension
963  bool tuneGridDim() const { return false; }
964  unsigned int minThreads() const { return kparam.threads; }
965 
966  public:
967  SideLink(const cudaGaugeField &link,
968  const cudaGaugeField &P3,
969  const cudaGaugeField &oprod,
970  int sig, int mu,
971  const typename RealTypeId<RealA>::Type &coeff,
972  const typename RealTypeId<RealA>::Type &accumu_coeff,
973  cudaGaugeField &shortP,
974  cudaGaugeField &newOprod,
975  const hisq_kernel_param_t &kparam) :
976  link(link), P3(P3), oprod(oprod),
977  sig(sig), mu(mu), coeff(coeff), accumu_coeff(accumu_coeff),
978  shortP(shortP), newOprod(newOprod), kparam(kparam)
979  { ; }
980  virtual ~SideLink() { ; }
981 
982  TuneKey tuneKey() const {
983  std::stringstream vol, aux;
984  vol << kparam.D[0] << "x";
985  vol << kparam.D[1] << "x";
986  vol << kparam.D[2] << "x";
987  vol << kparam.D[3];
988  aux << "threads=" << kparam.threads << ",prec=" << link.Precision();
989  aux << ",recon=" << link.Reconstruct() << ",sig=" << sig << ",mu=" << mu;
990  return TuneKey(vol.str().c_str(), typeid(*this).name(), aux.str().c_str());
991  }
992 
993 #define CALL_ARGUMENTS(typeA, typeB) <<<tp.grid, tp.block>>> \
994  ((typeA*)P3.Even_p(), (typeA*)P3.Odd_p(), \
995  (typeA*)oprod.Even_p(), (typeA*)oprod.Odd_p(), \
996  (typeB*)link.Even_p(), (typeB*)link.Odd_p(), \
997  sig, mu, \
998  coeff, \
999  (typename RealTypeId<typeA>::Type) accumu_coeff, \
1000  (typeA*)shortP.Even_p(), (typeA*)shortP.Odd_p(), \
1001  (typeA*)newOprod.Even_p(), (typeA*)newOprod.Odd_p(), \
1002  kparam)
1003 
1004 #define CALL_SIDE_LINK_KERNEL(sig_sign, mu_sign) \
1005  if(oddness_change == 0){ \
1006  if(sizeof(RealA) == sizeof(float2)){ \
1007  if(recon == QUDA_RECONSTRUCT_NO){ \
1008  do_side_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(float2, float2); \
1009  do_side_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(float2, float2); \
1010  }else{ \
1011  do_side_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(float2, float4); \
1012  do_side_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(float2, float4); \
1013  } \
1014  }else{ \
1015  if(recon == QUDA_RECONSTRUCT_NO){ \
1016  do_side_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(double2, double2); \
1017  do_side_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(double2, double2); \
1018  }else{ \
1019  do_side_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(double2, double2); \
1020  do_side_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(double2, double2); \
1021  } \
1022  } \
1023  }else{ \
1024  if(sizeof(RealA) == sizeof(float2)){ \
1025  if(recon == QUDA_RECONSTRUCT_NO){ \
1026  do_side_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(float2, float2); \
1027  do_side_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(float2, float2); \
1028  }else{ \
1029  do_side_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(float2, float4); \
1030  do_side_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(float2, float4); \
1031  } \
1032  }else{ \
1033  if(recon == QUDA_RECONSTRUCT_NO){ \
1034  do_side_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(double2, double2); \
1035  do_side_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(double2, double2); \
1036  }else{ \
1037  do_side_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(double2, double2); \
1038  do_side_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(double2, double2); \
1039  } \
1040  } \
1041  }
1042 
1043  void apply(const cudaStream_t &stream) {
1044  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
1045  QudaReconstructType recon = link.Reconstruct();
1046  int oddness_change = (kparam.base_idx[0] + kparam.base_idx[1]
1047  + kparam.base_idx[2] + kparam.base_idx[3])&1;
1048 
1049  if (GOES_FORWARDS(sig) && GOES_FORWARDS(mu)){
1050  CALL_SIDE_LINK_KERNEL(1,1);
1051  }else if (GOES_FORWARDS(sig) && GOES_BACKWARDS(mu)){
1052  CALL_SIDE_LINK_KERNEL(1,0);
1053  }else if (GOES_BACKWARDS(sig) && GOES_FORWARDS(mu)){
1054  CALL_SIDE_LINK_KERNEL(0,1);
1055  }else{
1056  CALL_SIDE_LINK_KERNEL(0,0);
1057  }
1058  }
1059 
1060 #undef CALL_SIDE_LINK_KERNEL
1061 #undef CALL_ARGUMENTS
1062 
1063  void preTune() {
1064  shortP.backup();
1065  newOprod.backup();
1066  }
1067 
1068  void postTune() {
1069  shortP.restore();
1070  newOprod.restore();
1071  }
1072 
1073  long long flops() const { return 0; }
1074  };
1075 
1076 
1077  template<class RealA, class RealB>
1078  class SideLinkShort : public Tunable {
1079 
1080  private:
1081  const cudaGaugeField &link;
1082  const cudaGaugeField &P3;
1083  const int sig;
1084  const int mu;
1085  const typename RealTypeId<RealA>::Type &coeff;
1086  cudaGaugeField &newOprod;
1087  const hisq_kernel_param_t &kparam;
1088 
1089  unsigned int sharedBytesPerThread() const { return 0; }
1090  unsigned int sharedBytesPerBlock(const TuneParam &) const { return 0; }
1091 
1092  // don't tune the grid dimension
1093  bool tuneGridDim() const { return false; }
1094  unsigned int minThreads() const { return kparam.threads; }
1095 
1096  public:
1097  SideLinkShort(const cudaGaugeField &link, const cudaGaugeField &P3, int sig, int mu,
1098  const typename RealTypeId<RealA>::Type &coeff, cudaGaugeField &newOprod,
1099  const hisq_kernel_param_t &kparam) :
1100  link(link), P3(P3), sig(sig), mu(mu), coeff(coeff), newOprod(newOprod), kparam(kparam)
1101  { ; }
1102  virtual ~SideLinkShort() { ; }
1103 
1104  TuneKey tuneKey() const {
1105  std::stringstream vol, aux;
1106  vol << kparam.D[0] << "x";
1107  vol << kparam.D[1] << "x";
1108  vol << kparam.D[2] << "x";
1109  vol << kparam.D[3];
1110  aux << "threads=" << kparam.threads << ",prec=" << link.Precision();
1111  aux << ",recon=" << link.Reconstruct() << ",sig=" << sig << ",mu=" << mu;
1112  return TuneKey(vol.str().c_str(), typeid(*this).name(), aux.str().c_str());
1113  }
1114 
1115 #define CALL_ARGUMENTS(typeA, typeB) <<<tp.grid, tp.block>>> \
1116  ((typeA*)P3.Even_p(), (typeA*)P3.Odd_p(), \
1117  (typeB*)link.Even_p(), (typeB*)link.Odd_p(), \
1118  sig, mu, (typename RealTypeId<typeA>::Type) coeff, \
1119  (typeA*)newOprod.Even_p(), (typeA*)newOprod.Odd_p(), kparam)
1120 
1121 
1122 #define CALL_SIDE_LINK_KERNEL(sig_sign, mu_sign) \
1123  if(oddness_change == 0){ \
1124  if(sizeof(RealA) == sizeof(float2)){ \
1125  if(recon == QUDA_RECONSTRUCT_NO){ \
1126  do_side_link_short_sp_18_kernel<float2, float2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(float2, float2); \
1127  do_side_link_short_sp_18_kernel<float2, float2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(float2, float2); \
1128  }else{ \
1129  do_side_link_short_sp_12_kernel<float2, float4, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(float2, float4); \
1130  do_side_link_short_sp_12_kernel<float2, float4, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(float2, float4); \
1131  } \
1132  }else{ \
1133  if(recon == QUDA_RECONSTRUCT_NO){ \
1134  do_side_link_short_dp_18_kernel<double2, double2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(double2, double2); \
1135  do_side_link_short_dp_18_kernel<double2, double2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(double2, double2); \
1136  }else{ \
1137  do_side_link_short_dp_12_kernel<double2, double2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(double2, double2); \
1138  do_side_link_short_dp_12_kernel<double2, double2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(double2, double2); \
1139  } \
1140  } \
1141  }else{ \
1142  if(sizeof(RealA) == sizeof(float2)){ \
1143  if(recon == QUDA_RECONSTRUCT_NO){ \
1144  do_side_link_short_sp_18_kernel<float2, float2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(float2, float2); \
1145  do_side_link_short_sp_18_kernel<float2, float2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(float2, float2); \
1146  }else{ \
1147  do_side_link_short_sp_12_kernel<float2, float4, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(float2, float4); \
1148  do_side_link_short_sp_12_kernel<float2, float4, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(float2, float4); \
1149  } \
1150  }else{ \
1151  if(recon == QUDA_RECONSTRUCT_NO){ \
1152  do_side_link_short_dp_18_kernel<double2, double2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(double2, double2); \
1153  do_side_link_short_dp_18_kernel<double2, double2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(double2, double2); \
1154  }else{ \
1155  do_side_link_short_dp_12_kernel<double2, double2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(double2, double2); \
1156  do_side_link_short_dp_12_kernel<double2, double2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(double2, double2); \
1157  } \
1158  } \
1159  }
1160 
1161  void apply(const cudaStream_t &stream) {
1162  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
1163  QudaReconstructType recon = link.Reconstruct();
1164  int oddness_change = (kparam.base_idx[0] + kparam.base_idx[1]
1165  + kparam.base_idx[2] + kparam.base_idx[3])&1;
1166 
1167  if (GOES_FORWARDS(sig) && GOES_FORWARDS(mu)){
1168  CALL_SIDE_LINK_KERNEL(1,1);
1169  }else if (GOES_FORWARDS(sig) && GOES_BACKWARDS(mu)){
1170  CALL_SIDE_LINK_KERNEL(1,0);
1171 
1172  }else if (GOES_BACKWARDS(sig) && GOES_FORWARDS(mu)){
1173  CALL_SIDE_LINK_KERNEL(0,1);
1174  }else{
1175  CALL_SIDE_LINK_KERNEL(0,0);
1176  }
1177  }
1178 
1179 #undef CALL_SIDE_LINK_KERNEL
1180 #undef CALL_ARGUMENTS
1181 
1182 
1183  void preTune() {
1184  newOprod.backup();
1185  }
1186 
1187  void postTune() {
1188  newOprod.restore();
1189  }
1190 
1191  long long flops() const { return 0; }
1192  };
1193 
1194  template<class RealA, class RealB>
1195  class AllLink : public Tunable {
1196 
1197  private:
1198  const cudaGaugeField &link;
1199  const cudaGaugeField &oprod;
1200  const cudaGaugeField &Qprev;
1201  const int sig;
1202  const int mu;
1203  const typename RealTypeId<RealA>::Type &coeff;
1204  const typename RealTypeId<RealA>::Type &accumu_coeff;
1205  cudaGaugeField &shortP;
1206  cudaGaugeField &newOprod;
1207  const hisq_kernel_param_t &kparam;
1208 
1209  unsigned int sharedBytesPerThread() const { return 0; }
1210  unsigned int sharedBytesPerBlock(const TuneParam &) const { return 0; }
1211 
1212  // don't tune the grid dimension
1213  bool tuneGridDim() const { return false; }
1214  unsigned int minThreads() const { return kparam.threads; }
1215 
1216  public:
1217  AllLink(const cudaGaugeField &link,
1218  const cudaGaugeField &oprod,
1219  const cudaGaugeField &Qprev,
1220  int sig, int mu,
1221  const typename RealTypeId<RealA>::Type &coeff,
1222  const typename RealTypeId<RealA>::Type &accumu_coeff,
1223  cudaGaugeField &shortP, cudaGaugeField &newOprod,
1224  const hisq_kernel_param_t &kparam) :
1225  link(link), oprod(oprod), Qprev(Qprev), sig(sig), mu(mu),
1226  coeff(coeff), accumu_coeff(accumu_coeff), shortP(shortP),
1227  newOprod(newOprod), kparam(kparam)
1228  { ; }
1229  virtual ~AllLink() { ; }
1230 
1231  TuneKey tuneKey() const {
1232  std::stringstream vol, aux;
1233  vol << kparam.D[0] << "x";
1234  vol << kparam.D[1] << "x";
1235  vol << kparam.D[2] << "x";
1236  vol << kparam.D[3];
1237  aux << "threads=" << kparam.threads << ",prec=" << link.Precision();
1238  aux << ",recon=" << link.Reconstruct() << ",sig=" << sig << ",mu=" << mu;
1239  return TuneKey(vol.str().c_str(), typeid(*this).name(), aux.str().c_str());
1240  }
1241 
1242 #define CALL_ARGUMENTS(typeA, typeB) <<<tp.grid, tp.block>>> \
1243  ((typeA*)oprod.Even_p(), (typeA*)oprod.Odd_p(), \
1244  (typeA*)Qprev.Even_p(), (typeA*)Qprev.Odd_p(), \
1245  (typeB*)link.Even_p(), (typeB*)link.Odd_p(), sig, mu, \
1246  (typename RealTypeId<typeA>::Type)coeff, \
1247  (typename RealTypeId<typeA>::Type)accumu_coeff, \
1248  (typeA*)shortP.Even_p(),(typeA*)shortP.Odd_p(), \
1249  (typeA*)newOprod.Even_p(), (typeA*)newOprod.Odd_p(), kparam)
1250 
1251 #define CALL_ALL_LINK_KERNEL(sig_sign, mu_sign) \
1252  if(oddness_change == 0){ \
1253  if(sizeof(RealA) == sizeof(float2)){ \
1254  if(recon == QUDA_RECONSTRUCT_NO){ \
1255  do_all_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(float2, float2); \
1256  do_all_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(float2, float2); \
1257  }else{ \
1258  do_all_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(float2, float4); \
1259  do_all_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(float2, float4); \
1260  } \
1261  }else{ \
1262  if(recon == QUDA_RECONSTRUCT_NO){ \
1263  do_all_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(double2, double2); \
1264  do_all_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(double2, double2); \
1265  }else{ \
1266  do_all_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(double2, double2); \
1267  do_all_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(double2, double2); \
1268  } \
1269  } \
1270  }else{ \
1271  if(sizeof(RealA) == sizeof(float2)){ \
1272  if(recon == QUDA_RECONSTRUCT_NO){ \
1273  do_all_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(float2, float2); \
1274  do_all_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(float2, float2); \
1275  }else{ \
1276  do_all_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(float2, float4); \
1277  do_all_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(float2, float4); \
1278  } \
1279  }else{ \
1280  if(recon == QUDA_RECONSTRUCT_NO){ \
1281  do_all_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(double2, double2); \
1282  do_all_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(double2, double2); \
1283  }else{ \
1284  do_all_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(double2, double2); \
1285  do_all_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(double2, double2); \
1286  } \
1287  } \
1288  }
1289  void apply(const cudaStream_t &stream) {
1290  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
1291  QudaReconstructType recon = link.Reconstruct();
1292  int oddness_change = (kparam.base_idx[0] + kparam.base_idx[1]
1293  + kparam.base_idx[2] + kparam.base_idx[3])&1;
1294 
1295  if (GOES_FORWARDS(sig) && GOES_FORWARDS(mu)){
1296  CALL_ALL_LINK_KERNEL(1, 1);
1297  }else if (GOES_FORWARDS(sig) && GOES_BACKWARDS(mu)){
1298  CALL_ALL_LINK_KERNEL(1, 0);
1299  }else if (GOES_BACKWARDS(sig) && GOES_FORWARDS(mu)){
1300  CALL_ALL_LINK_KERNEL(0, 1);
1301  }else{
1302  CALL_ALL_LINK_KERNEL(0, 0);
1303  }
1304 
1305  return;
1306  }
1307 
1308 #undef CALL_ARGUMENTS
1309 #undef CALL_ALL_LINK_KERNEL
1310 
1311  void preTune() {
1312  shortP.backup();
1313  newOprod.backup();
1314  }
1315 
1316  void postTune() {
1317  shortP.restore();
1318  newOprod.restore();
1319  }
1320 
1321  virtual void initTuneParam(TuneParam &param) const
1322  {
1323  Tunable::initTuneParam(param);
1324  param.grid = dim3((kparam.threads+param.block.x-1)/param.block.x, 1, 1);
1325  }
1326 
1328  void defaultTuneParam(TuneParam &param) const
1329  {
1331  param.grid = dim3((kparam.threads+param.block.x-1)/param.block.x, 1, 1);
1332  }
1333 
1334  long long flops() const {
1335  if(GOES_FORWARDS(sig)) return kparam.X[0]*kparam.X[1]*kparam.X[2]*kparam.X[3]*1242;
1336 
1337  return kparam.X[0]*kparam.X[1]*kparam.X[2]*kparam.X[3]*828;
1338  }
1339  };
1340 
1341 
1342  template<class RealA, class RealB>
1343  class OneLinkTerm : public Tunable {
1344 
1345  private:
1346  const cudaGaugeField &oprod;
1347  const typename RealTypeId<RealA>::Type &coeff;
1348  cudaGaugeField &ForceMatrix;
1349  int X[4];
1350  hisq_kernel_param_t kparam;
1351 
1352  unsigned int sharedBytesPerThread() const { return 0; }
1353  unsigned int sharedBytesPerBlock(const TuneParam &) const { return 0; }
1354 
1355  // don't tune the grid dimension
1356  bool tuneGridDim() const { return false; }
1357  unsigned int minThreads() const { return X[0]*X[1]*X[2]*X[3]/2; }
1358 
1359  public:
1360  OneLinkTerm(const cudaGaugeField &oprod,
1361  const typename RealTypeId<RealA>::Type &coeff,
1362  cudaGaugeField &ForceMatrix, const QudaGaugeParam& param) :
1363  oprod(oprod), coeff(coeff), ForceMatrix(ForceMatrix)
1364  {
1365  for(int dir=0; dir<4; ++dir) X[dir] = param.X[dir];
1366 
1367  kparam.threads = X[0]*X[1]*X[2]*X[3]/2;
1368  for(int dir=0; dir<4; ++dir){
1369  kparam.X[dir] = X[dir];
1370  }
1371  kparam.setStride(param);
1372  }
1373 
1374  virtual ~OneLinkTerm() { ; }
1375 
1376  TuneKey tuneKey() const {
1377  std::stringstream vol, aux;
1378  vol << X[0] << "x";
1379  vol << X[1] << "x";
1380  vol << X[2] << "x";
1381  vol << X[3];
1382  int threads = X[0]*X[1]*X[2]*X[3]/2;
1383  aux << "threads=" << threads << ",prec=" << oprod.Precision();
1384  aux << ",coeff=" << coeff;
1385  return TuneKey(vol.str().c_str(), typeid(*this).name(), aux.str().c_str());
1386  }
1387 
1388  void apply(const cudaStream_t &stream) {
1389  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
1390 
1391 
1392 
1393 
1394  do_one_link_term_kernel<RealA,0><<<tp.grid,tp.block>>>(static_cast<const RealA*>(oprod.Even_p()),
1395  static_cast<const RealA*>(oprod.Odd_p()),
1396  coeff,
1397  static_cast<RealA*>(ForceMatrix.Even_p()),
1398  static_cast<RealA*>(ForceMatrix.Odd_p()),
1399  kparam);
1400  do_one_link_term_kernel<RealA,1><<<tp.grid,tp.block>>>(static_cast<const RealA*>(oprod.Even_p()),
1401  static_cast<const RealA*>(oprod.Odd_p()),
1402  coeff,
1403  static_cast<RealA*>(ForceMatrix.Even_p()),
1404  static_cast<RealA*>(ForceMatrix.Odd_p()),
1405  kparam);
1406 
1407  }
1408 
1409  void preTune() {
1410  ForceMatrix.backup();
1411  }
1412 
1413  void postTune() {
1414  ForceMatrix.restore();
1415  }
1416 
1417  long long flops() const {
1418  return 72*kparam.X[0]*kparam.X[1]*kparam.X[2]*kparam.X[3];
1419  }
1420  };
1421 
1422 
1423  template<class RealA, class RealB>
1424  class LongLinkTerm : public Tunable {
1425 
1426  private:
1427  const cudaGaugeField &link;
1428  const cudaGaugeField &naikOprod;
1429  const typename RealTypeId<RealA>::Type naik_coeff;
1430  cudaGaugeField &output;
1431  int X[4];
1432  const hisq_kernel_param_t &kparam;
1433 
1434  unsigned int sharedBytesPerThread() const { return 0; }
1435  unsigned int sharedBytesPerBlock(const TuneParam &) const { return 0; }
1436 
1437  // don't tune the grid dimension
1438  bool tuneGridDim() const { return false; }
1439  unsigned int minThreads() const { return X[0]*X[1]*X[2]*X[3]/2; }
1440 
1441  public:
1442  LongLinkTerm(const cudaGaugeField &link, const cudaGaugeField &naikOprod,
1443  const typename RealTypeId<RealA>::Type &naik_coeff,
1444  cudaGaugeField &output, const hisq_kernel_param_t &kparam) :
1445  link(link), naikOprod(naikOprod), naik_coeff(naik_coeff), output(output),
1446  kparam(kparam)
1447  { for(int dir=0; dir<4; ++dir) X[dir] = kparam.X[dir]; }
1448 
1449  virtual ~LongLinkTerm() { ; }
1450 
1451  TuneKey tuneKey() const {
1452  std::stringstream vol, aux;
1453  vol << X[0] << "x";
1454  vol << X[1] << "x";
1455  vol << X[2] << "x";
1456  vol << X[3];
1457  int threads = X[0]*X[1]*X[2]*X[3]/2;
1458  aux << "threads=" << threads << ",prec=" << link.Precision();
1459  return TuneKey(vol.str().c_str(), typeid(*this).name(), aux.str().c_str());
1460  }
1461 
1462 #define CALL_ARGUMENTS(typeA, typeB) <<<tp.grid,tp.block>>> \
1463  ((typeB*)link.Even_p(), (typeB*)link.Odd_p(), \
1464  (typeA*)naikOprod.Even_p(), (typeA*)naikOprod.Odd_p(), \
1465  naik_coeff, \
1466  (typeA*)output.Even_p(), (typeA*)output.Odd_p(), \
1467  kparam);
1468 
1469  void apply(const cudaStream_t &stream) {
1470  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
1471  QudaReconstructType recon = link.Reconstruct();
1472 
1473  if(sizeof(RealA) == sizeof(float2)){
1474  if(recon == QUDA_RECONSTRUCT_NO){
1475  do_longlink_sp_18_kernel<float2,float2, 0> CALL_ARGUMENTS(float2, float2);
1476  do_longlink_sp_18_kernel<float2,float2, 1> CALL_ARGUMENTS(float2, float2);
1477  }else{
1478  do_longlink_sp_12_kernel<float2,float4, 0> CALL_ARGUMENTS(float2, float4);
1479  do_longlink_sp_12_kernel<float2,float4, 1> CALL_ARGUMENTS(float2, float4);
1480  }
1481  }else{
1482  if(recon == QUDA_RECONSTRUCT_NO){
1483  do_longlink_dp_18_kernel<double2,double2, 0> CALL_ARGUMENTS(double2, double2);
1484  do_longlink_dp_18_kernel<double2,double2, 1> CALL_ARGUMENTS(double2, double2);
1485  }else{
1486  do_longlink_dp_12_kernel<double2,double2, 0> CALL_ARGUMENTS(double2, double2);
1487  do_longlink_dp_12_kernel<double2,double2, 1> CALL_ARGUMENTS(double2, double2);
1488  }
1489  }
1490  }
1491 
1492 #undef CALL_ARGUMENTS
1493 
1494  void preTune() {
1495  output.backup();
1496  }
1497 
1498  void postTune() {
1499  output.restore();
1500  }
1501 
1502  long long flops() const { return 4968*kparam.X[0]*kparam.X[1]*kparam.X[2]*kparam.X[3]; }
1503  };
1504 
1505 
1506 
1507 
1508  template<class RealA, class RealB>
1509  class CompleteForce : public Tunable {
1510 
1511  private:
1512  const cudaGaugeField &link;
1513  const cudaGaugeField &oprod;
1514  cudaGaugeField &mom;
1515  int X[4];
1516  hisq_kernel_param_t kparam;
1517 
1518  unsigned int sharedBytesPerThread() const { return 0; }
1519  unsigned int sharedBytesPerBlock(const TuneParam &) const { return 0; }
1520 
1521  // don't tune the grid dimension
1522  bool tuneGridDim() const { return false; }
1523  unsigned int minThreads() const { return X[0]*X[1]*X[2]*X[3]/2; }
1524 
1525  public:
1526  CompleteForce(const cudaGaugeField &link, const cudaGaugeField &oprod,
1527  cudaGaugeField &mom, const QudaGaugeParam &param) :
1528  link(link), oprod(oprod), mom(mom)
1529  {
1530 
1531  for(int dir=0; dir<4; ++dir){
1532  X[dir] = param.X[dir];
1533  kparam.X[dir] = X[dir];
1534  }
1535  kparam.threads = X[0]*X[1]*X[2]*X[3]/2;
1536  kparam.setStride(param);
1537  }
1538 
1539  virtual ~CompleteForce() { ; }
1540 
1541  TuneKey tuneKey() const {
1542  std::stringstream vol, aux;
1543  vol << X[0] << "x";
1544  vol << X[1] << "x";
1545  vol << X[2] << "x";
1546  vol << X[3];
1547  int threads = X[0]*X[1]*X[2]*X[3]/2;
1548  aux << "threads=" << threads << ",prec=" << link.Precision();
1549  return TuneKey(vol.str().c_str(), typeid(*this).name(), aux.str().c_str());
1550  }
1551 
1552 #define CALL_ARGUMENTS(typeA, typeB) <<<tp.grid, tp.block>>> \
1553  ((typeB*)link.Even_p(), (typeB*)link.Odd_p(), \
1554  (typeA*)oprod.Even_p(), (typeA*)oprod.Odd_p(), \
1555  (typeA*)mom.Even_p(), (typeA*)mom.Odd_p(), \
1556  kparam);
1557 
1558  void apply(const cudaStream_t &stream) {
1559  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
1560  QudaReconstructType recon = link.Reconstruct();;
1561 
1562 
1563 
1564 
1565  if(sizeof(RealA) == sizeof(float2)){
1566  if(recon == QUDA_RECONSTRUCT_NO){
1567  do_complete_force_sp_18_kernel<float2,float2, 0> CALL_ARGUMENTS(float2, float2);
1568  do_complete_force_sp_18_kernel<float2,float2, 1> CALL_ARGUMENTS(float2, float2);
1569  }else{
1570  do_complete_force_sp_12_kernel<float2,float4, 0> CALL_ARGUMENTS(float2, float4);
1571  do_complete_force_sp_12_kernel<float2,float4, 1> CALL_ARGUMENTS(float2, float4);
1572  }
1573  }else{
1574  if(recon == QUDA_RECONSTRUCT_NO){
1575  do_complete_force_dp_18_kernel<double2,double2, 0> CALL_ARGUMENTS(double2, double2);
1576  do_complete_force_dp_18_kernel<double2,double2, 1> CALL_ARGUMENTS(double2, double2);
1577  }else{
1578  do_complete_force_dp_12_kernel<double2,double2, 0> CALL_ARGUMENTS(double2, double2);
1579  do_complete_force_dp_12_kernel<double2,double2, 1> CALL_ARGUMENTS(double2, double2);
1580  }
1581  }
1582  }
1583 
1584 #undef CALL_ARGUMENTS
1585 
1586  void preTune() {
1587  mom.backup();
1588  }
1589 
1590  void postTune() {
1591  mom.restore();
1592  }
1593 
1594  long long flops() const {
1595  return kparam.X[0]*kparam.X[1]*kparam.X[2]*kparam.X[3]*792;
1596  }
1597  };
1598 
1599 
1600  static void
1601  bind_tex_link(const cudaGaugeField& link, const cudaGaugeField& newOprod)
1602  {
1603  if(link.Precision() == QUDA_DOUBLE_PRECISION){
1604  cudaBindTexture(0, thinLink0TexDouble, link.Even_p(), link.Bytes()/2);
1605  cudaBindTexture(0, thinLink1TexDouble, link.Odd_p(), link.Bytes()/2);
1606 
1607  cudaBindTexture(0, newOprod0TexDouble, newOprod.Even_p(), newOprod.Bytes()/2);
1608  cudaBindTexture(0, newOprod1TexDouble, newOprod.Odd_p(), newOprod.Bytes()/2);
1609  }else{
1610  if(link.Reconstruct() == QUDA_RECONSTRUCT_NO){
1611  cudaBindTexture(0, thinLink0TexSingle, link.Even_p(), link.Bytes()/2);
1612  cudaBindTexture(0, thinLink1TexSingle, link.Odd_p(), link.Bytes()/2);
1613  }else{
1614  cudaBindTexture(0, thinLink0TexSingle_recon, link.Even_p(), link.Bytes()/2);
1615  cudaBindTexture(0, thinLink1TexSingle_recon, link.Odd_p(), link.Bytes()/2);
1616  }
1617  cudaBindTexture(0, newOprod0TexSingle, newOprod.Even_p(), newOprod.Bytes()/2);
1618  cudaBindTexture(0, newOprod1TexSingle, newOprod.Odd_p(), newOprod.Bytes()/2);
1619 
1620  }
1621  }
1622 
1623  static void
1624  unbind_tex_link(const cudaGaugeField& link, const cudaGaugeField& newOprod)
1625  {
1626  if(link.Precision() == QUDA_DOUBLE_PRECISION){
1627  cudaUnbindTexture(thinLink0TexDouble);
1628  cudaUnbindTexture(thinLink1TexDouble);
1629  cudaUnbindTexture(newOprod0TexDouble);
1630  cudaUnbindTexture(newOprod1TexDouble);
1631  }else{
1632  if(link.Reconstruct() == QUDA_RECONSTRUCT_NO){
1633  cudaUnbindTexture(thinLink0TexSingle);
1634  cudaUnbindTexture(thinLink1TexSingle);
1635  }else{
1636  cudaUnbindTexture(thinLink0TexSingle_recon);
1637  cudaUnbindTexture(thinLink1TexSingle_recon);
1638  }
1639  cudaUnbindTexture(newOprod0TexSingle);
1640  cudaUnbindTexture(newOprod1TexSingle);
1641  }
1642  }
1643 
1644  template<class Real, class RealA, class RealB>
1645  static void
1646  do_hisq_staples_force_cuda( PathCoefficients<Real> act_path_coeff,
1647  const QudaGaugeParam& param,
1648  const cudaGaugeField &oprod,
1649  const cudaGaugeField &link,
1650  cudaGaugeField &Pmu,
1651  cudaGaugeField &P3,
1652  cudaGaugeField &P5,
1653  cudaGaugeField &Pnumu,
1654  cudaGaugeField &Qmu,
1655  cudaGaugeField &Qnumu,
1656  cudaGaugeField &newOprod)
1657  {
1658 
1659  Real coeff;
1660  Real OneLink, Lepage, FiveSt, ThreeSt, SevenSt;
1661  Real mLepage, mFiveSt, mThreeSt;
1662 
1663  OneLink = act_path_coeff.one;
1664  ThreeSt = act_path_coeff.three; mThreeSt = -ThreeSt;
1665  FiveSt = act_path_coeff.five; mFiveSt = -FiveSt;
1666  SevenSt = act_path_coeff.seven;
1667  Lepage = act_path_coeff.lepage; mLepage = -Lepage;
1668 
1669 
1670 
1671  OneLinkTerm<RealA, RealB> oneLink(oprod, OneLink, newOprod, param);
1672  oneLink.apply(0);
1673  checkCudaError();
1674 
1675 
1676  int ghostDim[4]={
1677  commDimPartitioned(0),
1678  commDimPartitioned(1),
1679  commDimPartitioned(2),
1681  };
1682 
1683  hisq_kernel_param_t kparam_1g, kparam_2g;
1684 
1685  for(int dir=0; dir<4; ++dir){
1686  kparam_1g.X[dir] = param.X[dir];
1687  kparam_2g.X[dir] = param.X[dir];
1688  }
1689 
1690  kparam_1g.setStride(param);
1691  kparam_2g.setStride(param);
1692 
1693 
1694 #ifdef MULTI_GPU
1695  kparam_1g.D[0] = commDimPartitioned(0)?(param.X[0]+2):(param.X[0]);
1696  kparam_1g.D[1] = commDimPartitioned(1)?(param.X[1]+2):(param.X[1]);
1697  kparam_1g.D[2] = commDimPartitioned(2)?(param.X[2]+2):(param.X[2]);
1698  kparam_1g.D[3] = commDimPartitioned(3)?(param.X[3]+2):(param.X[3]);
1699  kparam_1g.D1h = kparam_1g.D[0]/2;
1700  kparam_1g.base_idx[0]=commDimPartitioned(0)?1:2;
1701  kparam_1g.base_idx[1]=commDimPartitioned(1)?1:2;
1702  kparam_1g.base_idx[2]=commDimPartitioned(2)?1:2;
1703  kparam_1g.base_idx[3]=commDimPartitioned(3)?1:2;
1704  kparam_1g.threads = kparam_1g.D[0]*kparam_1g.D[1]*kparam_1g.D[2]*kparam_1g.D[3]/2;
1705 
1706  kparam_2g.D[0] = commDimPartitioned(0)?(param.X[0]+4):(param.X[0]);
1707  kparam_2g.D[1] = commDimPartitioned(1)?(param.X[1]+4):(param.X[1]);
1708  kparam_2g.D[2] = commDimPartitioned(2)?(param.X[2]+4):(param.X[2]);
1709  kparam_2g.D[3] = commDimPartitioned(3)?(param.X[3]+4):(param.X[3]);
1710  kparam_2g.D1h = kparam_2g.D[0]/2;
1711  kparam_2g.base_idx[0]=commDimPartitioned(0)?0:2;
1712  kparam_2g.base_idx[1]=commDimPartitioned(1)?0:2;
1713  kparam_2g.base_idx[2]=commDimPartitioned(2)?0:2;
1714  kparam_2g.base_idx[3]=commDimPartitioned(3)?0:2;
1715  kparam_2g.threads = kparam_2g.D[0]*kparam_2g.D[1]*kparam_2g.D[2]*kparam_2g.D[3]/2;
1716 
1717 
1718  for(int i=0;i < 4; i++){
1719  kparam_1g.ghostDim[i] = kparam_2g.ghostDim[i]=kparam_1g.ghostDim[i]=kparam_2g.ghostDim[i] = ghostDim[i];
1720  }
1721 #else
1722  hisq_kernel_param_t kparam;
1723  kparam.D[0] = param.X[0];
1724  kparam.D[1] = param.X[1];
1725  kparam.D[2] = param.X[2];
1726  kparam.D[3] = param.X[3];
1727  kparam.D1h = param.X[0]/2;
1728  kparam.threads=param.X[0]*param.X[1]*param.X[2]*param.X[3]/2;
1729  kparam.base_idx[0]=0;
1730  kparam.base_idx[1]=0;
1731  kparam.base_idx[2]=0;
1732  kparam.base_idx[3]=0;
1733  kparam_2g.threads = kparam_1g.threads = kparam.threads;
1734 
1735  for(int i=0; i<4; ++i){
1736  kparam_2g.D[i] = kparam_1g.D[i] = kparam.D[i];
1737  kparam_2g.D1h = kparam_1g.D1h = kparam.D1h;
1738  kparam_2g.base_idx[i] = kparam_1g.base_idx[i] = 0;
1739  kparam_2g.ghostDim[i] = kparam_1g.ghostDim[i] = 0;
1740  }
1741 #endif
1742  for(int sig=0; sig<8; sig++){
1743  for(int mu=0; mu<8; mu++){
1744  if ( (mu == sig) || (mu == OPP_DIR(sig))){
1745  continue;
1746  }
1747  //3-link
1748  //Kernel A: middle link
1749 
1750  MiddleLink<RealA,RealB> middleLink( link, oprod, // read only
1751  sig, mu, mThreeSt,
1752  Pmu, P3, Qmu, // write only
1753  newOprod, kparam_2g);
1754  middleLink.apply(0);
1755  checkCudaError();
1756 
1757  for(int nu=0; nu < 8; nu++){
1758  if (nu == sig || nu == OPP_DIR(sig)
1759  || nu == mu || nu == OPP_DIR(mu)){
1760  continue;
1761  }
1762  //5-link: middle link
1763  //Kernel B
1764  MiddleLink<RealA,RealB> middleLink( link, Pmu, Qmu, // read only
1765  sig, nu, FiveSt,
1766  Pnumu, P5, Qnumu, // write only
1767  newOprod, kparam_1g);
1768  middleLink.apply(0);
1769  checkCudaError();
1770 
1771  for(int rho = 0; rho < 8; rho++){
1772  if (rho == sig || rho == OPP_DIR(sig)
1773  || rho == mu || rho == OPP_DIR(mu)
1774  || rho == nu || rho == OPP_DIR(nu)){
1775  continue;
1776  }
1777 
1778  //7-link: middle link and side link
1779  if(FiveSt != 0)coeff = SevenSt/FiveSt; else coeff = 0;
1780  AllLink<RealA,RealB> allLink(link, Pnumu, Qnumu, sig, rho, SevenSt, coeff,
1781  P5, newOprod, kparam_1g);
1782 
1783  allLink.apply(0);
1784  checkCudaError();
1785 
1786  //return;
1787  }//rho
1788 
1789  //5-link: side link
1790  if(ThreeSt != 0)coeff = FiveSt/ThreeSt; else coeff = 0;
1791  SideLink<RealA,RealB> sideLink(link, P5, Qmu, //read only
1792  sig, nu, mFiveSt, coeff,
1793  P3, // write only
1794  newOprod, kparam_1g);
1795  sideLink.apply(0);
1796  checkCudaError();
1797 
1798  } //nu
1799 
1800  //lepage
1801  if(Lepage != 0.){
1802  LepageMiddleLink<RealA,RealB>
1803  lepageMiddleLink ( link, Pmu, Qmu, // read only
1804  sig, mu, Lepage,
1805  P5, // write only
1806  newOprod, kparam_2g);
1807  lepageMiddleLink.apply(0);
1808  checkCudaError();
1809 
1810  if(ThreeSt != 0)coeff = Lepage/ThreeSt ; else coeff = 0;
1811 
1812  SideLink<RealA, RealB> sideLink(link, P5, Qmu, // read only
1813  sig, mu, mLepage, coeff,
1814  P3, //write only
1815  newOprod, kparam_2g);
1816 
1817  sideLink.apply(0);
1818  checkCudaError();
1819 
1820  } // Lepage != 0.0
1821 
1822  //3-link side link
1823  SideLinkShort<RealA,RealB> sideLinkShort(link, P3, // read only
1824  sig, mu, ThreeSt,
1825  newOprod, kparam_1g);
1826  sideLinkShort.apply(0);
1827  checkCudaError();
1828 
1829  }//mu
1830  }//sig
1831 
1832  return;
1833  } // do_hisq_staples_force_cuda
1834 
1835 
1836 #undef Pmu
1837 #undef Pnumu
1838 #undef P3
1839 #undef P5
1840 #undef Qmu
1841 #undef Qnumu
1842 
1843 
1844  void hisqCompleteForceCuda(const QudaGaugeParam &param,
1845  const cudaGaugeField &oprod,
1846  const cudaGaugeField &link,
1847  cudaGaugeField* force,
1848  long long* flops)
1849  {
1850  bind_tex_link(link, oprod);
1851 
1852  if(param.cuda_prec == QUDA_DOUBLE_PRECISION){
1853  CompleteForce<double2,double2> completeForce(link, oprod, *force, param);
1854  completeForce.apply(0);
1855  if(flops) *flops = completeForce.flops();
1856  checkCudaError();
1857  }else if(param.cuda_prec == QUDA_SINGLE_PRECISION){
1858  CompleteForce<float2,float2> completeForce(link, oprod, *force, param);
1859  completeForce.apply(0);
1860  if(flops) *flops = completeForce.flops();
1861  checkCudaError();
1862  }else{
1863  errorQuda("Unsupported precision");
1864  }
1865 
1866 
1867  unbind_tex_link(link, oprod);
1868  return;
1869  }
1870 
1871 
1872  void hisqLongLinkForceCuda(double coeff,
1873  const QudaGaugeParam &param,
1874  const cudaGaugeField &oldOprod,
1875  const cudaGaugeField &link,
1876  cudaGaugeField *newOprod,
1877  long long* flops)
1878  {
1879  bind_tex_link(link, *newOprod);
1880  const int volume = param.X[0]*param.X[1]*param.X[2]*param.X[3];
1881  hisq_kernel_param_t kparam;
1882  for(int i=0; i<4; i++){
1883  kparam.X[i] = param.X[i];
1884  kparam.ghostDim[i] = commDimPartitioned(i);
1885  }
1886  kparam.threads = volume/2;
1887  kparam.setStride(param);
1888 
1889  if(param.cuda_prec == QUDA_DOUBLE_PRECISION){
1890  LongLinkTerm<double2,double2> longLink(link, oldOprod, coeff, *newOprod, kparam);
1891  longLink.apply(0);
1892  if(flops) (*flops) = longLink.flops();
1893  checkCudaError();
1894  }else if(param.cuda_prec == QUDA_SINGLE_PRECISION){
1895  LongLinkTerm<float2,float2> longLink(link, oldOprod, static_cast<float>(coeff), *newOprod, kparam);
1896  longLink.apply(0);
1897  if(flops) (*flops) = longLink.flops();
1898  checkCudaError();
1899  }else{
1900  errorQuda("Unsupported precision");
1901  }
1902  unbind_tex_link(link, *newOprod);
1903  return;
1904  }
1905 
1906 
1907 
1908 
1909 
1910  void
1911  hisqStaplesForceCuda(const double path_coeff_array[6],
1912  const QudaGaugeParam &param,
1913  const cudaGaugeField &oprod,
1914  const cudaGaugeField &link,
1915  cudaGaugeField* newOprod,
1916  long long* flops)
1917  {
1918 
1919 #ifdef MULTI_GPU
1920  int X[4] = {
1921  param.X[0]+4, param.X[1]+4, param.X[2]+4, param.X[3]+4
1922  };
1923 #else
1924  int X[4] = {
1925  param.X[0], param.X[1], param.X[2], param.X[3]
1926  };
1927 #endif
1928 
1929  // create color matrix fields with zero padding
1930  int pad = 0;
1931  GaugeFieldParam gauge_param(X, param.cuda_prec, QUDA_RECONSTRUCT_NO, pad, QUDA_SCALAR_GEOMETRY);
1932 
1933  gauge_param.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
1934  gauge_param.siteSubset = QUDA_FULL_SITE_SUBSET;
1936  cudaGaugeField Pmu(gauge_param);
1937  cudaGaugeField P3(gauge_param);
1938  cudaGaugeField P5(gauge_param);
1939  cudaGaugeField Pnumu(gauge_param);
1940  cudaGaugeField Qmu(gauge_param);
1941  cudaGaugeField Qnumu(gauge_param);
1942 
1943  bind_tex_link(link, *newOprod);
1944 
1945  cudaEvent_t start, end;
1946 
1947  cudaEventCreate(&start);
1948  cudaEventCreate(&end);
1949 
1950  cudaEventRecord(start);
1951  if (param.cuda_prec == QUDA_DOUBLE_PRECISION){
1952 
1953  PathCoefficients<double> act_path_coeff;
1954  act_path_coeff.one = path_coeff_array[0];
1955  act_path_coeff.naik = path_coeff_array[1];
1956  act_path_coeff.three = path_coeff_array[2];
1957  act_path_coeff.five = path_coeff_array[3];
1958  act_path_coeff.seven = path_coeff_array[4];
1959  act_path_coeff.lepage = path_coeff_array[5];
1960  do_hisq_staples_force_cuda<double,double2,double2>( act_path_coeff,
1961  param,
1962  oprod,
1963  link,
1964  Pmu,
1965  P3,
1966  P5,
1967  Pnumu,
1968  Qmu,
1969  Qnumu,
1970  *newOprod);
1971 
1972 
1973  }else if(param.cuda_prec == QUDA_SINGLE_PRECISION){
1974  PathCoefficients<float> act_path_coeff;
1975  act_path_coeff.one = path_coeff_array[0];
1976  act_path_coeff.naik = path_coeff_array[1];
1977  act_path_coeff.three = path_coeff_array[2];
1978  act_path_coeff.five = path_coeff_array[3];
1979  act_path_coeff.seven = path_coeff_array[4];
1980  act_path_coeff.lepage = path_coeff_array[5];
1981 
1982  do_hisq_staples_force_cuda<float,float2,float2>( act_path_coeff,
1983  param,
1984  oprod,
1985  link,
1986  Pmu,
1987  P3,
1988  P5,
1989  Pnumu,
1990  Qmu,
1991  Qnumu,
1992  *newOprod);
1993  }else{
1994  errorQuda("Unsupported precision");
1995  }
1996 
1997 
1998  cudaEventRecord(end);
1999  cudaEventSynchronize(end);
2000  float runtime;
2001  cudaEventElapsedTime(&runtime, start, end);
2002 
2003  if(flops){
2004  int volume = param.X[0]*param.X[1]*param.X[2]*param.X[3];
2005  // Middle Link, side link, short side link, AllLink, OneLink
2006  *flops = (134784 + 24192 + 103680 + 864 + 397440 + 72);
2007 
2008  if(path_coeff_array[5] != 0.) *flops += 28944; // Lepage contribution
2009  *flops *= volume;
2010  }
2011 
2012  unbind_tex_link(link, *newOprod);
2013 
2014  cudaEventDestroy(start);
2015  cudaEventDestroy(end);
2016 
2017  return;
2018  }
2019 
2020  } // namespace fermion_force
2021 } // namespace quda
2022 
2023 #endif // GPU_HISQ_FORCE
QudaGaugeParam gauge_param
Definition: dslash_test.cpp:37
__device__ __host__ int linkIndex(int x[], int dx[], const int X[4])
int commDimPartitioned(int dir)
int y[4]
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
#define errorQuda(...)
Definition: util_quda.h:73
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int mu
#define HISQ_LOAD_MATRIX_12_SINGLE_TEX(gauge, dir, idx, var, stride)
addMatrixToField(Ow.data, point_d, accumu_coeff, shortPEven, shortPOdd, 1-oddBit, kparam.color_matrix_stride)
cudaStream_t * stream
__global__ void const FloatN FloatM FloatM Float Float int threads
Definition: llfat_core.h:1099
void mat(void *out, void **fatlink, void **longlink, void *in, double kappa, int dagger_bit, QudaPrecision sPrecision, QudaPrecision gPrecision)
#define HISQ_LOAD_MATRIX_18_SINGLE_TEX(gauge, dir, idx, var, stride)
#define HISQ_LOAD_MATRIX_12_DOUBLE_TEX(gauge_tex, gauge, dir, idx, var, stride)
virtual void initTuneParam(TuneParam &param) const
Definition: tune_quda.h:175
#define Qmu
__inline__ __device__ double fetch_double(texture< int2, 1 > t, int i)
Definition: texture.h:88
#define OPP_DIR(dir)
Definition: force_common.h:16
QudaGaugeParam param
Definition: pack_test.cpp:17
int E[4]
__device__ __host__ Cmplx Conj(const Cmplx &a)
Definition: quda_matrix.h:267
__global__ void const RealB *const const RealA *const oprodEven
cudaColorSpinorField * tmp
addMatrixToNewOprod(Ow.data, OPP_DIR(mu), new_sid, mycoeff, newOprodEven, newOprodOdd, oddBit, kparam.color_matrix_stride)
int site_ga_pad
Definition: quda.h:55
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const linkOdd
__global__ void const RealB *const const RealA *const const RealA *const RealTypeId< RealA >::Type RealA *const outputEven
void hisqCompleteForceCuda(const QudaGaugeParam &param, const cudaGaugeField &oprod, const cudaGaugeField &link, cudaGaugeField *force, long long *flops=NULL)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:271
__constant__ double coeff
__global__ void const RealB *const const RealA *const const RealA *const RealTypeId< RealA >::Type RealA *const RealA *const outputOdd
int new_sid
updateCoords(y, mymu,(mu_positive?-1:1), kparam.X, kparam.ghostDim[mymu])
#define P5
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealTypeId< RealA >::Type accumu_coeff
__inline__ __device__ double2 fetch_double2(texture< int4, 1 > t, int i)
Definition: texture.h:90
QudaPrecision cuda_prec
Definition: quda.h:42
int X[4]
Definition: quda.h:29
#define READ_DOUBLE2_TEXTURE(x_tex, x, i)
Definition: force_common.h:143
int x[4]
virtual void defaultTuneParam(TuneParam &param) const
Definition: tune_quda.h:199
#define Pnumu
void hisqLongLinkForceCuda(double coeff, const QudaGaugeParam &param, const cudaGaugeField &oprod, const cudaGaugeField &link, cudaGaugeField *newOprod, long long *flops=NULL)
#define P3
int dx[4]
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const hisq_kernel_param_t kparam
void hisqStaplesForceCuda(const double path_coeff[6], const QudaGaugeParam &param, const cudaGaugeField &oprod, const cudaGaugeField &link, cudaGaugeField *newOprod, long long *flops=NULL)
storeMatrixToField(Oy.data, new_sid, P3Even, P3Odd, oddBit, kparam.color_matrix_stride)
#define GOES_BACKWARDS(dir)
Definition: force_common.h:18
__host__ __device__ complex< ValueType > operator*(const complex< ValueType > &lhs, const complex< ValueType > &rhs)
Definition: complex_quda.h:692
enum QudaReconstructType_s QudaReconstructType
int mom_ga_pad
Definition: quda.h:59
#define Qnumu
__host__ __device__ float4 operator+=(float4 &x, const float4 y)
Definition: float_vector.h:83
#define Pmu
#define D1h
Definition: llfat_core.h:16
#define GOES_FORWARDS(dir)
Definition: force_common.h:17
#define HISQ_LOAD_MATRIX_18_DOUBLE_TEX(gauge_tex, gauge, dir, idx, var, stride)
#define checkCudaError()
Definition: util_quda.h:110
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int sig
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const linkEven
QudaTune getTuning()
Definition: util_quda.cpp:32
loadMatrixFromField(oprodEven, oprodOdd, point_c, Oy.data, oddBit, kparam.color_matrix_stride)
const QudaParity parity
Definition: dslash_test.cpp:29
int oddBit
void end()
#define TEX1DFETCH(type, tex, idx)
__global__ void const RealA *const oprodOdd
__device__ __host__ void getCoords(int x[4], int cb_index, const int X[4], int parity)