QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
llfat_quda.cu
Go to the documentation of this file.
1 #include <stdio.h>
2 #include <cuda_runtime.h>
3 #include <cuda.h>
4 
5 #include <quda_internal.h>
6 #include <gauge_field.h>
7 #include <clover_field.h>
8 #include <llfat_quda.h>
9 #include <read_gauge.h>
10 #include <force_common.h>
11 #include <dslash_quda.h>
12 
13 namespace quda {
14 
15 #ifdef GPU_FATLINK
16 
17  namespace fatlink {
18 #include <dslash_constants.h>
19 #include <dslash_textures.h>
20  } // namespace fatlink
21 
22  using namespace fatlink;
23 
24 #if (__COMPUTE_CAPABILITY__ >= 200)
25 #define SITE_MATRIX_LOAD_TEX 1
26 #define MULINK_LOAD_TEX 1
27 #define FATLINK_LOAD_TEX 1
28 #else
29 #define SITE_MATRIX_LOAD_TEX 0
30 #define MULINK_LOAD_TEX 1
31 #define FATLINK_LOAD_TEX 1
32 #endif
33 
34 #define BLOCK_DIM 64
35 
36 #define WRITE_FAT_MATRIX(gauge, dir, idx)do { \
37  gauge[idx + dir*9*fl.fat_ga_stride] = FAT0; \
38  gauge[idx + (dir*9+1) * fl.fat_ga_stride] = FAT1; \
39  gauge[idx + (dir*9+2) * fl.fat_ga_stride] = FAT2; \
40  gauge[idx + (dir*9+3) * fl.fat_ga_stride] = FAT3; \
41  gauge[idx + (dir*9+4) * fl.fat_ga_stride] = FAT4; \
42  gauge[idx + (dir*9+5) * fl.fat_ga_stride] = FAT5; \
43  gauge[idx + (dir*9+6) * fl.fat_ga_stride] = FAT6; \
44  gauge[idx + (dir*9+7) * fl.fat_ga_stride] = FAT7; \
45  gauge[idx + (dir*9+8) * fl.fat_ga_stride] = FAT8;} while(0)
46 
47 #define WRITE_GAUGE_MATRIX_FLOAT2(gauge, MAT, dir, idx, stride) { \
48  gauge[idx + dir*9*stride] = MAT##0; \
49  gauge[idx + (dir*9 + 1)*stride] = MAT##1; \
50  gauge[idx + (dir*9 + 2)*stride] = MAT##2; \
51  gauge[idx + (dir*9 + 3)*stride] = MAT##3; \
52  gauge[idx + (dir*9 + 4)*stride] = MAT##4; \
53  gauge[idx + (dir*9 + 5)*stride] = MAT##5; \
54  gauge[idx + (dir*9 + 6)*stride] = MAT##6; \
55  gauge[idx + (dir*9 + 7)*stride] = MAT##7; \
56  gauge[idx + (dir*9 + 8)*stride] = MAT##8; \
57  };
58 
59 
60 #define WRITE_GAUGE_MATRIX_FLOAT4(gauge, MAT, dir, idx, stride) { \
61  gauge[idx + dir*3*stride] = MAT##0; \
62  gauge[idx + (dir*3 + 1)*stride] = MAT##1; \
63  gauge[idx + (dir*3 + 2)*stride] = MAT##2; \
64 };
65 
66 
67 
68 #define WRITE_STAPLE_MATRIX(gauge, idx) \
69  gauge[idx] = STAPLE0; \
70  gauge[idx + fl.staple_stride] = STAPLE1; \
71  gauge[idx + 2*fl.staple_stride] = STAPLE2; \
72  gauge[idx + 3*fl.staple_stride] = STAPLE3; \
73  gauge[idx + 4*fl.staple_stride] = STAPLE4; \
74  gauge[idx + 5*fl.staple_stride] = STAPLE5; \
75  gauge[idx + 6*fl.staple_stride] = STAPLE6; \
76  gauge[idx + 7*fl.staple_stride] = STAPLE7; \
77  gauge[idx + 8*fl.staple_stride] = STAPLE8;
78 
79 
80 #define SCALAR_MULT_SU3_MATRIX(a, b, c) \
81  c##00_re = a*b##00_re; \
82  c##00_im = a*b##00_im; \
83  c##01_re = a*b##01_re; \
84  c##01_im = a*b##01_im; \
85  c##02_re = a*b##02_re; \
86  c##02_im = a*b##02_im; \
87  c##10_re = a*b##10_re; \
88  c##10_im = a*b##10_im; \
89  c##11_re = a*b##11_re; \
90  c##11_im = a*b##11_im; \
91  c##12_re = a*b##12_re; \
92  c##12_im = a*b##12_im; \
93  c##20_re = a*b##20_re; \
94  c##20_im = a*b##20_im; \
95  c##21_re = a*b##21_re; \
96  c##21_im = a*b##21_im; \
97  c##22_re = a*b##22_re; \
98  c##22_im = a*b##22_im; \
99 
100  /*
101  #define LOAD_MATRIX_12_SINGLE_DECLARE(gauge, dir, idx, var, stride) \
102  float2 var##0 = gauge[idx + dir*6*stride]; \
103  float2 var##1 = gauge[idx + dir*6*stride + stride]; \
104  float2 var##2 = gauge[idx + dir*6*stride + 2*stride]; \
105  float2 var##3 = gauge[idx + dir*6*stride + 3*stride]; \
106  float2 var##4 = gauge[idx + dir*6*stride + 4*stride]; \
107  float2 var##5 = gauge[idx + dir*6*stride + 5*stride]; \
108  float2 var##6, var##7, var##8;
109 
110  #define LOAD_MATRIX_12_SINGLE_TEX_DECLARE(gauge, dir, idx, var, stride) \
111  float2 var##0 = tex1Dfetch(gauge, idx + dir*6*stride); \
112  float2 var##1 = tex1Dfetch(gauge, idx + dir*6*stride + stride); \
113  float2 var##2 = tex1Dfetch(gauge, idx + dir*6*stride + 2*stride); \
114  float2 var##3 = tex1Dfetch(gauge, idx + dir*6*stride + 3*stride); \
115  float2 var##4 = tex1Dfetch(gauge, idx + dir*6*stride + 4*stride); \
116  float2 var##5 = tex1Dfetch(gauge, idx + dir*6*stride + 5*stride); \
117  float2 var##6, var##7, var##8;
118  */
119 #define LOAD_MATRIX_12_SINGLE_DECLARE(gauge, dir, idx, var, stride) \
120  float4 var##0 = gauge[idx + dir*3*stride]; \
121  float4 var##1 = gauge[idx + dir*3*stride + stride]; \
122  float4 var##2 = gauge[idx + dir*3*stride + 2*stride]; \
123  float4 var##3, var##4;
124 
125 #define LOAD_MATRIX_12_SINGLE_TEX_DECLARE(gauge, dir, idx, var, stride) \
126  float4 var##0 = tex1Dfetch(gauge, idx + dir*3*stride); \
127  float4 var##1 = tex1Dfetch(gauge, idx + dir*3*stride + stride); \
128  float4 var##2 = tex1Dfetch(gauge, idx + dir*3*stride + 2*stride); \
129  float4 var##3, var##4;
130 
131 #define LOAD_MATRIX_18_SINGLE_DECLARE(gauge, dir, idx, var, stride) \
132  float2 var##0 = gauge[idx + dir*9*stride]; \
133  float2 var##1 = gauge[idx + dir*9*stride + stride]; \
134  float2 var##2 = gauge[idx + dir*9*stride + 2*stride]; \
135  float2 var##3 = gauge[idx + dir*9*stride + 3*stride]; \
136  float2 var##4 = gauge[idx + dir*9*stride + 4*stride]; \
137  float2 var##5 = gauge[idx + dir*9*stride + 5*stride]; \
138  float2 var##6 = gauge[idx + dir*9*stride + 6*stride]; \
139  float2 var##7 = gauge[idx + dir*9*stride + 7*stride]; \
140  float2 var##8 = gauge[idx + dir*9*stride + 8*stride];
141 
142 
143 #define LOAD_MATRIX_18_SINGLE_TEX_DECLARE(gauge, dir, idx, var, stride) \
144  float2 var##0 = tex1Dfetch(gauge, idx + dir*9*stride); \
145  float2 var##1 = tex1Dfetch(gauge, idx + dir*9*stride + stride); \
146  float2 var##2 = tex1Dfetch(gauge, idx + dir*9*stride + 2*stride); \
147  float2 var##3 = tex1Dfetch(gauge, idx + dir*9*stride + 3*stride); \
148  float2 var##4 = tex1Dfetch(gauge, idx + dir*9*stride + 4*stride); \
149  float2 var##5 = tex1Dfetch(gauge, idx + dir*9*stride + 5*stride); \
150  float2 var##6 = tex1Dfetch(gauge, idx + dir*9*stride + 6*stride); \
151  float2 var##7 = tex1Dfetch(gauge, idx + dir*9*stride + 7*stride); \
152  float2 var##8 = tex1Dfetch(gauge, idx + dir*9*stride + 8*stride);
153 
154 
155 
156 #define LOAD_MATRIX_18_DOUBLE_DECLARE(gauge, dir, idx, var, stride) \
157  double2 var##0 = gauge[idx + dir*9*stride]; \
158  double2 var##1 = gauge[idx + dir*9*stride + stride]; \
159  double2 var##2 = gauge[idx + dir*9*stride + 2*stride]; \
160  double2 var##3 = gauge[idx + dir*9*stride + 3*stride]; \
161  double2 var##4 = gauge[idx + dir*9*stride + 4*stride]; \
162  double2 var##5 = gauge[idx + dir*9*stride + 5*stride]; \
163  double2 var##6 = gauge[idx + dir*9*stride + 6*stride]; \
164  double2 var##7 = gauge[idx + dir*9*stride + 7*stride]; \
165  double2 var##8 = gauge[idx + dir*9*stride + 8*stride];
166 
167 
168 #define LOAD_MATRIX_18_DOUBLE_TEX_DECLARE(gauge_tex, gauge, dir, idx, var, stride) \
169  double2 var##0 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*9*stride); \
170  double2 var##1 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*9*stride + stride); \
171  double2 var##2 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*9*stride + 2*stride); \
172  double2 var##3 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*9*stride + 3*stride); \
173  double2 var##4 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*9*stride + 4*stride); \
174  double2 var##5 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*9*stride + 5*stride); \
175  double2 var##6 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*9*stride + 6*stride); \
176  double2 var##7 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*9*stride + 7*stride); \
177  double2 var##8 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*9*stride + 8*stride);
178 
179 
180 #define LOAD_MATRIX_12_DOUBLE_DECLARE(gauge, dir, idx, var, stride) \
181  double2 var##0 = gauge[idx + dir*6*stride]; \
182  double2 var##1 = gauge[idx + dir*6*stride + stride]; \
183  double2 var##2 = gauge[idx + dir*6*stride + 2*stride]; \
184  double2 var##3 = gauge[idx + dir*6*stride + 3*stride]; \
185  double2 var##4 = gauge[idx + dir*6*stride + 4*stride]; \
186  double2 var##5 = gauge[idx + dir*6*stride + 5*stride]; \
187  double2 var##6, var##7, var##8;
188 
189 
190 #define LOAD_MATRIX_12_DOUBLE_TEX_DECLARE(gauge_tex, gauge, dir, idx, var, stride) \
191  double2 var##0 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*6*stride); \
192  double2 var##1 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*6*stride + stride); \
193  double2 var##2 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*6*stride + 2*stride); \
194  double2 var##3 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*6*stride + 3*stride); \
195  double2 var##4 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*6*stride + 4*stride); \
196  double2 var##5 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*6*stride + 5*stride); \
197  double2 var##6, var##7, var##8;
198 
199 #define LLFAT_ADD_SU3_MATRIX(ma, mb, mc) \
200  mc##00_re = ma##00_re + mb##00_re; \
201  mc##00_im = ma##00_im + mb##00_im; \
202  mc##01_re = ma##01_re + mb##01_re; \
203  mc##01_im = ma##01_im + mb##01_im; \
204  mc##02_re = ma##02_re + mb##02_re; \
205  mc##02_im = ma##02_im + mb##02_im; \
206  mc##10_re = ma##10_re + mb##10_re; \
207  mc##10_im = ma##10_im + mb##10_im; \
208  mc##11_re = ma##11_re + mb##11_re; \
209  mc##11_im = ma##11_im + mb##11_im; \
210  mc##12_re = ma##12_re + mb##12_re; \
211  mc##12_im = ma##12_im + mb##12_im; \
212  mc##20_re = ma##20_re + mb##20_re; \
213  mc##20_im = ma##20_im + mb##20_im; \
214  mc##21_re = ma##21_re + mb##21_re; \
215  mc##21_im = ma##21_im + mb##21_im; \
216  mc##22_re = ma##22_re + mb##22_re; \
217  mc##22_im = ma##22_im + mb##22_im;
218 
219 
220 
221 
222  __constant__ int dir1_array[16];
223  __constant__ int dir2_array[16];
224 
225  unsigned long staple_bytes=0;
226 
227  void
229  {
230  static int llfat_init_cuda_flag = 0;
231  if (llfat_init_cuda_flag){
232  return;
233  }
234 
235  llfat_init_cuda_flag = 1;
236 
237  int Vh = param->X[0]*param->X[1]*param->X[2]*param->X[3]/2;
238 
239 
240  fat_force_const_t fl_h;
241 
242  fl_h.site_ga_stride = param->site_ga_pad + Vh;
243  fl_h.staple_stride = param->staple_pad + Vh;
244  fl_h.fat_ga_stride = param->llfat_ga_pad + Vh;
245  fl_h.long_ga_stride = param->llfat_ga_pad + Vh; // temporary
246 
247  cudaMemcpyToSymbol(fl, &fl_h, sizeof(fat_force_const_t));
248 
249  int dir1[16];
250  int dir2[16];
251  for(int nu =0; nu < 4; nu++)
252  for(int mu=0; mu < 4; mu++){
253  if(nu == mu) continue;
254  int d1, d2;
255  for(d1=0; d1 < 4; d1 ++){
256  if(d1 != nu && d1 != mu){
257  break;
258  }
259  }
260  dir1[nu*4+mu] = d1;
261 
262  for(d2=0; d2 < 4; d2 ++){
263  if(d2 != nu && d2 != mu && d2 != d1){
264  break;
265  }
266  }
267 
268  dir2[nu*4+mu] = d2;
269  }
270 
271  cudaMemcpyToSymbol(dir1_array, &dir1, sizeof(dir1));
272  cudaMemcpyToSymbol(dir2_array, &dir2, sizeof(dir2));
273 
274  checkCudaError();
275  }
276 
277 
278  void
280  {
281  static int llfat_init_cuda_flag = 0;
282 
283  if (llfat_init_cuda_flag){
284  return;
285  }
286 
287  llfat_init_cuda_flag = 1;
288 
289  int Vh_ex = param_ex->X[0]*param_ex->X[1]*param_ex->X[2]*param_ex->X[3]/2;
290  int Vh = (param_ex->X[0]-4)*(param_ex->X[1]-4)*(param_ex->X[2]-4)*(param_ex->X[3]-4)/2;
291 
292  fat_force_const_t fl_h;
293  fl_h.site_ga_stride = param_ex->site_ga_pad + Vh_ex;
294  fl_h.staple_stride = param_ex->staple_pad + Vh_ex;
295  fl_h.fat_ga_stride = param_ex->llfat_ga_pad + Vh;
296  fl_h.long_ga_stride = param_ex->llfat_ga_pad + Vh;
297  cudaMemcpyToSymbol(fl, &fl_h, sizeof(fat_force_const_t));
298  }
299 
300 
301 
302 
303 #define LLFAT_CONCAT(a,b) a##b##Kernel
304 #define LLFAT_CONCAT_EX(a,b) a##b##Kernel_ex
305 #define LLFAT_KERNEL(a,b) LLFAT_CONCAT(a,b)
306 #define LLFAT_KERNEL_EX(a,b) LLFAT_CONCAT_EX(a,b)
307 
308  //precision: 0 is for double, 1 is for single
309 
310  //single precision, common macro
311 #define PRECISION 1
312 #define Float float
313 #define LOAD_FAT_MATRIX(gauge, dir, idx) LOAD_MATRIX_18_SINGLE_DECLARE(gauge, dir, idx, FAT, fl.fat_ga_stride)
314 #if (MULINK_LOAD_TEX == 1)
315 #define LOAD_EVEN_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_TEX_DECLARE((odd_bit?muLink1TexSingle:muLink0TexSingle), dir, idx, var, fl.staple_stride)
316 #define LOAD_ODD_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_TEX_DECLARE((odd_bit?muLink0TexSingle:muLink1TexSingle), dir, idx, var, fl.staple_stride)
317 #else
318 #define LOAD_EVEN_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_DECLARE(mulink_even, dir, idx, var, fl.staple_stride)
319 #define LOAD_ODD_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_DECLARE(mulink_odd, dir, idx, var, fl.staple_stride)
320 #endif
321 
322 #if (FATLINK_LOAD_TEX == 1)
323 #define LOAD_EVEN_FAT_MATRIX(dir, idx) LOAD_MATRIX_18_SINGLE_TEX_DECLARE((odd_bit?fatGauge1TexSingle:fatGauge0TexSingle), dir, idx, FAT, fl.fat_ga_stride);
324 #define LOAD_ODD_FAT_MATRIX(dir, idx) LOAD_MATRIX_18_SINGLE_TEX_DECLARE((odd_bit?fatGauge0TexSingle:fatGauge1TexSingle), dir, idx, FAT, fl.fat_ga_stride);
325 #else
326 #define LOAD_EVEN_FAT_MATRIX(dir, idx) LOAD_MATRIX_18_SINGLE_DECLARE(fatlink_even, dir, idx, FAT, fl.fat_ga_stride)
327 #define LOAD_ODD_FAT_MATRIX(dir, idx) LOAD_MATRIX_18_SINGLE_DECLARE(fatlink_odd, dir, idx, FAT, fl.fat_ga_stride)
328 #endif
329 
330 
331  //single precision, 12-reconstruct
332 #define DECLARE_VAR_SIGN short sign=1
333 #define SITELINK0TEX siteLink0TexSingle_recon
334 #define SITELINK1TEX siteLink1TexSingle_recon
335 #if (SITE_MATRIX_LOAD_TEX == 1)
336 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_SINGLE_TEX_DECLARE((odd_bit?SITELINK1TEX:SITELINK0TEX), dir, idx, var, fl.site_ga_stride)
337 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_SINGLE_TEX_DECLARE((odd_bit?SITELINK0TEX:SITELINK1TEX), dir, idx, var, fl.site_ga_stride)
338 #else
339 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_SINGLE_DECLARE(sitelink_even, dir, idx, var, fl.site_ga_stride)
340 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_SINGLE_DECLARE(sitelink_odd, dir, idx, var, fl.site_ga_stride)
341 #endif
342 #define LOAD_SITE_MATRIX(sitelink, dir, idx, var) LOAD_MATRIX_12_SINGLE_DECLARE(sitelink, dir, idx, var, fl.site_ga_stride)
343 
344 #define RECONSTRUCT_SITE_LINK(sign, var) RECONSTRUCT_LINK_12(sign, var);
345 #define FloatN float4
346 #define FloatM float2
347 #define RECONSTRUCT 12
348 #define sd_data float_12_sd_data
349 #include "llfat_core.h"
350 #undef DECLARE_VAR_SIGN
351 #undef SITELINK0TEX
352 #undef SITELINK1TEX
353 #undef LOAD_EVEN_SITE_MATRIX
354 #undef LOAD_ODD_SITE_MATRIX
355 #undef LOAD_SITE_MATRIX
356 #undef RECONSTRUCT_SITE_LINK
357 #undef FloatN
358 #undef FloatM
359 #undef RECONSTRUCT
360 #undef sd_data
361 
362  //single precision, 18-reconstruct
363 #define SITELINK0TEX siteLink0TexSingle_norecon
364 #define SITELINK1TEX siteLink1TexSingle_norecon
365 #if (SITE_MATRIX_LOAD_TEX == 1)
366 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_TEX_DECLARE((odd_bit?SITELINK1TEX:SITELINK0TEX), dir, idx, var, fl.site_ga_stride)
367 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_TEX_DECLARE((odd_bit?SITELINK0TEX:SITELINK1TEX), dir, idx, var, fl.site_ga_stride)
368 #else
369 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_DECLARE(sitelink_even, dir, idx, var, fl.site_ga_stride)
370 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_DECLARE(sitelink_odd, dir, idx, var, fl.site_ga_stride)
371 #endif
372 #define LOAD_SITE_MATRIX(sitelink, dir, idx, var) LOAD_MATRIX_18_SINGLE_DECLARE(sitelink, dir, idx, var, fl.site_ga_stride)
373 #define RECONSTRUCT_SITE_LINK(sign, var)
374 #define FloatN float2
375 #define FloatM float2
376 #define RECONSTRUCT 18
377 #define sd_data float_18_sd_data
378 #include "llfat_core.h"
379 #undef SITELINK0TEX
380 #undef SITELINK1TEX
381 #undef LOAD_EVEN_SITE_MATRIX
382 #undef LOAD_ODD_SITE_MATRIX
383 #undef LOAD_SITE_MATRIX
384 #undef RECONSTRUCT_SITE_LINK
385 #undef FloatN
386 #undef FloatM
387 #undef RECONSTRUCT
388 #undef sd_data
389 
390 
391 #undef PRECISION
392 #undef Float
393 #undef LOAD_FAT_MATRIX
394 #undef LOAD_EVEN_MULINK_MATRIX
395 #undef LOAD_ODD_MULINK_MATRIX
396 #undef LOAD_EVEN_FAT_MATRIX
397 #undef LOAD_ODD_FAT_MATRIX
398 
399 
400  //double precision, common macro
401 #define PRECISION 0
402 #define Float double
403 #define LOAD_FAT_MATRIX(gauge, dir, idx) LOAD_MATRIX_18_DOUBLE_DECLARE(gauge, dir, idx, FAT, fl.fat_ga_stride)
404 #if (MULINK_LOAD_TEX == 1)
405 #define LOAD_EVEN_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE_TEX_DECLARE(odd_bit?muLink1TexDouble:muLink0TexDouble), mulink_even, dir, idx, var, fl.staple_stride)
406 #define LOAD_ODD_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE_TEX_DECLARE((odd_bit?muLink0TexDouble:muLink1TexDouble), mulink_odd, dir, idx, var, fl.staple_stride)
407 #else
408 #define LOAD_EVEN_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE(mulink_even, dir, idx, var, fl.staple_stride)
409 #define LOAD_ODD_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE(mulink_odd, dir, idx, var, fl.staple_stride)
410 #endif
411 
412 #if (FATLINK_LOAD_TEX == 1)
413 #define LOAD_EVEN_FAT_MATRIX(dir, idx) LOAD_MATRIX_18_DOUBLE_TEX_DECLARE((odd_bit?fatGauge1TexDouble:fatGauge0TexDouble), fatlink_even, dir, idx, FAT, fl.fat_ga_stride)
414 #define LOAD_ODD_FAT_MATRIX(dir, idx) LOAD_MATRIX_18_DOUBLE_TEX_DECLARE((odd_bit?fatGauge0TexDouble:fatGauge1TexDouble), fatlink_odd, dir, idx, FAT, fl.fat_ga_stride)
415 #else
416 #define LOAD_EVEN_FAT_MATRIX(dir, idx) LOAD_MATRIX_18_DOUBLE_DECLARE(fatlink_even, dir, idx, FAT, fl.fat_ga_stride)
417 #define LOAD_ODD_FAT_MATRIX(dir, idx) LOAD_MATRIX_18_DOUBLE_DECLARE(fatlink_odd, dir, idx, FAT, fl.fat_ga_stride)
418 #endif
419 
420  //double precision, 18-reconstruct
421 #define SITELINK0TEX siteLink0TexDouble
422 #define SITELINK1TEX siteLink1TexDouble
423 #if (SITE_MATRIX_LOAD_TEX == 1)
424 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE_TEX_DECLARE((odd_bit?SITELINK1TEX:SITELINK0TEX), sitelink_even, dir, idx, var, fl.site_ga_stride)
425 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE_TEX_DECLARE((odd_bit?SITELINK0TEX:SITELINK1TEX), sitelink_odd, dir, idx, var, fl.site_ga_stride)
426 #else
427 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE_DECLARE(sitelink_even, dir, idx, var, fl.site_ga_stride)
428 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE_DECLARE(sitelink_odd, dir, idx, var, fl.site_ga_stride)
429 #endif
430 #define LOAD_SITE_MATRIX(sitelink, dir, idx, var) LOAD_MATRIX_18_DOUBLE_DECLARE(sitelink, dir, idx, var, fl.site_ga_stride)
431 #define RECONSTRUCT_SITE_LINK(sign, var)
432 #define FloatN double2
433 #define FloatM double2
434 #define RECONSTRUCT 18
435 #define sd_data double_18_sd_data
436 #include "llfat_core.h"
437 #undef SITELINK0TEX
438 #undef SITELINK1TEX
439 #undef LOAD_EVEN_SITE_MATRIX
440 #undef LOAD_ODD_SITE_MATRIX
441 #undef LOAD_SITE_MATRIX
442 #undef RECONSTRUCT_SITE_LINK
443 #undef FloatN
444 #undef FloatM
445 #undef RECONSTRUCT
446 #undef sd_data
447 
448 
449 
450 #if 1
451  //double precision, 12-reconstruct
452 #define SITELINK0TEX siteLink0TexDouble
453 #define SITELINK1TEX siteLink1TexDouble
454 #if (SITE_MATRIX_LOAD_TEX == 1)
455 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_DOUBLE_TEX_DECLARE((odd_bit?SITELINK1TEX:SITELINK0TEX), sitelink_even, dir, idx, var, fl.site_ga_stride)
456 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_DOUBLE_TEX_DECLARE((odd_bit?SITELINK0TEX:SITELINK1TEX), sitelink_odd, dir, idx, var, fl.site_ga_stride)
457 #else
458 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_DOUBLE_DECLARE(sitelink_even, dir, idx, var, fl.site_ga_stride)
459 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_DOUBLE_DECLARE(sitelink_odd, dir, idx, var, fl.site_ga_stride)
460 #endif
461 #define LOAD_SITE_MATRIX(sitelink, dir, idx, var) LOAD_MATRIX_12_DOUBLE_DECLARE(sitelink, dir, idx, var, fl.site_ga_stride)
462 #define RECONSTRUCT_SITE_LINK(sign, var) RECONSTRUCT_LINK_12(sign, var);
463 #define FloatN double2
464 #define FloatM double2
465 #define RECONSTRUCT 12
466 #define sd_data double_12_sd_data
467 #include "llfat_core.h"
468 #undef SITELINK0TEX
469 #undef SITELINK1TEX
470 #undef LOAD_EVEN_SITE_MATRIX
471 #undef LOAD_ODD_SITE_MATRIX
472 #undef LOAD_SITE_MATRIX
473 #undef RECONSTRUCT_SITE_LINK
474 #undef FloatN
475 #undef FloatM
476 #undef RECONSTRUCT
477 #undef sd_data
478 #endif
479 
480 #undef PRECISION
481 #undef Float
482 #undef LOAD_FAT_MATRIX
483 #undef LOAD_EVEN_MULINK_MATRIX
484 #undef LOAD_ODD_MULINK_MATRIX
485 #undef LOAD_EVEN_FAT_MATRIX
486 #undef LOAD_ODD_FAT_MATRIX
487 
488 #undef LLFAT_CONCAT
489 #undef LLFAT_KERNEL
490 
491 #define UNBIND_ALL_TEXTURE do{ \
492  if(prec ==QUDA_DOUBLE_PRECISION){ \
493  cudaUnbindTexture(siteLink0TexDouble); \
494  cudaUnbindTexture(siteLink1TexDouble); \
495  cudaUnbindTexture(fatGauge0TexDouble); \
496  cudaUnbindTexture(fatGauge1TexDouble); \
497  cudaUnbindTexture(muLink0TexDouble); \
498  cudaUnbindTexture(muLink1TexDouble); \
499  }else{ \
500  if(cudaSiteLink.reconstruct == QUDA_RECONSTRUCT_NO){ \
501  cudaUnbindTexture(siteLink0TexSingle_norecon); \
502  cudaUnbindTexture(siteLink1TexSingle_norecon); \
503  }else{ \
504  cudaUnbindTexture(siteLink0TexSingle_recon); \
505  cudaUnbindTexture(siteLink1TexSingle_recon); \
506  } \
507  cudaUnbindTexture(fatGauge0TexSingle); \
508  cudaUnbindTexture(fatGauge1TexSingle); \
509  cudaUnbindTexture(muLink0TexSingle); \
510  cudaUnbindTexture(muLink1TexSingle); \
511  } \
512  }while(0)
513 
514 #define UNBIND_SITE_AND_FAT_LINK do{ \
515  if(prec == QUDA_DOUBLE_PRECISION){ \
516  cudaUnbindTexture(siteLink0TexDouble); \
517  cudaUnbindTexture(siteLink1TexDouble); \
518  cudaUnbindTexture(fatGauge0TexDouble); \
519  cudaUnbindTexture(fatGauge1TexDouble); \
520  }else { \
521  if(cudaSiteLink.reconstruct == QUDA_RECONSTRUCT_NO){ \
522  cudaUnbindTexture(siteLink0TexSingle_norecon); \
523  cudaUnbindTexture(siteLink1TexSingle_norecon); \
524  }else{ \
525  cudaUnbindTexture(siteLink0TexSingle_recon); \
526  cudaUnbindTexture(siteLink1TexSingle_recon); \
527  } \
528  cudaUnbindTexture(fatGauge0TexSingle); \
529  cudaUnbindTexture(fatGauge1TexSingle); \
530  } \
531  }while(0)
532 
533 
534 #define BIND_MU_LINK() do{ \
535  if(prec == QUDA_DOUBLE_PRECISION){ \
536  cudaBindTexture(0, muLink0TexDouble, mulink_even, staple_bytes); \
537  cudaBindTexture(0, muLink1TexDouble, mulink_odd, staple_bytes); \
538  }else{ \
539  cudaBindTexture(0, muLink0TexSingle, mulink_even, staple_bytes); \
540  cudaBindTexture(0, muLink1TexSingle, mulink_odd, staple_bytes); \
541  } \
542  }while(0)
543 
544 #define UNBIND_MU_LINK() do{ \
545  if(prec == QUDA_DOUBLE_PRECISION){ \
546  cudaUnbindTexture(muLink0TexSingle); \
547  cudaUnbindTexture(muLink1TexSingle); \
548  }else{ \
549  cudaUnbindTexture(muLink0TexDouble); \
550  cudaUnbindTexture(muLink1TexDouble); \
551  } \
552  }while(0)
553 
554 
555 #define BIND_SITE_AND_FAT_LINK do { \
556  if(prec == QUDA_DOUBLE_PRECISION){ \
557  cudaBindTexture(0, siteLink0TexDouble, cudaSiteLink.Even_p(), cudaSiteLink.Bytes()); \
558  cudaBindTexture(0, siteLink1TexDouble, cudaSiteLink.Odd_p(), cudaSiteLink.Bytes()); \
559  cudaBindTexture(0, fatGauge0TexDouble, cudaFatLink.Even_p(), cudaFatLink.Bytes()); \
560  cudaBindTexture(0, fatGauge1TexDouble, cudaFatLink.Odd_p(), cudaFatLink.Bytes()); \
561  }else{ \
562  if(cudaSiteLink.Reconstruct() == QUDA_RECONSTRUCT_NO){ \
563  cudaBindTexture(0, siteLink0TexSingle_norecon, cudaSiteLink.Even_p(), cudaSiteLink.Bytes()); \
564  cudaBindTexture(0, siteLink1TexSingle_norecon, cudaSiteLink.Odd_p(), cudaSiteLink.Bytes()); \
565  }else{ \
566  cudaBindTexture(0, siteLink0TexSingle_recon, cudaSiteLink.Even_p(), cudaSiteLink.Bytes()); \
567  cudaBindTexture(0, siteLink1TexSingle_recon, cudaSiteLink.Odd_p(), cudaSiteLink.Bytes()); \
568  } \
569  cudaBindTexture(0, fatGauge0TexSingle, cudaFatLink.Even_p(), cudaFatLink.Bytes()); \
570  cudaBindTexture(0, fatGauge1TexSingle, cudaFatLink.Odd_p(), cudaFatLink.Bytes()); \
571  } \
572  }while(0)
573 
574 #define BIND_MU_LINK() do{ \
575  if(prec == QUDA_DOUBLE_PRECISION){ \
576  cudaBindTexture(0, muLink0TexDouble, mulink_even, staple_bytes); \
577  cudaBindTexture(0, muLink1TexDouble, mulink_odd, staple_bytes); \
578  }else{ \
579  cudaBindTexture(0, muLink0TexSingle, mulink_even, staple_bytes); \
580  cudaBindTexture(0, muLink1TexSingle, mulink_odd, staple_bytes); \
581  } \
582  }while(0)
583 
584 #define UNBIND_MU_LINK() do{ \
585  if(prec == QUDA_DOUBLE_PRECISION){ \
586  cudaUnbindTexture(muLink0TexSingle); \
587  cudaUnbindTexture(muLink1TexSingle); \
588  }else{ \
589  cudaUnbindTexture(muLink0TexDouble); \
590  cudaUnbindTexture(muLink1TexDouble); \
591  } \
592  }while(0)
593 
594 #define BIND_SITE_AND_FAT_LINK_REVERSE do { \
595  if(prec == QUDA_DOUBLE_PRECISION){ \
596  cudaBindTexture(0, siteLink1TexDouble, cudaSiteLink.even, cudaSiteLink.bytes); \
597  cudaBindTexture(0, siteLink0TexDouble, cudaSiteLink.odd, cudaSiteLink.bytes); \
598  cudaBindTexture(0, fatGauge1TexDouble, cudaFatLink.even, cudaFatLink.bytes); \
599  cudaBindTexture(0, fatGauge0TexDouble, cudaFatLink.odd, cudaFatLink.bytes); \
600  }else{ \
601  if(cudaSiteLink.reconstruct == QUDA_RECONSTRUCT_NO){ \
602  cudaBindTexture(0, siteLink1TexSingle_norecon, cudaSiteLink.even, cudaSiteLink.bytes); \
603  cudaBindTexture(0, siteLink0TexSingle_norecon, cudaSiteLink.odd, cudaSiteLink.bytes); \
604  }else{ \
605  cudaBindTexture(0, siteLink1TexSingle_recon, cudaSiteLink.even, cudaSiteLink.bytes); \
606  cudaBindTexture(0, siteLink0TexSingle_recon, cudaSiteLink.odd, cudaSiteLink.bytes); \
607  } \
608  cudaBindTexture(0, fatGauge1TexSingle, cudaFatLink.even, cudaFatLink.bytes); \
609  cudaBindTexture(0, fatGauge0TexSingle, cudaFatLink.odd, cudaFatLink.bytes); \
610  } \
611  }while(0)
612 
613 
614 
615 #define ENUMERATE_FUNCS(mu,nu) switch(mu) { \
616  case 0: \
617  switch(nu){ \
618  case 0: \
619  printf("ERROR: invalid direction combination\n"); exit(1); \
620  break; \
621  case 1: \
622  CALL_FUNCTION(0,1); \
623  break; \
624  case 2: \
625  CALL_FUNCTION(0,2); \
626  break; \
627  case 3: \
628  CALL_FUNCTION(0,3); \
629  break; \
630 } \
631  break; \
632  case 1: \
633  switch(nu){ \
634  case 0: \
635  CALL_FUNCTION(1,0); \
636  break; \
637  case 1: \
638  printf("ERROR: invalid direction combination\n"); exit(1); \
639  break; \
640  case 2: \
641  CALL_FUNCTION(1,2); \
642  break; \
643  case 3: \
644  CALL_FUNCTION(1,3); \
645  break; \
646 } \
647  break; \
648  case 2: \
649  switch(nu){ \
650  case 0: \
651  CALL_FUNCTION(2,0); \
652  break; \
653  case 1: \
654  CALL_FUNCTION(2,1); \
655  break; \
656  case 2: \
657  printf("ERROR: invalid direction combination\n"); exit(1); \
658  break; \
659  case 3: \
660  CALL_FUNCTION(2,3); \
661  break; \
662 } \
663  break; \
664  case 3: \
665  switch(nu){ \
666  case 0: \
667  CALL_FUNCTION(3,0); \
668  break; \
669  case 1: \
670  CALL_FUNCTION(3,1); \
671  break; \
672  case 2: \
673  CALL_FUNCTION(3,2); \
674  break; \
675  case 3: \
676  printf("ERROR: invalid direction combination\n"); exit(1); \
677  break; \
678 } \
679  break; \
680 }
681 
682 #define ENUMERATE_FUNCS_SAVE(mu,nu, save_staple) if(save_staple){ \
683  switch(mu) { \
684  case 0: \
685  switch(nu){ \
686  case 0: \
687  printf("ERROR: invalid direction combination\n"); exit(1); \
688  break; \
689  case 1: \
690  CALL_FUNCTION(0,1,1); \
691  break; \
692  case 2: \
693  CALL_FUNCTION(0,2,1); \
694  break; \
695  case 3: \
696  CALL_FUNCTION(0,3,1); \
697  break; \
698  } \
699  break; \
700  case 1: \
701  switch(nu){ \
702  case 0: \
703  CALL_FUNCTION(1,0,1); \
704  break; \
705  case 1: \
706  printf("ERROR: invalid direction combination\n"); exit(1); \
707  break; \
708  case 2: \
709  CALL_FUNCTION(1,2,1); \
710  break; \
711  case 3: \
712  CALL_FUNCTION(1,3,1); \
713  break; \
714  } \
715  break; \
716  case 2: \
717  switch(nu){ \
718  case 0: \
719  CALL_FUNCTION(2,0,1); \
720  break; \
721  case 1: \
722  CALL_FUNCTION(2,1,1); \
723  break; \
724  case 2: \
725  printf("ERROR: invalid direction combination\n"); exit(1); \
726  break; \
727  case 3: \
728  CALL_FUNCTION(2,3,1); \
729  break; \
730  } \
731  break; \
732  case 3: \
733  switch(nu){ \
734  case 0: \
735  CALL_FUNCTION(3,0,1); \
736  break; \
737  case 1: \
738  CALL_FUNCTION(3,1,1); \
739  break; \
740  case 2: \
741  CALL_FUNCTION(3,2,1); \
742  break; \
743  case 3: \
744  printf("ERROR: invalid direction combination\n"); exit(1); \
745  break; \
746  } \
747  break; \
748  } \
749  }else{ \
750  switch(mu) { \
751  case 0: \
752  switch(nu){ \
753  case 0: \
754  printf("ERROR: invalid direction combination\n"); exit(1); \
755  break; \
756  case 1: \
757  CALL_FUNCTION(0,1,0); \
758  break; \
759  case 2: \
760  CALL_FUNCTION(0,2,0); \
761  break; \
762  case 3: \
763  CALL_FUNCTION(0,3,0); \
764  break; \
765  } \
766  break; \
767  case 1: \
768  switch(nu){ \
769  case 0: \
770  CALL_FUNCTION(1,0,0); \
771  break; \
772  case 1: \
773  printf("ERROR: invalid direction combination\n"); exit(1); \
774  break; \
775  case 2: \
776  CALL_FUNCTION(1,2,0); \
777  break; \
778  case 3: \
779  CALL_FUNCTION(1,3,0); \
780  break; \
781  } \
782  break; \
783  case 2: \
784  switch(nu){ \
785  case 0: \
786  CALL_FUNCTION(2,0,0); \
787  break; \
788  case 1: \
789  CALL_FUNCTION(2,1,0); \
790  break; \
791  case 2: \
792  printf("ERROR: invalid direction combination\n"); exit(1); \
793  break; \
794  case 3: \
795  CALL_FUNCTION(2,3,0); \
796  break; \
797  } \
798  break; \
799  case 3: \
800  switch(nu){ \
801  case 0: \
802  CALL_FUNCTION(3,0,0); \
803  break; \
804  case 1: \
805  CALL_FUNCTION(3,1,0); \
806  break; \
807  case 2: \
808  CALL_FUNCTION(3,2,0); \
809  break; \
810  case 3: \
811  printf("ERROR: invalid direction combination\n"); exit(1); \
812  break; \
813  } \
814  break; \
815  } \
816  }
817 
818 
819 
820  void computeLongLinkCuda(void* outEven, void* outOdd,
821  const void* const inEven, const void* const inOdd,
823  dim3 halfGridDim, llfat_kernel_param_t kparam)
824  {
825  dim3 blockDim = kparam.blockDim;
826 
827 
828  if(prec == QUDA_DOUBLE_PRECISION){
829  if(recon == QUDA_RECONSTRUCT_NO){
830 
831  computeLongLinkParity18Kernel<0>
832  <<<halfGridDim, blockDim>>>((double2*)outEven, (double2*)inEven, (double2*)inOdd, coeff, kparam);
833  computeLongLinkParity18Kernel<1>
834  <<<halfGridDim, blockDim>>>((double2*)outOdd, (double2*)inOdd, (double2*)inEven, coeff, kparam);
835 
836  }else if(recon == QUDA_RECONSTRUCT_12){
837 
838  computeLongLinkParity12Kernel<0>
839  <<<halfGridDim, blockDim>>>((double2*)outEven, (double2*)inEven, (double2*)inOdd, coeff, kparam);
840  computeLongLinkParity12Kernel<1>
841  <<<halfGridDim, blockDim>>>((double2*)outOdd, (double2*)inOdd, (double2*)inEven, coeff, kparam);
842 
843  }else{
844  errorQuda("Reconstruct %d is not supported\n", recon);
845  }
846  }else if(prec == QUDA_SINGLE_PRECISION){
847  if(recon == QUDA_RECONSTRUCT_NO){
848 
849  computeLongLinkParity18Kernel<0>
850  <<<halfGridDim, blockDim>>>((float2*)outEven, (float2*)inEven, (float2*)inOdd, (float)coeff, kparam);
851  computeLongLinkParity18Kernel<1>
852  <<<halfGridDim, blockDim>>>((float2*)outOdd, (float2*)inOdd, (float2*)inEven, (float)coeff, kparam);
853 
854  }else if(recon == QUDA_RECONSTRUCT_12){
855 
856  computeLongLinkParity12Kernel<0>
857  <<<halfGridDim, blockDim>>>((float4*)outEven, (float4*)inEven, (float4*)inOdd, (float)coeff, kparam);
858  computeLongLinkParity12Kernel<1>
859  <<<halfGridDim, blockDim>>>((float4*)outOdd, (float4*)inOdd, (float4*)inEven, (float)coeff, kparam);
860 
861  }else{
862  errorQuda("Reconstruct %d is not supported\n", recon);
863  }
864  }else{
865  errorQuda("Unsupported precision %d\n", prec);
866  }
867  return;
868  }
869 
870 
871  void siteComputeGenStapleParityKernel(void* staple_even, void* staple_odd,
872  const void* sitelink_even, const void* sitelink_odd,
873  void* fatlink_even, void* fatlink_odd,
874  int mu, int nu, double mycoeff,
876  dim3 halfGridDim, llfat_kernel_param_t kparam,
877  cudaStream_t* stream)
878  {
879 
880  //compute even and odd
881 
882 #define CALL_FUNCTION(mu, nu) \
883  if (prec == QUDA_DOUBLE_PRECISION){ \
884  if(recon == QUDA_RECONSTRUCT_NO){ \
885  do_siteComputeGenStapleParity18Kernel<mu,nu, 0> \
886  <<<halfGridDim, blockDim, 0, *stream>>>((double2*)staple_even, (double2*)staple_odd, \
887  (const double2*)sitelink_even, (const double2*)sitelink_odd, \
888  (double2*)fatlink_even, (double2*)fatlink_odd, \
889  (double)mycoeff, kparam); \
890  do_siteComputeGenStapleParity18Kernel<mu,nu, 1> \
891  <<<halfGridDim, blockDim, 0, *stream>>>((double2*)staple_odd, (double2*)staple_even, \
892  (const double2*)sitelink_odd, (const double2*)sitelink_even, \
893  (double2*)fatlink_odd, (double2*)fatlink_even, \
894  (double)mycoeff, kparam); \
895  }else{ \
896  do_siteComputeGenStapleParity12Kernel<mu,nu, 0> \
897  <<<halfGridDim, blockDim, 0, *stream>>>((double2*)staple_even, (double2*)staple_odd, \
898  (const double2*)sitelink_even, (const double2*)sitelink_odd, \
899  (double2*)fatlink_even, (double2*)fatlink_odd, \
900  (double)mycoeff, kparam); \
901  do_siteComputeGenStapleParity12Kernel<mu,nu, 1> \
902  <<<halfGridDim, blockDim, 0, *stream>>>((double2*)staple_odd, (double2*)staple_even, \
903  (const double2*)sitelink_odd, (const double2*)sitelink_even, \
904  (double2*)fatlink_odd, (double2*)fatlink_even, \
905  (double)mycoeff, kparam); \
906  } \
907  }else { \
908  if(recon == QUDA_RECONSTRUCT_NO){ \
909  do_siteComputeGenStapleParity18Kernel<mu,nu, 0> \
910  <<<halfGridDim, blockDim, 0, *stream>>>((float2*)staple_even, (float2*)staple_odd, \
911  (float2*)sitelink_even, (float2*)sitelink_odd, \
912  (float2*)fatlink_even, (float2*)fatlink_odd, \
913  (float)mycoeff, kparam); \
914  do_siteComputeGenStapleParity18Kernel<mu,nu, 1> \
915  <<<halfGridDim, blockDim, 0, *stream>>>((float2*)staple_odd, (float2*)staple_even, \
916  (float2*)sitelink_odd, (float2*)sitelink_even, \
917  (float2*)fatlink_odd, (float2*)fatlink_even, \
918  (float)mycoeff, kparam); \
919  }else{ \
920  do_siteComputeGenStapleParity12Kernel<mu,nu, 0> \
921  <<<halfGridDim, blockDim, 0, *stream>>>((float2*)staple_even, (float2*)staple_odd, \
922  (float4*)sitelink_even, (float4*)sitelink_odd, \
923  (float2*)fatlink_even, (float2*)fatlink_odd, \
924  (float)mycoeff, kparam); \
925  do_siteComputeGenStapleParity12Kernel<mu,nu, 1> \
926  <<<halfGridDim, blockDim, 0, *stream>>>((float2*)staple_odd, (float2*)staple_even, \
927  (float4*)sitelink_odd, (float4*)sitelink_even, \
928  (float2*)fatlink_odd, (float2*)fatlink_even, \
929  (float)mycoeff, kparam); \
930  } \
931  }
932 
933 
934  dim3 blockDim(BLOCK_DIM , 1, 1);
935  ENUMERATE_FUNCS(mu,nu);
936 
937 #undef CALL_FUNCTION
938 
939 
940  }
941 
942 
943  void
944  computeGenStapleFieldParityKernel(void* staple_even, void* staple_odd,
945  const void* sitelink_even, const void* sitelink_odd,
946  void* fatlink_even, void* fatlink_odd,
947  const void* mulink_even, const void* mulink_odd,
948  int mu, int nu, int save_staple,
949  double mycoeff,
951  dim3 halfGridDim, llfat_kernel_param_t kparam,
952  cudaStream_t* stream)
953  {
954 
955 #define CALL_FUNCTION(mu, nu, save_staple) \
956  if (prec == QUDA_DOUBLE_PRECISION){ \
957  if(recon == QUDA_RECONSTRUCT_NO){ \
958  do_computeGenStapleFieldParity18Kernel<mu,nu, 0, save_staple> \
959  <<<halfGridDim, blockDim, 0, *stream>>>((double2*)staple_even, (double2*)staple_odd, \
960  (const double2*)sitelink_even, (const double2*)sitelink_odd, \
961  (double2*)fatlink_even, (double2*)fatlink_odd, \
962  (const double2*)mulink_even, (const double2*)mulink_odd, \
963  (double)mycoeff, kparam); \
964  do_computeGenStapleFieldParity18Kernel<mu,nu, 1, save_staple> \
965  <<<halfGridDim, blockDim, 0, *stream>>>((double2*)staple_odd, (double2*)staple_even, \
966  (const double2*)sitelink_odd, (const double2*)sitelink_even, \
967  (double2*)fatlink_odd, (double2*)fatlink_even, \
968  (const double2*)mulink_odd, (const double2*)mulink_even, \
969  (double)mycoeff, kparam); \
970  }else{ \
971  do_computeGenStapleFieldParity12Kernel<mu,nu, 0, save_staple> \
972  <<<halfGridDim, blockDim, 0, *stream>>>((double2*)staple_even, (double2*)staple_odd, \
973  (const double2*)sitelink_even, (const double2*)sitelink_odd, \
974  (double2*)fatlink_even, (double2*)fatlink_odd, \
975  (const double2*)mulink_even, (const double2*)mulink_odd, \
976  (double)mycoeff, kparam); \
977  do_computeGenStapleFieldParity12Kernel<mu,nu, 1, save_staple> \
978  <<<halfGridDim, blockDim, 0, *stream>>>((double2*)staple_odd, (double2*)staple_even, \
979  (const double2*)sitelink_odd, (const double2*)sitelink_even, \
980  (double2*)fatlink_odd, (double2*)fatlink_even, \
981  (const double2*)mulink_odd, (const double2*)mulink_even, \
982  (double)mycoeff, kparam); \
983  } \
984  }else{ \
985  if(recon == QUDA_RECONSTRUCT_NO){ \
986  do_computeGenStapleFieldParity18Kernel<mu,nu, 0, save_staple> \
987  <<<halfGridDim, blockDim, 0, *stream>>>((float2*)staple_even, (float2*)staple_odd, \
988  (float2*)sitelink_even, (float2*)sitelink_odd, \
989  (float2*)fatlink_even, (float2*)fatlink_odd, \
990  (float2*)mulink_even, (float2*)mulink_odd, \
991  (float)mycoeff, kparam); \
992  do_computeGenStapleFieldParity18Kernel<mu,nu, 1, save_staple> \
993  <<<halfGridDim, blockDim, 0, *stream>>>((float2*)staple_odd, (float2*)staple_even, \
994  (float2*)sitelink_odd, (float2*)sitelink_even, \
995  (float2*)fatlink_odd, (float2*)fatlink_even, \
996  (float2*)mulink_odd, (float2*)mulink_even, \
997  (float)mycoeff, kparam); \
998  }else{ \
999  do_computeGenStapleFieldParity12Kernel<mu,nu, 0, save_staple> \
1000  <<<halfGridDim, blockDim, 0, *stream>>>((float2*)staple_even, (float2*)staple_odd, \
1001  (float4*)sitelink_even, (float4*)sitelink_odd, \
1002  (float2*)fatlink_even, (float2*)fatlink_odd, \
1003  (float2*)mulink_even, (float2*)mulink_odd, \
1004  (float)mycoeff, kparam); \
1005  do_computeGenStapleFieldParity12Kernel<mu,nu, 1, save_staple> \
1006  <<<halfGridDim, blockDim, 0, *stream>>>((float2*)staple_odd, (float2*)staple_even, \
1007  (float4*)sitelink_odd, (float4*)sitelink_even, \
1008  (float2*)fatlink_odd, (float2*)fatlink_even, \
1009  (float2*)mulink_odd, (float2*)mulink_even, \
1010  (float)mycoeff, kparam); \
1011  } \
1012  }
1013 
1014  BIND_MU_LINK();
1015  dim3 blockDim(BLOCK_DIM , 1, 1);
1016  ENUMERATE_FUNCS_SAVE(mu,nu,save_staple);
1017 
1018  UNBIND_MU_LINK();
1019 
1020 #undef CALL_FUNCTION
1021 
1022  }
1023 
1024 
1025  void siteComputeGenStapleParityKernel_ex(void* staple_even, void* staple_odd,
1026  const void* sitelink_even, const void* sitelink_odd,
1027  void* fatlink_even, void* fatlink_odd,
1028  int mu, int nu, double mycoeff,
1029  QudaReconstructType recon, QudaPrecision prec,
1030  llfat_kernel_param_t kparam)
1031  {
1032 
1033  //compute even and odd
1034  dim3 blockDim = kparam.blockDim;
1035  dim3 halfGridDim = kparam.halfGridDim;
1036  int sbytes_dp = blockDim.x*5*sizeof(double2);
1037  int sbytes_sp = blockDim.x*5*sizeof(float2);
1038 
1039 #define CALL_FUNCTION(mu, nu) \
1040  if (prec == QUDA_DOUBLE_PRECISION){ \
1041  if(recon == QUDA_RECONSTRUCT_NO){ \
1042  do_siteComputeGenStapleParity18Kernel_ex<mu,nu, 0> \
1043  <<<halfGridDim, blockDim, sbytes_dp>>>((double2*)staple_even, (double2*)staple_odd, \
1044  (const double2*)sitelink_even, (const double2*)sitelink_odd, \
1045  (double2*)fatlink_even, (double2*)fatlink_odd, \
1046  (double)mycoeff, kparam); \
1047  do_siteComputeGenStapleParity18Kernel_ex<mu,nu, 1> \
1048  <<<halfGridDim, blockDim, sbytes_dp>>>((double2*)staple_odd, (double2*)staple_even, \
1049  (const double2*)sitelink_odd, (const double2*)sitelink_even, \
1050  (double2*)fatlink_odd, (double2*)fatlink_even, \
1051  (double)mycoeff, kparam); \
1052  }else{ \
1053  do_siteComputeGenStapleParity12Kernel_ex<mu,nu, 0> \
1054  <<<halfGridDim, blockDim, sbytes_dp>>>((double2*)staple_even, (double2*)staple_odd, \
1055  (const double2*)sitelink_even, (const double2*)sitelink_odd, \
1056  (double2*)fatlink_even, (double2*)fatlink_odd, \
1057  (double)mycoeff, kparam); \
1058  do_siteComputeGenStapleParity12Kernel_ex<mu,nu, 1> \
1059  <<<halfGridDim, blockDim, sbytes_dp>>>((double2*)staple_odd, (double2*)staple_even, \
1060  (const double2*)sitelink_odd, (const double2*)sitelink_even, \
1061  (double2*)fatlink_odd, (double2*)fatlink_even, \
1062  (double)mycoeff, kparam); \
1063  } \
1064  }else { \
1065  if(recon == QUDA_RECONSTRUCT_NO){ \
1066  do_siteComputeGenStapleParity18Kernel_ex<mu,nu, 0> \
1067  <<<halfGridDim, blockDim, sbytes_sp>>>((float2*)staple_even, (float2*)staple_odd, \
1068  (float2*)sitelink_even, (float2*)sitelink_odd, \
1069  (float2*)fatlink_even, (float2*)fatlink_odd, \
1070  (float)mycoeff, kparam); \
1071  do_siteComputeGenStapleParity18Kernel_ex<mu,nu, 1> \
1072  <<<halfGridDim, blockDim, sbytes_sp>>>((float2*)staple_odd, (float2*)staple_even, \
1073  (float2*)sitelink_odd, (float2*)sitelink_even, \
1074  (float2*)fatlink_odd, (float2*)fatlink_even, \
1075  (float)mycoeff, kparam); \
1076  }else{ \
1077  do_siteComputeGenStapleParity12Kernel_ex<mu,nu, 0> \
1078  <<<halfGridDim, blockDim, sbytes_sp>>>((float2*)staple_even, (float2*)staple_odd, \
1079  (float4*)sitelink_even, (float4*)sitelink_odd, \
1080  (float2*)fatlink_even, (float2*)fatlink_odd, \
1081  (float)mycoeff, kparam); \
1082  do_siteComputeGenStapleParity12Kernel_ex<mu,nu, 1> \
1083  <<<halfGridDim, blockDim, sbytes_sp>>>((float2*)staple_odd, (float2*)staple_even, \
1084  (float4*)sitelink_odd, (float4*)sitelink_even, \
1085  (float2*)fatlink_odd, (float2*)fatlink_even, \
1086  (float)mycoeff, kparam); \
1087  } \
1088  }
1089 
1090 
1091  ENUMERATE_FUNCS(mu,nu);
1092 
1093 #undef CALL_FUNCTION
1094 
1095 
1096  }
1097 
1098 
1099 
1100  void
1101  computeGenStapleFieldParityKernel_ex(void* staple_even, void* staple_odd,
1102  const void* sitelink_even, const void* sitelink_odd,
1103  void* fatlink_even, void* fatlink_odd,
1104  const void* mulink_even, const void* mulink_odd,
1105  int mu, int nu, int save_staple,
1106  double mycoeff,
1107  QudaReconstructType recon, QudaPrecision prec,
1108  llfat_kernel_param_t kparam)
1109  {
1110 
1111  dim3 blockDim = kparam.blockDim;
1112  dim3 halfGridDim= kparam.halfGridDim;
1113 
1114  int sbytes_dp = blockDim.x*5*sizeof(double2);
1115  int sbytes_sp = blockDim.x*5*sizeof(float2);
1116 
1117 #define CALL_FUNCTION(mu, nu, save_staple) \
1118  if (prec == QUDA_DOUBLE_PRECISION){ \
1119  if(recon == QUDA_RECONSTRUCT_NO){ \
1120  do_computeGenStapleFieldParity18Kernel_ex<mu,nu, 0, save_staple> \
1121  <<<halfGridDim, blockDim, sbytes_dp>>>((double2*)staple_even, (double2*)staple_odd, \
1122  (const double2*)sitelink_even, (const double2*)sitelink_odd, \
1123  (double2*)fatlink_even, (double2*)fatlink_odd, \
1124  (const double2*)mulink_even, (const double2*)mulink_odd, \
1125  (double)mycoeff, kparam); \
1126  do_computeGenStapleFieldParity18Kernel_ex<mu,nu, 1, save_staple> \
1127  <<<halfGridDim, blockDim, sbytes_dp>>>((double2*)staple_odd, (double2*)staple_even, \
1128  (const double2*)sitelink_odd, (const double2*)sitelink_even, \
1129  (double2*)fatlink_odd, (double2*)fatlink_even, \
1130  (const double2*)mulink_odd, (const double2*)mulink_even, \
1131  (double)mycoeff, kparam); \
1132  }else{ \
1133  do_computeGenStapleFieldParity12Kernel_ex<mu,nu, 0, save_staple> \
1134  <<<halfGridDim, blockDim, sbytes_dp>>>((double2*)staple_even, (double2*)staple_odd, \
1135  (const double2*)sitelink_even, (const double2*)sitelink_odd, \
1136  (double2*)fatlink_even, (double2*)fatlink_odd, \
1137  (const double2*)mulink_even, (const double2*)mulink_odd, \
1138  (double)mycoeff, kparam); \
1139  do_computeGenStapleFieldParity12Kernel_ex<mu,nu, 1, save_staple> \
1140  <<<halfGridDim, blockDim, sbytes_dp>>>((double2*)staple_odd, (double2*)staple_even, \
1141  (const double2*)sitelink_odd, (const double2*)sitelink_even, \
1142  (double2*)fatlink_odd, (double2*)fatlink_even, \
1143  (const double2*)mulink_odd, (const double2*)mulink_even, \
1144  (double)mycoeff, kparam); \
1145  } \
1146  }else{ \
1147  if(recon == QUDA_RECONSTRUCT_NO){ \
1148  do_computeGenStapleFieldParity18Kernel_ex<mu,nu, 0, save_staple> \
1149  <<<halfGridDim, blockDim, sbytes_sp>>>((float2*)staple_even, (float2*)staple_odd, \
1150  (float2*)sitelink_even, (float2*)sitelink_odd, \
1151  (float2*)fatlink_even, (float2*)fatlink_odd, \
1152  (float2*)mulink_even, (float2*)mulink_odd, \
1153  (float)mycoeff, kparam); \
1154  do_computeGenStapleFieldParity18Kernel_ex<mu,nu, 1, save_staple> \
1155  <<<halfGridDim, blockDim, sbytes_sp>>>((float2*)staple_odd, (float2*)staple_even, \
1156  (float2*)sitelink_odd, (float2*)sitelink_even, \
1157  (float2*)fatlink_odd, (float2*)fatlink_even, \
1158  (float2*)mulink_odd, (float2*)mulink_even, \
1159  (float)mycoeff, kparam); \
1160  }else{ \
1161  do_computeGenStapleFieldParity12Kernel_ex<mu,nu, 0, save_staple> \
1162  <<<halfGridDim, blockDim, sbytes_sp>>>((float2*)staple_even, (float2*)staple_odd, \
1163  (float4*)sitelink_even, (float4*)sitelink_odd, \
1164  (float2*)fatlink_even, (float2*)fatlink_odd, \
1165  (float2*)mulink_even, (float2*)mulink_odd, \
1166  (float)mycoeff, kparam); \
1167  do_computeGenStapleFieldParity12Kernel_ex<mu,nu, 1, save_staple> \
1168  <<<halfGridDim, blockDim, sbytes_sp>>>((float2*)staple_odd, (float2*)staple_even, \
1169  (float4*)sitelink_odd, (float4*)sitelink_even, \
1170  (float2*)fatlink_odd, (float2*)fatlink_even, \
1171  (float2*)mulink_odd, (float2*)mulink_even, \
1172  (float)mycoeff, kparam); \
1173  } \
1174  }
1175 
1176  BIND_MU_LINK();
1177  ENUMERATE_FUNCS_SAVE(mu,nu,save_staple);
1178 
1179  UNBIND_MU_LINK();
1180 
1181 #undef CALL_FUNCTION
1182 
1183  }
1184 
1185 #endif
1186 
1188  cudaGaugeField& cudaStaple, cudaGaugeField& cudaStaple1,
1189  QudaGaugeParam* param, double* act_path_coeff)
1190  {
1191 #ifdef GPU_FATLINK
1192  QudaPrecision prec = cudaSiteLink.Precision();
1193  QudaReconstructType recon = cudaSiteLink.Reconstruct();
1194 
1195  BIND_SITE_AND_FAT_LINK;
1196  int volume = param->X[0]*param->X[1]*param->X[2]*param->X[3];
1197  dim3 gridDim((volume + BLOCK_DIM-1)/BLOCK_DIM,1,1);
1198  dim3 blockDim(BLOCK_DIM , 1, 1);
1199 
1200  staple_bytes = cudaStaple.Bytes();
1201 
1202  if(prec == QUDA_DOUBLE_PRECISION){
1203  if(recon == QUDA_RECONSTRUCT_NO){
1204  llfatOneLink18Kernel<<<gridDim, blockDim>>>((const double2*)cudaSiteLink.Even_p(), (const double2*)cudaSiteLink.Odd_p(),
1205  (double2*)cudaFatLink.Even_p(), (double2*)cudaFatLink.Odd_p(),
1206  (double)act_path_coeff[0], (double)act_path_coeff[5], volume);
1207  }else{
1208 
1209  llfatOneLink12Kernel<<<gridDim, blockDim>>>((const double2*)cudaSiteLink.Even_p(), (const double2*)cudaSiteLink.Odd_p(),
1210  (double2*)cudaFatLink.Even_p(), (double2*)cudaFatLink.Odd_p(),
1211  (double)act_path_coeff[0], (double)act_path_coeff[5], volume);
1212 
1213  }
1214  }else{ //single precision
1215  if(recon == QUDA_RECONSTRUCT_NO){
1216  llfatOneLink18Kernel<<<gridDim, blockDim>>>((float2*)cudaSiteLink.Even_p(), (float2*)cudaSiteLink.Odd_p(),
1217  (float2*)cudaFatLink.Even_p(), (float2*)cudaFatLink.Odd_p(),
1218  (float)act_path_coeff[0], (float)act_path_coeff[5], volume);
1219  }else{
1220  llfatOneLink12Kernel<<<gridDim, blockDim>>>((float4*)cudaSiteLink.Even_p(), (float4*)cudaSiteLink.Odd_p(),
1221  (float2*)cudaFatLink.Even_p(), (float2*)cudaFatLink.Odd_p(),
1222  (float)act_path_coeff[0], (float)act_path_coeff[5], volume);
1223  }
1224  }
1225 #else
1226  errorQuda("Fat-link construction has not been built");
1227 #endif
1228  }
1229 
1230 
1231 
1233  cudaGaugeField& cudaStaple, cudaGaugeField& cudaStaple1,
1234  QudaGaugeParam* param, double* act_path_coeff,
1235  llfat_kernel_param_t kparam)
1236  {
1237 #ifdef GPU_FATLINK
1238  QudaPrecision prec = cudaSiteLink.Precision();
1239  QudaReconstructType recon = cudaSiteLink.Reconstruct();
1240 
1241  BIND_SITE_AND_FAT_LINK;
1242 
1243  dim3 gridDim;
1244  dim3 blockDim = kparam.blockDim;
1245  gridDim.x = 2* kparam.halfGridDim.x;
1246  gridDim.y = 1;
1247  gridDim.z = 1;
1248  staple_bytes = cudaStaple.Bytes();
1249 
1250  if(prec == QUDA_DOUBLE_PRECISION){
1251  if(recon == QUDA_RECONSTRUCT_NO){
1252  llfatOneLink18Kernel_ex<<<gridDim, blockDim>>>((const double2*)cudaSiteLink.Even_p(), (const double2*)cudaSiteLink.Odd_p(),
1253  (double2*)cudaFatLink.Even_p(), (double2*)cudaFatLink.Odd_p(),
1254  (double)act_path_coeff[0], (double)act_path_coeff[5], kparam);
1255  }else{
1256 
1257  llfatOneLink12Kernel_ex<<<gridDim, blockDim>>>((const double2*)cudaSiteLink.Even_p(), (const double2*)cudaSiteLink.Odd_p(),
1258  (double2*)cudaFatLink.Even_p(), (double2*)cudaFatLink.Odd_p(),
1259  (double)act_path_coeff[0], (double)act_path_coeff[5], kparam);
1260 
1261  }
1262  }else{ //single precision
1263  if(recon == QUDA_RECONSTRUCT_NO){
1264  llfatOneLink18Kernel_ex<<<gridDim, blockDim>>>((float2*)cudaSiteLink.Even_p(), (float2*)cudaSiteLink.Odd_p(),
1265  (float2*)cudaFatLink.Even_p(), (float2*)cudaFatLink.Odd_p(),
1266  (float)act_path_coeff[0], (float)act_path_coeff[5], kparam);
1267  }else{
1268  llfatOneLink12Kernel_ex<<<gridDim, blockDim>>>((float4*)cudaSiteLink.Even_p(), (float4*)cudaSiteLink.Odd_p(),
1269  (float2*)cudaFatLink.Even_p(), (float2*)cudaFatLink.Odd_p(),
1270  (float)act_path_coeff[0], (float)act_path_coeff[5], kparam);
1271  }
1272  }
1273 #else
1274  errorQuda("Fat-link construction has not been built");
1275 #endif
1276 
1277  }
1278 
1279 #undef BLOCK_DIM
1280 
1281 } // namespace quda
__global__ void FloatM * staple_odd
Definition: llfat_core.h:800
__global__ void FloatM const FloatN const FloatN FloatM * fatlink_even
Definition: llfat_core.h:800
__constant__ int Vh
__global__ void FloatM const FloatN const FloatN * sitelink_odd
Definition: llfat_core.h:800
enum QudaPrecision_s QudaPrecision
struct quda::llfat_kernel_param_s llfat_kernel_param_t
__constant__ int Vh_ex
__global__ void FloatM const FloatN const FloatN FloatM FloatM * fatlink_odd
Definition: llfat_core.h:800
#define errorQuda(...)
Definition: util_quda.h:73
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int mu
cudaStream_t * stream
void computeLongLinkCuda(void *outEven, void *outOdd, const void *const inEven, const void *const inOdd, double coeff, QudaReconstructType recon, QudaPrecision prec, dim3 halfGridDim, llfat_kernel_param_t kparam)
__global__ void FloatM const FloatN const FloatN FloatM FloatM const FloatM * mulink_even
Definition: llfat_core.h:943
void llfat_init_cuda_ex(QudaGaugeParam *param_ex)
QudaGaugeParam param
Definition: pack_test.cpp:17
int llfat_ga_pad
Definition: quda.h:58
size_t Bytes() const
Definition: gauge_field.h:197
QudaPrecision Precision() const
void siteComputeGenStapleParityKernel_ex(void *staple_even, void *staple_odd, const void *sitelink_even, const void *sitelink_odd, void *fatlink_even, void *fatlink_odd, int mu, int nu, double mycoeff, QudaReconstructType recon, QudaPrecision prec, llfat_kernel_param_t kparam)
void llfatOneLinkKernel(cudaGaugeField &cudaFatLink, cudaGaugeField &cudaSiteLink, cudaGaugeField &cudaStaple, cudaGaugeField &cudaStaple1, QudaGaugeParam *param, double *act_path_coeff)
Definition: llfat_quda.cu:1187
int site_ga_pad
Definition: quda.h:55
void computeGenStapleFieldParityKernel_ex(void *staple_even, void *staple_odd, const void *sitelink_even, const void *sitelink_odd, void *fatlink_even, void *fatlink_odd, const void *mulink_even, const void *mulink_odd, int mu, int nu, int save_staple, double mycoeff, QudaReconstructType recon, QudaPrecision prec, llfat_kernel_param_t kparam)
cudaGaugeField * cudaFatLink
int staple_pad
Definition: quda.h:57
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:168
void computeGenStapleFieldParityKernel(void *staple_even, void *staple_odd, const void *sitelink_even, const void *sitelink_odd, void *fatlink_even, void *fatlink_odd, const void *mulink_even, const void *mulink_odd, int mu, int nu, int save_staple, double mycoeff, QudaReconstructType recon, QudaPrecision prec, dim3 halfGridDim, llfat_kernel_param_t kparam, cudaStream_t *stream)
__constant__ double coeff
int X[4]
Definition: quda.h:29
void siteComputeGenStapleParityKernel(void *staple_even, void *staple_odd, const void *sitelink_even, const void *sitelink_odd, void *fatlink_even, void *fatlink_odd, int mu, int nu, double mycoeff, QudaReconstructType recon, QudaPrecision prec, dim3 halfGridDim, llfat_kernel_param_t kparam, cudaStream_t *stream)
__constant__ fat_force_const_t fl
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const hisq_kernel_param_t kparam
if(x2 >=X2) return
enum QudaReconstructType_s QudaReconstructType
__global__ void FloatM const FloatN * sitelink_even
Definition: llfat_core.h:800
#define checkCudaError()
Definition: util_quda.h:110
__global__ void FloatM const FloatN const FloatN FloatM FloatM const FloatM const FloatM * mulink_odd
Definition: llfat_core.h:943
void llfat_init_cuda(QudaGaugeParam *param)
RealTypeId< RealA >::Type mycoeff
QudaPrecision prec
Definition: test_util.cpp:1551
void llfatOneLinkKernel_ex(cudaGaugeField &cudaFatLink, cudaGaugeField &cudaSiteLink, cudaGaugeField &cudaStaple, cudaGaugeField &cudaStaple1, QudaGaugeParam *param, double *act_path_coeff, llfat_kernel_param_t kparam)
Definition: llfat_quda.cu:1232
#define BLOCK_DIM
void * fatlink[4]