2 #include <cuda_runtime.h>
13 #if (__COMPUTE_CAPABILITY__ >= 200)
14 #define SITE_MATRIX_LOAD_TEX 1
15 #define MULINK_LOAD_TEX 1
16 #define FATLINK_LOAD_TEX 1
18 #define SITE_MATRIX_LOAD_TEX 0
19 #define MULINK_LOAD_TEX 1
20 #define FATLINK_LOAD_TEX 1
25 #define WRITE_FAT_MATRIX(gauge, dir, idx)do { \
26 gauge[idx + dir*9*fl.fat_ga_stride] = FAT0; \
27 gauge[idx + (dir*9+1) * fl.fat_ga_stride] = FAT1; \
28 gauge[idx + (dir*9+2) * fl.fat_ga_stride] = FAT2; \
29 gauge[idx + (dir*9+3) * fl.fat_ga_stride] = FAT3; \
30 gauge[idx + (dir*9+4) * fl.fat_ga_stride] = FAT4; \
31 gauge[idx + (dir*9+5) * fl.fat_ga_stride] = FAT5; \
32 gauge[idx + (dir*9+6) * fl.fat_ga_stride] = FAT6; \
33 gauge[idx + (dir*9+7) * fl.fat_ga_stride] = FAT7; \
34 gauge[idx + (dir*9+8) * fl.fat_ga_stride] = FAT8;} while(0)
37 #define WRITE_STAPLE_MATRIX(gauge, idx) \
38 gauge[idx] = STAPLE0; \
39 gauge[idx + fl.staple_stride] = STAPLE1; \
40 gauge[idx + 2*fl.staple_stride] = STAPLE2; \
41 gauge[idx + 3*fl.staple_stride] = STAPLE3; \
42 gauge[idx + 4*fl.staple_stride] = STAPLE4; \
43 gauge[idx + 5*fl.staple_stride] = STAPLE5; \
44 gauge[idx + 6*fl.staple_stride] = STAPLE6; \
45 gauge[idx + 7*fl.staple_stride] = STAPLE7; \
46 gauge[idx + 8*fl.staple_stride] = STAPLE8;
49 #define SCALAR_MULT_SU3_MATRIX(a, b, c) \
50 c##00_re = a*b##00_re; \
51 c##00_im = a*b##00_im; \
52 c##01_re = a*b##01_re; \
53 c##01_im = a*b##01_im; \
54 c##02_re = a*b##02_re; \
55 c##02_im = a*b##02_im; \
56 c##10_re = a*b##10_re; \
57 c##10_im = a*b##10_im; \
58 c##11_re = a*b##11_re; \
59 c##11_im = a*b##11_im; \
60 c##12_re = a*b##12_re; \
61 c##12_im = a*b##12_im; \
62 c##20_re = a*b##20_re; \
63 c##20_im = a*b##20_im; \
64 c##21_re = a*b##21_re; \
65 c##21_im = a*b##21_im; \
66 c##22_re = a*b##22_re; \
67 c##22_im = a*b##22_im; \
88 #define LOAD_MATRIX_12_SINGLE_DECLARE(gauge, dir, idx, var, stride) \
89 float4 var##0 = gauge[idx + dir*3*stride]; \
90 float4 var##1 = gauge[idx + dir*3*stride + stride]; \
91 float4 var##2 = gauge[idx + dir*3*stride + 2*stride]; \
92 float4 var##3, var##4;
94 #define LOAD_MATRIX_12_SINGLE_TEX_DECLARE(gauge, dir, idx, var, stride) \
95 float4 var##0 = tex1Dfetch(gauge, idx + dir*3*stride); \
96 float4 var##1 = tex1Dfetch(gauge, idx + dir*3*stride + stride); \
97 float4 var##2 = tex1Dfetch(gauge, idx + dir*3*stride + 2*stride); \
98 float4 var##3, var##4;
100 #define LOAD_MATRIX_18_SINGLE_DECLARE(gauge, dir, idx, var, stride) \
101 float2 var##0 = gauge[idx + dir*9*stride]; \
102 float2 var##1 = gauge[idx + dir*9*stride + stride]; \
103 float2 var##2 = gauge[idx + dir*9*stride + 2*stride]; \
104 float2 var##3 = gauge[idx + dir*9*stride + 3*stride]; \
105 float2 var##4 = gauge[idx + dir*9*stride + 4*stride]; \
106 float2 var##5 = gauge[idx + dir*9*stride + 5*stride]; \
107 float2 var##6 = gauge[idx + dir*9*stride + 6*stride]; \
108 float2 var##7 = gauge[idx + dir*9*stride + 7*stride]; \
109 float2 var##8 = gauge[idx + dir*9*stride + 8*stride];
112 #define LOAD_MATRIX_18_SINGLE_TEX_DECLARE(gauge, dir, idx, var, stride) \
113 float2 var##0 = tex1Dfetch(gauge, idx + dir*9*stride); \
114 float2 var##1 = tex1Dfetch(gauge, idx + dir*9*stride + stride); \
115 float2 var##2 = tex1Dfetch(gauge, idx + dir*9*stride + 2*stride); \
116 float2 var##3 = tex1Dfetch(gauge, idx + dir*9*stride + 3*stride); \
117 float2 var##4 = tex1Dfetch(gauge, idx + dir*9*stride + 4*stride); \
118 float2 var##5 = tex1Dfetch(gauge, idx + dir*9*stride + 5*stride); \
119 float2 var##6 = tex1Dfetch(gauge, idx + dir*9*stride + 6*stride); \
120 float2 var##7 = tex1Dfetch(gauge, idx + dir*9*stride + 7*stride); \
121 float2 var##8 = tex1Dfetch(gauge, idx + dir*9*stride + 8*stride);
125 #define LOAD_MATRIX_18_DOUBLE_DECLARE(gauge, dir, idx, var, stride) \
126 double2 var##0 = gauge[idx + dir*9*stride]; \
127 double2 var##1 = gauge[idx + dir*9*stride + stride]; \
128 double2 var##2 = gauge[idx + dir*9*stride + 2*stride]; \
129 double2 var##3 = gauge[idx + dir*9*stride + 3*stride]; \
130 double2 var##4 = gauge[idx + dir*9*stride + 4*stride]; \
131 double2 var##5 = gauge[idx + dir*9*stride + 5*stride]; \
132 double2 var##6 = gauge[idx + dir*9*stride + 6*stride]; \
133 double2 var##7 = gauge[idx + dir*9*stride + 7*stride]; \
134 double2 var##8 = gauge[idx + dir*9*stride + 8*stride];
137 #define LOAD_MATRIX_18_DOUBLE_TEX_DECLARE(gauge_tex, gauge, dir, idx, var, stride) \
138 double2 var##0 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*9*stride); \
139 double2 var##1 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*9*stride + stride); \
140 double2 var##2 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*9*stride + 2*stride); \
141 double2 var##3 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*9*stride + 3*stride); \
142 double2 var##4 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*9*stride + 4*stride); \
143 double2 var##5 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*9*stride + 5*stride); \
144 double2 var##6 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*9*stride + 6*stride); \
145 double2 var##7 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*9*stride + 7*stride); \
146 double2 var##8 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*9*stride + 8*stride);
149 #define LOAD_MATRIX_12_DOUBLE_DECLARE(gauge, dir, idx, var, stride) \
150 double2 var##0 = gauge[idx + dir*6*stride]; \
151 double2 var##1 = gauge[idx + dir*6*stride + stride]; \
152 double2 var##2 = gauge[idx + dir*6*stride + 2*stride]; \
153 double2 var##3 = gauge[idx + dir*6*stride + 3*stride]; \
154 double2 var##4 = gauge[idx + dir*6*stride + 4*stride]; \
155 double2 var##5 = gauge[idx + dir*6*stride + 5*stride]; \
156 double2 var##6, var##7, var##8;
159 #define LOAD_MATRIX_12_DOUBLE_TEX_DECLARE(gauge_tex, gauge, dir, idx, var, stride) \
160 double2 var##0 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*6*stride); \
161 double2 var##1 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*6*stride + stride); \
162 double2 var##2 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*6*stride + 2*stride); \
163 double2 var##3 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*6*stride + 3*stride); \
164 double2 var##4 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*6*stride + 4*stride); \
165 double2 var##5 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*6*stride + 5*stride); \
166 double2 var##6, var##7, var##8;
168 #define LLFAT_ADD_SU3_MATRIX(ma, mb, mc) \
169 mc##00_re = ma##00_re + mb##00_re; \
170 mc##00_im = ma##00_im + mb##00_im; \
171 mc##01_re = ma##01_re + mb##01_re; \
172 mc##01_im = ma##01_im + mb##01_im; \
173 mc##02_re = ma##02_re + mb##02_re; \
174 mc##02_im = ma##02_im + mb##02_im; \
175 mc##10_re = ma##10_re + mb##10_re; \
176 mc##10_im = ma##10_im + mb##10_im; \
177 mc##11_re = ma##11_re + mb##11_re; \
178 mc##11_im = ma##11_im + mb##11_im; \
179 mc##12_re = ma##12_re + mb##12_re; \
180 mc##12_im = ma##12_im + mb##12_im; \
181 mc##20_re = ma##20_re + mb##20_re; \
182 mc##20_im = ma##20_im + mb##20_im; \
183 mc##21_re = ma##21_re + mb##21_re; \
184 mc##21_im = ma##21_im + mb##21_im; \
185 mc##22_re = ma##22_re + mb##22_re; \
186 mc##22_im = ma##22_im + mb##22_im;
199 static int llfat_init_cuda_flag = 0;
200 if (llfat_init_cuda_flag){
204 llfat_init_cuda_flag = 1;
206 int Vh = param->
X[0]*param->
X[1]*param->
X[2]*param->
X[3]/2;
218 for(
int nu =0; nu < 4; nu++)
220 if(nu ==
mu)
continue;
222 for(d1=0; d1 < 4; d1 ++){
223 if(d1 != nu && d1 !=
mu){
229 for(d2=0; d2 < 4; d2 ++){
230 if(d2 != nu && d2 !=
mu && d2 != d1){
248 static int llfat_init_cuda_flag = 0;
249 if (llfat_init_cuda_flag){
253 llfat_init_cuda_flag = 1;
255 int Vh_ex = param_ex->
X[0]*param_ex->
X[1]*param_ex->
X[2]*param_ex->
X[3]/2;
256 int Vh = (param_ex->
X[0]-4)*(param_ex->
X[1]-4)*(param_ex->
X[2]-4)*(param_ex->
X[3]-4)/2;
268 #define LLFAT_CONCAT(a,b) a##b##Kernel
269 #define LLFAT_CONCAT_EX(a,b) a##b##Kernel_ex
270 #define LLFAT_KERNEL(a,b) LLFAT_CONCAT(a,b)
271 #define LLFAT_KERNEL_EX(a,b) LLFAT_CONCAT_EX(a,b)
278 #define LOAD_FAT_MATRIX(gauge, dir, idx) LOAD_MATRIX_18_SINGLE_DECLARE(gauge, dir, idx, FAT, fl.fat_ga_stride)
279 #if (MULINK_LOAD_TEX == 1)
280 #define LOAD_EVEN_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_TEX_DECLARE((odd_bit?muLink1TexSingle:muLink0TexSingle), dir, idx, var, fl.staple_stride)
281 #define LOAD_ODD_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_TEX_DECLARE((odd_bit?muLink0TexSingle:muLink1TexSingle), dir, idx, var, fl.staple_stride)
283 #define LOAD_EVEN_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_DECLARE(mulink_even, dir, idx, var, fl.staple_stride)
284 #define LOAD_ODD_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_DECLARE(mulink_odd, dir, idx, var, fl.staple_stride)
287 #if (FATLINK_LOAD_TEX == 1)
288 #define LOAD_EVEN_FAT_MATRIX(dir, idx) LOAD_MATRIX_18_SINGLE_TEX_DECLARE((odd_bit?fatGauge1TexSingle:fatGauge0TexSingle), dir, idx, FAT, fl.fat_ga_stride);
289 #define LOAD_ODD_FAT_MATRIX(dir, idx) LOAD_MATRIX_18_SINGLE_TEX_DECLARE((odd_bit?fatGauge0TexSingle:fatGauge1TexSingle), dir, idx, FAT, fl.fat_ga_stride);
291 #define LOAD_EVEN_FAT_MATRIX(dir, idx) LOAD_MATRIX_18_SINGLE_DECLARE(fatlink_even, dir, idx, FAT, fl.fat_ga_stride)
292 #define LOAD_ODD_FAT_MATRIX(dir, idx) LOAD_MATRIX_18_SINGLE_DECLARE(fatlink_odd, dir, idx, FAT, fl.fat_ga_stride)
297 #define DECLARE_VAR_SIGN short sign=1
298 #define SITELINK0TEX siteLink0TexSingle_recon
299 #define SITELINK1TEX siteLink1TexSingle_recon
300 #if (SITE_MATRIX_LOAD_TEX == 1)
301 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_SINGLE_TEX_DECLARE((odd_bit?SITELINK1TEX:SITELINK0TEX), dir, idx, var, fl.site_ga_stride)
302 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_SINGLE_TEX_DECLARE((odd_bit?SITELINK0TEX:SITELINK1TEX), dir, idx, var, fl.site_ga_stride)
304 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_SINGLE_DECLARE(sitelink_even, dir, idx, var, fl.site_ga_stride)
305 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_SINGLE_DECLARE(sitelink_odd, dir, idx, var, fl.site_ga_stride)
307 #define LOAD_SITE_MATRIX(sitelink, dir, idx, var) LOAD_MATRIX_12_SINGLE_DECLARE(sitelink, dir, idx, var, fl.site_ga_stride)
309 #define RECONSTRUCT_SITE_LINK(sign, var) RECONSTRUCT_LINK_12(sign, var);
310 #define FloatN float4
311 #define FloatM float2
312 #define RECONSTRUCT 12
313 #define sd_data float_12_sd_data
315 #undef DECLARE_VAR_SIGN
318 #undef LOAD_EVEN_SITE_MATRIX
319 #undef LOAD_ODD_SITE_MATRIX
320 #undef LOAD_SITE_MATRIX
321 #undef RECONSTRUCT_SITE_LINK
328 #define SITELINK0TEX siteLink0TexSingle_norecon
329 #define SITELINK1TEX siteLink1TexSingle_norecon
330 #if (SITE_MATRIX_LOAD_TEX == 1)
331 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_TEX_DECLARE((odd_bit?SITELINK1TEX:SITELINK0TEX), dir, idx, var, fl.site_ga_stride)
332 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_TEX_DECLARE((odd_bit?SITELINK0TEX:SITELINK1TEX), dir, idx, var, fl.site_ga_stride)
334 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_DECLARE(sitelink_even, dir, idx, var, fl.site_ga_stride)
335 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_DECLARE(sitelink_odd, dir, idx, var, fl.site_ga_stride)
337 #define LOAD_SITE_MATRIX(sitelink, dir, idx, var) LOAD_MATRIX_18_SINGLE_DECLARE(sitelink, dir, idx, var, fl.site_ga_stride)
338 #define RECONSTRUCT_SITE_LINK(sign, var)
339 #define FloatN float2
340 #define FloatM float2
341 #define RECONSTRUCT 18
342 #define sd_data float_18_sd_data
346 #undef LOAD_EVEN_SITE_MATRIX
347 #undef LOAD_ODD_SITE_MATRIX
348 #undef LOAD_SITE_MATRIX
349 #undef RECONSTRUCT_SITE_LINK
358 #undef LOAD_FAT_MATRIX
359 #undef LOAD_EVEN_MULINK_MATRIX
360 #undef LOAD_ODD_MULINK_MATRIX
361 #undef LOAD_EVEN_FAT_MATRIX
362 #undef LOAD_ODD_FAT_MATRIX
368 #define LOAD_FAT_MATRIX(gauge, dir, idx) LOAD_MATRIX_18_DOUBLE_DECLARE(gauge, dir, idx, FAT, fl.fat_ga_stride)
369 #if (MULINK_LOAD_TEX == 1)
370 #define LOAD_EVEN_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE_TEX_DECLARE(odd_bit?muLink1TexDouble:muLink0TexDouble), mulink_even, dir, idx, var, fl.staple_stride)
371 #define LOAD_ODD_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE_TEX_DECLARE((odd_bit?muLink0TexDouble:muLink1TexDouble), mulink_odd, dir, idx, var, fl.staple_stride)
373 #define LOAD_EVEN_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE(mulink_even, dir, idx, var, fl.staple_stride)
374 #define LOAD_ODD_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE(mulink_odd, dir, idx, var, fl.staple_stride)
377 #if (FATLINK_LOAD_TEX == 1)
378 #define LOAD_EVEN_FAT_MATRIX(dir, idx) LOAD_MATRIX_18_DOUBLE_TEX_DECLARE((odd_bit?fatGauge1TexDouble:fatGauge0TexDouble), fatlink_even, dir, idx, FAT, fl.fat_ga_stride)
379 #define LOAD_ODD_FAT_MATRIX(dir, idx) LOAD_MATRIX_18_DOUBLE_TEX_DECLARE((odd_bit?fatGauge0TexDouble:fatGauge1TexDouble), fatlink_odd, dir, idx, FAT, fl.fat_ga_stride)
381 #define LOAD_EVEN_FAT_MATRIX(dir, idx) LOAD_MATRIX_18_DOUBLE_DECLARE(fatlink_even, dir, idx, FAT, fl.fat_ga_stride)
382 #define LOAD_ODD_FAT_MATRIX(dir, idx) LOAD_MATRIX_18_DOUBLE_DECLARE(fatlink_odd, dir, idx, FAT, fl.fat_ga_stride)
386 #define SITELINK0TEX siteLink0TexDouble
387 #define SITELINK1TEX siteLink1TexDouble
388 #if (SITE_MATRIX_LOAD_TEX == 1)
389 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE_TEX_DECLARE((odd_bit?SITELINK1TEX:SITELINK0TEX), sitelink_even, dir, idx, var, fl.site_ga_stride)
390 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE_TEX_DECLARE((odd_bit?SITELINK0TEX:SITELINK1TEX), sitelink_odd, dir, idx, var, fl.site_ga_stride)
392 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE_DECLARE(sitelink_even, dir, idx, var, fl.site_ga_stride)
393 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE_DECLARE(sitelink_odd, dir, idx, var, fl.site_ga_stride)
395 #define LOAD_SITE_MATRIX(sitelink, dir, idx, var) LOAD_MATRIX_18_DOUBLE_DECLARE(sitelink, dir, idx, var, fl.site_ga_stride)
396 #define RECONSTRUCT_SITE_LINK(sign, var)
397 #define FloatN double2
398 #define FloatM double2
399 #define RECONSTRUCT 18
400 #define sd_data double_18_sd_data
404 #undef LOAD_EVEN_SITE_MATRIX
405 #undef LOAD_ODD_SITE_MATRIX
406 #undef LOAD_SITE_MATRIX
407 #undef RECONSTRUCT_SITE_LINK
417 #define SITELINK0TEX siteLink0TexDouble
418 #define SITELINK1TEX siteLink1TexDouble
419 #if (SITE_MATRIX_LOAD_TEX == 1)
420 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_DOUBLE_TEX_DECLARE((odd_bit?SITELINK1TEX:SITELINK0TEX), sitelink_even, dir, idx, var, fl.site_ga_stride)
421 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_DOUBLE_TEX_DECLARE((odd_bit?SITELINK0TEX:SITELINK1TEX), sitelink_odd, dir, idx, var, fl.site_ga_stride)
423 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_DOUBLE_DECLARE(sitelink_even, dir, idx, var, fl.site_ga_stride)
424 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_DOUBLE_DECLARE(sitelink_odd, dir, idx, var, fl.site_ga_stride)
426 #define LOAD_SITE_MATRIX(sitelink, dir, idx, var) LOAD_MATRIX_12_DOUBLE_DECLARE(sitelink, dir, idx, var, fl.site_ga_stride)
427 #define RECONSTRUCT_SITE_LINK(sign, var) RECONSTRUCT_LINK_12(sign, var);
428 #define FloatN double2
429 #define FloatM double2
430 #define RECONSTRUCT 12
431 #define sd_data double_12_sd_data
435 #undef LOAD_EVEN_SITE_MATRIX
436 #undef LOAD_ODD_SITE_MATRIX
437 #undef LOAD_SITE_MATRIX
438 #undef RECONSTRUCT_SITE_LINK
447 #undef LOAD_FAT_MATRIX
448 #undef LOAD_EVEN_MULINK_MATRIX
449 #undef LOAD_ODD_MULINK_MATRIX
450 #undef LOAD_EVEN_FAT_MATRIX
451 #undef LOAD_ODD_FAT_MATRIX
456 #define UNBIND_ALL_TEXTURE do{ \
457 if(prec ==QUDA_DOUBLE_PRECISION){ \
458 cudaUnbindTexture(siteLink0TexDouble); \
459 cudaUnbindTexture(siteLink1TexDouble); \
460 cudaUnbindTexture(fatGauge0TexDouble); \
461 cudaUnbindTexture(fatGauge1TexDouble); \
462 cudaUnbindTexture(muLink0TexDouble); \
463 cudaUnbindTexture(muLink1TexDouble); \
465 if(cudaSiteLink.reconstruct == QUDA_RECONSTRUCT_NO){ \
466 cudaUnbindTexture(siteLink0TexSingle_norecon); \
467 cudaUnbindTexture(siteLink1TexSingle_norecon); \
469 cudaUnbindTexture(siteLink0TexSingle_recon); \
470 cudaUnbindTexture(siteLink1TexSingle_recon); \
472 cudaUnbindTexture(fatGauge0TexSingle); \
473 cudaUnbindTexture(fatGauge1TexSingle); \
474 cudaUnbindTexture(muLink0TexSingle); \
475 cudaUnbindTexture(muLink1TexSingle); \
479 #define UNBIND_SITE_AND_FAT_LINK do{ \
480 if(prec == QUDA_DOUBLE_PRECISION){ \
481 cudaUnbindTexture(siteLink0TexDouble); \
482 cudaUnbindTexture(siteLink1TexDouble); \
483 cudaUnbindTexture(fatGauge0TexDouble); \
484 cudaUnbindTexture(fatGauge1TexDouble); \
486 if(cudaSiteLink.reconstruct == QUDA_RECONSTRUCT_NO){ \
487 cudaUnbindTexture(siteLink0TexSingle_norecon); \
488 cudaUnbindTexture(siteLink1TexSingle_norecon); \
490 cudaUnbindTexture(siteLink0TexSingle_recon); \
491 cudaUnbindTexture(siteLink1TexSingle_recon); \
493 cudaUnbindTexture(fatGauge0TexSingle); \
494 cudaUnbindTexture(fatGauge1TexSingle); \
499 #define BIND_MU_LINK() do{ \
500 if(prec == QUDA_DOUBLE_PRECISION){ \
501 cudaBindTexture(0, muLink0TexDouble, mulink_even, staple_bytes); \
502 cudaBindTexture(0, muLink1TexDouble, mulink_odd, staple_bytes); \
504 cudaBindTexture(0, muLink0TexSingle, mulink_even, staple_bytes); \
505 cudaBindTexture(0, muLink1TexSingle, mulink_odd, staple_bytes); \
509 #define UNBIND_MU_LINK() do{ \
510 if(prec == QUDA_DOUBLE_PRECISION){ \
511 cudaUnbindTexture(muLink0TexSingle); \
512 cudaUnbindTexture(muLink1TexSingle); \
514 cudaUnbindTexture(muLink0TexDouble); \
515 cudaUnbindTexture(muLink1TexDouble); \
520 #define BIND_SITE_AND_FAT_LINK do { \
521 if(prec == QUDA_DOUBLE_PRECISION){ \
522 cudaBindTexture(0, siteLink0TexDouble, cudaSiteLink.Even_p(), cudaSiteLink.Bytes()); \
523 cudaBindTexture(0, siteLink1TexDouble, cudaSiteLink.Odd_p(), cudaSiteLink.Bytes()); \
524 cudaBindTexture(0, fatGauge0TexDouble, cudaFatLink.Even_p(), cudaFatLink.Bytes()); \
525 cudaBindTexture(0, fatGauge1TexDouble, cudaFatLink.Odd_p(), cudaFatLink.Bytes()); \
527 if(cudaSiteLink.Reconstruct() == QUDA_RECONSTRUCT_NO){ \
528 cudaBindTexture(0, siteLink0TexSingle_norecon, cudaSiteLink.Even_p(), cudaSiteLink.Bytes()); \
529 cudaBindTexture(0, siteLink1TexSingle_norecon, cudaSiteLink.Odd_p(), cudaSiteLink.Bytes()); \
531 cudaBindTexture(0, siteLink0TexSingle_recon, cudaSiteLink.Even_p(), cudaSiteLink.Bytes()); \
532 cudaBindTexture(0, siteLink1TexSingle_recon, cudaSiteLink.Odd_p(), cudaSiteLink.Bytes()); \
534 cudaBindTexture(0, fatGauge0TexSingle, cudaFatLink.Even_p(), cudaFatLink.Bytes()); \
535 cudaBindTexture(0, fatGauge1TexSingle, cudaFatLink.Odd_p(), cudaFatLink.Bytes()); \
539 #define BIND_MU_LINK() do{ \
540 if(prec == QUDA_DOUBLE_PRECISION){ \
541 cudaBindTexture(0, muLink0TexDouble, mulink_even, staple_bytes); \
542 cudaBindTexture(0, muLink1TexDouble, mulink_odd, staple_bytes); \
544 cudaBindTexture(0, muLink0TexSingle, mulink_even, staple_bytes); \
545 cudaBindTexture(0, muLink1TexSingle, mulink_odd, staple_bytes); \
549 #define UNBIND_MU_LINK() do{ \
550 if(prec == QUDA_DOUBLE_PRECISION){ \
551 cudaUnbindTexture(muLink0TexSingle); \
552 cudaUnbindTexture(muLink1TexSingle); \
554 cudaUnbindTexture(muLink0TexDouble); \
555 cudaUnbindTexture(muLink1TexDouble); \
559 #define BIND_SITE_AND_FAT_LINK_REVERSE do { \
560 if(prec == QUDA_DOUBLE_PRECISION){ \
561 cudaBindTexture(0, siteLink1TexDouble, cudaSiteLink.even, cudaSiteLink.bytes); \
562 cudaBindTexture(0, siteLink0TexDouble, cudaSiteLink.odd, cudaSiteLink.bytes); \
563 cudaBindTexture(0, fatGauge1TexDouble, cudaFatLink.even, cudaFatLink.bytes); \
564 cudaBindTexture(0, fatGauge0TexDouble, cudaFatLink.odd, cudaFatLink.bytes); \
566 if(cudaSiteLink.reconstruct == QUDA_RECONSTRUCT_NO){ \
567 cudaBindTexture(0, siteLink1TexSingle_norecon, cudaSiteLink.even, cudaSiteLink.bytes); \
568 cudaBindTexture(0, siteLink0TexSingle_norecon, cudaSiteLink.odd, cudaSiteLink.bytes); \
570 cudaBindTexture(0, siteLink1TexSingle_recon, cudaSiteLink.even, cudaSiteLink.bytes); \
571 cudaBindTexture(0, siteLink0TexSingle_recon, cudaSiteLink.odd, cudaSiteLink.bytes); \
573 cudaBindTexture(0, fatGauge1TexSingle, cudaFatLink.even, cudaFatLink.bytes); \
574 cudaBindTexture(0, fatGauge0TexSingle, cudaFatLink.odd, cudaFatLink.bytes); \
580 #define ENUMERATE_FUNCS(mu,nu) switch(mu) { \
584 printf("ERROR: invalid direction combination\n"); exit(1); \
587 CALL_FUNCTION(0,1); \
590 CALL_FUNCTION(0,2); \
593 CALL_FUNCTION(0,3); \
600 CALL_FUNCTION(1,0); \
603 printf("ERROR: invalid direction combination\n"); exit(1); \
606 CALL_FUNCTION(1,2); \
609 CALL_FUNCTION(1,3); \
616 CALL_FUNCTION(2,0); \
619 CALL_FUNCTION(2,1); \
622 printf("ERROR: invalid direction combination\n"); exit(1); \
625 CALL_FUNCTION(2,3); \
632 CALL_FUNCTION(3,0); \
635 CALL_FUNCTION(3,1); \
638 CALL_FUNCTION(3,2); \
641 printf("ERROR: invalid direction combination\n"); exit(1); \
647 #define ENUMERATE_FUNCS_SAVE(mu,nu, save_staple) if(save_staple){ \
652 printf("ERROR: invalid direction combination\n"); exit(1); \
655 CALL_FUNCTION(0,1,1); \
658 CALL_FUNCTION(0,2,1); \
661 CALL_FUNCTION(0,3,1); \
668 CALL_FUNCTION(1,0,1); \
671 printf("ERROR: invalid direction combination\n"); exit(1); \
674 CALL_FUNCTION(1,2,1); \
677 CALL_FUNCTION(1,3,1); \
684 CALL_FUNCTION(2,0,1); \
687 CALL_FUNCTION(2,1,1); \
690 printf("ERROR: invalid direction combination\n"); exit(1); \
693 CALL_FUNCTION(2,3,1); \
700 CALL_FUNCTION(3,0,1); \
703 CALL_FUNCTION(3,1,1); \
706 CALL_FUNCTION(3,2,1); \
709 printf("ERROR: invalid direction combination\n"); exit(1); \
719 printf("ERROR: invalid direction combination\n"); exit(1); \
722 CALL_FUNCTION(0,1,0); \
725 CALL_FUNCTION(0,2,0); \
728 CALL_FUNCTION(0,3,0); \
735 CALL_FUNCTION(1,0,0); \
738 printf("ERROR: invalid direction combination\n"); exit(1); \
741 CALL_FUNCTION(1,2,0); \
744 CALL_FUNCTION(1,3,0); \
751 CALL_FUNCTION(2,0,0); \
754 CALL_FUNCTION(2,1,0); \
757 printf("ERROR: invalid direction combination\n"); exit(1); \
760 CALL_FUNCTION(2,3,0); \
767 CALL_FUNCTION(3,0,0); \
770 CALL_FUNCTION(3,1,0); \
773 CALL_FUNCTION(3,2,0); \
776 printf("ERROR: invalid direction combination\n"); exit(1); \
794 #define CALL_FUNCTION(mu, nu) \
795 if (prec == QUDA_DOUBLE_PRECISION){ \
796 if(recon == QUDA_RECONSTRUCT_NO){ \
797 do_siteComputeGenStapleParity18Kernel<mu,nu, 0> \
798 <<<halfGridDim, blockDim, 0, *stream>>>((double2*)staple_even, (double2*)staple_odd, \
799 (const double2*)sitelink_even, (const double2*)sitelink_odd, \
800 (double2*)fatlink_even, (double2*)fatlink_odd, \
801 (double)mycoeff, kparam); \
802 do_siteComputeGenStapleParity18Kernel<mu,nu, 1> \
803 <<<halfGridDim, blockDim, 0, *stream>>>((double2*)staple_odd, (double2*)staple_even, \
804 (const double2*)sitelink_odd, (const double2*)sitelink_even, \
805 (double2*)fatlink_odd, (double2*)fatlink_even, \
806 (double)mycoeff, kparam); \
808 do_siteComputeGenStapleParity12Kernel<mu,nu, 0> \
809 <<<halfGridDim, blockDim, 0, *stream>>>((double2*)staple_even, (double2*)staple_odd, \
810 (const double2*)sitelink_even, (const double2*)sitelink_odd, \
811 (double2*)fatlink_even, (double2*)fatlink_odd, \
812 (double)mycoeff, kparam); \
813 do_siteComputeGenStapleParity12Kernel<mu,nu, 1> \
814 <<<halfGridDim, blockDim, 0, *stream>>>((double2*)staple_odd, (double2*)staple_even, \
815 (const double2*)sitelink_odd, (const double2*)sitelink_even, \
816 (double2*)fatlink_odd, (double2*)fatlink_even, \
817 (double)mycoeff, kparam); \
820 if(recon == QUDA_RECONSTRUCT_NO){ \
821 do_siteComputeGenStapleParity18Kernel<mu,nu, 0> \
822 <<<halfGridDim, blockDim, 0, *stream>>>((float2*)staple_even, (float2*)staple_odd, \
823 (const float2*)sitelink_even, (const float2*)sitelink_odd, \
824 (float2*)fatlink_even, (float2*)fatlink_odd, \
825 (float)mycoeff, kparam); \
826 do_siteComputeGenStapleParity18Kernel<mu,nu, 1> \
827 <<<halfGridDim, blockDim, 0, *stream>>>((float2*)staple_odd, (float2*)staple_even, \
828 (const float2*)sitelink_odd, (const float2*)sitelink_even, \
829 (float2*)fatlink_odd, (float2*)fatlink_even, \
830 (float)mycoeff, kparam); \
832 do_siteComputeGenStapleParity12Kernel<mu,nu, 0> \
833 <<<halfGridDim, blockDim, 0, *stream>>>((float2*)staple_even, (float2*)staple_odd, \
834 (const float4*)sitelink_even, (const float4*)sitelink_odd, \
835 (float2*)fatlink_even, (float2*)fatlink_odd, \
836 (float)mycoeff, kparam); \
837 do_siteComputeGenStapleParity12Kernel<mu,nu, 1> \
838 <<<halfGridDim, blockDim, 0, *stream>>>((float2*)staple_odd, (float2*)staple_even, \
839 (const float4*)sitelink_odd, (const float4*)sitelink_even, \
840 (float2*)fatlink_odd, (float2*)fatlink_even, \
841 (float)mycoeff, kparam); \
860 int mu,
int nu,
int save_staple,
867 #define CALL_FUNCTION(mu, nu, save_staple) \
868 if (prec == QUDA_DOUBLE_PRECISION){ \
869 if(recon == QUDA_RECONSTRUCT_NO){ \
870 do_computeGenStapleFieldParity18Kernel<mu,nu, 0, save_staple> \
871 <<<halfGridDim, blockDim, 0, *stream>>>((double2*)staple_even, (double2*)staple_odd, \
872 (const double2*)sitelink_even, (const double2*)sitelink_odd, \
873 (double2*)fatlink_even, (double2*)fatlink_odd, \
874 (const double2*)mulink_even, (const double2*)mulink_odd, \
875 (double)mycoeff, kparam); \
876 do_computeGenStapleFieldParity18Kernel<mu,nu, 1, save_staple> \
877 <<<halfGridDim, blockDim, 0, *stream>>>((double2*)staple_odd, (double2*)staple_even, \
878 (const double2*)sitelink_odd, (const double2*)sitelink_even, \
879 (double2*)fatlink_odd, (double2*)fatlink_even, \
880 (const double2*)mulink_odd, (const double2*)mulink_even, \
881 (double)mycoeff, kparam); \
883 do_computeGenStapleFieldParity12Kernel<mu,nu, 0, save_staple> \
884 <<<halfGridDim, blockDim, 0, *stream>>>((double2*)staple_even, (double2*)staple_odd, \
885 (const double2*)sitelink_even, (const double2*)sitelink_odd, \
886 (double2*)fatlink_even, (double2*)fatlink_odd, \
887 (const double2*)mulink_even, (const double2*)mulink_odd, \
888 (double)mycoeff, kparam); \
889 do_computeGenStapleFieldParity12Kernel<mu,nu, 1, save_staple> \
890 <<<halfGridDim, blockDim, 0, *stream>>>((double2*)staple_odd, (double2*)staple_even, \
891 (const double2*)sitelink_odd, (const double2*)sitelink_even, \
892 (double2*)fatlink_odd, (double2*)fatlink_even, \
893 (const double2*)mulink_odd, (const double2*)mulink_even, \
894 (double)mycoeff, kparam); \
897 if(recon == QUDA_RECONSTRUCT_NO){ \
898 do_computeGenStapleFieldParity18Kernel<mu,nu, 0, save_staple> \
899 <<<halfGridDim, blockDim, 0, *stream>>>((float2*)staple_even, (float2*)staple_odd, \
900 (const float2*)sitelink_even, (const float2*)sitelink_odd, \
901 (float2*)fatlink_even, (float2*)fatlink_odd, \
902 (const float2*)mulink_even, (const float2*)mulink_odd, \
903 (float)mycoeff, kparam); \
904 do_computeGenStapleFieldParity18Kernel<mu,nu, 1, save_staple> \
905 <<<halfGridDim, blockDim, 0, *stream>>>((float2*)staple_odd, (float2*)staple_even, \
906 (const float2*)sitelink_odd, (const float2*)sitelink_even, \
907 (float2*)fatlink_odd, (float2*)fatlink_even, \
908 (const float2*)mulink_odd, (const float2*)mulink_even, \
909 (float)mycoeff, kparam); \
911 do_computeGenStapleFieldParity12Kernel<mu,nu, 0, save_staple> \
912 <<<halfGridDim, blockDim, 0, *stream>>>((float2*)staple_even, (float2*)staple_odd, \
913 (const float4*)sitelink_even, (const float4*)sitelink_odd, \
914 (float2*)fatlink_even, (float2*)fatlink_odd, \
915 (const float2*)mulink_even, (const float2*)mulink_odd, \
916 (float)mycoeff, kparam); \
917 do_computeGenStapleFieldParity12Kernel<mu,nu, 1, save_staple> \
918 <<<halfGridDim, blockDim, 0, *stream>>>((float2*)staple_odd, (float2*)staple_even, \
919 (const float4*)sitelink_odd, (const float4*)sitelink_even, \
920 (float2*)fatlink_odd, (float2*)fatlink_even, \
921 (const float2*)mulink_odd, (const float2*)mulink_even, \
922 (float)mycoeff, kparam); \
948 int sbytes_dp = blockDim.x*5*
sizeof(double2);
949 int sbytes_sp = blockDim.x*5*
sizeof(float2);
951 #define CALL_FUNCTION(mu, nu) \
952 if (prec == QUDA_DOUBLE_PRECISION){ \
953 if(recon == QUDA_RECONSTRUCT_NO){ \
954 do_siteComputeGenStapleParity18Kernel_ex<mu,nu, 0> \
955 <<<halfGridDim, blockDim, sbytes_dp>>>((double2*)staple_even, (double2*)staple_odd, \
956 (const double2*)sitelink_even, (const double2*)sitelink_odd, \
957 (double2*)fatlink_even, (double2*)fatlink_odd, \
958 (double)mycoeff, kparam); \
959 do_siteComputeGenStapleParity18Kernel_ex<mu,nu, 1> \
960 <<<halfGridDim, blockDim, sbytes_dp>>>((double2*)staple_odd, (double2*)staple_even, \
961 (const double2*)sitelink_odd, (const double2*)sitelink_even, \
962 (double2*)fatlink_odd, (double2*)fatlink_even, \
963 (double)mycoeff, kparam); \
965 do_siteComputeGenStapleParity12Kernel_ex<mu,nu, 0> \
966 <<<halfGridDim, blockDim, sbytes_dp>>>((double2*)staple_even, (double2*)staple_odd, \
967 (const double2*)sitelink_even, (const double2*)sitelink_odd, \
968 (double2*)fatlink_even, (double2*)fatlink_odd, \
969 (double)mycoeff, kparam); \
970 do_siteComputeGenStapleParity12Kernel_ex<mu,nu, 1> \
971 <<<halfGridDim, blockDim, sbytes_dp>>>((double2*)staple_odd, (double2*)staple_even, \
972 (const double2*)sitelink_odd, (const double2*)sitelink_even, \
973 (double2*)fatlink_odd, (double2*)fatlink_even, \
974 (double)mycoeff, kparam); \
977 if(recon == QUDA_RECONSTRUCT_NO){ \
978 do_siteComputeGenStapleParity18Kernel_ex<mu,nu, 0> \
979 <<<halfGridDim, blockDim, sbytes_sp>>>((float2*)staple_even, (float2*)staple_odd, \
980 (const float2*)sitelink_even, (const float2*)sitelink_odd, \
981 (float2*)fatlink_even, (float2*)fatlink_odd, \
982 (float)mycoeff, kparam); \
983 do_siteComputeGenStapleParity18Kernel_ex<mu,nu, 1> \
984 <<<halfGridDim, blockDim, sbytes_sp>>>((float2*)staple_odd, (float2*)staple_even, \
985 (const float2*)sitelink_odd, (const float2*)sitelink_even, \
986 (float2*)fatlink_odd, (float2*)fatlink_even, \
987 (float)mycoeff, kparam); \
989 do_siteComputeGenStapleParity12Kernel_ex<mu,nu, 0> \
990 <<<halfGridDim, blockDim, sbytes_sp>>>((float2*)staple_even, (float2*)staple_odd, \
991 (const float4*)sitelink_even, (const float4*)sitelink_odd, \
992 (float2*)fatlink_even, (float2*)fatlink_odd, \
993 (float)mycoeff, kparam); \
994 do_siteComputeGenStapleParity12Kernel_ex<mu,nu, 1> \
995 <<<halfGridDim, blockDim, sbytes_sp>>>((float2*)staple_odd, (float2*)staple_even, \
996 (const float4*)sitelink_odd, (const float4*)sitelink_even, \
997 (float2*)fatlink_odd, (float2*)fatlink_even, \
998 (float)mycoeff, kparam); \
1005 #undef CALL_FUNCTION
1017 int mu,
int nu,
int save_staple,
1026 int sbytes_dp = blockDim.x*5*
sizeof(double2);
1027 int sbytes_sp = blockDim.x*5*
sizeof(float2);
1029 #define CALL_FUNCTION(mu, nu, save_staple) \
1030 if (prec == QUDA_DOUBLE_PRECISION){ \
1031 if(recon == QUDA_RECONSTRUCT_NO){ \
1032 do_computeGenStapleFieldParity18Kernel_ex<mu,nu, 0, save_staple> \
1033 <<<halfGridDim, blockDim, sbytes_dp>>>((double2*)staple_even, (double2*)staple_odd, \
1034 (const double2*)sitelink_even, (const double2*)sitelink_odd, \
1035 (double2*)fatlink_even, (double2*)fatlink_odd, \
1036 (const double2*)mulink_even, (const double2*)mulink_odd, \
1037 (double)mycoeff, kparam); \
1038 do_computeGenStapleFieldParity18Kernel_ex<mu,nu, 1, save_staple> \
1039 <<<halfGridDim, blockDim, sbytes_dp>>>((double2*)staple_odd, (double2*)staple_even, \
1040 (const double2*)sitelink_odd, (const double2*)sitelink_even, \
1041 (double2*)fatlink_odd, (double2*)fatlink_even, \
1042 (const double2*)mulink_odd, (const double2*)mulink_even, \
1043 (double)mycoeff, kparam); \
1045 do_computeGenStapleFieldParity12Kernel_ex<mu,nu, 0, save_staple> \
1046 <<<halfGridDim, blockDim, sbytes_dp>>>((double2*)staple_even, (double2*)staple_odd, \
1047 (const double2*)sitelink_even, (const double2*)sitelink_odd, \
1048 (double2*)fatlink_even, (double2*)fatlink_odd, \
1049 (const double2*)mulink_even, (const double2*)mulink_odd, \
1050 (double)mycoeff, kparam); \
1051 do_computeGenStapleFieldParity12Kernel_ex<mu,nu, 1, save_staple> \
1052 <<<halfGridDim, blockDim, sbytes_dp>>>((double2*)staple_odd, (double2*)staple_even, \
1053 (const double2*)sitelink_odd, (const double2*)sitelink_even, \
1054 (double2*)fatlink_odd, (double2*)fatlink_even, \
1055 (const double2*)mulink_odd, (const double2*)mulink_even, \
1056 (double)mycoeff, kparam); \
1059 if(recon == QUDA_RECONSTRUCT_NO){ \
1060 do_computeGenStapleFieldParity18Kernel_ex<mu,nu, 0, save_staple> \
1061 <<<halfGridDim, blockDim, sbytes_sp>>>((float2*)staple_even, (float2*)staple_odd, \
1062 (const float2*)sitelink_even, (const float2*)sitelink_odd, \
1063 (float2*)fatlink_even, (float2*)fatlink_odd, \
1064 (const float2*)mulink_even, (const float2*)mulink_odd, \
1065 (float)mycoeff, kparam); \
1066 do_computeGenStapleFieldParity18Kernel_ex<mu,nu, 1, save_staple> \
1067 <<<halfGridDim, blockDim, sbytes_sp>>>((float2*)staple_odd, (float2*)staple_even, \
1068 (const float2*)sitelink_odd, (const float2*)sitelink_even, \
1069 (float2*)fatlink_odd, (float2*)fatlink_even, \
1070 (const float2*)mulink_odd, (const float2*)mulink_even, \
1071 (float)mycoeff, kparam); \
1073 do_computeGenStapleFieldParity12Kernel_ex<mu,nu, 0, save_staple> \
1074 <<<halfGridDim, blockDim, sbytes_sp>>>((float2*)staple_even, (float2*)staple_odd, \
1075 (const float4*)sitelink_even, (const float4*)sitelink_odd, \
1076 (float2*)fatlink_even, (float2*)fatlink_odd, \
1077 (const float2*)mulink_even, (const float2*)mulink_odd, \
1078 (float)mycoeff, kparam); \
1079 do_computeGenStapleFieldParity12Kernel_ex<mu,nu, 1, save_staple> \
1080 <<<halfGridDim, blockDim, sbytes_sp>>>((float2*)staple_odd, (float2*)staple_even, \
1081 (const float4*)sitelink_odd, (const float4*)sitelink_even, \
1082 (float2*)fatlink_odd, (float2*)fatlink_even, \
1083 (const float2*)mulink_odd, (const float2*)mulink_even, \
1084 (float)mycoeff, kparam); \
1093 #undef CALL_FUNCTION
1108 int volume = param->
X[0]*param->
X[1]*param->
X[2]*param->
X[3];
1116 llfatOneLink18Kernel<<<gridDim, blockDim>>>((
const double2*)cudaSiteLink.
Even_p(), (
const double2*)cudaSiteLink.
Odd_p(),
1117 (double2*)cudaFatLink.
Even_p(), (double2*)cudaFatLink.
Odd_p(),
1118 (double)act_path_coeff[0], (
double)act_path_coeff[5]);
1121 llfatOneLink12Kernel<<<gridDim, blockDim>>>((
const double2*)cudaSiteLink.
Even_p(), (
const double2*)cudaSiteLink.
Odd_p(),
1122 (double2*)cudaFatLink.
Even_p(), (double2*)cudaFatLink.
Odd_p(),
1123 (double)act_path_coeff[0], (
double)act_path_coeff[5]);
1128 llfatOneLink18Kernel<<<gridDim, blockDim>>>((
const float2*)cudaSiteLink.
Even_p(), (
const float2*)cudaSiteLink.
Odd_p(),
1129 (float2*)cudaFatLink.
Even_p(), (float2*)cudaFatLink.
Odd_p(),
1130 (float)act_path_coeff[0], (
float)act_path_coeff[5]);
1132 llfatOneLink12Kernel<<<gridDim, blockDim>>>((
const float4*)cudaSiteLink.
Even_p(), (
const float4*)cudaSiteLink.
Odd_p(),
1133 (float2*)cudaFatLink.
Even_p(), (float2*)cudaFatLink.
Odd_p(),
1134 (float)act_path_coeff[0], (
float)act_path_coeff[5]);
1160 llfatOneLink18Kernel_ex<<<gridDim, blockDim>>>((
const double2*)cudaSiteLink.
Even_p(), (
const double2*)cudaSiteLink.
Odd_p(),
1161 (double2*)cudaFatLink.
Even_p(), (double2*)cudaFatLink.
Odd_p(),
1162 (double)act_path_coeff[0], (
double)act_path_coeff[5],
kparam);
1165 llfatOneLink12Kernel_ex<<<gridDim, blockDim>>>((
const double2*)cudaSiteLink.
Even_p(), (
const double2*)cudaSiteLink.
Odd_p(),
1166 (double2*)cudaFatLink.
Even_p(), (double2*)cudaFatLink.
Odd_p(),
1167 (double)act_path_coeff[0], (
double)act_path_coeff[5],
kparam);
1172 llfatOneLink18Kernel_ex<<<gridDim, blockDim>>>((
const float2*)cudaSiteLink.
Even_p(), (
const float2*)cudaSiteLink.
Odd_p(),
1173 (float2*)cudaFatLink.
Even_p(), (float2*)cudaFatLink.
Odd_p(),
1174 (float)act_path_coeff[0], (
float)act_path_coeff[5],
kparam);
1176 llfatOneLink12Kernel_ex<<<gridDim, blockDim>>>((
const float4*)cudaSiteLink.
Even_p(), (
const float4*)cudaSiteLink.
Odd_p(),
1177 (float2*)cudaFatLink.
Even_p(), (float2*)cudaFatLink.
Odd_p(),
1178 (float)act_path_coeff[0], (
float)act_path_coeff[5],
kparam);