2 #include <cuda_runtime.h>
24 #if (__COMPUTE_CAPABILITY__ >= 200)
25 #define SITE_MATRIX_LOAD_TEX 1
26 #define MULINK_LOAD_TEX 1
27 #define FATLINK_LOAD_TEX 1
29 #define SITE_MATRIX_LOAD_TEX 0
30 #define MULINK_LOAD_TEX 1
31 #define FATLINK_LOAD_TEX 1
36 #define WRITE_FAT_MATRIX(gauge, dir, idx)do { \
37 gauge[idx + dir*9*fl.fat_ga_stride] = FAT0; \
38 gauge[idx + (dir*9+1) * fl.fat_ga_stride] = FAT1; \
39 gauge[idx + (dir*9+2) * fl.fat_ga_stride] = FAT2; \
40 gauge[idx + (dir*9+3) * fl.fat_ga_stride] = FAT3; \
41 gauge[idx + (dir*9+4) * fl.fat_ga_stride] = FAT4; \
42 gauge[idx + (dir*9+5) * fl.fat_ga_stride] = FAT5; \
43 gauge[idx + (dir*9+6) * fl.fat_ga_stride] = FAT6; \
44 gauge[idx + (dir*9+7) * fl.fat_ga_stride] = FAT7; \
45 gauge[idx + (dir*9+8) * fl.fat_ga_stride] = FAT8;} while(0)
47 #define WRITE_GAUGE_MATRIX_FLOAT2(gauge, MAT, dir, idx, stride) { \
48 gauge[idx + dir*9*stride] = MAT##0; \
49 gauge[idx + (dir*9 + 1)*stride] = MAT##1; \
50 gauge[idx + (dir*9 + 2)*stride] = MAT##2; \
51 gauge[idx + (dir*9 + 3)*stride] = MAT##3; \
52 gauge[idx + (dir*9 + 4)*stride] = MAT##4; \
53 gauge[idx + (dir*9 + 5)*stride] = MAT##5; \
54 gauge[idx + (dir*9 + 6)*stride] = MAT##6; \
55 gauge[idx + (dir*9 + 7)*stride] = MAT##7; \
56 gauge[idx + (dir*9 + 8)*stride] = MAT##8; \
60 #define WRITE_GAUGE_MATRIX_FLOAT4(gauge, MAT, dir, idx, stride) { \
61 gauge[idx + dir*3*stride] = MAT##0; \
62 gauge[idx + (dir*3 + 1)*stride] = MAT##1; \
63 gauge[idx + (dir*3 + 2)*stride] = MAT##2; \
68 #define WRITE_STAPLE_MATRIX(gauge, idx) \
69 gauge[idx] = STAPLE0; \
70 gauge[idx + fl.staple_stride] = STAPLE1; \
71 gauge[idx + 2*fl.staple_stride] = STAPLE2; \
72 gauge[idx + 3*fl.staple_stride] = STAPLE3; \
73 gauge[idx + 4*fl.staple_stride] = STAPLE4; \
74 gauge[idx + 5*fl.staple_stride] = STAPLE5; \
75 gauge[idx + 6*fl.staple_stride] = STAPLE6; \
76 gauge[idx + 7*fl.staple_stride] = STAPLE7; \
77 gauge[idx + 8*fl.staple_stride] = STAPLE8;
80 #define SCALAR_MULT_SU3_MATRIX(a, b, c) \
81 c##00_re = a*b##00_re; \
82 c##00_im = a*b##00_im; \
83 c##01_re = a*b##01_re; \
84 c##01_im = a*b##01_im; \
85 c##02_re = a*b##02_re; \
86 c##02_im = a*b##02_im; \
87 c##10_re = a*b##10_re; \
88 c##10_im = a*b##10_im; \
89 c##11_re = a*b##11_re; \
90 c##11_im = a*b##11_im; \
91 c##12_re = a*b##12_re; \
92 c##12_im = a*b##12_im; \
93 c##20_re = a*b##20_re; \
94 c##20_im = a*b##20_im; \
95 c##21_re = a*b##21_re; \
96 c##21_im = a*b##21_im; \
97 c##22_re = a*b##22_re; \
98 c##22_im = a*b##22_im; \
119 #define LOAD_MATRIX_12_SINGLE_DECLARE(gauge, dir, idx, var, stride) \
120 float4 var##0 = gauge[idx + dir*3*stride]; \
121 float4 var##1 = gauge[idx + dir*3*stride + stride]; \
122 float4 var##2 = gauge[idx + dir*3*stride + 2*stride]; \
123 float4 var##3, var##4;
125 #define LOAD_MATRIX_12_SINGLE_TEX_DECLARE(gauge, dir, idx, var, stride) \
126 float4 var##0 = tex1Dfetch(gauge, idx + dir*3*stride); \
127 float4 var##1 = tex1Dfetch(gauge, idx + dir*3*stride + stride); \
128 float4 var##2 = tex1Dfetch(gauge, idx + dir*3*stride + 2*stride); \
129 float4 var##3, var##4;
131 #define LOAD_MATRIX_18_SINGLE_DECLARE(gauge, dir, idx, var, stride) \
132 float2 var##0 = gauge[idx + dir*9*stride]; \
133 float2 var##1 = gauge[idx + dir*9*stride + stride]; \
134 float2 var##2 = gauge[idx + dir*9*stride + 2*stride]; \
135 float2 var##3 = gauge[idx + dir*9*stride + 3*stride]; \
136 float2 var##4 = gauge[idx + dir*9*stride + 4*stride]; \
137 float2 var##5 = gauge[idx + dir*9*stride + 5*stride]; \
138 float2 var##6 = gauge[idx + dir*9*stride + 6*stride]; \
139 float2 var##7 = gauge[idx + dir*9*stride + 7*stride]; \
140 float2 var##8 = gauge[idx + dir*9*stride + 8*stride];
143 #define LOAD_MATRIX_18_SINGLE_TEX_DECLARE(gauge, dir, idx, var, stride) \
144 float2 var##0 = tex1Dfetch(gauge, idx + dir*9*stride); \
145 float2 var##1 = tex1Dfetch(gauge, idx + dir*9*stride + stride); \
146 float2 var##2 = tex1Dfetch(gauge, idx + dir*9*stride + 2*stride); \
147 float2 var##3 = tex1Dfetch(gauge, idx + dir*9*stride + 3*stride); \
148 float2 var##4 = tex1Dfetch(gauge, idx + dir*9*stride + 4*stride); \
149 float2 var##5 = tex1Dfetch(gauge, idx + dir*9*stride + 5*stride); \
150 float2 var##6 = tex1Dfetch(gauge, idx + dir*9*stride + 6*stride); \
151 float2 var##7 = tex1Dfetch(gauge, idx + dir*9*stride + 7*stride); \
152 float2 var##8 = tex1Dfetch(gauge, idx + dir*9*stride + 8*stride);
156 #define LOAD_MATRIX_18_DOUBLE_DECLARE(gauge, dir, idx, var, stride) \
157 double2 var##0 = gauge[idx + dir*9*stride]; \
158 double2 var##1 = gauge[idx + dir*9*stride + stride]; \
159 double2 var##2 = gauge[idx + dir*9*stride + 2*stride]; \
160 double2 var##3 = gauge[idx + dir*9*stride + 3*stride]; \
161 double2 var##4 = gauge[idx + dir*9*stride + 4*stride]; \
162 double2 var##5 = gauge[idx + dir*9*stride + 5*stride]; \
163 double2 var##6 = gauge[idx + dir*9*stride + 6*stride]; \
164 double2 var##7 = gauge[idx + dir*9*stride + 7*stride]; \
165 double2 var##8 = gauge[idx + dir*9*stride + 8*stride];
168 #define LOAD_MATRIX_18_DOUBLE_TEX_DECLARE(gauge_tex, gauge, dir, idx, var, stride) \
169 double2 var##0 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*9*stride); \
170 double2 var##1 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*9*stride + stride); \
171 double2 var##2 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*9*stride + 2*stride); \
172 double2 var##3 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*9*stride + 3*stride); \
173 double2 var##4 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*9*stride + 4*stride); \
174 double2 var##5 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*9*stride + 5*stride); \
175 double2 var##6 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*9*stride + 6*stride); \
176 double2 var##7 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*9*stride + 7*stride); \
177 double2 var##8 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*9*stride + 8*stride);
180 #define LOAD_MATRIX_12_DOUBLE_DECLARE(gauge, dir, idx, var, stride) \
181 double2 var##0 = gauge[idx + dir*6*stride]; \
182 double2 var##1 = gauge[idx + dir*6*stride + stride]; \
183 double2 var##2 = gauge[idx + dir*6*stride + 2*stride]; \
184 double2 var##3 = gauge[idx + dir*6*stride + 3*stride]; \
185 double2 var##4 = gauge[idx + dir*6*stride + 4*stride]; \
186 double2 var##5 = gauge[idx + dir*6*stride + 5*stride]; \
187 double2 var##6, var##7, var##8;
190 #define LOAD_MATRIX_12_DOUBLE_TEX_DECLARE(gauge_tex, gauge, dir, idx, var, stride) \
191 double2 var##0 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*6*stride); \
192 double2 var##1 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*6*stride + stride); \
193 double2 var##2 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*6*stride + 2*stride); \
194 double2 var##3 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*6*stride + 3*stride); \
195 double2 var##4 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*6*stride + 4*stride); \
196 double2 var##5 = READ_DOUBLE2_TEXTURE(gauge_tex, gauge, idx + dir*6*stride + 5*stride); \
197 double2 var##6, var##7, var##8;
199 #define LLFAT_ADD_SU3_MATRIX(ma, mb, mc) \
200 mc##00_re = ma##00_re + mb##00_re; \
201 mc##00_im = ma##00_im + mb##00_im; \
202 mc##01_re = ma##01_re + mb##01_re; \
203 mc##01_im = ma##01_im + mb##01_im; \
204 mc##02_re = ma##02_re + mb##02_re; \
205 mc##02_im = ma##02_im + mb##02_im; \
206 mc##10_re = ma##10_re + mb##10_re; \
207 mc##10_im = ma##10_im + mb##10_im; \
208 mc##11_re = ma##11_re + mb##11_re; \
209 mc##11_im = ma##11_im + mb##11_im; \
210 mc##12_re = ma##12_re + mb##12_re; \
211 mc##12_im = ma##12_im + mb##12_im; \
212 mc##20_re = ma##20_re + mb##20_re; \
213 mc##20_im = ma##20_im + mb##20_im; \
214 mc##21_re = ma##21_re + mb##21_re; \
215 mc##21_im = ma##21_im + mb##21_im; \
216 mc##22_re = ma##22_re + mb##22_re; \
217 mc##22_im = ma##22_im + mb##22_im;
222 __constant__
int dir1_array[16];
223 __constant__
int dir2_array[16];
225 unsigned long staple_bytes=0;
230 static int llfat_init_cuda_flag = 0;
231 if (llfat_init_cuda_flag){
235 llfat_init_cuda_flag = 1;
237 int Vh = param->
X[0]*param->
X[1]*param->
X[2]*param->
X[3]/2;
251 for(
int nu =0; nu < 4; nu++)
252 for(
int mu=0;
mu < 4;
mu++){
253 if(nu ==
mu)
continue;
255 for(d1=0; d1 < 4; d1 ++){
256 if(d1 != nu && d1 !=
mu){
262 for(d2=0; d2 < 4; d2 ++){
263 if(d2 != nu && d2 !=
mu && d2 != d1){
271 cudaMemcpyToSymbol(dir1_array, &dir1,
sizeof(dir1));
272 cudaMemcpyToSymbol(dir2_array, &dir2,
sizeof(dir2));
281 static int llfat_init_cuda_flag = 0;
283 if (llfat_init_cuda_flag){
287 llfat_init_cuda_flag = 1;
289 int Vh_ex = param_ex->
X[0]*param_ex->
X[1]*param_ex->
X[2]*param_ex->
X[3]/2;
290 int Vh = (param_ex->
X[0]-4)*(param_ex->
X[1]-4)*(param_ex->
X[2]-4)*(param_ex->
X[3]-4)/2;
303 #define LLFAT_CONCAT(a,b) a##b##Kernel
304 #define LLFAT_CONCAT_EX(a,b) a##b##Kernel_ex
305 #define LLFAT_KERNEL(a,b) LLFAT_CONCAT(a,b)
306 #define LLFAT_KERNEL_EX(a,b) LLFAT_CONCAT_EX(a,b)
313 #define LOAD_FAT_MATRIX(gauge, dir, idx) LOAD_MATRIX_18_SINGLE_DECLARE(gauge, dir, idx, FAT, fl.fat_ga_stride)
314 #if (MULINK_LOAD_TEX == 1)
315 #define LOAD_EVEN_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_TEX_DECLARE((odd_bit?muLink1TexSingle:muLink0TexSingle), dir, idx, var, fl.staple_stride)
316 #define LOAD_ODD_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_TEX_DECLARE((odd_bit?muLink0TexSingle:muLink1TexSingle), dir, idx, var, fl.staple_stride)
318 #define LOAD_EVEN_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_DECLARE(mulink_even, dir, idx, var, fl.staple_stride)
319 #define LOAD_ODD_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_DECLARE(mulink_odd, dir, idx, var, fl.staple_stride)
322 #if (FATLINK_LOAD_TEX == 1)
323 #define LOAD_EVEN_FAT_MATRIX(dir, idx) LOAD_MATRIX_18_SINGLE_TEX_DECLARE((odd_bit?fatGauge1TexSingle:fatGauge0TexSingle), dir, idx, FAT, fl.fat_ga_stride);
324 #define LOAD_ODD_FAT_MATRIX(dir, idx) LOAD_MATRIX_18_SINGLE_TEX_DECLARE((odd_bit?fatGauge0TexSingle:fatGauge1TexSingle), dir, idx, FAT, fl.fat_ga_stride);
326 #define LOAD_EVEN_FAT_MATRIX(dir, idx) LOAD_MATRIX_18_SINGLE_DECLARE(fatlink_even, dir, idx, FAT, fl.fat_ga_stride)
327 #define LOAD_ODD_FAT_MATRIX(dir, idx) LOAD_MATRIX_18_SINGLE_DECLARE(fatlink_odd, dir, idx, FAT, fl.fat_ga_stride)
332 #define DECLARE_VAR_SIGN short sign=1
333 #define SITELINK0TEX siteLink0TexSingle_recon
334 #define SITELINK1TEX siteLink1TexSingle_recon
335 #if (SITE_MATRIX_LOAD_TEX == 1)
336 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_SINGLE_TEX_DECLARE((odd_bit?SITELINK1TEX:SITELINK0TEX), dir, idx, var, fl.site_ga_stride)
337 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_SINGLE_TEX_DECLARE((odd_bit?SITELINK0TEX:SITELINK1TEX), dir, idx, var, fl.site_ga_stride)
339 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_SINGLE_DECLARE(sitelink_even, dir, idx, var, fl.site_ga_stride)
340 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_SINGLE_DECLARE(sitelink_odd, dir, idx, var, fl.site_ga_stride)
342 #define LOAD_SITE_MATRIX(sitelink, dir, idx, var) LOAD_MATRIX_12_SINGLE_DECLARE(sitelink, dir, idx, var, fl.site_ga_stride)
344 #define RECONSTRUCT_SITE_LINK(sign, var) RECONSTRUCT_LINK_12(sign, var);
345 #define FloatN float4
346 #define FloatM float2
347 #define RECONSTRUCT 12
348 #define sd_data float_12_sd_data
350 #undef DECLARE_VAR_SIGN
353 #undef LOAD_EVEN_SITE_MATRIX
354 #undef LOAD_ODD_SITE_MATRIX
355 #undef LOAD_SITE_MATRIX
356 #undef RECONSTRUCT_SITE_LINK
363 #define SITELINK0TEX siteLink0TexSingle_norecon
364 #define SITELINK1TEX siteLink1TexSingle_norecon
365 #if (SITE_MATRIX_LOAD_TEX == 1)
366 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_TEX_DECLARE((odd_bit?SITELINK1TEX:SITELINK0TEX), dir, idx, var, fl.site_ga_stride)
367 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_TEX_DECLARE((odd_bit?SITELINK0TEX:SITELINK1TEX), dir, idx, var, fl.site_ga_stride)
369 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_DECLARE(sitelink_even, dir, idx, var, fl.site_ga_stride)
370 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_SINGLE_DECLARE(sitelink_odd, dir, idx, var, fl.site_ga_stride)
372 #define LOAD_SITE_MATRIX(sitelink, dir, idx, var) LOAD_MATRIX_18_SINGLE_DECLARE(sitelink, dir, idx, var, fl.site_ga_stride)
373 #define RECONSTRUCT_SITE_LINK(sign, var)
374 #define FloatN float2
375 #define FloatM float2
376 #define RECONSTRUCT 18
377 #define sd_data float_18_sd_data
381 #undef LOAD_EVEN_SITE_MATRIX
382 #undef LOAD_ODD_SITE_MATRIX
383 #undef LOAD_SITE_MATRIX
384 #undef RECONSTRUCT_SITE_LINK
393 #undef LOAD_FAT_MATRIX
394 #undef LOAD_EVEN_MULINK_MATRIX
395 #undef LOAD_ODD_MULINK_MATRIX
396 #undef LOAD_EVEN_FAT_MATRIX
397 #undef LOAD_ODD_FAT_MATRIX
403 #define LOAD_FAT_MATRIX(gauge, dir, idx) LOAD_MATRIX_18_DOUBLE_DECLARE(gauge, dir, idx, FAT, fl.fat_ga_stride)
404 #if (MULINK_LOAD_TEX == 1)
405 #define LOAD_EVEN_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE_TEX_DECLARE(odd_bit?muLink1TexDouble:muLink0TexDouble), mulink_even, dir, idx, var, fl.staple_stride)
406 #define LOAD_ODD_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE_TEX_DECLARE((odd_bit?muLink0TexDouble:muLink1TexDouble), mulink_odd, dir, idx, var, fl.staple_stride)
408 #define LOAD_EVEN_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE(mulink_even, dir, idx, var, fl.staple_stride)
409 #define LOAD_ODD_MULINK_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE(mulink_odd, dir, idx, var, fl.staple_stride)
412 #if (FATLINK_LOAD_TEX == 1)
413 #define LOAD_EVEN_FAT_MATRIX(dir, idx) LOAD_MATRIX_18_DOUBLE_TEX_DECLARE((odd_bit?fatGauge1TexDouble:fatGauge0TexDouble), fatlink_even, dir, idx, FAT, fl.fat_ga_stride)
414 #define LOAD_ODD_FAT_MATRIX(dir, idx) LOAD_MATRIX_18_DOUBLE_TEX_DECLARE((odd_bit?fatGauge0TexDouble:fatGauge1TexDouble), fatlink_odd, dir, idx, FAT, fl.fat_ga_stride)
416 #define LOAD_EVEN_FAT_MATRIX(dir, idx) LOAD_MATRIX_18_DOUBLE_DECLARE(fatlink_even, dir, idx, FAT, fl.fat_ga_stride)
417 #define LOAD_ODD_FAT_MATRIX(dir, idx) LOAD_MATRIX_18_DOUBLE_DECLARE(fatlink_odd, dir, idx, FAT, fl.fat_ga_stride)
421 #define SITELINK0TEX siteLink0TexDouble
422 #define SITELINK1TEX siteLink1TexDouble
423 #if (SITE_MATRIX_LOAD_TEX == 1)
424 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE_TEX_DECLARE((odd_bit?SITELINK1TEX:SITELINK0TEX), sitelink_even, dir, idx, var, fl.site_ga_stride)
425 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE_TEX_DECLARE((odd_bit?SITELINK0TEX:SITELINK1TEX), sitelink_odd, dir, idx, var, fl.site_ga_stride)
427 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE_DECLARE(sitelink_even, dir, idx, var, fl.site_ga_stride)
428 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_18_DOUBLE_DECLARE(sitelink_odd, dir, idx, var, fl.site_ga_stride)
430 #define LOAD_SITE_MATRIX(sitelink, dir, idx, var) LOAD_MATRIX_18_DOUBLE_DECLARE(sitelink, dir, idx, var, fl.site_ga_stride)
431 #define RECONSTRUCT_SITE_LINK(sign, var)
432 #define FloatN double2
433 #define FloatM double2
434 #define RECONSTRUCT 18
435 #define sd_data double_18_sd_data
439 #undef LOAD_EVEN_SITE_MATRIX
440 #undef LOAD_ODD_SITE_MATRIX
441 #undef LOAD_SITE_MATRIX
442 #undef RECONSTRUCT_SITE_LINK
452 #define SITELINK0TEX siteLink0TexDouble
453 #define SITELINK1TEX siteLink1TexDouble
454 #if (SITE_MATRIX_LOAD_TEX == 1)
455 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_DOUBLE_TEX_DECLARE((odd_bit?SITELINK1TEX:SITELINK0TEX), sitelink_even, dir, idx, var, fl.site_ga_stride)
456 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_DOUBLE_TEX_DECLARE((odd_bit?SITELINK0TEX:SITELINK1TEX), sitelink_odd, dir, idx, var, fl.site_ga_stride)
458 #define LOAD_EVEN_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_DOUBLE_DECLARE(sitelink_even, dir, idx, var, fl.site_ga_stride)
459 #define LOAD_ODD_SITE_MATRIX(dir, idx, var) LOAD_MATRIX_12_DOUBLE_DECLARE(sitelink_odd, dir, idx, var, fl.site_ga_stride)
461 #define LOAD_SITE_MATRIX(sitelink, dir, idx, var) LOAD_MATRIX_12_DOUBLE_DECLARE(sitelink, dir, idx, var, fl.site_ga_stride)
462 #define RECONSTRUCT_SITE_LINK(sign, var) RECONSTRUCT_LINK_12(sign, var);
463 #define FloatN double2
464 #define FloatM double2
465 #define RECONSTRUCT 12
466 #define sd_data double_12_sd_data
470 #undef LOAD_EVEN_SITE_MATRIX
471 #undef LOAD_ODD_SITE_MATRIX
472 #undef LOAD_SITE_MATRIX
473 #undef RECONSTRUCT_SITE_LINK
482 #undef LOAD_FAT_MATRIX
483 #undef LOAD_EVEN_MULINK_MATRIX
484 #undef LOAD_ODD_MULINK_MATRIX
485 #undef LOAD_EVEN_FAT_MATRIX
486 #undef LOAD_ODD_FAT_MATRIX
491 #define UNBIND_ALL_TEXTURE do{ \
492 if(prec ==QUDA_DOUBLE_PRECISION){ \
493 cudaUnbindTexture(siteLink0TexDouble); \
494 cudaUnbindTexture(siteLink1TexDouble); \
495 cudaUnbindTexture(fatGauge0TexDouble); \
496 cudaUnbindTexture(fatGauge1TexDouble); \
497 cudaUnbindTexture(muLink0TexDouble); \
498 cudaUnbindTexture(muLink1TexDouble); \
500 if(cudaSiteLink.reconstruct == QUDA_RECONSTRUCT_NO){ \
501 cudaUnbindTexture(siteLink0TexSingle_norecon); \
502 cudaUnbindTexture(siteLink1TexSingle_norecon); \
504 cudaUnbindTexture(siteLink0TexSingle_recon); \
505 cudaUnbindTexture(siteLink1TexSingle_recon); \
507 cudaUnbindTexture(fatGauge0TexSingle); \
508 cudaUnbindTexture(fatGauge1TexSingle); \
509 cudaUnbindTexture(muLink0TexSingle); \
510 cudaUnbindTexture(muLink1TexSingle); \
514 #define UNBIND_SITE_AND_FAT_LINK do{ \
515 if(prec == QUDA_DOUBLE_PRECISION){ \
516 cudaUnbindTexture(siteLink0TexDouble); \
517 cudaUnbindTexture(siteLink1TexDouble); \
518 cudaUnbindTexture(fatGauge0TexDouble); \
519 cudaUnbindTexture(fatGauge1TexDouble); \
521 if(cudaSiteLink.reconstruct == QUDA_RECONSTRUCT_NO){ \
522 cudaUnbindTexture(siteLink0TexSingle_norecon); \
523 cudaUnbindTexture(siteLink1TexSingle_norecon); \
525 cudaUnbindTexture(siteLink0TexSingle_recon); \
526 cudaUnbindTexture(siteLink1TexSingle_recon); \
528 cudaUnbindTexture(fatGauge0TexSingle); \
529 cudaUnbindTexture(fatGauge1TexSingle); \
534 #define BIND_MU_LINK() do{ \
535 if(prec == QUDA_DOUBLE_PRECISION){ \
536 cudaBindTexture(0, muLink0TexDouble, mulink_even, staple_bytes); \
537 cudaBindTexture(0, muLink1TexDouble, mulink_odd, staple_bytes); \
539 cudaBindTexture(0, muLink0TexSingle, mulink_even, staple_bytes); \
540 cudaBindTexture(0, muLink1TexSingle, mulink_odd, staple_bytes); \
544 #define UNBIND_MU_LINK() do{ \
545 if(prec == QUDA_DOUBLE_PRECISION){ \
546 cudaUnbindTexture(muLink0TexSingle); \
547 cudaUnbindTexture(muLink1TexSingle); \
549 cudaUnbindTexture(muLink0TexDouble); \
550 cudaUnbindTexture(muLink1TexDouble); \
555 #define BIND_SITE_AND_FAT_LINK do { \
556 if(prec == QUDA_DOUBLE_PRECISION){ \
557 cudaBindTexture(0, siteLink0TexDouble, cudaSiteLink.Even_p(), cudaSiteLink.Bytes()); \
558 cudaBindTexture(0, siteLink1TexDouble, cudaSiteLink.Odd_p(), cudaSiteLink.Bytes()); \
559 cudaBindTexture(0, fatGauge0TexDouble, cudaFatLink.Even_p(), cudaFatLink.Bytes()); \
560 cudaBindTexture(0, fatGauge1TexDouble, cudaFatLink.Odd_p(), cudaFatLink.Bytes()); \
562 if(cudaSiteLink.Reconstruct() == QUDA_RECONSTRUCT_NO){ \
563 cudaBindTexture(0, siteLink0TexSingle_norecon, cudaSiteLink.Even_p(), cudaSiteLink.Bytes()); \
564 cudaBindTexture(0, siteLink1TexSingle_norecon, cudaSiteLink.Odd_p(), cudaSiteLink.Bytes()); \
566 cudaBindTexture(0, siteLink0TexSingle_recon, cudaSiteLink.Even_p(), cudaSiteLink.Bytes()); \
567 cudaBindTexture(0, siteLink1TexSingle_recon, cudaSiteLink.Odd_p(), cudaSiteLink.Bytes()); \
569 cudaBindTexture(0, fatGauge0TexSingle, cudaFatLink.Even_p(), cudaFatLink.Bytes()); \
570 cudaBindTexture(0, fatGauge1TexSingle, cudaFatLink.Odd_p(), cudaFatLink.Bytes()); \
574 #define BIND_MU_LINK() do{ \
575 if(prec == QUDA_DOUBLE_PRECISION){ \
576 cudaBindTexture(0, muLink0TexDouble, mulink_even, staple_bytes); \
577 cudaBindTexture(0, muLink1TexDouble, mulink_odd, staple_bytes); \
579 cudaBindTexture(0, muLink0TexSingle, mulink_even, staple_bytes); \
580 cudaBindTexture(0, muLink1TexSingle, mulink_odd, staple_bytes); \
584 #define UNBIND_MU_LINK() do{ \
585 if(prec == QUDA_DOUBLE_PRECISION){ \
586 cudaUnbindTexture(muLink0TexSingle); \
587 cudaUnbindTexture(muLink1TexSingle); \
589 cudaUnbindTexture(muLink0TexDouble); \
590 cudaUnbindTexture(muLink1TexDouble); \
594 #define BIND_SITE_AND_FAT_LINK_REVERSE do { \
595 if(prec == QUDA_DOUBLE_PRECISION){ \
596 cudaBindTexture(0, siteLink1TexDouble, cudaSiteLink.even, cudaSiteLink.bytes); \
597 cudaBindTexture(0, siteLink0TexDouble, cudaSiteLink.odd, cudaSiteLink.bytes); \
598 cudaBindTexture(0, fatGauge1TexDouble, cudaFatLink.even, cudaFatLink.bytes); \
599 cudaBindTexture(0, fatGauge0TexDouble, cudaFatLink.odd, cudaFatLink.bytes); \
601 if(cudaSiteLink.reconstruct == QUDA_RECONSTRUCT_NO){ \
602 cudaBindTexture(0, siteLink1TexSingle_norecon, cudaSiteLink.even, cudaSiteLink.bytes); \
603 cudaBindTexture(0, siteLink0TexSingle_norecon, cudaSiteLink.odd, cudaSiteLink.bytes); \
605 cudaBindTexture(0, siteLink1TexSingle_recon, cudaSiteLink.even, cudaSiteLink.bytes); \
606 cudaBindTexture(0, siteLink0TexSingle_recon, cudaSiteLink.odd, cudaSiteLink.bytes); \
608 cudaBindTexture(0, fatGauge1TexSingle, cudaFatLink.even, cudaFatLink.bytes); \
609 cudaBindTexture(0, fatGauge0TexSingle, cudaFatLink.odd, cudaFatLink.bytes); \
615 #define ENUMERATE_FUNCS(mu,nu) switch(mu) { \
619 printf("ERROR: invalid direction combination\n"); exit(1); \
622 CALL_FUNCTION(0,1); \
625 CALL_FUNCTION(0,2); \
628 CALL_FUNCTION(0,3); \
635 CALL_FUNCTION(1,0); \
638 printf("ERROR: invalid direction combination\n"); exit(1); \
641 CALL_FUNCTION(1,2); \
644 CALL_FUNCTION(1,3); \
651 CALL_FUNCTION(2,0); \
654 CALL_FUNCTION(2,1); \
657 printf("ERROR: invalid direction combination\n"); exit(1); \
660 CALL_FUNCTION(2,3); \
667 CALL_FUNCTION(3,0); \
670 CALL_FUNCTION(3,1); \
673 CALL_FUNCTION(3,2); \
676 printf("ERROR: invalid direction combination\n"); exit(1); \
682 #define ENUMERATE_FUNCS_SAVE(mu,nu, save_staple) if(save_staple){ \
687 printf("ERROR: invalid direction combination\n"); exit(1); \
690 CALL_FUNCTION(0,1,1); \
693 CALL_FUNCTION(0,2,1); \
696 CALL_FUNCTION(0,3,1); \
703 CALL_FUNCTION(1,0,1); \
706 printf("ERROR: invalid direction combination\n"); exit(1); \
709 CALL_FUNCTION(1,2,1); \
712 CALL_FUNCTION(1,3,1); \
719 CALL_FUNCTION(2,0,1); \
722 CALL_FUNCTION(2,1,1); \
725 printf("ERROR: invalid direction combination\n"); exit(1); \
728 CALL_FUNCTION(2,3,1); \
735 CALL_FUNCTION(3,0,1); \
738 CALL_FUNCTION(3,1,1); \
741 CALL_FUNCTION(3,2,1); \
744 printf("ERROR: invalid direction combination\n"); exit(1); \
754 printf("ERROR: invalid direction combination\n"); exit(1); \
757 CALL_FUNCTION(0,1,0); \
760 CALL_FUNCTION(0,2,0); \
763 CALL_FUNCTION(0,3,0); \
770 CALL_FUNCTION(1,0,0); \
773 printf("ERROR: invalid direction combination\n"); exit(1); \
776 CALL_FUNCTION(1,2,0); \
779 CALL_FUNCTION(1,3,0); \
786 CALL_FUNCTION(2,0,0); \
789 CALL_FUNCTION(2,1,0); \
792 printf("ERROR: invalid direction combination\n"); exit(1); \
795 CALL_FUNCTION(2,3,0); \
802 CALL_FUNCTION(3,0,0); \
805 CALL_FUNCTION(3,1,0); \
808 CALL_FUNCTION(3,2,0); \
811 printf("ERROR: invalid direction combination\n"); exit(1); \
821 const void*
const inEven,
const void*
const inOdd,
825 dim3 blockDim = kparam.blockDim;
831 computeLongLinkParity18Kernel<0>
832 <<<halfGridDim, blockDim>>>((double2*)outEven, (double2*)inEven, (double2*)inOdd, coeff, kparam);
833 computeLongLinkParity18Kernel<1>
834 <<<halfGridDim, blockDim>>>((double2*)outOdd, (double2*)inOdd, (double2*)inEven, coeff, kparam);
838 computeLongLinkParity12Kernel<0>
839 <<<halfGridDim, blockDim>>>((double2*)outEven, (double2*)inEven, (double2*)inOdd, coeff, kparam);
840 computeLongLinkParity12Kernel<1>
841 <<<halfGridDim, blockDim>>>((double2*)outOdd, (double2*)inOdd, (double2*)inEven, coeff, kparam);
844 errorQuda(
"Reconstruct %d is not supported\n", recon);
849 computeLongLinkParity18Kernel<0>
850 <<<halfGridDim, blockDim>>>((float2*)outEven, (float2*)inEven, (float2*)inOdd, (
float)coeff, kparam);
851 computeLongLinkParity18Kernel<1>
852 <<<halfGridDim, blockDim>>>((float2*)outOdd, (float2*)inOdd, (float2*)inEven, (
float)coeff, kparam);
856 computeLongLinkParity12Kernel<0>
857 <<<halfGridDim, blockDim>>>((float4*)outEven, (float4*)inEven, (float4*)inOdd, (
float)coeff, kparam);
858 computeLongLinkParity12Kernel<1>
859 <<<halfGridDim, blockDim>>>((float4*)outOdd, (float4*)inOdd, (float4*)inEven, (
float)coeff, kparam);
862 errorQuda(
"Reconstruct %d is not supported\n", recon);
865 errorQuda(
"Unsupported precision %d\n", prec);
882 #define CALL_FUNCTION(mu, nu) \
883 if (prec == QUDA_DOUBLE_PRECISION){ \
884 if(recon == QUDA_RECONSTRUCT_NO){ \
885 do_siteComputeGenStapleParity18Kernel<mu,nu, 0> \
886 <<<halfGridDim, blockDim, 0, *stream>>>((double2*)staple_even, (double2*)staple_odd, \
887 (const double2*)sitelink_even, (const double2*)sitelink_odd, \
888 (double2*)fatlink_even, (double2*)fatlink_odd, \
889 (double)mycoeff, kparam); \
890 do_siteComputeGenStapleParity18Kernel<mu,nu, 1> \
891 <<<halfGridDim, blockDim, 0, *stream>>>((double2*)staple_odd, (double2*)staple_even, \
892 (const double2*)sitelink_odd, (const double2*)sitelink_even, \
893 (double2*)fatlink_odd, (double2*)fatlink_even, \
894 (double)mycoeff, kparam); \
896 do_siteComputeGenStapleParity12Kernel<mu,nu, 0> \
897 <<<halfGridDim, blockDim, 0, *stream>>>((double2*)staple_even, (double2*)staple_odd, \
898 (const double2*)sitelink_even, (const double2*)sitelink_odd, \
899 (double2*)fatlink_even, (double2*)fatlink_odd, \
900 (double)mycoeff, kparam); \
901 do_siteComputeGenStapleParity12Kernel<mu,nu, 1> \
902 <<<halfGridDim, blockDim, 0, *stream>>>((double2*)staple_odd, (double2*)staple_even, \
903 (const double2*)sitelink_odd, (const double2*)sitelink_even, \
904 (double2*)fatlink_odd, (double2*)fatlink_even, \
905 (double)mycoeff, kparam); \
908 if(recon == QUDA_RECONSTRUCT_NO){ \
909 do_siteComputeGenStapleParity18Kernel<mu,nu, 0> \
910 <<<halfGridDim, blockDim, 0, *stream>>>((float2*)staple_even, (float2*)staple_odd, \
911 (float2*)sitelink_even, (float2*)sitelink_odd, \
912 (float2*)fatlink_even, (float2*)fatlink_odd, \
913 (float)mycoeff, kparam); \
914 do_siteComputeGenStapleParity18Kernel<mu,nu, 1> \
915 <<<halfGridDim, blockDim, 0, *stream>>>((float2*)staple_odd, (float2*)staple_even, \
916 (float2*)sitelink_odd, (float2*)sitelink_even, \
917 (float2*)fatlink_odd, (float2*)fatlink_even, \
918 (float)mycoeff, kparam); \
920 do_siteComputeGenStapleParity12Kernel<mu,nu, 0> \
921 <<<halfGridDim, blockDim, 0, *stream>>>((float2*)staple_even, (float2*)staple_odd, \
922 (float4*)sitelink_even, (float4*)sitelink_odd, \
923 (float2*)fatlink_even, (float2*)fatlink_odd, \
924 (float)mycoeff, kparam); \
925 do_siteComputeGenStapleParity12Kernel<mu,nu, 1> \
926 <<<halfGridDim, blockDim, 0, *stream>>>((float2*)staple_odd, (float2*)staple_even, \
927 (float4*)sitelink_odd, (float4*)sitelink_even, \
928 (float2*)fatlink_odd, (float2*)fatlink_even, \
929 (float)mycoeff, kparam); \
935 ENUMERATE_FUNCS(mu,nu);
948 int mu,
int nu,
int save_staple,
955 #define CALL_FUNCTION(mu, nu, save_staple) \
956 if (prec == QUDA_DOUBLE_PRECISION){ \
957 if(recon == QUDA_RECONSTRUCT_NO){ \
958 do_computeGenStapleFieldParity18Kernel<mu,nu, 0, save_staple> \
959 <<<halfGridDim, blockDim, 0, *stream>>>((double2*)staple_even, (double2*)staple_odd, \
960 (const double2*)sitelink_even, (const double2*)sitelink_odd, \
961 (double2*)fatlink_even, (double2*)fatlink_odd, \
962 (const double2*)mulink_even, (const double2*)mulink_odd, \
963 (double)mycoeff, kparam); \
964 do_computeGenStapleFieldParity18Kernel<mu,nu, 1, save_staple> \
965 <<<halfGridDim, blockDim, 0, *stream>>>((double2*)staple_odd, (double2*)staple_even, \
966 (const double2*)sitelink_odd, (const double2*)sitelink_even, \
967 (double2*)fatlink_odd, (double2*)fatlink_even, \
968 (const double2*)mulink_odd, (const double2*)mulink_even, \
969 (double)mycoeff, kparam); \
971 do_computeGenStapleFieldParity12Kernel<mu,nu, 0, save_staple> \
972 <<<halfGridDim, blockDim, 0, *stream>>>((double2*)staple_even, (double2*)staple_odd, \
973 (const double2*)sitelink_even, (const double2*)sitelink_odd, \
974 (double2*)fatlink_even, (double2*)fatlink_odd, \
975 (const double2*)mulink_even, (const double2*)mulink_odd, \
976 (double)mycoeff, kparam); \
977 do_computeGenStapleFieldParity12Kernel<mu,nu, 1, save_staple> \
978 <<<halfGridDim, blockDim, 0, *stream>>>((double2*)staple_odd, (double2*)staple_even, \
979 (const double2*)sitelink_odd, (const double2*)sitelink_even, \
980 (double2*)fatlink_odd, (double2*)fatlink_even, \
981 (const double2*)mulink_odd, (const double2*)mulink_even, \
982 (double)mycoeff, kparam); \
985 if(recon == QUDA_RECONSTRUCT_NO){ \
986 do_computeGenStapleFieldParity18Kernel<mu,nu, 0, save_staple> \
987 <<<halfGridDim, blockDim, 0, *stream>>>((float2*)staple_even, (float2*)staple_odd, \
988 (float2*)sitelink_even, (float2*)sitelink_odd, \
989 (float2*)fatlink_even, (float2*)fatlink_odd, \
990 (float2*)mulink_even, (float2*)mulink_odd, \
991 (float)mycoeff, kparam); \
992 do_computeGenStapleFieldParity18Kernel<mu,nu, 1, save_staple> \
993 <<<halfGridDim, blockDim, 0, *stream>>>((float2*)staple_odd, (float2*)staple_even, \
994 (float2*)sitelink_odd, (float2*)sitelink_even, \
995 (float2*)fatlink_odd, (float2*)fatlink_even, \
996 (float2*)mulink_odd, (float2*)mulink_even, \
997 (float)mycoeff, kparam); \
999 do_computeGenStapleFieldParity12Kernel<mu,nu, 0, save_staple> \
1000 <<<halfGridDim, blockDim, 0, *stream>>>((float2*)staple_even, (float2*)staple_odd, \
1001 (float4*)sitelink_even, (float4*)sitelink_odd, \
1002 (float2*)fatlink_even, (float2*)fatlink_odd, \
1003 (float2*)mulink_even, (float2*)mulink_odd, \
1004 (float)mycoeff, kparam); \
1005 do_computeGenStapleFieldParity12Kernel<mu,nu, 1, save_staple> \
1006 <<<halfGridDim, blockDim, 0, *stream>>>((float2*)staple_odd, (float2*)staple_even, \
1007 (float4*)sitelink_odd, (float4*)sitelink_even, \
1008 (float2*)fatlink_odd, (float2*)fatlink_even, \
1009 (float2*)mulink_odd, (float2*)mulink_even, \
1010 (float)mycoeff, kparam); \
1016 ENUMERATE_FUNCS_SAVE(mu,nu,save_staple);
1020 #undef CALL_FUNCTION
1034 dim3 blockDim = kparam.blockDim;
1035 dim3 halfGridDim = kparam.halfGridDim;
1036 int sbytes_dp = blockDim.x*5*
sizeof(double2);
1037 int sbytes_sp = blockDim.x*5*
sizeof(float2);
1039 #define CALL_FUNCTION(mu, nu) \
1040 if (prec == QUDA_DOUBLE_PRECISION){ \
1041 if(recon == QUDA_RECONSTRUCT_NO){ \
1042 do_siteComputeGenStapleParity18Kernel_ex<mu,nu, 0> \
1043 <<<halfGridDim, blockDim, sbytes_dp>>>((double2*)staple_even, (double2*)staple_odd, \
1044 (const double2*)sitelink_even, (const double2*)sitelink_odd, \
1045 (double2*)fatlink_even, (double2*)fatlink_odd, \
1046 (double)mycoeff, kparam); \
1047 do_siteComputeGenStapleParity18Kernel_ex<mu,nu, 1> \
1048 <<<halfGridDim, blockDim, sbytes_dp>>>((double2*)staple_odd, (double2*)staple_even, \
1049 (const double2*)sitelink_odd, (const double2*)sitelink_even, \
1050 (double2*)fatlink_odd, (double2*)fatlink_even, \
1051 (double)mycoeff, kparam); \
1053 do_siteComputeGenStapleParity12Kernel_ex<mu,nu, 0> \
1054 <<<halfGridDim, blockDim, sbytes_dp>>>((double2*)staple_even, (double2*)staple_odd, \
1055 (const double2*)sitelink_even, (const double2*)sitelink_odd, \
1056 (double2*)fatlink_even, (double2*)fatlink_odd, \
1057 (double)mycoeff, kparam); \
1058 do_siteComputeGenStapleParity12Kernel_ex<mu,nu, 1> \
1059 <<<halfGridDim, blockDim, sbytes_dp>>>((double2*)staple_odd, (double2*)staple_even, \
1060 (const double2*)sitelink_odd, (const double2*)sitelink_even, \
1061 (double2*)fatlink_odd, (double2*)fatlink_even, \
1062 (double)mycoeff, kparam); \
1065 if(recon == QUDA_RECONSTRUCT_NO){ \
1066 do_siteComputeGenStapleParity18Kernel_ex<mu,nu, 0> \
1067 <<<halfGridDim, blockDim, sbytes_sp>>>((float2*)staple_even, (float2*)staple_odd, \
1068 (float2*)sitelink_even, (float2*)sitelink_odd, \
1069 (float2*)fatlink_even, (float2*)fatlink_odd, \
1070 (float)mycoeff, kparam); \
1071 do_siteComputeGenStapleParity18Kernel_ex<mu,nu, 1> \
1072 <<<halfGridDim, blockDim, sbytes_sp>>>((float2*)staple_odd, (float2*)staple_even, \
1073 (float2*)sitelink_odd, (float2*)sitelink_even, \
1074 (float2*)fatlink_odd, (float2*)fatlink_even, \
1075 (float)mycoeff, kparam); \
1077 do_siteComputeGenStapleParity12Kernel_ex<mu,nu, 0> \
1078 <<<halfGridDim, blockDim, sbytes_sp>>>((float2*)staple_even, (float2*)staple_odd, \
1079 (float4*)sitelink_even, (float4*)sitelink_odd, \
1080 (float2*)fatlink_even, (float2*)fatlink_odd, \
1081 (float)mycoeff, kparam); \
1082 do_siteComputeGenStapleParity12Kernel_ex<mu,nu, 1> \
1083 <<<halfGridDim, blockDim, sbytes_sp>>>((float2*)staple_odd, (float2*)staple_even, \
1084 (float4*)sitelink_odd, (float4*)sitelink_even, \
1085 (float2*)fatlink_odd, (float2*)fatlink_even, \
1086 (float)mycoeff, kparam); \
1091 ENUMERATE_FUNCS(mu,nu);
1093 #undef CALL_FUNCTION
1105 int mu,
int nu,
int save_staple,
1111 dim3 blockDim = kparam.blockDim;
1112 dim3 halfGridDim= kparam.halfGridDim;
1114 int sbytes_dp = blockDim.x*5*
sizeof(double2);
1115 int sbytes_sp = blockDim.x*5*
sizeof(float2);
1117 #define CALL_FUNCTION(mu, nu, save_staple) \
1118 if (prec == QUDA_DOUBLE_PRECISION){ \
1119 if(recon == QUDA_RECONSTRUCT_NO){ \
1120 do_computeGenStapleFieldParity18Kernel_ex<mu,nu, 0, save_staple> \
1121 <<<halfGridDim, blockDim, sbytes_dp>>>((double2*)staple_even, (double2*)staple_odd, \
1122 (const double2*)sitelink_even, (const double2*)sitelink_odd, \
1123 (double2*)fatlink_even, (double2*)fatlink_odd, \
1124 (const double2*)mulink_even, (const double2*)mulink_odd, \
1125 (double)mycoeff, kparam); \
1126 do_computeGenStapleFieldParity18Kernel_ex<mu,nu, 1, save_staple> \
1127 <<<halfGridDim, blockDim, sbytes_dp>>>((double2*)staple_odd, (double2*)staple_even, \
1128 (const double2*)sitelink_odd, (const double2*)sitelink_even, \
1129 (double2*)fatlink_odd, (double2*)fatlink_even, \
1130 (const double2*)mulink_odd, (const double2*)mulink_even, \
1131 (double)mycoeff, kparam); \
1133 do_computeGenStapleFieldParity12Kernel_ex<mu,nu, 0, save_staple> \
1134 <<<halfGridDim, blockDim, sbytes_dp>>>((double2*)staple_even, (double2*)staple_odd, \
1135 (const double2*)sitelink_even, (const double2*)sitelink_odd, \
1136 (double2*)fatlink_even, (double2*)fatlink_odd, \
1137 (const double2*)mulink_even, (const double2*)mulink_odd, \
1138 (double)mycoeff, kparam); \
1139 do_computeGenStapleFieldParity12Kernel_ex<mu,nu, 1, save_staple> \
1140 <<<halfGridDim, blockDim, sbytes_dp>>>((double2*)staple_odd, (double2*)staple_even, \
1141 (const double2*)sitelink_odd, (const double2*)sitelink_even, \
1142 (double2*)fatlink_odd, (double2*)fatlink_even, \
1143 (const double2*)mulink_odd, (const double2*)mulink_even, \
1144 (double)mycoeff, kparam); \
1147 if(recon == QUDA_RECONSTRUCT_NO){ \
1148 do_computeGenStapleFieldParity18Kernel_ex<mu,nu, 0, save_staple> \
1149 <<<halfGridDim, blockDim, sbytes_sp>>>((float2*)staple_even, (float2*)staple_odd, \
1150 (float2*)sitelink_even, (float2*)sitelink_odd, \
1151 (float2*)fatlink_even, (float2*)fatlink_odd, \
1152 (float2*)mulink_even, (float2*)mulink_odd, \
1153 (float)mycoeff, kparam); \
1154 do_computeGenStapleFieldParity18Kernel_ex<mu,nu, 1, save_staple> \
1155 <<<halfGridDim, blockDim, sbytes_sp>>>((float2*)staple_odd, (float2*)staple_even, \
1156 (float2*)sitelink_odd, (float2*)sitelink_even, \
1157 (float2*)fatlink_odd, (float2*)fatlink_even, \
1158 (float2*)mulink_odd, (float2*)mulink_even, \
1159 (float)mycoeff, kparam); \
1161 do_computeGenStapleFieldParity12Kernel_ex<mu,nu, 0, save_staple> \
1162 <<<halfGridDim, blockDim, sbytes_sp>>>((float2*)staple_even, (float2*)staple_odd, \
1163 (float4*)sitelink_even, (float4*)sitelink_odd, \
1164 (float2*)fatlink_even, (float2*)fatlink_odd, \
1165 (float2*)mulink_even, (float2*)mulink_odd, \
1166 (float)mycoeff, kparam); \
1167 do_computeGenStapleFieldParity12Kernel_ex<mu,nu, 1, save_staple> \
1168 <<<halfGridDim, blockDim, sbytes_sp>>>((float2*)staple_odd, (float2*)staple_even, \
1169 (float4*)sitelink_odd, (float4*)sitelink_even, \
1170 (float2*)fatlink_odd, (float2*)fatlink_even, \
1171 (float2*)mulink_odd, (float2*)mulink_even, \
1172 (float)mycoeff, kparam); \
1177 ENUMERATE_FUNCS_SAVE(mu,nu,save_staple);
1181 #undef CALL_FUNCTION
1195 BIND_SITE_AND_FAT_LINK;
1196 int volume = param->
X[0]*param->
X[1]*param->
X[2]*param->
X[3];
1200 staple_bytes = cudaStaple.
Bytes();
1204 llfatOneLink18Kernel<<<gridDim, blockDim>>>((
const double2*)cudaSiteLink.
Even_p(), (
const double2*)cudaSiteLink.
Odd_p(),
1205 (double2*)cudaFatLink.
Even_p(), (double2*)cudaFatLink.
Odd_p(),
1206 (double)act_path_coeff[0], (
double)act_path_coeff[5], volume);
1209 llfatOneLink12Kernel<<<gridDim, blockDim>>>((
const double2*)cudaSiteLink.
Even_p(), (
const double2*)cudaSiteLink.
Odd_p(),
1210 (double2*)cudaFatLink.
Even_p(), (double2*)cudaFatLink.
Odd_p(),
1211 (double)act_path_coeff[0], (
double)act_path_coeff[5], volume);
1216 llfatOneLink18Kernel<<<gridDim, blockDim>>>((float2*)cudaSiteLink.
Even_p(), (float2*)cudaSiteLink.
Odd_p(),
1217 (float2*)cudaFatLink.
Even_p(), (float2*)cudaFatLink.
Odd_p(),
1218 (float)act_path_coeff[0], (
float)act_path_coeff[5], volume);
1220 llfatOneLink12Kernel<<<gridDim, blockDim>>>((float4*)cudaSiteLink.
Even_p(), (float4*)cudaSiteLink.
Odd_p(),
1221 (float2*)cudaFatLink.
Even_p(), (float2*)cudaFatLink.
Odd_p(),
1222 (float)act_path_coeff[0], (
float)act_path_coeff[5], volume);
1226 errorQuda(
"Fat-link construction has not been built");
1241 BIND_SITE_AND_FAT_LINK;
1248 staple_bytes = cudaStaple.
Bytes();
1252 llfatOneLink18Kernel_ex<<<gridDim, blockDim>>>((
const double2*)cudaSiteLink.
Even_p(), (
const double2*)cudaSiteLink.
Odd_p(),
1253 (double2*)cudaFatLink.
Even_p(), (double2*)cudaFatLink.
Odd_p(),
1254 (double)act_path_coeff[0], (
double)act_path_coeff[5],
kparam);
1257 llfatOneLink12Kernel_ex<<<gridDim, blockDim>>>((
const double2*)cudaSiteLink.
Even_p(), (
const double2*)cudaSiteLink.
Odd_p(),
1258 (double2*)cudaFatLink.
Even_p(), (double2*)cudaFatLink.
Odd_p(),
1259 (double)act_path_coeff[0], (
double)act_path_coeff[5],
kparam);
1264 llfatOneLink18Kernel_ex<<<gridDim, blockDim>>>((float2*)cudaSiteLink.
Even_p(), (float2*)cudaSiteLink.
Odd_p(),
1265 (float2*)cudaFatLink.
Even_p(), (float2*)cudaFatLink.
Odd_p(),
1266 (float)act_path_coeff[0], (
float)act_path_coeff[5],
kparam);
1268 llfatOneLink12Kernel_ex<<<gridDim, blockDim>>>((float4*)cudaSiteLink.
Even_p(), (float4*)cudaSiteLink.
Odd_p(),
1269 (float2*)cudaFatLink.
Even_p(), (float2*)cudaFatLink.
Odd_p(),
1270 (float)act_path_coeff[0], (
float)act_path_coeff[5],
kparam);
1274 errorQuda(
"Fat-link construction has not been built");
__global__ void FloatM * staple_odd
__global__ void FloatM const FloatN const FloatN FloatM * fatlink_even
__global__ void FloatM const FloatN const FloatN * sitelink_odd
enum QudaPrecision_s QudaPrecision
struct quda::llfat_kernel_param_s llfat_kernel_param_t
__global__ void FloatM const FloatN const FloatN FloatM FloatM * fatlink_odd
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int mu
void computeLongLinkCuda(void *outEven, void *outOdd, const void *const inEven, const void *const inOdd, double coeff, QudaReconstructType recon, QudaPrecision prec, dim3 halfGridDim, llfat_kernel_param_t kparam)
__global__ void FloatM const FloatN const FloatN FloatM FloatM const FloatM * mulink_even
void llfat_init_cuda_ex(QudaGaugeParam *param_ex)
QudaPrecision Precision() const
void siteComputeGenStapleParityKernel_ex(void *staple_even, void *staple_odd, const void *sitelink_even, const void *sitelink_odd, void *fatlink_even, void *fatlink_odd, int mu, int nu, double mycoeff, QudaReconstructType recon, QudaPrecision prec, llfat_kernel_param_t kparam)
void llfatOneLinkKernel(cudaGaugeField &cudaFatLink, cudaGaugeField &cudaSiteLink, cudaGaugeField &cudaStaple, cudaGaugeField &cudaStaple1, QudaGaugeParam *param, double *act_path_coeff)
void computeGenStapleFieldParityKernel_ex(void *staple_even, void *staple_odd, const void *sitelink_even, const void *sitelink_odd, void *fatlink_even, void *fatlink_odd, const void *mulink_even, const void *mulink_odd, int mu, int nu, int save_staple, double mycoeff, QudaReconstructType recon, QudaPrecision prec, llfat_kernel_param_t kparam)
cudaGaugeField * cudaFatLink
QudaReconstructType Reconstruct() const
void computeGenStapleFieldParityKernel(void *staple_even, void *staple_odd, const void *sitelink_even, const void *sitelink_odd, void *fatlink_even, void *fatlink_odd, const void *mulink_even, const void *mulink_odd, int mu, int nu, int save_staple, double mycoeff, QudaReconstructType recon, QudaPrecision prec, dim3 halfGridDim, llfat_kernel_param_t kparam, cudaStream_t *stream)
__constant__ double coeff
void siteComputeGenStapleParityKernel(void *staple_even, void *staple_odd, const void *sitelink_even, const void *sitelink_odd, void *fatlink_even, void *fatlink_odd, int mu, int nu, double mycoeff, QudaReconstructType recon, QudaPrecision prec, dim3 halfGridDim, llfat_kernel_param_t kparam, cudaStream_t *stream)
__constant__ fat_force_const_t fl
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const hisq_kernel_param_t kparam
enum QudaReconstructType_s QudaReconstructType
__global__ void FloatM const FloatN * sitelink_even
__global__ void FloatM const FloatN const FloatN FloatM FloatM const FloatM const FloatM * mulink_odd
void llfat_init_cuda(QudaGaugeParam *param)
RealTypeId< RealA >::Type mycoeff
void llfatOneLinkKernel_ex(cudaGaugeField &cudaFatLink, cudaGaugeField &cudaSiteLink, cudaGaugeField &cudaStaple, cudaGaugeField &cudaStaple1, QudaGaugeParam *param, double *act_path_coeff, llfat_kernel_param_t kparam)