11 #if defined(GPU_FERMION_FORCE)
15 namespace fermionforce {
20 using namespace fermionforce;
24 #define LOAD_ANTI_HERMITIAN(src, dir, idx, var) LOAD_ANTI_HERMITIAN_DIRECT(src, dir, idx, var, Vh)
26 #define LOAD_HW_SINGLE(hw_even, hw_odd, idx, var, oddness) do{ \
27 Float2* hw = (oddness)?hw_odd:hw_even; \
28 var##0 = hw[idx + 0*Vh]; \
29 var##1 = hw[idx + 1*Vh]; \
30 var##2 = hw[idx + 2*Vh]; \
31 var##3 = hw[idx + 3*Vh]; \
32 var##4 = hw[idx + 4*Vh]; \
33 var##5 = hw[idx + 5*Vh]; \
36 #define WRITE_HW_SINGLE(hw_even, hw_odd, idx, var, oddness) do{ \
37 Float2* hw = (oddness)?hw_odd:hw_even; \
38 hw[idx + 0*Vh] = var##0; \
39 hw[idx + 1*Vh] = var##1; \
40 hw[idx + 2*Vh] = var##2; \
41 hw[idx + 3*Vh] = var##3; \
42 hw[idx + 4*Vh] = var##4; \
43 hw[idx + 5*Vh] = var##5; \
46 #define LOAD_HW(hw_eve, hw_odd, idx, var, oddness) LOAD_HW_SINGLE(hw_eve, hw_odd, idx, var, oddness)
47 #define WRITE_HW(hw_even, hw_odd, idx, var, oddness) WRITE_HW_SINGLE(hw_even, hw_odd, idx, var, oddness)
48 #define LOAD_MATRIX(src, dir, idx, var) LOAD_MATRIX_12_SINGLE(src, dir, idx, var, Vh)
50 #define FF_SITE_MATRIX_LOAD_TEX 1
52 #define linkEvenTex siteLink0TexSingle_recon
53 #define linkOddTex siteLink1TexSingle_recon
55 #if (FF_SITE_MATRIX_LOAD_TEX == 1)
56 #define FF_LOAD_MATRIX(dir, idx, var, oddness) LOAD_MATRIX_12_SINGLE_TEX(((oddness)?linkOddTex:linkEvenTex), dir, idx, var, Vh)
58 #define FF_LOAD_MATRIX(dir, idx, var, oddness) LOAD_MATRIX_12_SINGLE(((oddness)?linkOdd:linkEven), dir, idx, var, Vh)
62 #define linka00_re LINKA0.x
63 #define linka00_im LINKA0.y
64 #define linka01_re LINKA0.z
65 #define linka01_im LINKA0.w
66 #define linka02_re LINKA1.x
67 #define linka02_im LINKA1.y
68 #define linka10_re LINKA1.z
69 #define linka10_im LINKA1.w
70 #define linka11_re LINKA2.x
71 #define linka11_im LINKA2.y
72 #define linka12_re LINKA2.z
73 #define linka12_im LINKA2.w
74 #define linka20_re LINKA3.x
75 #define linka20_im LINKA3.y
76 #define linka21_re LINKA3.z
77 #define linka21_im LINKA3.w
78 #define linka22_re LINKA4.x
79 #define linka22_im LINKA4.y
81 #define linkb00_re LINKB0.x
82 #define linkb00_im LINKB0.y
83 #define linkb01_re LINKB0.z
84 #define linkb01_im LINKB0.w
85 #define linkb02_re LINKB1.x
86 #define linkb02_im LINKB1.y
87 #define linkb10_re LINKB1.z
88 #define linkb10_im LINKB1.w
89 #define linkb11_re LINKB2.x
90 #define linkb11_im LINKB2.y
91 #define linkb12_re LINKB2.z
92 #define linkb12_im LINKB2.w
93 #define linkb20_re LINKB3.x
94 #define linkb20_im LINKB3.y
95 #define linkb21_re LINKB3.z
96 #define linkb21_im LINKB3.w
97 #define linkb22_re LINKB4.x
98 #define linkb22_im LINKB4.y
101 #define MAT_MUL_HW(M, HW, HWOUT) \
102 HWOUT##00_re = (M##00_re * HW##00_re - M##00_im * HW##00_im) \
103 + (M##01_re * HW##01_re - M##01_im * HW##01_im) \
104 + (M##02_re * HW##02_re - M##02_im * HW##02_im); \
105 HWOUT##00_im = (M##00_re * HW##00_im + M##00_im * HW##00_re) \
106 + (M##01_re * HW##01_im + M##01_im * HW##01_re) \
107 + (M##02_re * HW##02_im + M##02_im * HW##02_re); \
108 HWOUT##01_re = (M##10_re * HW##00_re - M##10_im * HW##00_im) \
109 + (M##11_re * HW##01_re - M##11_im * HW##01_im) \
110 + (M##12_re * HW##02_re - M##12_im * HW##02_im); \
111 HWOUT##01_im = (M##10_re * HW##00_im + M##10_im * HW##00_re) \
112 + (M##11_re * HW##01_im + M##11_im * HW##01_re) \
113 + (M##12_re * HW##02_im + M##12_im * HW##02_re); \
114 HWOUT##02_re = (M##20_re * HW##00_re - M##20_im * HW##00_im) \
115 + (M##21_re * HW##01_re - M##21_im * HW##01_im) \
116 + (M##22_re * HW##02_re - M##22_im * HW##02_im); \
117 HWOUT##02_im = (M##20_re * HW##00_im + M##20_im * HW##00_re) \
118 + (M##21_re * HW##01_im + M##21_im * HW##01_re) \
119 + (M##22_re * HW##02_im + M##22_im * HW##02_re); \
120 HWOUT##10_re = (M##00_re * HW##10_re - M##00_im * HW##10_im) \
121 + (M##01_re * HW##11_re - M##01_im * HW##11_im) \
122 + (M##02_re * HW##12_re - M##02_im * HW##12_im); \
123 HWOUT##10_im = (M##00_re * HW##10_im + M##00_im * HW##10_re) \
124 + (M##01_re * HW##11_im + M##01_im * HW##11_re) \
125 + (M##02_re * HW##12_im + M##02_im * HW##12_re); \
126 HWOUT##11_re = (M##10_re * HW##10_re - M##10_im * HW##10_im) \
127 + (M##11_re * HW##11_re - M##11_im * HW##11_im) \
128 + (M##12_re * HW##12_re - M##12_im * HW##12_im); \
129 HWOUT##11_im = (M##10_re * HW##10_im + M##10_im * HW##10_re) \
130 + (M##11_re * HW##11_im + M##11_im * HW##11_re) \
131 + (M##12_re * HW##12_im + M##12_im * HW##12_re); \
132 HWOUT##12_re = (M##20_re * HW##10_re - M##20_im * HW##10_im) \
133 + (M##21_re * HW##11_re - M##21_im * HW##11_im) \
134 + (M##22_re * HW##12_re - M##22_im * HW##12_im); \
135 HWOUT##12_im = (M##20_re * HW##10_im + M##20_im * HW##10_re) \
136 + (M##21_re * HW##11_im + M##21_im * HW##11_re) \
137 + (M##22_re * HW##12_im + M##22_im * HW##12_re);
140 #define ADJ_MAT_MUL_HW(M, HW, HWOUT) \
141 HWOUT##00_re = (M##00_re * HW##00_re + M##00_im * HW##00_im) \
142 + (M##10_re * HW##01_re + M##10_im * HW##01_im) \
143 + (M##20_re * HW##02_re + M##20_im * HW##02_im); \
144 HWOUT##00_im = (M##00_re * HW##00_im - M##00_im * HW##00_re) \
145 + (M##10_re * HW##01_im - M##10_im * HW##01_re) \
146 + (M##20_re * HW##02_im - M##20_im * HW##02_re); \
147 HWOUT##01_re = (M##01_re * HW##00_re + M##01_im * HW##00_im) \
148 + (M##11_re * HW##01_re + M##11_im * HW##01_im) \
149 + (M##21_re * HW##02_re + M##21_im * HW##02_im); \
150 HWOUT##01_im = (M##01_re * HW##00_im - M##01_im * HW##00_re) \
151 + (M##11_re * HW##01_im - M##11_im * HW##01_re) \
152 + (M##21_re * HW##02_im - M##21_im * HW##02_re); \
153 HWOUT##02_re = (M##02_re * HW##00_re + M##02_im * HW##00_im) \
154 + (M##12_re * HW##01_re + M##12_im * HW##01_im) \
155 + (M##22_re * HW##02_re + M##22_im * HW##02_im); \
156 HWOUT##02_im = (M##02_re * HW##00_im - M##02_im * HW##00_re) \
157 + (M##12_re * HW##01_im - M##12_im * HW##01_re) \
158 + (M##22_re * HW##02_im - M##22_im * HW##02_re); \
159 HWOUT##10_re = (M##00_re * HW##10_re + M##00_im * HW##10_im) \
160 + (M##10_re * HW##11_re + M##10_im * HW##11_im) \
161 + (M##20_re * HW##12_re + M##20_im * HW##12_im); \
162 HWOUT##10_im = (M##00_re * HW##10_im - M##00_im * HW##10_re) \
163 + (M##10_re * HW##11_im - M##10_im * HW##11_re) \
164 + (M##20_re * HW##12_im - M##20_im * HW##12_re); \
165 HWOUT##11_re = (M##01_re * HW##10_re + M##01_im * HW##10_im) \
166 + (M##11_re * HW##11_re + M##11_im * HW##11_im) \
167 + (M##21_re * HW##12_re + M##21_im * HW##12_im); \
168 HWOUT##11_im = (M##01_re * HW##10_im - M##01_im * HW##10_re) \
169 + (M##11_re * HW##11_im - M##11_im * HW##11_re) \
170 + (M##21_re * HW##12_im - M##21_im * HW##12_re); \
171 HWOUT##12_re = (M##02_re * HW##10_re + M##02_im * HW##10_im) \
172 + (M##12_re * HW##11_re + M##12_im * HW##11_im) \
173 + (M##22_re * HW##12_re + M##22_im * HW##12_im); \
174 HWOUT##12_im = (M##02_re * HW##10_im - M##02_im * HW##10_re) \
175 + (M##12_re * HW##11_im - M##12_im * HW##11_re) \
176 + (M##22_re * HW##12_im - M##22_im * HW##12_re);
179 #define SU3_PROJECTOR(va, vb, m) \
180 m##00_re = va##0_re * vb##0_re + va##0_im * vb##0_im; \
181 m##00_im = va##0_im * vb##0_re - va##0_re * vb##0_im; \
182 m##01_re = va##0_re * vb##1_re + va##0_im * vb##1_im; \
183 m##01_im = va##0_im * vb##1_re - va##0_re * vb##1_im; \
184 m##02_re = va##0_re * vb##2_re + va##0_im * vb##2_im; \
185 m##02_im = va##0_im * vb##2_re - va##0_re * vb##2_im; \
186 m##10_re = va##1_re * vb##0_re + va##1_im * vb##0_im; \
187 m##10_im = va##1_im * vb##0_re - va##1_re * vb##0_im; \
188 m##11_re = va##1_re * vb##1_re + va##1_im * vb##1_im; \
189 m##11_im = va##1_im * vb##1_re - va##1_re * vb##1_im; \
190 m##12_re = va##1_re * vb##2_re + va##1_im * vb##2_im; \
191 m##12_im = va##1_im * vb##2_re - va##1_re * vb##2_im; \
192 m##20_re = va##2_re * vb##0_re + va##2_im * vb##0_im; \
193 m##20_im = va##2_im * vb##0_re - va##2_re * vb##0_im; \
194 m##21_re = va##2_re * vb##1_re + va##2_im * vb##1_im; \
195 m##21_im = va##2_im * vb##1_re - va##2_re * vb##1_im; \
196 m##22_re = va##2_re * vb##2_re + va##2_im * vb##2_im; \
197 m##22_im = va##2_im * vb##2_re - va##2_re * vb##2_im;
200 #define SCALAR_MULT_ADD_SU3_VECTOR(va, vb, s, vc) do { \
201 vc##0_re = va##0_re + vb##0_re * s; \
202 vc##0_im = va##0_im + vb##0_im * s; \
203 vc##1_re = va##1_re + vb##1_re * s; \
204 vc##1_im = va##1_im + vb##1_im * s; \
205 vc##2_re = va##2_re + vb##2_re * s; \
206 vc##2_im = va##2_im + vb##2_im * s; \
210 #define FF_COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(mydir, idx, new_idx) do { \
213 new_idx = ( (new_x1==X1m1)?idx-X1m1:idx+1); \
214 new_x1 = (new_x1==X1m1)?0:new_x1+1; \
217 new_idx = ( (new_x2==X2m1)?idx-X2X1mX1:idx+X1); \
218 new_x2 = (new_x2==X2m1)?0:new_x2+1; \
221 new_idx = ( (new_x3==X3m1)?idx-X3X2X1mX2X1:idx+X2X1); \
222 new_x3 = (new_x3==X3m1)?0:new_x3+1; \
225 new_idx = ( (new_x4==X4m1)?idx-X4X3X2X1mX3X2X1:idx+X3X2X1); \
226 new_x4 = (new_x4==X4m1)?0:new_x4+1; \
231 #define FF_COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(mydir, idx, new_idx) do { \
234 new_idx = ( (new_x1==0)?idx+X1m1:idx-1); \
235 new_x1 = (new_x1==0)?X1m1:new_x1 - 1; \
238 new_idx = ( (new_x2==0)?idx+X2X1mX1:idx-X1); \
239 new_x2 = (new_x2==0)?X2m1:new_x2 - 1; \
242 new_idx = ( (new_x3==0)?idx+X3X2X1mX2X1:idx-X2X1); \
243 new_x3 = (new_x3==0)?X3m1:new_x3 - 1; \
246 new_idx = ( (new_x4==0)?idx+X4X3X2X1mX3X2X1:idx-X3X2X1); \
247 new_x4 = (new_x4==0)?X4m1:new_x4 - 1; \
253 #define FF_COMPUTE_NEW_FULL_IDX_PLUS(old_x1, old_x2, old_x3, old_x4, idx, mydir, new_idx) do { \
256 new_idx = ( (old_x1==X1m1)?idx-X1m1:idx+1); \
259 new_idx = ( (old_x2==X2m1)?idx-X2X1mX1:idx+X1); \
262 new_idx = ( (old_x3==X3m1)?idx-X3X2X1mX2X1:idx+X2X1); \
265 new_idx = ( (old_x4==X4m1)?idx-X4X3X2X1mX3X2X1:idx+X3X2X1); \
270 #define FF_COMPUTE_NEW_FULL_IDX_MINUS(old_x1, old_x2, old_x3, old_x4, idx, mydir, new_idx) do { \
273 new_idx = ( (old_x1==0)?idx+X1m1:idx-1); \
276 new_idx = ( (old_x2==0)?idx+X2X1mX1:idx-X1); \
279 new_idx = ( (old_x3==0)?idx+X3X2X1mX2X1:idx-X2X1); \
282 new_idx = ( (old_x4==0)?idx+X4X3X2X1mX3X2X1:idx-X3X2X1); \
288 #define ADD_FORCE_TO_MOM(hw1, hw2, idx, dir, cf,oddness) do{ \
291 if (GOES_BACKWARDS(dir)){ \
292 mydir=OPP_DIR(dir); \
293 my_coeff.x = -cf.x; \
294 my_coeff.y = -cf.y; \
301 tmp_coeff.x = my_coeff.x; \
302 tmp_coeff.y = my_coeff.y; \
304 tmp_coeff.x = - my_coeff.x; \
305 tmp_coeff.y = - my_coeff.y; \
307 Float2* mom = oddness?momOdd:momEven; \
308 LOAD_ANTI_HERMITIAN(mom, mydir, idx, AH); \
309 UNCOMPRESS_ANTI_HERMITIAN(ah, linka); \
310 SU3_PROJECTOR(hw1##0, hw2##0, linkb); \
311 SCALAR_MULT_ADD_SU3_MATRIX(linka, linkb, tmp_coeff.x, linka); \
312 SU3_PROJECTOR(hw1##1, hw2##1, linkb); \
313 SCALAR_MULT_ADD_SU3_MATRIX(linka, linkb, tmp_coeff.y, linka); \
314 MAKE_ANTI_HERMITIAN(linka, ah); \
315 WRITE_ANTI_HERMITIAN(mom, mydir, idx, AH, Vh); \
319 #define FF_COMPUTE_RECONSTRUCT_SIGN(sign, dir, i1,i2,i3,i4) do { \
323 if ( (i4 & 1) == 1){ \
328 if ( ((i4+i1) & 1) == 1){ \
333 if ( ((i4+i1+i2) & 1) == 1){ \
346 #define hwa00_re HWA0.x
347 #define hwa00_im HWA0.y
348 #define hwa01_re HWA1.x
349 #define hwa01_im HWA1.y
350 #define hwa02_re HWA2.x
351 #define hwa02_im HWA2.y
352 #define hwa10_re HWA3.x
353 #define hwa10_im HWA3.y
354 #define hwa11_re HWA4.x
355 #define hwa11_im HWA4.y
356 #define hwa12_re HWA5.x
357 #define hwa12_im HWA5.y
359 #define hwb00_re HWB0.x
360 #define hwb00_im HWB0.y
361 #define hwb01_re HWB1.x
362 #define hwb01_im HWB1.y
363 #define hwb02_re HWB2.x
364 #define hwb02_im HWB2.y
365 #define hwb10_re HWB3.x
366 #define hwb10_im HWB3.y
367 #define hwb11_re HWB4.x
368 #define hwb11_im HWB4.y
369 #define hwb12_re HWB5.x
370 #define hwb12_im HWB5.y
372 #define hwc00_re HWC0.x
373 #define hwc00_im HWC0.y
374 #define hwc01_re HWC1.x
375 #define hwc01_im HWC1.y
376 #define hwc02_re HWC2.x
377 #define hwc02_im HWC2.y
378 #define hwc10_re HWC3.x
379 #define hwc10_im HWC3.y
380 #define hwc11_re HWC4.x
381 #define hwc11_im HWC4.y
382 #define hwc12_re HWC5.x
383 #define hwc12_im HWC5.y
385 #define hwd00_re HWD0.x
386 #define hwd00_im HWD0.y
387 #define hwd01_re HWD1.x
388 #define hwd01_im HWD1.y
389 #define hwd02_re HWD2.x
390 #define hwd02_im HWD2.y
391 #define hwd10_re HWD3.x
392 #define hwd10_im HWD3.y
393 #define hwd11_re HWD4.x
394 #define hwd11_im HWD4.y
395 #define hwd12_re HWD5.x
396 #define hwd12_im HWD5.y
398 #define hwe00_re HWE0.x
399 #define hwe00_im HWE0.y
400 #define hwe01_re HWE1.x
401 #define hwe01_im HWE1.y
402 #define hwe02_re HWE2.x
403 #define hwe02_im HWE2.y
404 #define hwe10_re HWE3.x
405 #define hwe10_im HWE3.y
406 #define hwe11_re HWE4.x
407 #define hwe11_im HWE4.y
408 #define hwe12_re HWE5.x
409 #define hwe12_im HWE5.y
416 #error "multi gpu is not supported for fermion force computation"
419 static int fermion_force_init_cuda_flag = 0;
421 if (fermion_force_init_cuda_flag)
return;
423 fermion_force_init_cuda_flag=1;
436 template<
int sig_positive,
int mu_positive,
int oddBit,
typename Float2>
438 do_middle_link_kernel(Float2* tempxEven, Float2* tempxOdd,
443 Float2* momEven, Float2* momOdd)
445 int sid = blockIdx.x * blockDim.x + threadIdx.x;
457 int new_x1, new_x2, new_x3, new_x4;
463 Float2 HWA0, HWA1, HWA2, HWA3, HWA4, HWA5;
464 Float2 HWB0, HWB1, HWB2, HWB3, HWB4, HWB5;
465 Float2 HWC0, HWC1, HWC2, HWC3, HWC4, HWC5;
466 Float2 HWD0, HWD1, HWD2, HWD3, HWD4, HWD5;
467 float4 LINKA0, LINKA1, LINKA2, LINKA3, LINKA4;
468 float4 LINKB0, LINKB1, LINKB2, LINKB3, LINKB4;
469 Float2 AH0, AH1, AH2, AH3, AH4;
495 point_d = (new_mem_idx >> 1);
498 FF_COMPUTE_RECONSTRUCT_SIGN(ad_link_sign, mymu, new_x1,new_x2,new_x3,new_x4);
500 ad_link_nbr_idx =
sid;
501 FF_COMPUTE_RECONSTRUCT_SIGN(ad_link_sign, mymu, x1, x2, x3, x4);
512 point_c = (new_mem_idx >> 1);
515 FF_COMPUTE_RECONSTRUCT_SIGN(bc_link_sign, mymu, new_x1,new_x2,new_x3,new_x4);
526 point_b = (new_mem_idx >> 1);
530 FF_COMPUTE_RECONSTRUCT_SIGN(bc_link_sign, mymu, new_x1,new_x2,new_x3,new_x4);
534 ab_link_nbr_idx =
sid;
535 FF_COMPUTE_RECONSTRUCT_SIGN(ab_link_sign, mysig, x1, x2, x3, x4);
538 FF_COMPUTE_RECONSTRUCT_SIGN(ab_link_sign, mysig, new_x1,new_x2,new_x3,new_x4);
541 LOAD_HW(tempxEven, tempxOdd, point_d, HWA, 1-
oddBit );
543 FF_LOAD_MATRIX(mymu, ad_link_nbr_idx, LINKA, 1-
oddBit);
545 FF_LOAD_MATRIX(mymu, ad_link_nbr_idx, LINKA,
oddBit);
550 ADJ_MAT_MUL_HW(linka, hwa, hwd);
552 MAT_MUL_HW(linka, hwa, hwd);
554 WRITE_HW(PmuEven,PmuOdd, sid, HWD,
oddBit);
556 LOAD_HW(tempxEven,tempxOdd, point_c, HWA,
oddBit);
558 FF_LOAD_MATRIX(mymu, bc_link_nbr_idx, LINKA,
oddBit);
560 FF_LOAD_MATRIX(mymu, bc_link_nbr_idx, LINKA, 1-
oddBit);
565 ADJ_MAT_MUL_HW(linka, hwa, hwb);
567 MAT_MUL_HW(linka, hwa, hwb);
570 FF_LOAD_MATRIX(mysig, ab_link_nbr_idx, LINKB,
oddBit);
572 FF_LOAD_MATRIX(mysig, ab_link_nbr_idx, LINKB, 1-
oddBit);
577 MAT_MUL_HW(linkb, hwb, hwc);
579 ADJ_MAT_MUL_HW(linkb, hwb, hwc);
581 WRITE_HW(P3Even, P3Odd, sid, HWC,
oddBit);
585 ADD_FORCE_TO_MOM(hwc, hwd, sid, sig, coeff,
oddBit);
590 template<
typename Float2>
592 middle_link_kernel(Float2* tempxEven, Float2* tempxOdd,
593 Float2* PmuEven, Float2* PmuOdd,
594 Float2* P3Even, Float2* P3Odd,
595 int sig,
int mu, Float2 coeff,
596 float4* linkEven, float4* linkOdd, cudaGaugeField &siteLink,
597 Float2* momEven, Float2* momOdd,
598 dim3 gridDim, dim3 BlockDim)
600 dim3 halfGridDim(gridDim.x/2, 1,1);
603 #define CALL_MIDDLE_LINK_KERNEL(sig_sign, mu_sign) \
604 do_middle_link_kernel<sig_sign, mu_sign,0><<<halfGridDim, BlockDim>>>( tempxEven, tempxOdd, \
610 do_middle_link_kernel<sig_sign, mu_sign, 1><<<halfGridDim, BlockDim>>>(tempxEven, tempxOdd, \
619 CALL_MIDDLE_LINK_KERNEL(1, 1);
621 CALL_MIDDLE_LINK_KERNEL(1, 0);
623 CALL_MIDDLE_LINK_KERNEL(0, 1);
625 CALL_MIDDLE_LINK_KERNEL(0, 0);
627 #undef CALL_MIDDLE_LINK_KERNEL
642 template<
int sig_positive,
int mu_positive,
int oddBit,
typename Float2>
644 do_side_link_kernel(Float2* P3Even, Float2* P3Odd,
645 Float2* P3muEven, Float2* P3muOdd,
646 Float2* TempxEven, Float2* TempxOdd,
647 Float2* PmuEven, Float2* PmuOdd,
650 float4* linkEven, float4* linkOdd,
651 Float2* momEven, Float2* momOdd)
657 int sid = blockIdx.x * blockDim.x + threadIdx.x;
660 int x1h = sid - z1*
X1h;
665 int x1odd = (x2 + x3 + x4 +
oddBit) & 1;
666 int x1 = 2*x1h +
x1odd;
667 int X = 2*sid +
x1odd;
669 int ad_link_sign = 1;
670 Float2 HWA0, HWA1, HWA2, HWA3, HWA4, HWA5;
671 Float2 HWB0, HWB1, HWB2, HWB3, HWB4, HWB5;
672 Float2 HWC0, HWC1, HWC2, HWC3, HWC4, HWC5;
673 float4 LINKA0, LINKA1, LINKA2, LINKA3, LINKA4;
674 float4 LINKB0, LINKB1, LINKB2, LINKB3, LINKB4;
675 Float2 AH0, AH1, AH2, AH3, AH4;
708 point_d = (new_mem_idx >> 1);
712 FF_COMPUTE_RECONSTRUCT_SIGN(ad_link_sign, mymu, new_x1,new_x2,new_x3,new_x4);
714 ad_link_nbr_idx =
sid;
715 FF_COMPUTE_RECONSTRUCT_SIGN(ad_link_sign, mymu, x1, x2, x3, x4);
719 LOAD_HW(P3Even, P3Odd, sid, HWA,
oddBit);
721 FF_LOAD_MATRIX(mymu, ad_link_nbr_idx, LINKA, 1 -
oddBit);
723 FF_LOAD_MATRIX(mymu, ad_link_nbr_idx, LINKA,
oddBit);
728 MAT_MUL_HW(linka, hwa, hwb);
730 ADJ_MAT_MUL_HW(linka, hwa, hwb);
736 LOAD_HW(TempxEven, TempxOdd, point_d, HWC, 1-
oddBit);
739 ADD_FORCE_TO_MOM(hwb, hwc, point_d, mu, coeff, 1-
oddBit);
741 ADD_FORCE_TO_MOM(hwc, hwb, point_d,
OPP_DIR(mu), mcoeff, 1-
oddBit);
744 LOAD_HW(PmuEven, PmuOdd, sid, HWC,
oddBit);
746 ADD_FORCE_TO_MOM(hwa, hwc, sid, mu, mcoeff,
oddBit);
754 LOAD_HW(shortPEven, shortPOdd, point_d, HWA, 1-
oddBit);
755 SCALAR_MULT_ADD_SU3_VECTOR(hwa0, hwb0, accumu_coeff.x, hwa0);
756 SCALAR_MULT_ADD_SU3_VECTOR(hwa1, hwb1, accumu_coeff.y, hwa1);
757 WRITE_HW(shortPEven, shortPOdd, point_d, HWA, 1-
oddBit);
763 template<
typename Float2>
765 side_link_kernel(Float2* P3Even, Float2* P3Odd,
766 Float2* P3muEven, Float2* P3muOdd,
767 Float2* TempxEven, Float2* TempxOdd,
768 Float2* PmuEven, Float2* PmuOdd,
769 Float2* shortPEven, Float2* shortPOdd,
770 int sig,
int mu, Float2 coeff, Float2 accumu_coeff,
771 float4* linkEven, float4* linkOdd, cudaGaugeField &siteLink,
772 Float2* momEven, Float2* momOdd,
773 dim3 gridDim, dim3 blockDim)
775 dim3 halfGridDim(gridDim.x/2,1,1);
777 #define CALL_SIDE_LINK_KERNEL(sig_sign, mu_sign) \
778 do_side_link_kernel<sig_sign,mu_sign,0><<<halfGridDim, blockDim>>>( P3Even, P3Odd, \
780 TempxEven, TempxOdd, \
782 shortPEven, shortPOdd, \
783 sig, mu, coeff, accumu_coeff, \
786 do_side_link_kernel<sig_sign,mu_sign,1><<<halfGridDim, blockDim>>>( P3Even, P3Odd, \
788 TempxEven, TempxOdd, \
790 shortPEven, shortPOdd, \
791 sig, mu, coeff, accumu_coeff, \
796 CALL_SIDE_LINK_KERNEL(1,1);
798 CALL_SIDE_LINK_KERNEL(1,0);
800 CALL_SIDE_LINK_KERNEL(0,1);
802 CALL_SIDE_LINK_KERNEL(0,0);
805 #undef CALL_SIDE_LINK_KERNEL
820 template<
int sig_positive,
int mu_positive,
int oddBit,
typename Float2>
822 do_all_link_kernel(Float2* tempxEven, Float2* tempxOdd,
823 Float2* PmuEven, Float2* PmuOdd,
824 Float2* P3Even, Float2* P3Odd,
825 Float2* P3muEven, Float2* P3muOdd,
826 Float2* shortPEven, Float2* shortPOdd,
827 int sig,
int mu, Float2 coeff, Float2 mcoeff, Float2 accumu_coeff,
828 float4* linkEven, float4* linkOdd,
829 Float2* momEven, Float2* momOdd)
831 int sid = blockIdx.x * blockDim.x + threadIdx.x;
834 int x1h = sid - z1*
X1h;
839 int x1odd = (x2 + x3 + x4 +
oddBit) & 1;
840 int x1 = 2*x1h +
x1odd;
841 int X = 2*sid +
x1odd;
843 int new_x1, new_x2, new_x3, new_x4;
848 Float2 HWA0, HWA1, HWA2, HWA3, HWA4, HWA5;
849 Float2 HWB0, HWB1, HWB2, HWB3, HWB4, HWB5;
850 Float2 HWC0, HWC1, HWC2, HWC3, HWC4, HWC5;
851 Float2 HWD0, HWD1, HWD2, HWD3, HWD4, HWD5;
852 Float2 HWE0, HWE1, HWE2, HWE3, HWE4, HWE5;
853 float4 LINKA0, LINKA1, LINKA2, LINKA3, LINKA4;
854 float4 LINKB0, LINKB1, LINKB2, LINKB3, LINKB4;
855 float4 LINKC0, LINKC1, LINKC2, LINKC3, LINKC4;
856 Float2 AH0, AH1, AH2, AH3, AH4;
884 point_d = (new_mem_idx >> 1);
888 FF_COMPUTE_RECONSTRUCT_SIGN(ad_link_sign, mymu, new_x1,new_x2,new_x3,new_x4);
890 ad_link_nbr_idx =
sid;
891 FF_COMPUTE_RECONSTRUCT_SIGN(ad_link_sign, mymu, x1, x2, x3, x4);
902 point_c = (new_mem_idx >> 1);
905 FF_COMPUTE_RECONSTRUCT_SIGN(bc_link_sign, mymu, new_x1,new_x2,new_x3,new_x4);
917 point_b = (new_mem_idx >> 1);
920 FF_COMPUTE_RECONSTRUCT_SIGN(bc_link_sign, mymu, new_x1,new_x2,new_x3,new_x4);
924 ab_link_nbr_idx =
sid;
925 FF_COMPUTE_RECONSTRUCT_SIGN(ab_link_sign, mysig, x1, x2, x3, x4);
928 FF_COMPUTE_RECONSTRUCT_SIGN(ab_link_sign, mysig, new_x1,new_x2,new_x3,new_x4);
931 LOAD_HW(tempxEven, tempxOdd, point_d, HWE, 1-
oddBit);
933 FF_LOAD_MATRIX(mymu, ad_link_nbr_idx, LINKC, 1-
oddBit);
935 FF_LOAD_MATRIX(mymu, ad_link_nbr_idx, LINKC,
oddBit);
940 ADJ_MAT_MUL_HW(linkc, hwe, hwd);
942 MAT_MUL_HW(linkc, hwe, hwd);
947 LOAD_HW(tempxEven, tempxOdd, point_c, HWA,
oddBit);
949 FF_LOAD_MATRIX(mymu, bc_link_nbr_idx, LINKA,
oddBit);
951 FF_LOAD_MATRIX(mymu, bc_link_nbr_idx, LINKA, 1-
oddBit);
956 ADJ_MAT_MUL_HW(linka, hwa, hwb);
958 MAT_MUL_HW(linka, hwa, hwb);
961 FF_LOAD_MATRIX(mysig, ab_link_nbr_idx, LINKA,
oddBit);
963 FF_LOAD_MATRIX(mysig, ab_link_nbr_idx, LINKA, 1-
oddBit);
968 MAT_MUL_HW(linka, hwb, hwc);
970 ADJ_MAT_MUL_HW(linka, hwb, hwc);
979 ADD_FORCE_TO_MOM(hwc, hwd, sid, sig, mcoeff,
oddBit);
985 MAT_MUL_HW(linkc, hwc, hwa);
987 ADJ_MAT_MUL_HW(linkc, hwc, hwa);
992 LOAD_HW(shortPEven, shortPOdd, point_d, HWB, 1-
oddBit);
993 SCALAR_MULT_ADD_SU3_VECTOR(hwb0, hwa0, accumu_coeff.x, hwb0);
994 SCALAR_MULT_ADD_SU3_VECTOR(hwb1, hwa1, accumu_coeff.y, hwb1);
995 WRITE_HW(shortPEven, shortPOdd, point_d, HWB, 1-
oddBit);
1001 ADD_FORCE_TO_MOM(hwa, hwe, point_d, mu, coeff, 1-
oddBit);
1003 ADD_FORCE_TO_MOM(hwe, hwa, point_d,
OPP_DIR(mu), mcoeff, 1-
oddBit);
1007 ADD_FORCE_TO_MOM(hwc, hwd, sid, mu, mcoeff,
oddBit);
1009 ADD_FORCE_TO_MOM(hwd, hwc, sid,
OPP_DIR(mu), coeff,
oddBit);
1019 template<
typename Float2>
1021 all_link_kernel(Float2* tempxEven, Float2* tempxOdd,
1022 Float2* PmuEven, Float2* PmuOdd,
1023 Float2* P3Even, Float2* P3Odd,
1024 Float2* P3muEven, Float2* P3muOdd,
1025 Float2* shortPEven, Float2* shortPOdd,
1026 int sig,
int mu, Float2 coeff, Float2 mcoeff, Float2 accumu_coeff,
1027 float4* linkEven, float4* linkOdd, cudaGaugeField &siteLink,
1028 Float2* momEven, Float2* momOdd,
1029 dim3 gridDim, dim3 blockDim)
1032 dim3 halfGridDim(gridDim.x/2, 1,1);
1034 #define CALL_ALL_LINK_KERNEL(sig_sign, mu_sign) \
1035 do_all_link_kernel<sig_sign,mu_sign,0><<<halfGridDim, blockDim>>>(tempxEven, tempxOdd, \
1038 P3muEven, P3muOdd, \
1039 shortPEven, shortPOdd, \
1040 sig, mu, coeff, mcoeff, accumu_coeff, \
1041 linkEven, linkOdd, \
1043 do_all_link_kernel<sig_sign,mu_sign,1><<<halfGridDim, blockDim>>>(tempxEven, tempxOdd, \
1046 P3muEven, P3muOdd, \
1047 shortPEven, shortPOdd, \
1048 sig, mu, coeff, mcoeff, accumu_coeff, \
1049 linkEven, linkOdd, \
1054 CALL_ALL_LINK_KERNEL(1,1);
1056 CALL_ALL_LINK_KERNEL(1,0);
1058 CALL_ALL_LINK_KERNEL(0,1);
1060 CALL_ALL_LINK_KERNEL(0,0);
1063 #undef CALL_ALL_LINK_KERNEL
1074 template <
int oddBit,
typename Float2>
1076 do_one_and_naik_terms_kernel(Float2* TempxEven, Float2* TempxOdd,
1077 Float2* PmuEven, Float2* PmuOdd,
1078 Float2* PnumuEven, Float2* PnumuOdd,
1079 int mu, Float2 OneLink, Float2 Naik, Float2 mNaik,
1080 float4* linkEven, float4* linkOdd,
1081 Float2* momEven, Float2* momOdd)
1083 Float2 HWA0, HWA1, HWA2, HWA3, HWA4, HWA5;
1084 Float2 HWB0, HWB1, HWB2, HWB3, HWB4, HWB5;
1085 Float2 HWC0, HWC1, HWC2, HWC3, HWC4, HWC5;
1086 Float2 HWD0, HWD1, HWD2, HWD3, HWD4, HWD5;
1087 float4 LINKA0, LINKA1, LINKA2, LINKA3, LINKA4;
1088 float4 LINKB0, LINKB1, LINKB2, LINKB3, LINKB4;
1089 Float2 AH0, AH1, AH2, AH3, AH4;
1091 int sid = blockIdx.x * blockDim.x + threadIdx.x;
1093 int x1h = sid - z1*
X1h;
1095 int x2 = z1 - z2*
X2;
1097 int x3 = z2 - x4*
X3;
1098 int x1odd = (x2 + x3 + x4 +
oddBit) & 1;
1099 int x1 = 2*x1h +
x1odd;
1103 int new_x1, new_x2, new_x3, new_x4, new_idx;
1108 LOAD_HW(PmuEven, PmuOdd, sid, HWA,
oddBit);
1109 LOAD_HW(TempxEven, TempxOdd, sid, HWB,
oddBit);
1110 ADD_FORCE_TO_MOM(hwa, hwb, sid,
OPP_DIR(mu), OneLink,
oddBit);
1113 dx[3]=dx[2]=dx[1]=dx[0]=0;
1115 new_x1 = (x1 + dx[0] +
X1)%
X1;
1116 new_x2 = (x2 + dx[1] +
X2)%X2;
1117 new_x3 = (x3 + dx[2] +
X3)%X3;
1118 new_x4 = (x4 + dx[3] +
X4)%
X4;
1119 new_idx = (new_x4*
X3X2X1+new_x3*
X2X1+new_x2*
X1+new_x1) >> 1;
1120 LOAD_HW(TempxEven, TempxOdd, new_idx, HWA, 1-
oddBit);
1122 FF_COMPUTE_RECONSTRUCT_SIGN(sign,
OPP_DIR(mu), new_x1,new_x2,new_x3,new_x4);
1124 ADJ_MAT_MUL_HW(linka, hwa, hwc);
1126 LOAD_HW(PnumuEven, PnumuOdd, sid, HWD,
oddBit);
1127 ADD_FORCE_TO_MOM(hwd, hwc, sid,
OPP_DIR(mu), mNaik,
oddBit);
1129 dx[3]=dx[2]=dx[1]=dx[0]=0;
1131 new_x1 = (x1 + dx[0] +
X1)%
X1;
1132 new_x2 = (x2 + dx[1] +
X2)%X2;
1133 new_x3 = (x3 + dx[2] +
X3)%X3;
1134 new_x4 = (x4 + dx[3] +
X4)%
X4;
1135 new_idx = (new_x4*
X3X2X1+new_x3*
X2X1+new_x2*
X1+new_x1) >> 1;
1136 LOAD_HW(PnumuEven, PnumuOdd, new_idx, HWA, 1-
oddBit);
1138 FF_COMPUTE_RECONSTRUCT_SIGN(sign,
OPP_DIR(mu), x1, x2, x3, x4);
1140 MAT_MUL_HW(linka, hwa, hwc);
1143 dx[3]=dx[2]=dx[1]=dx[0]=0;
1145 new_x1 = (x1 + dx[0] +
X1)%
X1;
1146 new_x2 = (x2 + dx[1] +
X2)%X2;
1147 new_x3 = (x3 + dx[2] +
X3)%X3;
1148 new_x4 = (x4 + dx[3] +
X4)%
X4;
1149 new_idx = (new_x4*
X3X2X1+new_x3*
X2X1+new_x2*
X1+new_x1) >> 1;
1150 LOAD_HW(TempxEven, TempxOdd, new_idx, HWA, 1-
oddBit);
1151 FF_LOAD_MATRIX(mu, sid, LINKA,
oddBit);
1152 FF_COMPUTE_RECONSTRUCT_SIGN(sign, mu, x1, x2, x3, x4);
1154 MAT_MUL_HW(linka, hwa, hwb);
1156 LOAD_HW(PnumuEven, PnumuOdd, sid, HWC,
oddBit);
1157 ADD_FORCE_TO_MOM(hwb, hwc, sid, mu, Naik,
oddBit);
1163 template<
typename Float2>
1165 one_and_naik_terms_kernel(Float2* TempxEven, Float2* TempxOdd,
1166 Float2* PmuEven, Float2* PmuOdd,
1167 Float2* PnumuEven, Float2* PnumuOdd,
1168 int mu, Float2 OneLink, Float2 Naik, Float2 mNaik,
1169 float4* linkEven, float4* linkOdd,
1170 Float2* momEven, Float2* momOdd,
1171 dim3 gridDim, dim3 blockDim)
1173 dim3 halfGridDim(gridDim.x/2, 1,1);
1175 do_one_and_naik_terms_kernel<0><<<halfGridDim, blockDim>>>(TempxEven, TempxOdd,
1177 PnumuEven, PnumuOdd,
1178 mu, OneLink, Naik, mNaik,
1181 do_one_and_naik_terms_kernel<1><<<halfGridDim, blockDim>>>(TempxEven, TempxOdd,
1183 PnumuEven, PnumuOdd,
1184 mu, OneLink, Naik, mNaik,
1192 #define Pmu tempvec[0]
1193 #define Pnumu tempvec[1]
1194 #define Prhonumu tempvec[2]
1195 #define P7 tempvec[3]
1196 #define P7rho tempvec[4]
1197 #define P7rhonu tempvec[5]
1198 #define P5 tempvec[6]
1199 #define P3 tempvec[7]
1200 #define P5nu tempvec[3]
1201 #define P3mu tempvec[3]
1202 #define Popmu tempvec[4]
1203 #define Pmumumu tempvec[4]
1205 template<
typename Real>
1207 do_fermion_force_cuda(Real eps, Real weight1, Real weight2, Real* act_path_coeff,
FullHw cudaHw,
1211 int mu, nu, rho,
sig;
1214 float2 OneLink, Lepage, Naik, FiveSt, ThreeSt, SevenSt;
1215 float2 mNaik, mLepage, mFiveSt, mThreeSt, mSevenSt;
1218 ferm_epsilon = 2.0*weight1*eps;
1219 OneLink.x = act_path_coeff[0]*ferm_epsilon ;
1220 Naik.x = act_path_coeff[1]*ferm_epsilon ; mNaik.x = -Naik.x;
1221 ThreeSt.x = act_path_coeff[2]*ferm_epsilon ; mThreeSt.x = -ThreeSt.x;
1222 FiveSt.x = act_path_coeff[3]*ferm_epsilon ; mFiveSt.x = -FiveSt.x;
1223 SevenSt.x = act_path_coeff[4]*ferm_epsilon ; mSevenSt.x = -SevenSt.x;
1224 Lepage.x = act_path_coeff[5]*ferm_epsilon ; mLepage.x = -Lepage.x;
1226 ferm_epsilon = 2.0*weight2*eps;
1227 OneLink.y = act_path_coeff[0]*ferm_epsilon ;
1228 Naik.y = act_path_coeff[1]*ferm_epsilon ; mNaik.y = -Naik.y;
1229 ThreeSt.y = act_path_coeff[2]*ferm_epsilon ; mThreeSt.y = -ThreeSt.y;
1230 FiveSt.y = act_path_coeff[3]*ferm_epsilon ; mFiveSt.y = -FiveSt.y;
1231 SevenSt.y = act_path_coeff[4]*ferm_epsilon ; mSevenSt.y = -SevenSt.y;
1232 Lepage.y = act_path_coeff[5]*ferm_epsilon ; mLepage.y = -Lepage.y;
1234 int DirectLinks[8] ;
1236 for(mu=0;mu<8;mu++){
1237 DirectLinks[
mu] = 0 ;
1240 int volume = param->
X[0]*param->
X[1]*param->
X[2]*param->
X[3];
1242 dim3 gridDim(volume/blockDim.x, 1, 1);
1249 for(sig=0; sig < 8; sig++){
1250 for(mu = 0; mu < 8; mu++){
1251 if ( (mu == sig) || (mu ==
OPP_DIR(sig))){
1257 middle_link_kernel( (float2*)cudaHw.
even.
data, (float2*)cudaHw.
odd.
data,
1258 (float2*)
Pmu.even.data, (float2*)
Pmu.odd.data,
1259 (float2*)
P3.even.data, (float2*)
P3.odd.data,
1261 (float4*)siteLink.Even_p(), (float4*)siteLink.Odd_p(), siteLink,
1262 (float2*)cudaMom.Even_p(), (float2*)cudaMom.Odd_p(),
1265 for(nu=0; nu < 8; nu++){
1266 if (nu == sig || nu ==
OPP_DIR(sig)
1267 || nu == mu || nu ==
OPP_DIR(mu)){
1272 middle_link_kernel( (float2*)
Pmu.even.data, (float2*)
Pmu.odd.data,
1273 (float2*)
Pnumu.even.data, (float2*)
Pnumu.odd.data,
1274 (float2*)
P5.even.data, (float2*)
P5.odd.data,
1276 (float4*)siteLink.Even_p(), (float4*)siteLink.Odd_p(), siteLink,
1277 (float2*)cudaMom.Even_p(), (float2*)cudaMom.Odd_p(),
1281 for(rho =0; rho < 8; rho++){
1282 if (rho == sig || rho ==
OPP_DIR(sig)
1283 || rho == mu || rho ==
OPP_DIR(mu)
1284 || rho == nu || rho ==
OPP_DIR(nu)){
1290 if(FiveSt.x != 0)coeff.x = SevenSt.x/FiveSt.x ;
else coeff.x = 0;
1291 if(FiveSt.y != 0)coeff.y = SevenSt.y/FiveSt.y ;
else coeff.y = 0;
1292 all_link_kernel((float2*)
Pnumu.even.data, (float2*)
Pnumu.odd.data,
1294 (float2*)
P7.even.data, (float2*)
P7.odd.data,
1295 (float2*)
P7rho.even.data, (float2*)
P7rho.odd.data,
1296 (float2*)
P5.even.data, (float2*)
P5.odd.data,
1297 sig, rho, SevenSt,mSevenSt,coeff,
1298 (float4*)siteLink.Even_p(), (float4*)siteLink.Odd_p(), siteLink,
1299 (float2*)cudaMom.Even_p(), (float2*)cudaMom.Odd_p(),
1307 if(ThreeSt.x != 0)coeff.x = FiveSt.x/ThreeSt.x ;
else coeff.x = 0;
1308 if(ThreeSt.y != 0)coeff.y = FiveSt.y/ThreeSt.y ;
else coeff.y = 0;
1309 side_link_kernel((float2*)
P5.even.data, (float2*)
P5.odd.data,
1310 (float2*)
P5nu.even.data, (float2*)
P5nu.odd.data,
1311 (float2*)
Pmu.even.data, (float2*)
Pmu.odd.data,
1312 (float2*)
Pnumu.even.data, (float2*)
Pnumu.odd.data,
1313 (float2*)
P3.even.data, (float2*)
P3.odd.data,
1314 sig, nu, mFiveSt, coeff,
1315 (float4*)siteLink.Even_p(), (float4*)siteLink.Odd_p(), siteLink,
1316 (float2*)cudaMom.Even_p(), (float2*)cudaMom.Odd_p(),
1324 middle_link_kernel( (float2*)
Pmu.even.data, (float2*)
Pmu.odd.data,
1325 (float2*)
Pnumu.even.data, (float2*)
Pnumu.odd.data,
1326 (float2*)
P5.even.data, (float2*)
P5.odd.data,
1328 (float4*)siteLink.Even_p(), (float4*)siteLink.Odd_p(), siteLink,
1329 (float2*)cudaMom.Even_p(), (float2*)cudaMom.Odd_p(),
1333 if(ThreeSt.x != 0)coeff.x = Lepage.x/ThreeSt.x ;
else coeff.x = 0;
1334 if(ThreeSt.y != 0)coeff.y = Lepage.y/ThreeSt.y ;
else coeff.y = 0;
1336 side_link_kernel((float2*)
P5.even.data, (float2*)
P5.odd.data,
1337 (float2*)
P5nu.even.data, (float2*)
P5nu.odd.data,
1338 (float2*)
Pmu.even.data, (float2*)
Pmu.odd.data,
1339 (float2*)
Pnumu.even.data, (float2*)
Pnumu.odd.data,
1340 (float2*)
P3.even.data, (float2*)
P3.odd.data,
1341 sig, mu, mLepage,coeff,
1342 (float4*)siteLink.Even_p(), (float4*)siteLink.Odd_p(), siteLink,
1343 (float2*)cudaMom.Even_p(), (float2*)cudaMom.Odd_p(),
1349 side_link_kernel((float2*)
P3.even.data, (float2*)
P3.odd.data,
1350 (float2*)
P3mu.even.data, (float2*)
P3mu.odd.data,
1352 (float2*)
Pmu.even.data, (float2*)
Pmu.odd.data,
1353 (float2*)NULL, (float2*)NULL,
1354 sig, mu, ThreeSt,coeff,
1355 (float4*)siteLink.Even_p(), (float4*)siteLink.Odd_p(), siteLink,
1356 (float2*)cudaMom.Even_p(), (float2*)cudaMom.Odd_p(),
1361 if (!DirectLinks[mu]){
1364 one_and_naik_terms_kernel((float2*)cudaHw.
even.
data, (float2*)cudaHw.
odd.
data,
1365 (float2*)
Pmu.even.data, (float2*)
Pmu.odd.data,
1366 (float2*)
Pnumu.even.data, (float2*)
Pnumu.odd.data,
1367 mu, OneLink, Naik, mNaik,
1368 (float4*)siteLink.Even_p(), (float4*)siteLink.Odd_p(),
1369 (float2*)cudaMom.Even_p(), (float2*)cudaMom.Odd_p(),
1405 errorQuda(
"Reconstruct type %d not supported for gauge field", siteLink.Reconstruct());
1408 errorQuda(
"Reconstruct type %d not supported for momentum field", cudaMom.Reconstruct());
1419 errorQuda(
"Double precision not supported?");
1421 do_fermion_force_cuda( (
float)eps, (
float)weight1, (
float)weight2, (
float*)act_path_coeff,
1422 cudaHw, siteLink, cudaMom, tempvec, param);
1435 #undef FF_COMPUTE_NEW_FULL_IDX_PLUS_UPDATE
1436 #undef FF_COMPUTE_NEW_FULL_IDX_MINUS_UPDATE
1440 #endif // defined(GPU_FERMION_FORCE)
FullHw createHwQuda(int *X, QudaPrecision precision)
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int mu
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealA *const RealA *const RealA *const RealA *const P3Odd
void fermion_force_cuda(double eps, double weight1, double weight2, void *act_path_coeff, FullHw cudaHw, cudaGaugeField &cudaSiteLink, cudaGaugeField &cudaMom, QudaGaugeParam *param)
texture< float4, 1, cudaReadModeElementType > siteLink1TexSingle_recon
void fermion_force_init_cuda(QudaGaugeParam *param)
void freeHwQuda(FullHw hw)
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const linkOdd
#define FF_COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(mydir, idx, new_idx)
#define RECONSTRUCT_LINK_12(sign, var)
__constant__ double coeff
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealTypeId< RealA >::Type accumu_coeff
texture< float4, 1, cudaReadModeElementType > siteLink0TexSingle_recon
#define FF_COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(mydir, idx, new_idx)
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealTypeId< RealA >::Type RealA *const RealA *const shortPOdd
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealA *const PmuEven
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealA *const RealA *const RealA *const P3Even
#define GOES_BACKWARDS(dir)
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealTypeId< RealA >::Type RealA *const shortPEven
#define GOES_FORWARDS(dir)
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int sig
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const linkEven
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealA *const RealA *const PmuOdd