13 #define LOAD_ANTI_HERMITIAN(src, dir, idx, var) LOAD_ANTI_HERMITIAN_DIRECT(src, dir, idx, var, Vh)
15 #define LOAD_HW_SINGLE(hw_even, hw_odd, idx, var, oddness) do{ \
16 Float2* hw = (oddness)?hw_odd:hw_even; \
17 var##0 = hw[idx + 0*Vh]; \
18 var##1 = hw[idx + 1*Vh]; \
19 var##2 = hw[idx + 2*Vh]; \
20 var##3 = hw[idx + 3*Vh]; \
21 var##4 = hw[idx + 4*Vh]; \
22 var##5 = hw[idx + 5*Vh]; \
25 #define WRITE_HW_SINGLE(hw_even, hw_odd, idx, var, oddness) do{ \
26 Float2* hw = (oddness)?hw_odd:hw_even; \
27 hw[idx + 0*Vh] = var##0; \
28 hw[idx + 1*Vh] = var##1; \
29 hw[idx + 2*Vh] = var##2; \
30 hw[idx + 3*Vh] = var##3; \
31 hw[idx + 4*Vh] = var##4; \
32 hw[idx + 5*Vh] = var##5; \
35 #define LOAD_HW(hw_eve, hw_odd, idx, var, oddness) LOAD_HW_SINGLE(hw_eve, hw_odd, idx, var, oddness)
36 #define WRITE_HW(hw_even, hw_odd, idx, var, oddness) WRITE_HW_SINGLE(hw_even, hw_odd, idx, var, oddness)
37 #define LOAD_MATRIX(src, dir, idx, var) LOAD_MATRIX_12_SINGLE(src, dir, idx, var, Vh)
39 #define FF_SITE_MATRIX_LOAD_TEX 1
41 #define linkEvenTex siteLink0TexSingle_recon
42 #define linkOddTex siteLink1TexSingle_recon
44 #if (FF_SITE_MATRIX_LOAD_TEX == 1)
45 #define FF_LOAD_MATRIX(dir, idx, var, oddness) LOAD_MATRIX_12_SINGLE_TEX(((oddness)?linkOddTex:linkEvenTex), dir, idx, var, Vh)
47 #define FF_LOAD_MATRIX(dir, idx, var, oddness) LOAD_MATRIX_12_SINGLE(((oddness)?linkOdd:linkEven), dir, idx, var, Vh)
51 #define linka00_re LINKA0.x
52 #define linka00_im LINKA0.y
53 #define linka01_re LINKA0.z
54 #define linka01_im LINKA0.w
55 #define linka02_re LINKA1.x
56 #define linka02_im LINKA1.y
57 #define linka10_re LINKA1.z
58 #define linka10_im LINKA1.w
59 #define linka11_re LINKA2.x
60 #define linka11_im LINKA2.y
61 #define linka12_re LINKA2.z
62 #define linka12_im LINKA2.w
63 #define linka20_re LINKA3.x
64 #define linka20_im LINKA3.y
65 #define linka21_re LINKA3.z
66 #define linka21_im LINKA3.w
67 #define linka22_re LINKA4.x
68 #define linka22_im LINKA4.y
70 #define linkb00_re LINKB0.x
71 #define linkb00_im LINKB0.y
72 #define linkb01_re LINKB0.z
73 #define linkb01_im LINKB0.w
74 #define linkb02_re LINKB1.x
75 #define linkb02_im LINKB1.y
76 #define linkb10_re LINKB1.z
77 #define linkb10_im LINKB1.w
78 #define linkb11_re LINKB2.x
79 #define linkb11_im LINKB2.y
80 #define linkb12_re LINKB2.z
81 #define linkb12_im LINKB2.w
82 #define linkb20_re LINKB3.x
83 #define linkb20_im LINKB3.y
84 #define linkb21_re LINKB3.z
85 #define linkb21_im LINKB3.w
86 #define linkb22_re LINKB4.x
87 #define linkb22_im LINKB4.y
90 #define MAT_MUL_HW(M, HW, HWOUT) \
91 HWOUT##00_re = (M##00_re * HW##00_re - M##00_im * HW##00_im) \
92 + (M##01_re * HW##01_re - M##01_im * HW##01_im) \
93 + (M##02_re * HW##02_re - M##02_im * HW##02_im); \
94 HWOUT##00_im = (M##00_re * HW##00_im + M##00_im * HW##00_re) \
95 + (M##01_re * HW##01_im + M##01_im * HW##01_re) \
96 + (M##02_re * HW##02_im + M##02_im * HW##02_re); \
97 HWOUT##01_re = (M##10_re * HW##00_re - M##10_im * HW##00_im) \
98 + (M##11_re * HW##01_re - M##11_im * HW##01_im) \
99 + (M##12_re * HW##02_re - M##12_im * HW##02_im); \
100 HWOUT##01_im = (M##10_re * HW##00_im + M##10_im * HW##00_re) \
101 + (M##11_re * HW##01_im + M##11_im * HW##01_re) \
102 + (M##12_re * HW##02_im + M##12_im * HW##02_re); \
103 HWOUT##02_re = (M##20_re * HW##00_re - M##20_im * HW##00_im) \
104 + (M##21_re * HW##01_re - M##21_im * HW##01_im) \
105 + (M##22_re * HW##02_re - M##22_im * HW##02_im); \
106 HWOUT##02_im = (M##20_re * HW##00_im + M##20_im * HW##00_re) \
107 + (M##21_re * HW##01_im + M##21_im * HW##01_re) \
108 + (M##22_re * HW##02_im + M##22_im * HW##02_re); \
109 HWOUT##10_re = (M##00_re * HW##10_re - M##00_im * HW##10_im) \
110 + (M##01_re * HW##11_re - M##01_im * HW##11_im) \
111 + (M##02_re * HW##12_re - M##02_im * HW##12_im); \
112 HWOUT##10_im = (M##00_re * HW##10_im + M##00_im * HW##10_re) \
113 + (M##01_re * HW##11_im + M##01_im * HW##11_re) \
114 + (M##02_re * HW##12_im + M##02_im * HW##12_re); \
115 HWOUT##11_re = (M##10_re * HW##10_re - M##10_im * HW##10_im) \
116 + (M##11_re * HW##11_re - M##11_im * HW##11_im) \
117 + (M##12_re * HW##12_re - M##12_im * HW##12_im); \
118 HWOUT##11_im = (M##10_re * HW##10_im + M##10_im * HW##10_re) \
119 + (M##11_re * HW##11_im + M##11_im * HW##11_re) \
120 + (M##12_re * HW##12_im + M##12_im * HW##12_re); \
121 HWOUT##12_re = (M##20_re * HW##10_re - M##20_im * HW##10_im) \
122 + (M##21_re * HW##11_re - M##21_im * HW##11_im) \
123 + (M##22_re * HW##12_re - M##22_im * HW##12_im); \
124 HWOUT##12_im = (M##20_re * HW##10_im + M##20_im * HW##10_re) \
125 + (M##21_re * HW##11_im + M##21_im * HW##11_re) \
126 + (M##22_re * HW##12_im + M##22_im * HW##12_re);
129 #define ADJ_MAT_MUL_HW(M, HW, HWOUT) \
130 HWOUT##00_re = (M##00_re * HW##00_re + M##00_im * HW##00_im) \
131 + (M##10_re * HW##01_re + M##10_im * HW##01_im) \
132 + (M##20_re * HW##02_re + M##20_im * HW##02_im); \
133 HWOUT##00_im = (M##00_re * HW##00_im - M##00_im * HW##00_re) \
134 + (M##10_re * HW##01_im - M##10_im * HW##01_re) \
135 + (M##20_re * HW##02_im - M##20_im * HW##02_re); \
136 HWOUT##01_re = (M##01_re * HW##00_re + M##01_im * HW##00_im) \
137 + (M##11_re * HW##01_re + M##11_im * HW##01_im) \
138 + (M##21_re * HW##02_re + M##21_im * HW##02_im); \
139 HWOUT##01_im = (M##01_re * HW##00_im - M##01_im * HW##00_re) \
140 + (M##11_re * HW##01_im - M##11_im * HW##01_re) \
141 + (M##21_re * HW##02_im - M##21_im * HW##02_re); \
142 HWOUT##02_re = (M##02_re * HW##00_re + M##02_im * HW##00_im) \
143 + (M##12_re * HW##01_re + M##12_im * HW##01_im) \
144 + (M##22_re * HW##02_re + M##22_im * HW##02_im); \
145 HWOUT##02_im = (M##02_re * HW##00_im - M##02_im * HW##00_re) \
146 + (M##12_re * HW##01_im - M##12_im * HW##01_re) \
147 + (M##22_re * HW##02_im - M##22_im * HW##02_re); \
148 HWOUT##10_re = (M##00_re * HW##10_re + M##00_im * HW##10_im) \
149 + (M##10_re * HW##11_re + M##10_im * HW##11_im) \
150 + (M##20_re * HW##12_re + M##20_im * HW##12_im); \
151 HWOUT##10_im = (M##00_re * HW##10_im - M##00_im * HW##10_re) \
152 + (M##10_re * HW##11_im - M##10_im * HW##11_re) \
153 + (M##20_re * HW##12_im - M##20_im * HW##12_re); \
154 HWOUT##11_re = (M##01_re * HW##10_re + M##01_im * HW##10_im) \
155 + (M##11_re * HW##11_re + M##11_im * HW##11_im) \
156 + (M##21_re * HW##12_re + M##21_im * HW##12_im); \
157 HWOUT##11_im = (M##01_re * HW##10_im - M##01_im * HW##10_re) \
158 + (M##11_re * HW##11_im - M##11_im * HW##11_re) \
159 + (M##21_re * HW##12_im - M##21_im * HW##12_re); \
160 HWOUT##12_re = (M##02_re * HW##10_re + M##02_im * HW##10_im) \
161 + (M##12_re * HW##11_re + M##12_im * HW##11_im) \
162 + (M##22_re * HW##12_re + M##22_im * HW##12_im); \
163 HWOUT##12_im = (M##02_re * HW##10_im - M##02_im * HW##10_re) \
164 + (M##12_re * HW##11_im - M##12_im * HW##11_re) \
165 + (M##22_re * HW##12_im - M##22_im * HW##12_re);
168 #define SU3_PROJECTOR(va, vb, m) \
169 m##00_re = va##0_re * vb##0_re + va##0_im * vb##0_im; \
170 m##00_im = va##0_im * vb##0_re - va##0_re * vb##0_im; \
171 m##01_re = va##0_re * vb##1_re + va##0_im * vb##1_im; \
172 m##01_im = va##0_im * vb##1_re - va##0_re * vb##1_im; \
173 m##02_re = va##0_re * vb##2_re + va##0_im * vb##2_im; \
174 m##02_im = va##0_im * vb##2_re - va##0_re * vb##2_im; \
175 m##10_re = va##1_re * vb##0_re + va##1_im * vb##0_im; \
176 m##10_im = va##1_im * vb##0_re - va##1_re * vb##0_im; \
177 m##11_re = va##1_re * vb##1_re + va##1_im * vb##1_im; \
178 m##11_im = va##1_im * vb##1_re - va##1_re * vb##1_im; \
179 m##12_re = va##1_re * vb##2_re + va##1_im * vb##2_im; \
180 m##12_im = va##1_im * vb##2_re - va##1_re * vb##2_im; \
181 m##20_re = va##2_re * vb##0_re + va##2_im * vb##0_im; \
182 m##20_im = va##2_im * vb##0_re - va##2_re * vb##0_im; \
183 m##21_re = va##2_re * vb##1_re + va##2_im * vb##1_im; \
184 m##21_im = va##2_im * vb##1_re - va##2_re * vb##1_im; \
185 m##22_re = va##2_re * vb##2_re + va##2_im * vb##2_im; \
186 m##22_im = va##2_im * vb##2_re - va##2_re * vb##2_im;
189 #define SCALAR_MULT_ADD_SU3_VECTOR(va, vb, s, vc) do { \
190 vc##0_re = va##0_re + vb##0_re * s; \
191 vc##0_im = va##0_im + vb##0_im * s; \
192 vc##1_re = va##1_re + vb##1_re * s; \
193 vc##1_im = va##1_im + vb##1_im * s; \
194 vc##2_re = va##2_re + vb##2_re * s; \
195 vc##2_im = va##2_im + vb##2_im * s; \
199 #define FF_COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(mydir, idx, new_idx) do { \
202 new_idx = ( (new_x1==X1m1)?idx-X1m1:idx+1); \
203 new_x1 = (new_x1==X1m1)?0:new_x1+1; \
206 new_idx = ( (new_x2==X2m1)?idx-X2X1mX1:idx+X1); \
207 new_x2 = (new_x2==X2m1)?0:new_x2+1; \
210 new_idx = ( (new_x3==X3m1)?idx-X3X2X1mX2X1:idx+X2X1); \
211 new_x3 = (new_x3==X3m1)?0:new_x3+1; \
214 new_idx = ( (new_x4==X4m1)?idx-X4X3X2X1mX3X2X1:idx+X3X2X1); \
215 new_x4 = (new_x4==X4m1)?0:new_x4+1; \
220 #define FF_COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(mydir, idx, new_idx) do { \
223 new_idx = ( (new_x1==0)?idx+X1m1:idx-1); \
224 new_x1 = (new_x1==0)?X1m1:new_x1 - 1; \
227 new_idx = ( (new_x2==0)?idx+X2X1mX1:idx-X1); \
228 new_x2 = (new_x2==0)?X2m1:new_x2 - 1; \
231 new_idx = ( (new_x3==0)?idx+X3X2X1mX2X1:idx-X2X1); \
232 new_x3 = (new_x3==0)?X3m1:new_x3 - 1; \
235 new_idx = ( (new_x4==0)?idx+X4X3X2X1mX3X2X1:idx-X3X2X1); \
236 new_x4 = (new_x4==0)?X4m1:new_x4 - 1; \
242 #define FF_COMPUTE_NEW_FULL_IDX_PLUS(old_x1, old_x2, old_x3, old_x4, idx, mydir, new_idx) do { \
245 new_idx = ( (old_x1==X1m1)?idx-X1m1:idx+1); \
248 new_idx = ( (old_x2==X2m1)?idx-X2X1mX1:idx+X1); \
251 new_idx = ( (old_x3==X3m1)?idx-X3X2X1mX2X1:idx+X2X1); \
254 new_idx = ( (old_x4==X4m1)?idx-X4X3X2X1mX3X2X1:idx+X3X2X1); \
259 #define FF_COMPUTE_NEW_FULL_IDX_MINUS(old_x1, old_x2, old_x3, old_x4, idx, mydir, new_idx) do { \
262 new_idx = ( (old_x1==0)?idx+X1m1:idx-1); \
265 new_idx = ( (old_x2==0)?idx+X2X1mX1:idx-X1); \
268 new_idx = ( (old_x3==0)?idx+X3X2X1mX2X1:idx-X2X1); \
271 new_idx = ( (old_x4==0)?idx+X4X3X2X1mX3X2X1:idx-X3X2X1); \
277 #define ADD_FORCE_TO_MOM(hw1, hw2, idx, dir, cf,oddness) do{ \
280 if (GOES_BACKWARDS(dir)){ \
281 mydir=OPP_DIR(dir); \
282 my_coeff.x = -cf.x; \
283 my_coeff.y = -cf.y; \
290 tmp_coeff.x = my_coeff.x; \
291 tmp_coeff.y = my_coeff.y; \
293 tmp_coeff.x = - my_coeff.x; \
294 tmp_coeff.y = - my_coeff.y; \
296 Float2* mom = oddness?momOdd:momEven; \
297 LOAD_ANTI_HERMITIAN(mom, mydir, idx, AH); \
298 UNCOMPRESS_ANTI_HERMITIAN(ah, linka); \
299 SU3_PROJECTOR(hw1##0, hw2##0, linkb); \
300 SCALAR_MULT_ADD_SU3_MATRIX(linka, linkb, tmp_coeff.x, linka); \
301 SU3_PROJECTOR(hw1##1, hw2##1, linkb); \
302 SCALAR_MULT_ADD_SU3_MATRIX(linka, linkb, tmp_coeff.y, linka); \
303 MAKE_ANTI_HERMITIAN(linka, ah); \
304 WRITE_ANTI_HERMITIAN(mom, mydir, idx, AH, Vh); \
308 #define FF_COMPUTE_RECONSTRUCT_SIGN(sign, dir, i1,i2,i3,i4) do { \
312 if ( (i4 & 1) == 1){ \
317 if ( ((i4+i1) & 1) == 1){ \
322 if ( ((i4+i1+i2) & 1) == 1){ \
335 #define hwa00_re HWA0.x
336 #define hwa00_im HWA0.y
337 #define hwa01_re HWA1.x
338 #define hwa01_im HWA1.y
339 #define hwa02_re HWA2.x
340 #define hwa02_im HWA2.y
341 #define hwa10_re HWA3.x
342 #define hwa10_im HWA3.y
343 #define hwa11_re HWA4.x
344 #define hwa11_im HWA4.y
345 #define hwa12_re HWA5.x
346 #define hwa12_im HWA5.y
348 #define hwb00_re HWB0.x
349 #define hwb00_im HWB0.y
350 #define hwb01_re HWB1.x
351 #define hwb01_im HWB1.y
352 #define hwb02_re HWB2.x
353 #define hwb02_im HWB2.y
354 #define hwb10_re HWB3.x
355 #define hwb10_im HWB3.y
356 #define hwb11_re HWB4.x
357 #define hwb11_im HWB4.y
358 #define hwb12_re HWB5.x
359 #define hwb12_im HWB5.y
361 #define hwc00_re HWC0.x
362 #define hwc00_im HWC0.y
363 #define hwc01_re HWC1.x
364 #define hwc01_im HWC1.y
365 #define hwc02_re HWC2.x
366 #define hwc02_im HWC2.y
367 #define hwc10_re HWC3.x
368 #define hwc10_im HWC3.y
369 #define hwc11_re HWC4.x
370 #define hwc11_im HWC4.y
371 #define hwc12_re HWC5.x
372 #define hwc12_im HWC5.y
374 #define hwd00_re HWD0.x
375 #define hwd00_im HWD0.y
376 #define hwd01_re HWD1.x
377 #define hwd01_im HWD1.y
378 #define hwd02_re HWD2.x
379 #define hwd02_im HWD2.y
380 #define hwd10_re HWD3.x
381 #define hwd10_im HWD3.y
382 #define hwd11_re HWD4.x
383 #define hwd11_im HWD4.y
384 #define hwd12_re HWD5.x
385 #define hwd12_im HWD5.y
387 #define hwe00_re HWE0.x
388 #define hwe00_im HWE0.y
389 #define hwe01_re HWE1.x
390 #define hwe01_im HWE1.y
391 #define hwe02_re HWE2.x
392 #define hwe02_im HWE2.y
393 #define hwe10_re HWE3.x
394 #define hwe10_im HWE3.y
395 #define hwe11_re HWE4.x
396 #define hwe11_im HWE4.y
397 #define hwe12_re HWE5.x
398 #define hwe12_im HWE5.y
405 #error "multi gpu is not supported for fermion force computation"
408 static int fermion_force_init_cuda_flag = 0;
410 if (fermion_force_init_cuda_flag)
return;
412 fermion_force_init_cuda_flag=1;
425 template<
int sig_positive,
int mu_positive,
int oddBit,
typename Float2>
432 Float2* momEven, Float2* momOdd)
434 int sid = blockIdx.x * blockDim.x + threadIdx.x;
442 int x1odd = (x2 + x3 + x4 + oddBit) & 1;
446 int new_x1, new_x2, new_x3, new_x4;
452 Float2 HWA0, HWA1, HWA2, HWA3, HWA4, HWA5;
453 Float2 HWB0, HWB1, HWB2, HWB3, HWB4, HWB5;
454 Float2 HWC0, HWC1, HWC2, HWC3, HWC4, HWC5;
455 Float2 HWD0, HWD1, HWD2, HWD3, HWD4, HWD5;
456 float4 LINKA0, LINKA1, LINKA2, LINKA3, LINKA4;
457 float4 LINKB0, LINKB1, LINKB2, LINKB3, LINKB4;
458 Float2 AH0, AH1, AH2, AH3, AH4;
484 point_d = (new_mem_idx >> 1);
489 ad_link_nbr_idx =
sid;
501 point_c = (new_mem_idx >> 1);
515 point_b = (new_mem_idx >> 1);
523 ab_link_nbr_idx =
sid;
530 LOAD_HW(tempxEven, tempxOdd, point_d, HWA, 1-oddBit );
543 WRITE_HW(PmuEven,PmuOdd, sid, HWD, oddBit);
545 LOAD_HW(tempxEven,tempxOdd, point_c, HWA, oddBit);
570 WRITE_HW(P3Even, P3Odd, sid, HWC, oddBit);
579 template<
typename Float2>
581 middle_link_kernel(Float2* tempxEven, Float2* tempxOdd,
586 Float2* momEven, Float2* momOdd,
587 dim3 gridDim, dim3 BlockDim)
589 dim3 halfGridDim(gridDim.x/2, 1,1);
592 #define CALL_MIDDLE_LINK_KERNEL(sig_sign, mu_sign) \
593 do_middle_link_kernel<sig_sign, mu_sign,0><<<halfGridDim, BlockDim>>>( tempxEven, tempxOdd, \
599 do_middle_link_kernel<sig_sign, mu_sign, 1><<<halfGridDim, BlockDim>>>(tempxEven, tempxOdd, \
616 #undef CALL_MIDDLE_LINK_KERNEL
631 template<
int sig_positive,
int mu_positive,
int oddBit,
typename Float2>
634 Float2* P3muEven, Float2* P3muOdd,
635 Float2* TempxEven, Float2* TempxOdd,
636 Float2* PmuEven, Float2* PmuOdd,
639 float4* linkEven, float4* linkOdd,
640 Float2* momEven, Float2* momOdd)
646 int sid = blockIdx.x * blockDim.x + threadIdx.x;
654 int x1odd = (x2 + x3 + x4 + oddBit) & 1;
659 Float2 HWA0, HWA1, HWA2, HWA3, HWA4, HWA5;
660 Float2 HWB0, HWB1, HWB2, HWB3, HWB4, HWB5;
661 Float2 HWC0, HWC1, HWC2, HWC3, HWC4, HWC5;
662 float4 LINKA0, LINKA1, LINKA2, LINKA3, LINKA4;
663 float4 LINKB0, LINKB1, LINKB2, LINKB3, LINKB4;
664 Float2 AH0, AH1, AH2, AH3, AH4;
697 point_d = (new_mem_idx >> 1);
703 ad_link_nbr_idx =
sid;
708 LOAD_HW(P3Even, P3Odd, sid, HWA, oddBit);
725 LOAD_HW(TempxEven, TempxOdd, point_d, HWC, 1-oddBit);
733 LOAD_HW(PmuEven, PmuOdd, sid, HWC, oddBit);
743 LOAD_HW(shortPEven, shortPOdd, point_d, HWA, 1-oddBit);
746 WRITE_HW(shortPEven, shortPOdd, point_d, HWA, 1-oddBit);
752 template<
typename Float2>
754 side_link_kernel(Float2* P3Even, Float2* P3Odd,
755 Float2* P3muEven, Float2* P3muOdd,
756 Float2* TempxEven, Float2* TempxOdd,
757 Float2* PmuEven, Float2* PmuOdd,
760 float4* linkEven, float4* linkOdd, cudaGaugeField &siteLink,
761 Float2* momEven, Float2* momOdd,
762 dim3 gridDim, dim3 blockDim)
764 dim3 halfGridDim(gridDim.x/2,1,1);
766 #define CALL_SIDE_LINK_KERNEL(sig_sign, mu_sign) \
767 do_side_link_kernel<sig_sign,mu_sign,0><<<halfGridDim, blockDim>>>( P3Even, P3Odd, \
769 TempxEven, TempxOdd, \
771 shortPEven, shortPOdd, \
772 sig, mu, coeff, accumu_coeff, \
775 do_side_link_kernel<sig_sign,mu_sign,1><<<halfGridDim, blockDim>>>( P3Even, P3Odd, \
777 TempxEven, TempxOdd, \
779 shortPEven, shortPOdd, \
780 sig, mu, coeff, accumu_coeff, \
794 #undef CALL_SIDE_LINK_KERNEL
809 template<
int sig_positive,
int mu_positive,
int oddBit,
typename Float2>
812 Float2* PmuEven, Float2* PmuOdd,
813 Float2* P3Even, Float2* P3Odd,
814 Float2* P3muEven, Float2* P3muOdd,
815 Float2* shortPEven, Float2* shortPOdd,
816 int sig,
int mu, Float2 coeff, Float2 mcoeff, Float2 accumu_coeff,
817 float4* linkEven, float4* linkOdd,
818 Float2* momEven, Float2* momOdd)
820 int sid = blockIdx.x * blockDim.x + threadIdx.x;
828 int x1odd = (x2 + x3 + x4 + oddBit) & 1;
832 int new_x1, new_x2, new_x3, new_x4;
837 Float2 HWA0, HWA1, HWA2, HWA3, HWA4, HWA5;
838 Float2 HWB0, HWB1, HWB2, HWB3, HWB4, HWB5;
839 Float2 HWC0, HWC1, HWC2, HWC3, HWC4, HWC5;
840 Float2 HWD0, HWD1, HWD2, HWD3, HWD4, HWD5;
841 Float2 HWE0, HWE1, HWE2, HWE3, HWE4, HWE5;
842 float4 LINKA0, LINKA1, LINKA2, LINKA3, LINKA4;
843 float4 LINKB0, LINKB1, LINKB2, LINKB3, LINKB4;
844 float4 LINKC0, LINKC1, LINKC2, LINKC3, LINKC4;
845 Float2 AH0, AH1, AH2, AH3, AH4;
873 point_d = (new_mem_idx >> 1);
879 ad_link_nbr_idx =
sid;
891 point_c = (new_mem_idx >> 1);
906 point_b = (new_mem_idx >> 1);
913 ab_link_nbr_idx =
sid;
920 LOAD_HW(tempxEven, tempxOdd, point_d, HWE, 1-oddBit);
936 LOAD_HW(tempxEven, tempxOdd, point_c, HWA, oddBit);
981 LOAD_HW(shortPEven, shortPOdd, point_d, HWB, 1-oddBit);
984 WRITE_HW(shortPEven, shortPOdd, point_d, HWB, 1-oddBit);
1008 template<
typename Float2>
1010 all_link_kernel(Float2* tempxEven, Float2* tempxOdd,
1011 Float2* PmuEven, Float2* PmuOdd,
1012 Float2* P3Even, Float2* P3Odd,
1013 Float2* P3muEven, Float2* P3muOdd,
1014 Float2* shortPEven, Float2* shortPOdd,
1015 int sig,
int mu, Float2 coeff, Float2 mcoeff, Float2 accumu_coeff,
1016 float4* linkEven, float4* linkOdd, cudaGaugeField &siteLink,
1017 Float2* momEven, Float2* momOdd,
1018 dim3 gridDim, dim3 blockDim)
1021 dim3 halfGridDim(gridDim.x/2, 1,1);
1023 #define CALL_ALL_LINK_KERNEL(sig_sign, mu_sign) \
1024 do_all_link_kernel<sig_sign,mu_sign,0><<<halfGridDim, blockDim>>>(tempxEven, tempxOdd, \
1027 P3muEven, P3muOdd, \
1028 shortPEven, shortPOdd, \
1029 sig, mu, coeff, mcoeff, accumu_coeff, \
1030 linkEven, linkOdd, \
1032 do_all_link_kernel<sig_sign,mu_sign,1><<<halfGridDim, blockDim>>>(tempxEven, tempxOdd, \
1035 P3muEven, P3muOdd, \
1036 shortPEven, shortPOdd, \
1037 sig, mu, coeff, mcoeff, accumu_coeff, \
1038 linkEven, linkOdd, \
1052 #undef CALL_ALL_LINK_KERNEL
1063 template <
int oddBit,
typename Float2>
1066 Float2* PmuEven, Float2* PmuOdd,
1067 Float2* PnumuEven, Float2* PnumuOdd,
1068 int mu, Float2 OneLink, Float2 Naik, Float2 mNaik,
1069 float4* linkEven, float4* linkOdd,
1070 Float2* momEven, Float2* momOdd)
1072 Float2 HWA0, HWA1, HWA2, HWA3, HWA4, HWA5;
1073 Float2 HWB0, HWB1, HWB2, HWB3, HWB4, HWB5;
1074 Float2 HWC0, HWC1, HWC2, HWC3, HWC4, HWC5;
1075 Float2 HWD0, HWD1, HWD2, HWD3, HWD4, HWD5;
1076 float4 LINKA0, LINKA1, LINKA2, LINKA3, LINKA4;
1077 float4 LINKB0, LINKB1, LINKB2, LINKB3, LINKB4;
1078 Float2 AH0, AH1, AH2, AH3, AH4;
1080 int sid = blockIdx.x * blockDim.x + threadIdx.x;
1084 int x2 = z1 - z2*
X2;
1086 int x3 = z2 - x4*
X3;
1087 int x1odd = (x2 + x3 + x4 + oddBit) & 1;
1092 int new_x1, new_x2, new_x3, new_x4, new_idx;
1097 LOAD_HW(PmuEven, PmuOdd, sid, HWA, oddBit);
1098 LOAD_HW(TempxEven, TempxOdd, sid, HWB, oddBit);
1102 dx[3]=dx[2]=dx[1]=dx[0]=0;
1104 new_x1 = (x1 + dx[0] +
X1)%
X1;
1105 new_x2 = (x2 + dx[1] +
X2)%X2;
1106 new_x3 = (x3 + dx[2] +
X3)%X3;
1107 new_x4 = (x4 + dx[3] +
X4)%
X4;
1108 new_idx = (new_x4*
X3X2X1+new_x3*
X2X1+new_x2*
X1+new_x1) >> 1;
1109 LOAD_HW(TempxEven, TempxOdd, new_idx, HWA, 1-oddBit);
1115 LOAD_HW(PnumuEven, PnumuOdd, sid, HWD, oddBit);
1118 dx[3]=dx[2]=dx[1]=dx[0]=0;
1120 new_x1 = (x1 + dx[0] +
X1)%
X1;
1121 new_x2 = (x2 + dx[1] +
X2)%X2;
1122 new_x3 = (x3 + dx[2] +
X3)%X3;
1123 new_x4 = (x4 + dx[3] +
X4)%
X4;
1124 new_idx = (new_x4*
X3X2X1+new_x3*
X2X1+new_x2*
X1+new_x1) >> 1;
1125 LOAD_HW(PnumuEven, PnumuOdd, new_idx, HWA, 1-oddBit);
1132 dx[3]=dx[2]=dx[1]=dx[0]=0;
1134 new_x1 = (x1 + dx[0] +
X1)%
X1;
1135 new_x2 = (x2 + dx[1] +
X2)%X2;
1136 new_x3 = (x3 + dx[2] +
X3)%X3;
1137 new_x4 = (x4 + dx[3] +
X4)%
X4;
1138 new_idx = (new_x4*
X3X2X1+new_x3*
X2X1+new_x2*
X1+new_x1) >> 1;
1139 LOAD_HW(TempxEven, TempxOdd, new_idx, HWA, 1-oddBit);
1145 LOAD_HW(PnumuEven, PnumuOdd, sid, HWC, oddBit);
1152 template<
typename Float2>
1154 one_and_naik_terms_kernel(Float2* TempxEven, Float2* TempxOdd,
1155 Float2* PmuEven, Float2* PmuOdd,
1156 Float2* PnumuEven, Float2* PnumuOdd,
1157 int mu, Float2 OneLink, Float2 Naik, Float2 mNaik,
1158 float4* linkEven, float4* linkOdd,
1159 Float2* momEven, Float2* momOdd,
1160 dim3 gridDim, dim3 blockDim)
1162 dim3 halfGridDim(gridDim.x/2, 1,1);
1164 do_one_and_naik_terms_kernel<0><<<halfGridDim, blockDim>>>(TempxEven, TempxOdd,
1166 PnumuEven, PnumuOdd,
1167 mu, OneLink, Naik, mNaik,
1170 do_one_and_naik_terms_kernel<1><<<halfGridDim, blockDim>>>(TempxEven, TempxOdd,
1172 PnumuEven, PnumuOdd,
1173 mu, OneLink, Naik, mNaik,
1181 #define Pmu tempvec[0]
1182 #define Pnumu tempvec[1]
1183 #define Prhonumu tempvec[2]
1184 #define P7 tempvec[3]
1185 #define P7rho tempvec[4]
1186 #define P7rhonu tempvec[5]
1187 #define P5 tempvec[6]
1188 #define P3 tempvec[7]
1189 #define P5nu tempvec[3]
1190 #define P3mu tempvec[3]
1191 #define Popmu tempvec[4]
1192 #define Pmumumu tempvec[4]
1194 template<
typename Real>
1196 do_fermion_force_cuda(Real eps, Real weight1, Real weight2, Real* act_path_coeff,
FullHw cudaHw,
1200 int mu, nu, rho,
sig;
1203 float2 OneLink, Lepage, Naik, FiveSt, ThreeSt, SevenSt;
1204 float2 mNaik, mLepage, mFiveSt, mThreeSt, mSevenSt;
1207 ferm_epsilon = 2.0*weight1*eps;
1208 OneLink.x = act_path_coeff[0]*ferm_epsilon ;
1209 Naik.x = act_path_coeff[1]*ferm_epsilon ; mNaik.x = -Naik.x;
1210 ThreeSt.x = act_path_coeff[2]*ferm_epsilon ; mThreeSt.x = -ThreeSt.x;
1211 FiveSt.x = act_path_coeff[3]*ferm_epsilon ; mFiveSt.x = -FiveSt.x;
1212 SevenSt.x = act_path_coeff[4]*ferm_epsilon ; mSevenSt.x = -SevenSt.x;
1213 Lepage.x = act_path_coeff[5]*ferm_epsilon ; mLepage.x = -Lepage.x;
1215 ferm_epsilon = 2.0*weight2*eps;
1216 OneLink.y = act_path_coeff[0]*ferm_epsilon ;
1217 Naik.y = act_path_coeff[1]*ferm_epsilon ; mNaik.y = -Naik.y;
1218 ThreeSt.y = act_path_coeff[2]*ferm_epsilon ; mThreeSt.y = -ThreeSt.y;
1219 FiveSt.y = act_path_coeff[3]*ferm_epsilon ; mFiveSt.y = -FiveSt.y;
1220 SevenSt.y = act_path_coeff[4]*ferm_epsilon ; mSevenSt.y = -SevenSt.y;
1221 Lepage.y = act_path_coeff[5]*ferm_epsilon ; mLepage.y = -Lepage.y;
1223 int DirectLinks[8] ;
1225 for(mu=0;mu<8;mu++){
1226 DirectLinks[
mu] = 0 ;
1229 int volume = param->
X[0]*param->
X[1]*param->
X[2]*param->
X[3];
1231 dim3 gridDim(volume/blockDim.x, 1, 1);
1238 for(sig=0; sig < 8; sig++){
1239 for(mu = 0; mu < 8; mu++){
1240 if ( (mu == sig) || (mu ==
OPP_DIR(sig))){
1246 middle_link_kernel( (float2*)cudaHw.
even.
data, (float2*)cudaHw.
odd.
data,
1247 (float2*)
Pmu.even.data, (float2*)
Pmu.odd.data,
1248 (float2*)
P3.even.data, (float2*)
P3.odd.data,
1250 (float4*)siteLink.Even_p(), (float4*)siteLink.Odd_p(), siteLink,
1251 (float2*)cudaMom.Even_p(), (float2*)cudaMom.Odd_p(),
1254 for(nu=0; nu < 8; nu++){
1255 if (nu == sig || nu ==
OPP_DIR(sig)
1256 || nu == mu || nu ==
OPP_DIR(mu)){
1261 middle_link_kernel( (float2*)
Pmu.even.data, (float2*)
Pmu.odd.data,
1262 (float2*)
Pnumu.even.data, (float2*)
Pnumu.odd.data,
1263 (float2*)
P5.even.data, (float2*)
P5.odd.data,
1265 (float4*)siteLink.Even_p(), (float4*)siteLink.Odd_p(), siteLink,
1266 (float2*)cudaMom.Even_p(), (float2*)cudaMom.Odd_p(),
1270 for(rho =0; rho < 8; rho++){
1271 if (rho == sig || rho ==
OPP_DIR(sig)
1272 || rho == mu || rho ==
OPP_DIR(mu)
1273 || rho == nu || rho ==
OPP_DIR(nu)){
1279 if(FiveSt.x != 0)coeff.x = SevenSt.x/FiveSt.x ;
else coeff.x = 0;
1280 if(FiveSt.y != 0)coeff.y = SevenSt.y/FiveSt.y ;
else coeff.y = 0;
1281 all_link_kernel((float2*)
Pnumu.even.data, (float2*)
Pnumu.odd.data,
1283 (float2*)
P7.even.data, (float2*)
P7.odd.data,
1284 (float2*)
P7rho.even.data, (float2*)
P7rho.odd.data,
1285 (float2*)
P5.even.data, (float2*)
P5.odd.data,
1286 sig, rho, SevenSt,mSevenSt,coeff,
1287 (float4*)siteLink.Even_p(), (float4*)siteLink.Odd_p(), siteLink,
1288 (float2*)cudaMom.Even_p(), (float2*)cudaMom.Odd_p(),
1296 if(ThreeSt.x != 0)coeff.x = FiveSt.x/ThreeSt.x ;
else coeff.x = 0;
1297 if(ThreeSt.y != 0)coeff.y = FiveSt.y/ThreeSt.y ;
else coeff.y = 0;
1298 side_link_kernel((float2*)
P5.even.data, (float2*)
P5.odd.data,
1299 (float2*)
P5nu.even.data, (float2*)
P5nu.odd.data,
1300 (float2*)
Pmu.even.data, (float2*)
Pmu.odd.data,
1301 (float2*)
Pnumu.even.data, (float2*)
Pnumu.odd.data,
1302 (float2*)
P3.even.data, (float2*)
P3.odd.data,
1303 sig, nu, mFiveSt, coeff,
1304 (float4*)siteLink.Even_p(), (float4*)siteLink.Odd_p(), siteLink,
1305 (float2*)cudaMom.Even_p(), (float2*)cudaMom.Odd_p(),
1313 middle_link_kernel( (float2*)
Pmu.even.data, (float2*)
Pmu.odd.data,
1314 (float2*)
Pnumu.even.data, (float2*)
Pnumu.odd.data,
1315 (float2*)
P5.even.data, (float2*)
P5.odd.data,
1317 (float4*)siteLink.Even_p(), (float4*)siteLink.Odd_p(), siteLink,
1318 (float2*)cudaMom.Even_p(), (float2*)cudaMom.Odd_p(),
1322 if(ThreeSt.x != 0)coeff.x = Lepage.x/ThreeSt.x ;
else coeff.x = 0;
1323 if(ThreeSt.y != 0)coeff.y = Lepage.y/ThreeSt.y ;
else coeff.y = 0;
1325 side_link_kernel((float2*)
P5.even.data, (float2*)
P5.odd.data,
1326 (float2*)
P5nu.even.data, (float2*)
P5nu.odd.data,
1327 (float2*)
Pmu.even.data, (float2*)
Pmu.odd.data,
1328 (float2*)
Pnumu.even.data, (float2*)
Pnumu.odd.data,
1329 (float2*)
P3.even.data, (float2*)
P3.odd.data,
1330 sig, mu, mLepage,coeff,
1331 (float4*)siteLink.Even_p(), (float4*)siteLink.Odd_p(), siteLink,
1332 (float2*)cudaMom.Even_p(), (float2*)cudaMom.Odd_p(),
1338 side_link_kernel((float2*)
P3.even.data, (float2*)
P3.odd.data,
1339 (float2*)
P3mu.even.data, (float2*)
P3mu.odd.data,
1341 (float2*)
Pmu.even.data, (float2*)
Pmu.odd.data,
1342 (float2*)NULL, (float2*)NULL,
1343 sig, mu, ThreeSt,coeff,
1344 (float4*)siteLink.Even_p(), (float4*)siteLink.Odd_p(), siteLink,
1345 (float2*)cudaMom.Even_p(), (float2*)cudaMom.Odd_p(),
1350 if (!DirectLinks[mu]){
1353 one_and_naik_terms_kernel((float2*)cudaHw.
even.
data, (float2*)cudaHw.
odd.
data,
1354 (float2*)
Pmu.even.data, (float2*)
Pmu.odd.data,
1355 (float2*)
Pnumu.even.data, (float2*)
Pnumu.odd.data,
1356 mu, OneLink, Naik, mNaik,
1357 (float4*)siteLink.Even_p(), (float4*)siteLink.Odd_p(),
1358 (float2*)cudaMom.Even_p(), (float2*)cudaMom.Odd_p(),
1408 errorQuda(
"Double precision not supported?");
1410 do_fermion_force_cuda( (
float)eps, (
float)weight1, (
float)weight2, (
float*)act_path_coeff,
1411 cudaHw, siteLink, cudaMom, tempvec, param);
1424 #undef FF_COMPUTE_NEW_FULL_IDX_PLUS_UPDATE
1425 #undef FF_COMPUTE_NEW_FULL_IDX_MINUS_UPDATE