QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
fermion_force_quda.cu
Go to the documentation of this file.
1 
2 #include <dslash_quda.h>
3 #include <read_gauge.h>
4 #include <gauge_field.h>
5 #include <clover_field.h>
6 
7 #include <fermion_force_quda.h>
8 #include <force_common.h>
9 #include <hw_quda.h>
10 
11 #if defined(GPU_FERMION_FORCE)
12 namespace quda {
13 
14 
15  namespace fermionforce {
16 #include <dslash_constants.h>
17 #include <dslash_textures.h>
18  }
19 
20  using namespace fermionforce;
21 
22 #define BLOCK_DIM 64
23 
24 #define LOAD_ANTI_HERMITIAN(src, dir, idx, var) LOAD_ANTI_HERMITIAN_DIRECT(src, dir, idx, var, Vh)
25 
26 #define LOAD_HW_SINGLE(hw_even, hw_odd, idx, var, oddness) do{ \
27  Float2* hw = (oddness)?hw_odd:hw_even; \
28  var##0 = hw[idx + 0*Vh]; \
29  var##1 = hw[idx + 1*Vh]; \
30  var##2 = hw[idx + 2*Vh]; \
31  var##3 = hw[idx + 3*Vh]; \
32  var##4 = hw[idx + 4*Vh]; \
33  var##5 = hw[idx + 5*Vh]; \
34  }while(0)
35 
36 #define WRITE_HW_SINGLE(hw_even, hw_odd, idx, var, oddness) do{ \
37  Float2* hw = (oddness)?hw_odd:hw_even; \
38  hw[idx + 0*Vh] = var##0; \
39  hw[idx + 1*Vh] = var##1; \
40  hw[idx + 2*Vh] = var##2; \
41  hw[idx + 3*Vh] = var##3; \
42  hw[idx + 4*Vh] = var##4; \
43  hw[idx + 5*Vh] = var##5; \
44  }while(0)
45 
46 #define LOAD_HW(hw_eve, hw_odd, idx, var, oddness) LOAD_HW_SINGLE(hw_eve, hw_odd, idx, var, oddness)
47 #define WRITE_HW(hw_even, hw_odd, idx, var, oddness) WRITE_HW_SINGLE(hw_even, hw_odd, idx, var, oddness)
48 #define LOAD_MATRIX(src, dir, idx, var) LOAD_MATRIX_12_SINGLE(src, dir, idx, var, Vh)
49 
50 #define FF_SITE_MATRIX_LOAD_TEX 1
51 
52 #define linkEvenTex siteLink0TexSingle_recon
53 #define linkOddTex siteLink1TexSingle_recon
54 
55 #if (FF_SITE_MATRIX_LOAD_TEX == 1)
56 #define FF_LOAD_MATRIX(dir, idx, var, oddness) LOAD_MATRIX_12_SINGLE_TEX(((oddness)?linkOddTex:linkEvenTex), dir, idx, var, Vh)
57 #else
58 #define FF_LOAD_MATRIX(dir, idx, var, oddness) LOAD_MATRIX_12_SINGLE(((oddness)?linkOdd:linkEven), dir, idx, var, Vh)
59 #endif
60 
61 
62 #define linka00_re LINKA0.x
63 #define linka00_im LINKA0.y
64 #define linka01_re LINKA0.z
65 #define linka01_im LINKA0.w
66 #define linka02_re LINKA1.x
67 #define linka02_im LINKA1.y
68 #define linka10_re LINKA1.z
69 #define linka10_im LINKA1.w
70 #define linka11_re LINKA2.x
71 #define linka11_im LINKA2.y
72 #define linka12_re LINKA2.z
73 #define linka12_im LINKA2.w
74 #define linka20_re LINKA3.x
75 #define linka20_im LINKA3.y
76 #define linka21_re LINKA3.z
77 #define linka21_im LINKA3.w
78 #define linka22_re LINKA4.x
79 #define linka22_im LINKA4.y
80 
81 #define linkb00_re LINKB0.x
82 #define linkb00_im LINKB0.y
83 #define linkb01_re LINKB0.z
84 #define linkb01_im LINKB0.w
85 #define linkb02_re LINKB1.x
86 #define linkb02_im LINKB1.y
87 #define linkb10_re LINKB1.z
88 #define linkb10_im LINKB1.w
89 #define linkb11_re LINKB2.x
90 #define linkb11_im LINKB2.y
91 #define linkb12_re LINKB2.z
92 #define linkb12_im LINKB2.w
93 #define linkb20_re LINKB3.x
94 #define linkb20_im LINKB3.y
95 #define linkb21_re LINKB3.z
96 #define linkb21_im LINKB3.w
97 #define linkb22_re LINKB4.x
98 #define linkb22_im LINKB4.y
99 
100 
101 #define MAT_MUL_HW(M, HW, HWOUT) \
102  HWOUT##00_re = (M##00_re * HW##00_re - M##00_im * HW##00_im) \
103  + (M##01_re * HW##01_re - M##01_im * HW##01_im) \
104  + (M##02_re * HW##02_re - M##02_im * HW##02_im); \
105  HWOUT##00_im = (M##00_re * HW##00_im + M##00_im * HW##00_re) \
106  + (M##01_re * HW##01_im + M##01_im * HW##01_re) \
107  + (M##02_re * HW##02_im + M##02_im * HW##02_re); \
108  HWOUT##01_re = (M##10_re * HW##00_re - M##10_im * HW##00_im) \
109  + (M##11_re * HW##01_re - M##11_im * HW##01_im) \
110  + (M##12_re * HW##02_re - M##12_im * HW##02_im); \
111  HWOUT##01_im = (M##10_re * HW##00_im + M##10_im * HW##00_re) \
112  + (M##11_re * HW##01_im + M##11_im * HW##01_re) \
113  + (M##12_re * HW##02_im + M##12_im * HW##02_re); \
114  HWOUT##02_re = (M##20_re * HW##00_re - M##20_im * HW##00_im) \
115  + (M##21_re * HW##01_re - M##21_im * HW##01_im) \
116  + (M##22_re * HW##02_re - M##22_im * HW##02_im); \
117  HWOUT##02_im = (M##20_re * HW##00_im + M##20_im * HW##00_re) \
118  + (M##21_re * HW##01_im + M##21_im * HW##01_re) \
119  + (M##22_re * HW##02_im + M##22_im * HW##02_re); \
120  HWOUT##10_re = (M##00_re * HW##10_re - M##00_im * HW##10_im) \
121  + (M##01_re * HW##11_re - M##01_im * HW##11_im) \
122  + (M##02_re * HW##12_re - M##02_im * HW##12_im); \
123  HWOUT##10_im = (M##00_re * HW##10_im + M##00_im * HW##10_re) \
124  + (M##01_re * HW##11_im + M##01_im * HW##11_re) \
125  + (M##02_re * HW##12_im + M##02_im * HW##12_re); \
126  HWOUT##11_re = (M##10_re * HW##10_re - M##10_im * HW##10_im) \
127  + (M##11_re * HW##11_re - M##11_im * HW##11_im) \
128  + (M##12_re * HW##12_re - M##12_im * HW##12_im); \
129  HWOUT##11_im = (M##10_re * HW##10_im + M##10_im * HW##10_re) \
130  + (M##11_re * HW##11_im + M##11_im * HW##11_re) \
131  + (M##12_re * HW##12_im + M##12_im * HW##12_re); \
132  HWOUT##12_re = (M##20_re * HW##10_re - M##20_im * HW##10_im) \
133  + (M##21_re * HW##11_re - M##21_im * HW##11_im) \
134  + (M##22_re * HW##12_re - M##22_im * HW##12_im); \
135  HWOUT##12_im = (M##20_re * HW##10_im + M##20_im * HW##10_re) \
136  + (M##21_re * HW##11_im + M##21_im * HW##11_re) \
137  + (M##22_re * HW##12_im + M##22_im * HW##12_re);
138 
139 
140 #define ADJ_MAT_MUL_HW(M, HW, HWOUT) \
141  HWOUT##00_re = (M##00_re * HW##00_re + M##00_im * HW##00_im) \
142  + (M##10_re * HW##01_re + M##10_im * HW##01_im) \
143  + (M##20_re * HW##02_re + M##20_im * HW##02_im); \
144  HWOUT##00_im = (M##00_re * HW##00_im - M##00_im * HW##00_re) \
145  + (M##10_re * HW##01_im - M##10_im * HW##01_re) \
146  + (M##20_re * HW##02_im - M##20_im * HW##02_re); \
147  HWOUT##01_re = (M##01_re * HW##00_re + M##01_im * HW##00_im) \
148  + (M##11_re * HW##01_re + M##11_im * HW##01_im) \
149  + (M##21_re * HW##02_re + M##21_im * HW##02_im); \
150  HWOUT##01_im = (M##01_re * HW##00_im - M##01_im * HW##00_re) \
151  + (M##11_re * HW##01_im - M##11_im * HW##01_re) \
152  + (M##21_re * HW##02_im - M##21_im * HW##02_re); \
153  HWOUT##02_re = (M##02_re * HW##00_re + M##02_im * HW##00_im) \
154  + (M##12_re * HW##01_re + M##12_im * HW##01_im) \
155  + (M##22_re * HW##02_re + M##22_im * HW##02_im); \
156  HWOUT##02_im = (M##02_re * HW##00_im - M##02_im * HW##00_re) \
157  + (M##12_re * HW##01_im - M##12_im * HW##01_re) \
158  + (M##22_re * HW##02_im - M##22_im * HW##02_re); \
159  HWOUT##10_re = (M##00_re * HW##10_re + M##00_im * HW##10_im) \
160  + (M##10_re * HW##11_re + M##10_im * HW##11_im) \
161  + (M##20_re * HW##12_re + M##20_im * HW##12_im); \
162  HWOUT##10_im = (M##00_re * HW##10_im - M##00_im * HW##10_re) \
163  + (M##10_re * HW##11_im - M##10_im * HW##11_re) \
164  + (M##20_re * HW##12_im - M##20_im * HW##12_re); \
165  HWOUT##11_re = (M##01_re * HW##10_re + M##01_im * HW##10_im) \
166  + (M##11_re * HW##11_re + M##11_im * HW##11_im) \
167  + (M##21_re * HW##12_re + M##21_im * HW##12_im); \
168  HWOUT##11_im = (M##01_re * HW##10_im - M##01_im * HW##10_re) \
169  + (M##11_re * HW##11_im - M##11_im * HW##11_re) \
170  + (M##21_re * HW##12_im - M##21_im * HW##12_re); \
171  HWOUT##12_re = (M##02_re * HW##10_re + M##02_im * HW##10_im) \
172  + (M##12_re * HW##11_re + M##12_im * HW##11_im) \
173  + (M##22_re * HW##12_re + M##22_im * HW##12_im); \
174  HWOUT##12_im = (M##02_re * HW##10_im - M##02_im * HW##10_re) \
175  + (M##12_re * HW##11_im - M##12_im * HW##11_re) \
176  + (M##22_re * HW##12_im - M##22_im * HW##12_re);
177 
178 
179 #define SU3_PROJECTOR(va, vb, m) \
180  m##00_re = va##0_re * vb##0_re + va##0_im * vb##0_im; \
181  m##00_im = va##0_im * vb##0_re - va##0_re * vb##0_im; \
182  m##01_re = va##0_re * vb##1_re + va##0_im * vb##1_im; \
183  m##01_im = va##0_im * vb##1_re - va##0_re * vb##1_im; \
184  m##02_re = va##0_re * vb##2_re + va##0_im * vb##2_im; \
185  m##02_im = va##0_im * vb##2_re - va##0_re * vb##2_im; \
186  m##10_re = va##1_re * vb##0_re + va##1_im * vb##0_im; \
187  m##10_im = va##1_im * vb##0_re - va##1_re * vb##0_im; \
188  m##11_re = va##1_re * vb##1_re + va##1_im * vb##1_im; \
189  m##11_im = va##1_im * vb##1_re - va##1_re * vb##1_im; \
190  m##12_re = va##1_re * vb##2_re + va##1_im * vb##2_im; \
191  m##12_im = va##1_im * vb##2_re - va##1_re * vb##2_im; \
192  m##20_re = va##2_re * vb##0_re + va##2_im * vb##0_im; \
193  m##20_im = va##2_im * vb##0_re - va##2_re * vb##0_im; \
194  m##21_re = va##2_re * vb##1_re + va##2_im * vb##1_im; \
195  m##21_im = va##2_im * vb##1_re - va##2_re * vb##1_im; \
196  m##22_re = va##2_re * vb##2_re + va##2_im * vb##2_im; \
197  m##22_im = va##2_im * vb##2_re - va##2_re * vb##2_im;
198 
199  //vc = va + vb*s
200 #define SCALAR_MULT_ADD_SU3_VECTOR(va, vb, s, vc) do { \
201  vc##0_re = va##0_re + vb##0_re * s; \
202  vc##0_im = va##0_im + vb##0_im * s; \
203  vc##1_re = va##1_re + vb##1_re * s; \
204  vc##1_im = va##1_im + vb##1_im * s; \
205  vc##2_re = va##2_re + vb##2_re * s; \
206  vc##2_im = va##2_im + vb##2_im * s; \
207  }while (0)
208 
209 
210 #define FF_COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(mydir, idx, new_idx) do { \
211  switch(mydir){ \
212  case 0: \
213  new_idx = ( (new_x1==X1m1)?idx-X1m1:idx+1); \
214  new_x1 = (new_x1==X1m1)?0:new_x1+1; \
215  break; \
216  case 1: \
217  new_idx = ( (new_x2==X2m1)?idx-X2X1mX1:idx+X1); \
218  new_x2 = (new_x2==X2m1)?0:new_x2+1; \
219  break; \
220  case 2: \
221  new_idx = ( (new_x3==X3m1)?idx-X3X2X1mX2X1:idx+X2X1); \
222  new_x3 = (new_x3==X3m1)?0:new_x3+1; \
223  break; \
224  case 3: \
225  new_idx = ( (new_x4==X4m1)?idx-X4X3X2X1mX3X2X1:idx+X3X2X1); \
226  new_x4 = (new_x4==X4m1)?0:new_x4+1; \
227  break; \
228  } \
229  }while(0)
230 
231 #define FF_COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(mydir, idx, new_idx) do { \
232  switch(mydir){ \
233  case 0: \
234  new_idx = ( (new_x1==0)?idx+X1m1:idx-1); \
235  new_x1 = (new_x1==0)?X1m1:new_x1 - 1; \
236  break; \
237  case 1: \
238  new_idx = ( (new_x2==0)?idx+X2X1mX1:idx-X1); \
239  new_x2 = (new_x2==0)?X2m1:new_x2 - 1; \
240  break; \
241  case 2: \
242  new_idx = ( (new_x3==0)?idx+X3X2X1mX2X1:idx-X2X1); \
243  new_x3 = (new_x3==0)?X3m1:new_x3 - 1; \
244  break; \
245  case 3: \
246  new_idx = ( (new_x4==0)?idx+X4X3X2X1mX3X2X1:idx-X3X2X1); \
247  new_x4 = (new_x4==0)?X4m1:new_x4 - 1; \
248  break; \
249  } \
250  }while(0)
251 
252 
253 #define FF_COMPUTE_NEW_FULL_IDX_PLUS(old_x1, old_x2, old_x3, old_x4, idx, mydir, new_idx) do { \
254  switch(mydir){ \
255  case 0: \
256  new_idx = ( (old_x1==X1m1)?idx-X1m1:idx+1); \
257  break; \
258  case 1: \
259  new_idx = ( (old_x2==X2m1)?idx-X2X1mX1:idx+X1); \
260  break; \
261  case 2: \
262  new_idx = ( (old_x3==X3m1)?idx-X3X2X1mX2X1:idx+X2X1); \
263  break; \
264  case 3: \
265  new_idx = ( (old_x4==X4m1)?idx-X4X3X2X1mX3X2X1:idx+X3X2X1); \
266  break; \
267  } \
268  }while(0)
269 
270 #define FF_COMPUTE_NEW_FULL_IDX_MINUS(old_x1, old_x2, old_x3, old_x4, idx, mydir, new_idx) do { \
271  switch(mydir){ \
272  case 0: \
273  new_idx = ( (old_x1==0)?idx+X1m1:idx-1); \
274  break; \
275  case 1: \
276  new_idx = ( (old_x2==0)?idx+X2X1mX1:idx-X1); \
277  break; \
278  case 2: \
279  new_idx = ( (old_x3==0)?idx+X3X2X1mX2X1:idx-X2X1); \
280  break; \
281  case 3: \
282  new_idx = ( (old_x4==0)?idx+X4X3X2X1mX3X2X1:idx-X3X2X1); \
283  break; \
284  } \
285  }while(0)
286 
287  //this macro require linka, linkb, and ah variables defined
288 #define ADD_FORCE_TO_MOM(hw1, hw2, idx, dir, cf,oddness) do{ \
289  Float2 my_coeff; \
290  int mydir; \
291  if (GOES_BACKWARDS(dir)){ \
292  mydir=OPP_DIR(dir); \
293  my_coeff.x = -cf.x; \
294  my_coeff.y = -cf.y; \
295  }else{ \
296  mydir=dir; \
297  my_coeff.x = cf.x; \
298  my_coeff.y = cf.y; \
299  } \
300  Float2 tmp_coeff; \
301  tmp_coeff.x = my_coeff.x; \
302  tmp_coeff.y = my_coeff.y; \
303  if(oddness){ \
304  tmp_coeff.x = - my_coeff.x; \
305  tmp_coeff.y = - my_coeff.y; \
306  } \
307  Float2* mom = oddness?momOdd:momEven; \
308  LOAD_ANTI_HERMITIAN(mom, mydir, idx, AH); \
309  UNCOMPRESS_ANTI_HERMITIAN(ah, linka); \
310  SU3_PROJECTOR(hw1##0, hw2##0, linkb); \
311  SCALAR_MULT_ADD_SU3_MATRIX(linka, linkb, tmp_coeff.x, linka); \
312  SU3_PROJECTOR(hw1##1, hw2##1, linkb); \
313  SCALAR_MULT_ADD_SU3_MATRIX(linka, linkb, tmp_coeff.y, linka); \
314  MAKE_ANTI_HERMITIAN(linka, ah); \
315  WRITE_ANTI_HERMITIAN(mom, mydir, idx, AH, Vh); \
316  }while(0)
317 
318 
319 #define FF_COMPUTE_RECONSTRUCT_SIGN(sign, dir, i1,i2,i3,i4) do { \
320  sign =1; \
321  switch(dir){ \
322  case XUP: \
323  if ( (i4 & 1) == 1){ \
324  sign = -1; \
325  } \
326  break; \
327  case YUP: \
328  if ( ((i4+i1) & 1) == 1){ \
329  sign = -1; \
330  } \
331  break; \
332  case ZUP: \
333  if ( ((i4+i1+i2) & 1) == 1){ \
334  sign = -1; \
335  } \
336  break; \
337  case TUP: \
338  if (i4 == X4m1 ){ \
339  sign = -1; \
340  } \
341  break; \
342  } \
343  }while (0)
344 
345 
346 #define hwa00_re HWA0.x
347 #define hwa00_im HWA0.y
348 #define hwa01_re HWA1.x
349 #define hwa01_im HWA1.y
350 #define hwa02_re HWA2.x
351 #define hwa02_im HWA2.y
352 #define hwa10_re HWA3.x
353 #define hwa10_im HWA3.y
354 #define hwa11_re HWA4.x
355 #define hwa11_im HWA4.y
356 #define hwa12_re HWA5.x
357 #define hwa12_im HWA5.y
358 
359 #define hwb00_re HWB0.x
360 #define hwb00_im HWB0.y
361 #define hwb01_re HWB1.x
362 #define hwb01_im HWB1.y
363 #define hwb02_re HWB2.x
364 #define hwb02_im HWB2.y
365 #define hwb10_re HWB3.x
366 #define hwb10_im HWB3.y
367 #define hwb11_re HWB4.x
368 #define hwb11_im HWB4.y
369 #define hwb12_re HWB5.x
370 #define hwb12_im HWB5.y
371 
372 #define hwc00_re HWC0.x
373 #define hwc00_im HWC0.y
374 #define hwc01_re HWC1.x
375 #define hwc01_im HWC1.y
376 #define hwc02_re HWC2.x
377 #define hwc02_im HWC2.y
378 #define hwc10_re HWC3.x
379 #define hwc10_im HWC3.y
380 #define hwc11_re HWC4.x
381 #define hwc11_im HWC4.y
382 #define hwc12_re HWC5.x
383 #define hwc12_im HWC5.y
384 
385 #define hwd00_re HWD0.x
386 #define hwd00_im HWD0.y
387 #define hwd01_re HWD1.x
388 #define hwd01_im HWD1.y
389 #define hwd02_re HWD2.x
390 #define hwd02_im HWD2.y
391 #define hwd10_re HWD3.x
392 #define hwd10_im HWD3.y
393 #define hwd11_re HWD4.x
394 #define hwd11_im HWD4.y
395 #define hwd12_re HWD5.x
396 #define hwd12_im HWD5.y
397 
398 #define hwe00_re HWE0.x
399 #define hwe00_im HWE0.y
400 #define hwe01_re HWE1.x
401 #define hwe01_im HWE1.y
402 #define hwe02_re HWE2.x
403 #define hwe02_im HWE2.y
404 #define hwe10_re HWE3.x
405 #define hwe10_im HWE3.y
406 #define hwe11_re HWE4.x
407 #define hwe11_im HWE4.y
408 #define hwe12_re HWE5.x
409 #define hwe12_im HWE5.y
410 
411 
413  {
414 
415 #ifdef MULTI_GPU
416 #error "multi gpu is not supported for fermion force computation"
417 #endif
418 
419  static int fermion_force_init_cuda_flag = 0;
420 
421  if (fermion_force_init_cuda_flag) return;
422 
423  fermion_force_init_cuda_flag=1;
424 
425  }
426 
427  /*
428  * This function computes contribution to mometum from the middle link in a staple
429  *
430  * tempx: IN
431  * Pmu: OUT
432  * P3: OUT
433  *
434  */
435 
436  template<int sig_positive, int mu_positive, int oddBit, typename Float2>
437  __global__ void
438  do_middle_link_kernel(Float2* tempxEven, Float2* tempxOdd,
439  Float2* PmuEven, Float2* PmuOdd,
440  Float2* P3Even, Float2* P3Odd,
441  int sig, int mu, Float2 coeff,
442  float4* linkEven, float4* linkOdd,
443  Float2* momEven, Float2* momOdd)
444  {
445  int sid = blockIdx.x * blockDim.x + threadIdx.x;
446 
447  int z1 = sid / X1h;
448  int x1h = sid - z1*X1h;
449  int z2 = z1 / X2;
450  int x2 = z1 - z2*X2;
451  int x4 = z2 / X3;
452  int x3 = z2 - x4*X3;
453  int x1odd = (x2 + x3 + x4 + oddBit) & 1;
454  int x1 = 2*x1h + x1odd;
455  int X = 2*sid + x1odd;
456 
457  int new_x1, new_x2, new_x3, new_x4;
458  int new_mem_idx;
459  int ad_link_sign=1;
460  int ab_link_sign=1;
461  int bc_link_sign=1;
462 
463  Float2 HWA0, HWA1, HWA2, HWA3, HWA4, HWA5;
464  Float2 HWB0, HWB1, HWB2, HWB3, HWB4, HWB5;
465  Float2 HWC0, HWC1, HWC2, HWC3, HWC4, HWC5;
466  Float2 HWD0, HWD1, HWD2, HWD3, HWD4, HWD5;
467  float4 LINKA0, LINKA1, LINKA2, LINKA3, LINKA4;
468  float4 LINKB0, LINKB1, LINKB2, LINKB3, LINKB4;
469  Float2 AH0, AH1, AH2, AH3, AH4;
470 
471  /* sig
472  * A________B
473  * mu | |
474  * D | |C
475  *
476  * A is the current point (sid)
477  */
478 
479  int point_b, point_c, point_d;
481  int mymu;
482 
483  new_x1 = x1;
484  new_x2 = x2;
485  new_x3 = x3;
486  new_x4 = x4;
487 
488  if(mu_positive){
489  mymu =mu;
490  FF_COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(mu, X, new_mem_idx);
491  }else{
492  mymu = OPP_DIR(mu);
493  FF_COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(OPP_DIR(mu), X, new_mem_idx);
494  }
495  point_d = (new_mem_idx >> 1);
496  if (mu_positive){
497  ad_link_nbr_idx = point_d;
498  FF_COMPUTE_RECONSTRUCT_SIGN(ad_link_sign, mymu, new_x1,new_x2,new_x3,new_x4);
499  }else{
500  ad_link_nbr_idx = sid;
501  FF_COMPUTE_RECONSTRUCT_SIGN(ad_link_sign, mymu, x1, x2, x3, x4);
502  }
503 
504  int mysig;
505  if(sig_positive){
506  mysig = sig;
507  FF_COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(sig, new_mem_idx, new_mem_idx);
508  }else{
509  mysig = OPP_DIR(sig);
510  FF_COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(OPP_DIR(sig), new_mem_idx, new_mem_idx);
511  }
512  point_c = (new_mem_idx >> 1);
513  if (mu_positive){
514  bc_link_nbr_idx = point_c;
515  FF_COMPUTE_RECONSTRUCT_SIGN(bc_link_sign, mymu, new_x1,new_x2,new_x3,new_x4);
516  }
517  new_x1 = x1;
518  new_x2 = x2;
519  new_x3 = x3;
520  new_x4 = x4;
521  if(sig_positive){
522  FF_COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(sig, X, new_mem_idx);
523  }else{
524  FF_COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(OPP_DIR(sig), X, new_mem_idx);
525  }
526  point_b = (new_mem_idx >> 1);
527 
528  if (!mu_positive){
529  bc_link_nbr_idx = point_b;
530  FF_COMPUTE_RECONSTRUCT_SIGN(bc_link_sign, mymu, new_x1,new_x2,new_x3,new_x4);
531  }
532 
533  if(sig_positive){
534  ab_link_nbr_idx = sid;
535  FF_COMPUTE_RECONSTRUCT_SIGN(ab_link_sign, mysig, x1, x2, x3, x4);
536  }else{
537  ab_link_nbr_idx = point_b;
538  FF_COMPUTE_RECONSTRUCT_SIGN(ab_link_sign, mysig, new_x1,new_x2,new_x3,new_x4);
539  }
540 
541  LOAD_HW(tempxEven, tempxOdd, point_d, HWA, 1-oddBit );
542  if(mu_positive){
543  FF_LOAD_MATRIX(mymu, ad_link_nbr_idx, LINKA, 1-oddBit);
544  }else{
545  FF_LOAD_MATRIX(mymu, ad_link_nbr_idx, LINKA, oddBit);
546  }
547 
548  RECONSTRUCT_LINK_12(ad_link_sign, linka);
549  if (mu_positive){
550  ADJ_MAT_MUL_HW(linka, hwa, hwd);
551  }else{
552  MAT_MUL_HW(linka, hwa, hwd);
553  }
554  WRITE_HW(PmuEven,PmuOdd, sid, HWD, oddBit);
555 
556  LOAD_HW(tempxEven,tempxOdd, point_c, HWA, oddBit);
557  if(mu_positive){
558  FF_LOAD_MATRIX(mymu, bc_link_nbr_idx, LINKA, oddBit);
559  }else{
560  FF_LOAD_MATRIX(mymu, bc_link_nbr_idx, LINKA, 1-oddBit);
561  }
562 
563  RECONSTRUCT_LINK_12(bc_link_sign, linka);
564  if (mu_positive){
565  ADJ_MAT_MUL_HW(linka, hwa, hwb);
566  }else{
567  MAT_MUL_HW(linka, hwa, hwb);
568  }
569  if(sig_positive){
570  FF_LOAD_MATRIX(mysig, ab_link_nbr_idx, LINKB, oddBit);
571  }else{
572  FF_LOAD_MATRIX(mysig, ab_link_nbr_idx, LINKB, 1-oddBit);
573  }
574 
575  RECONSTRUCT_LINK_12(ab_link_sign, linkb);
576  if (sig_positive){
577  MAT_MUL_HW(linkb, hwb, hwc);
578  }else{
579  ADJ_MAT_MUL_HW(linkb, hwb, hwc);
580  }
581  WRITE_HW(P3Even, P3Odd, sid, HWC, oddBit);
582 
583  if (sig_positive){
584  //add the force to mom
585  ADD_FORCE_TO_MOM(hwc, hwd, sid, sig, coeff, oddBit);
586  }
587  }
588 
589 
590  template<typename Float2>
591  static void
592  middle_link_kernel(Float2* tempxEven, Float2* tempxOdd,
593  Float2* PmuEven, Float2* PmuOdd,
594  Float2* P3Even, Float2* P3Odd,
595  int sig, int mu, Float2 coeff,
596  float4* linkEven, float4* linkOdd, cudaGaugeField &siteLink,
597  Float2* momEven, Float2* momOdd,
598  dim3 gridDim, dim3 BlockDim)
599  {
600  dim3 halfGridDim(gridDim.x/2, 1,1);
601 
602 
603 #define CALL_MIDDLE_LINK_KERNEL(sig_sign, mu_sign) \
604  do_middle_link_kernel<sig_sign, mu_sign,0><<<halfGridDim, BlockDim>>>( tempxEven, tempxOdd, \
605  PmuEven, PmuOdd, \
606  P3Even, P3Odd, \
607  sig, mu, coeff, \
608  linkEven, linkOdd, \
609  momEven, momOdd); \
610  do_middle_link_kernel<sig_sign, mu_sign, 1><<<halfGridDim, BlockDim>>>(tempxEven, tempxOdd, \
611  PmuEven, PmuOdd, \
612  P3Even, P3Odd, \
613  sig, mu, coeff, \
614  linkEven, linkOdd, \
615  momEven, momOdd);
616 
617 
618  if (GOES_FORWARDS(sig) && GOES_FORWARDS(mu)){
619  CALL_MIDDLE_LINK_KERNEL(1, 1);
620  }else if (GOES_FORWARDS(sig) && GOES_BACKWARDS(mu)){
621  CALL_MIDDLE_LINK_KERNEL(1, 0);
622  }else if (GOES_BACKWARDS(sig) && GOES_FORWARDS(mu)){
623  CALL_MIDDLE_LINK_KERNEL(0, 1);
624  }else{
625  CALL_MIDDLE_LINK_KERNEL(0, 0);
626  }
627 #undef CALL_MIDDLE_LINK_KERNEL
628 
629  }
630 
631  /*
632  * Computes contribution to momentum from the side links in a staple
633  *
634  * P3: IN
635  * P3mu: not used
636  * Tempx: IN
637  * Pmu: IN
638  * shortPE: OUT
639  *
640  */
641 
642  template<int sig_positive, int mu_positive, int oddBit, typename Float2>
643  __global__ void
644  do_side_link_kernel(Float2* P3Even, Float2* P3Odd,
645  Float2* P3muEven, Float2* P3muOdd,
646  Float2* TempxEven, Float2* TempxOdd,
647  Float2* PmuEven, Float2* PmuOdd,
648  Float2* shortPEven, Float2* shortPOdd,
649  int sig, int mu, Float2 coeff, Float2 accumu_coeff,
650  float4* linkEven, float4* linkOdd,
651  Float2* momEven, Float2* momOdd)
652  {
653  Float2 mcoeff;
654  mcoeff.x = -coeff.x;
655  mcoeff.y = -coeff.y;
656 
657  int sid = blockIdx.x * blockDim.x + threadIdx.x;
658 
659  int z1 = sid / X1h;
660  int x1h = sid - z1*X1h;
661  int z2 = z1 / X2;
662  int x2 = z1 - z2*X2;
663  int x4 = z2 / X3;
664  int x3 = z2 - x4*X3;
665  int x1odd = (x2 + x3 + x4 + oddBit) & 1;
666  int x1 = 2*x1h + x1odd;
667  int X = 2*sid + x1odd;
668 
669  int ad_link_sign = 1;
670  Float2 HWA0, HWA1, HWA2, HWA3, HWA4, HWA5;
671  Float2 HWB0, HWB1, HWB2, HWB3, HWB4, HWB5;
672  Float2 HWC0, HWC1, HWC2, HWC3, HWC4, HWC5;
673  float4 LINKA0, LINKA1, LINKA2, LINKA3, LINKA4;
674  float4 LINKB0, LINKB1, LINKB2, LINKB3, LINKB4;
675  Float2 AH0, AH1, AH2, AH3, AH4;
676 
677 
678 
679  /*
680  * compute the side link contribution to the momentum
681  *
682  *
683  * sig
684  * A________B
685  * | | mu
686  * D | |C
687  *
688  * A is the current point (sid)
689  */
690 
691  int point_d;
692  int ad_link_nbr_idx;
693  int mymu;
694  int new_mem_idx;
695 
696  int new_x1 = x1;
697  int new_x2 = x2;
698  int new_x3 = x3;
699  int new_x4 = x4;
700 
701  if(mu_positive){
702  mymu =mu;
703  FF_COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(mymu,X, new_mem_idx);
704  }else{
705  mymu = OPP_DIR(mu);
706  FF_COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(mymu, X, new_mem_idx);
707  }
708  point_d = (new_mem_idx >> 1);
709 
710  if (mu_positive){
711  ad_link_nbr_idx = point_d;
712  FF_COMPUTE_RECONSTRUCT_SIGN(ad_link_sign, mymu, new_x1,new_x2,new_x3,new_x4);
713  }else{
714  ad_link_nbr_idx = sid;
715  FF_COMPUTE_RECONSTRUCT_SIGN(ad_link_sign, mymu, x1, x2, x3, x4);
716  }
717 
718 
719  LOAD_HW(P3Even, P3Odd, sid, HWA, oddBit);
720  if(mu_positive){
721  FF_LOAD_MATRIX(mymu, ad_link_nbr_idx, LINKA, 1 - oddBit);
722  }else{
723  FF_LOAD_MATRIX(mymu, ad_link_nbr_idx, LINKA, oddBit);
724  }
725 
726  RECONSTRUCT_LINK_12(ad_link_sign, linka);
727  if (mu_positive){
728  MAT_MUL_HW(linka, hwa, hwb);
729  }else{
730  ADJ_MAT_MUL_HW(linka, hwa, hwb);
731  }
732 
733 
734  //start to add side link force
735  if (mu_positive){
736  LOAD_HW(TempxEven, TempxOdd, point_d, HWC, 1-oddBit);
737 
738  if (sig_positive){
739  ADD_FORCE_TO_MOM(hwb, hwc, point_d, mu, coeff, 1-oddBit);
740  }else{
741  ADD_FORCE_TO_MOM(hwc, hwb, point_d, OPP_DIR(mu), mcoeff, 1- oddBit);
742  }
743  }else{
744  LOAD_HW(PmuEven, PmuOdd, sid, HWC, oddBit);
745  if (sig_positive){
746  ADD_FORCE_TO_MOM(hwa, hwc, sid, mu, mcoeff, oddBit);
747  }else{
748  ADD_FORCE_TO_MOM(hwc, hwa, sid, OPP_DIR(mu), coeff, oddBit);
749  }
750 
751  }
752 
753  if (shortPOdd){
754  LOAD_HW(shortPEven, shortPOdd, point_d, HWA, 1-oddBit);
755  SCALAR_MULT_ADD_SU3_VECTOR(hwa0, hwb0, accumu_coeff.x, hwa0);
756  SCALAR_MULT_ADD_SU3_VECTOR(hwa1, hwb1, accumu_coeff.y, hwa1);
757  WRITE_HW(shortPEven, shortPOdd, point_d, HWA, 1-oddBit);
758  }
759 
760  }
761 
762 
763  template<typename Float2>
764  static void
765  side_link_kernel(Float2* P3Even, Float2* P3Odd,
766  Float2* P3muEven, Float2* P3muOdd,
767  Float2* TempxEven, Float2* TempxOdd,
768  Float2* PmuEven, Float2* PmuOdd,
769  Float2* shortPEven, Float2* shortPOdd,
770  int sig, int mu, Float2 coeff, Float2 accumu_coeff,
771  float4* linkEven, float4* linkOdd, cudaGaugeField &siteLink,
772  Float2* momEven, Float2* momOdd,
773  dim3 gridDim, dim3 blockDim)
774  {
775  dim3 halfGridDim(gridDim.x/2,1,1);
776 
777 #define CALL_SIDE_LINK_KERNEL(sig_sign, mu_sign) \
778  do_side_link_kernel<sig_sign,mu_sign,0><<<halfGridDim, blockDim>>>( P3Even, P3Odd, \
779  P3muEven, P3muOdd, \
780  TempxEven, TempxOdd, \
781  PmuEven, PmuOdd, \
782  shortPEven, shortPOdd, \
783  sig, mu, coeff, accumu_coeff, \
784  linkEven, linkOdd, \
785  momEven, momOdd); \
786  do_side_link_kernel<sig_sign,mu_sign,1><<<halfGridDim, blockDim>>>( P3Even, P3Odd, \
787  P3muEven, P3muOdd, \
788  TempxEven, TempxOdd, \
789  PmuEven, PmuOdd, \
790  shortPEven, shortPOdd, \
791  sig, mu, coeff, accumu_coeff, \
792  linkEven, linkOdd, \
793  momEven, momOdd);
794 
795  if (GOES_FORWARDS(sig) && GOES_FORWARDS(mu)){
796  CALL_SIDE_LINK_KERNEL(1,1);
797  }else if (GOES_FORWARDS(sig) && GOES_BACKWARDS(mu)){
798  CALL_SIDE_LINK_KERNEL(1,0);
799  }else if (GOES_BACKWARDS(sig) && GOES_FORWARDS(mu)){
800  CALL_SIDE_LINK_KERNEL(0,1);
801  }else{
802  CALL_SIDE_LINK_KERNEL(0,0);
803  }
804 
805 #undef CALL_SIDE_LINK_KERNEL
806 
807  }
808 
809  /*
810  * This function computes the contribution to momentum from middle and side links
811  *
812  * tempx: IN
813  * Pmu: not used
814  * P3: not used
815  * P3mu: not used
816  * shortP: OUT
817  *
818  */
819 
820  template<int sig_positive, int mu_positive, int oddBit, typename Float2>
821  __global__ void
822  do_all_link_kernel(Float2* tempxEven, Float2* tempxOdd,
823  Float2* PmuEven, Float2* PmuOdd,
824  Float2* P3Even, Float2* P3Odd,
825  Float2* P3muEven, Float2* P3muOdd,
826  Float2* shortPEven, Float2* shortPOdd,
827  int sig, int mu, Float2 coeff, Float2 mcoeff, Float2 accumu_coeff,
828  float4* linkEven, float4* linkOdd,
829  Float2* momEven, Float2* momOdd)
830  {
831  int sid = blockIdx.x * blockDim.x + threadIdx.x;
832 
833  int z1 = sid / X1h;
834  int x1h = sid - z1*X1h;
835  int z2 = z1 / X2;
836  int x2 = z1 - z2*X2;
837  int x4 = z2 / X3;
838  int x3 = z2 - x4*X3;
839  int x1odd = (x2 + x3 + x4 + oddBit) & 1;
840  int x1 = 2*x1h + x1odd;
841  int X = 2*sid + x1odd;
842 
843  int new_x1, new_x2, new_x3, new_x4;
844  int ad_link_sign=1;
845  int ab_link_sign=1;
846  int bc_link_sign=1;
847 
848  Float2 HWA0, HWA1, HWA2, HWA3, HWA4, HWA5;
849  Float2 HWB0, HWB1, HWB2, HWB3, HWB4, HWB5;
850  Float2 HWC0, HWC1, HWC2, HWC3, HWC4, HWC5;
851  Float2 HWD0, HWD1, HWD2, HWD3, HWD4, HWD5;
852  Float2 HWE0, HWE1, HWE2, HWE3, HWE4, HWE5;
853  float4 LINKA0, LINKA1, LINKA2, LINKA3, LINKA4;
854  float4 LINKB0, LINKB1, LINKB2, LINKB3, LINKB4;
855  float4 LINKC0, LINKC1, LINKC2, LINKC3, LINKC4;
856  Float2 AH0, AH1, AH2, AH3, AH4;
857 
858 
859  /* sig
860  * A________B
861  * mu | |
862  * D | |C
863  *
864  * A is the current point (sid)
865  */
866 
867 
868  int point_b, point_c, point_d;
870  int mymu;
871  int new_mem_idx;
872  new_x1 = x1;
873  new_x2 = x2;
874  new_x3 = x3;
875  new_x4 = x4;
876 
877  if(mu_positive){
878  mymu =mu;
879  FF_COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(mu, X, new_mem_idx);
880  }else{
881  mymu = OPP_DIR(mu);
882  FF_COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(OPP_DIR(mu), X, new_mem_idx);
883  }
884  point_d = (new_mem_idx >> 1);
885 
886  if (mu_positive){
887  ad_link_nbr_idx = point_d;
888  FF_COMPUTE_RECONSTRUCT_SIGN(ad_link_sign, mymu, new_x1,new_x2,new_x3,new_x4);
889  }else{
890  ad_link_nbr_idx = sid;
891  FF_COMPUTE_RECONSTRUCT_SIGN(ad_link_sign, mymu, x1, x2, x3, x4);
892  }
893 
894  int mysig;
895  if(sig_positive){
896  mysig = sig;
897  FF_COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(sig, new_mem_idx, new_mem_idx);
898  }else{
899  mysig = OPP_DIR(sig);
900  FF_COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(OPP_DIR(sig), new_mem_idx, new_mem_idx);
901  }
902  point_c = (new_mem_idx >> 1);
903  if (mu_positive){
904  bc_link_nbr_idx = point_c;
905  FF_COMPUTE_RECONSTRUCT_SIGN(bc_link_sign, mymu, new_x1,new_x2,new_x3,new_x4);
906  }
907 
908  new_x1 = x1;
909  new_x2 = x2;
910  new_x3 = x3;
911  new_x4 = x4;
912  if(sig_positive){
913  FF_COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(sig, X, new_mem_idx);
914  }else{
915  FF_COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(OPP_DIR(sig), X, new_mem_idx);
916  }
917  point_b = (new_mem_idx >> 1);
918  if (!mu_positive){
919  bc_link_nbr_idx = point_b;
920  FF_COMPUTE_RECONSTRUCT_SIGN(bc_link_sign, mymu, new_x1,new_x2,new_x3,new_x4);
921  }
922 
923  if(sig_positive){
924  ab_link_nbr_idx = sid;
925  FF_COMPUTE_RECONSTRUCT_SIGN(ab_link_sign, mysig, x1, x2, x3, x4);
926  }else{
927  ab_link_nbr_idx = point_b;
928  FF_COMPUTE_RECONSTRUCT_SIGN(ab_link_sign, mysig, new_x1,new_x2,new_x3,new_x4);
929  }
930 
931  LOAD_HW(tempxEven, tempxOdd, point_d, HWE, 1-oddBit);
932  if (mu_positive){
933  FF_LOAD_MATRIX(mymu, ad_link_nbr_idx, LINKC, 1-oddBit);
934  }else{
935  FF_LOAD_MATRIX(mymu, ad_link_nbr_idx, LINKC, oddBit);
936  }
937 
938  RECONSTRUCT_LINK_12(ad_link_sign, linkc);
939  if (mu_positive){
940  ADJ_MAT_MUL_HW(linkc, hwe, hwd);
941  }else{
942  MAT_MUL_HW(linkc, hwe, hwd);
943  }
944  //we do not need to write Pmu here
945  //WRITE_HW(myPmu, sid, HWD);
946 
947  LOAD_HW(tempxEven, tempxOdd, point_c, HWA, oddBit);
948  if (mu_positive){
949  FF_LOAD_MATRIX(mymu, bc_link_nbr_idx, LINKA, oddBit);
950  }else{
951  FF_LOAD_MATRIX(mymu, bc_link_nbr_idx, LINKA, 1-oddBit);
952  }
953 
954  RECONSTRUCT_LINK_12(bc_link_sign, linka);
955  if (mu_positive){
956  ADJ_MAT_MUL_HW(linka, hwa, hwb);
957  }else{
958  MAT_MUL_HW(linka, hwa, hwb);
959  }
960  if (sig_positive){
961  FF_LOAD_MATRIX(mysig, ab_link_nbr_idx, LINKA, oddBit);
962  }else{
963  FF_LOAD_MATRIX(mysig, ab_link_nbr_idx, LINKA, 1-oddBit);
964  }
965 
966  RECONSTRUCT_LINK_12(ab_link_sign, linka);
967  if (sig_positive){
968  MAT_MUL_HW(linka, hwb, hwc);
969  }else{
970  ADJ_MAT_MUL_HW(linka, hwb, hwc);
971  }
972 
973  //we do not need to write P3 here
974  //WRITE_HW(myP3, sid, HWC);
975 
976  //The middle link contribution
977  if (sig_positive){
978  //add the force to mom
979  ADD_FORCE_TO_MOM(hwc, hwd, sid, sig, mcoeff, oddBit);
980  }
981 
982  //P3 is hwc
983  //ad_link is linkc
984  if (mu_positive){
985  MAT_MUL_HW(linkc, hwc, hwa);
986  }else{
987  ADJ_MAT_MUL_HW(linkc, hwc, hwa);
988  }
989 
990  //accumulate P7rho to P5
991  //WRITE_HW(otherP3mu, point_d, HWA);
992  LOAD_HW(shortPEven, shortPOdd, point_d, HWB, 1-oddBit);
993  SCALAR_MULT_ADD_SU3_VECTOR(hwb0, hwa0, accumu_coeff.x, hwb0);
994  SCALAR_MULT_ADD_SU3_VECTOR(hwb1, hwa1, accumu_coeff.y, hwb1);
995  WRITE_HW(shortPEven, shortPOdd, point_d, HWB, 1-oddBit);
996 
997  //hwe holds tempx at point_d
998  //hwd holds Pmu at point A(sid)
999  if (mu_positive){
1000  if (sig_positive){
1001  ADD_FORCE_TO_MOM(hwa, hwe, point_d, mu, coeff, 1-oddBit);
1002  }else{
1003  ADD_FORCE_TO_MOM(hwe, hwa, point_d, OPP_DIR(mu), mcoeff, 1- oddBit);
1004  }
1005  }else{
1006  if (sig_positive){
1007  ADD_FORCE_TO_MOM(hwc, hwd, sid, mu, mcoeff, oddBit);
1008  }else{
1009  ADD_FORCE_TO_MOM(hwd, hwc, sid, OPP_DIR(mu), coeff, oddBit);
1010  }
1011 
1012  }
1013 
1014 
1015  }
1016 
1017 
1018 
1019  template<typename Float2>
1020  static void
1021  all_link_kernel(Float2* tempxEven, Float2* tempxOdd,
1022  Float2* PmuEven, Float2* PmuOdd,
1023  Float2* P3Even, Float2* P3Odd,
1024  Float2* P3muEven, Float2* P3muOdd,
1025  Float2* shortPEven, Float2* shortPOdd,
1026  int sig, int mu, Float2 coeff, Float2 mcoeff, Float2 accumu_coeff,
1027  float4* linkEven, float4* linkOdd, cudaGaugeField &siteLink,
1028  Float2* momEven, Float2* momOdd,
1029  dim3 gridDim, dim3 blockDim)
1030 
1031  {
1032  dim3 halfGridDim(gridDim.x/2, 1,1);
1033 
1034 #define CALL_ALL_LINK_KERNEL(sig_sign, mu_sign) \
1035  do_all_link_kernel<sig_sign,mu_sign,0><<<halfGridDim, blockDim>>>(tempxEven, tempxOdd, \
1036  PmuEven, PmuOdd, \
1037  P3Even, P3Odd, \
1038  P3muEven, P3muOdd, \
1039  shortPEven, shortPOdd, \
1040  sig, mu, coeff, mcoeff, accumu_coeff, \
1041  linkEven, linkOdd, \
1042  momEven, momOdd); \
1043  do_all_link_kernel<sig_sign,mu_sign,1><<<halfGridDim, blockDim>>>(tempxEven, tempxOdd, \
1044  PmuEven, PmuOdd, \
1045  P3Even, P3Odd, \
1046  P3muEven, P3muOdd, \
1047  shortPEven, shortPOdd, \
1048  sig, mu, coeff, mcoeff, accumu_coeff, \
1049  linkEven, linkOdd, \
1050  momEven, momOdd);
1051 
1052 
1053  if (GOES_FORWARDS(sig) && GOES_FORWARDS(mu)){
1054  CALL_ALL_LINK_KERNEL(1,1);
1055  }else if (GOES_FORWARDS(sig) && GOES_BACKWARDS(mu)){
1056  CALL_ALL_LINK_KERNEL(1,0);
1057  }else if (GOES_BACKWARDS(sig) && GOES_FORWARDS(mu)){
1058  CALL_ALL_LINK_KERNEL(0,1);
1059  }else{
1060  CALL_ALL_LINK_KERNEL(0,0);
1061  }
1062 
1063 #undef CALL_ALL_LINK_KERNEL
1064 
1065  }
1066 
1067  /* This function computes the one and naik terms' contribution to momentum
1068  *
1069  * Tempx: IN
1070  * Pmu: IN
1071  * Pnumu: IN
1072  *
1073  */
1074  template <int oddBit, typename Float2>
1075  __global__ void
1076  do_one_and_naik_terms_kernel(Float2* TempxEven, Float2* TempxOdd,
1077  Float2* PmuEven, Float2* PmuOdd,
1078  Float2* PnumuEven, Float2* PnumuOdd,
1079  int mu, Float2 OneLink, Float2 Naik, Float2 mNaik,
1080  float4* linkEven, float4* linkOdd,
1081  Float2* momEven, Float2* momOdd)
1082  {
1083  Float2 HWA0, HWA1, HWA2, HWA3, HWA4, HWA5;
1084  Float2 HWB0, HWB1, HWB2, HWB3, HWB4, HWB5;
1085  Float2 HWC0, HWC1, HWC2, HWC3, HWC4, HWC5;
1086  Float2 HWD0, HWD1, HWD2, HWD3, HWD4, HWD5;
1087  float4 LINKA0, LINKA1, LINKA2, LINKA3, LINKA4;
1088  float4 LINKB0, LINKB1, LINKB2, LINKB3, LINKB4;
1089  Float2 AH0, AH1, AH2, AH3, AH4;
1090 
1091  int sid = blockIdx.x * blockDim.x + threadIdx.x;
1092  int z1 = sid / X1h;
1093  int x1h = sid - z1*X1h;
1094  int z2 = z1 / X2;
1095  int x2 = z1 - z2*X2;
1096  int x4 = z2 / X3;
1097  int x3 = z2 - x4*X3;
1098  int x1odd = (x2 + x3 + x4 + oddBit) & 1;
1099  int x1 = 2*x1h + x1odd;
1100  //int X = 2*sid + x1odd;
1101 
1102  int dx[4];
1103  int new_x1, new_x2, new_x3, new_x4, new_idx;
1104  int sign=1;
1105 
1106  if (GOES_BACKWARDS(mu)){
1107  //The one link
1108  LOAD_HW(PmuEven, PmuOdd, sid, HWA, oddBit);
1109  LOAD_HW(TempxEven, TempxOdd, sid, HWB, oddBit);
1110  ADD_FORCE_TO_MOM(hwa, hwb, sid, OPP_DIR(mu), OneLink, oddBit);
1111 
1112  //Naik term
1113  dx[3]=dx[2]=dx[1]=dx[0]=0;
1114  dx[OPP_DIR(mu)] = -1;
1115  new_x1 = (x1 + dx[0] + X1)%X1;
1116  new_x2 = (x2 + dx[1] + X2)%X2;
1117  new_x3 = (x3 + dx[2] + X3)%X3;
1118  new_x4 = (x4 + dx[3] + X4)%X4;
1119  new_idx = (new_x4*X3X2X1+new_x3*X2X1+new_x2*X1+new_x1) >> 1;
1120  LOAD_HW(TempxEven, TempxOdd, new_idx, HWA, 1-oddBit);
1121  FF_LOAD_MATRIX(OPP_DIR(mu), new_idx, LINKA, 1-oddBit);
1122  FF_COMPUTE_RECONSTRUCT_SIGN(sign, OPP_DIR(mu), new_x1,new_x2,new_x3,new_x4);
1123  RECONSTRUCT_LINK_12(sign, linka);
1124  ADJ_MAT_MUL_HW(linka, hwa, hwc); //Popmu
1125 
1126  LOAD_HW(PnumuEven, PnumuOdd, sid, HWD, oddBit);
1127  ADD_FORCE_TO_MOM(hwd, hwc, sid, OPP_DIR(mu), mNaik, oddBit);
1128 
1129  dx[3]=dx[2]=dx[1]=dx[0]=0;
1130  dx[OPP_DIR(mu)] = 1;
1131  new_x1 = (x1 + dx[0] + X1)%X1;
1132  new_x2 = (x2 + dx[1] + X2)%X2;
1133  new_x3 = (x3 + dx[2] + X3)%X3;
1134  new_x4 = (x4 + dx[3] + X4)%X4;
1135  new_idx = (new_x4*X3X2X1+new_x3*X2X1+new_x2*X1+new_x1) >> 1;
1136  LOAD_HW(PnumuEven, PnumuOdd, new_idx, HWA, 1-oddBit);
1137  FF_LOAD_MATRIX(OPP_DIR(mu), sid, LINKA, oddBit);
1138  FF_COMPUTE_RECONSTRUCT_SIGN(sign, OPP_DIR(mu), x1, x2, x3, x4);
1139  RECONSTRUCT_LINK_12(sign, linka);
1140  MAT_MUL_HW(linka, hwa, hwc);
1141  ADD_FORCE_TO_MOM(hwc, hwb, sid, OPP_DIR(mu), Naik, oddBit);
1142  }else{
1143  dx[3]=dx[2]=dx[1]=dx[0]=0;
1144  dx[mu] = 1;
1145  new_x1 = (x1 + dx[0] + X1)%X1;
1146  new_x2 = (x2 + dx[1] + X2)%X2;
1147  new_x3 = (x3 + dx[2] + X3)%X3;
1148  new_x4 = (x4 + dx[3] + X4)%X4;
1149  new_idx = (new_x4*X3X2X1+new_x3*X2X1+new_x2*X1+new_x1) >> 1;
1150  LOAD_HW(TempxEven, TempxOdd, new_idx, HWA, 1-oddBit);
1151  FF_LOAD_MATRIX(mu, sid, LINKA, oddBit);
1152  FF_COMPUTE_RECONSTRUCT_SIGN(sign, mu, x1, x2, x3, x4);
1153  RECONSTRUCT_LINK_12(sign, linka);
1154  MAT_MUL_HW(linka, hwa, hwb);
1155 
1156  LOAD_HW(PnumuEven, PnumuOdd, sid, HWC, oddBit);
1157  ADD_FORCE_TO_MOM(hwb, hwc, sid, mu, Naik, oddBit);
1158 
1159 
1160  }
1161  }
1162 
1163  template<typename Float2>
1164  static void
1165  one_and_naik_terms_kernel(Float2* TempxEven, Float2* TempxOdd,
1166  Float2* PmuEven, Float2* PmuOdd,
1167  Float2* PnumuEven, Float2* PnumuOdd,
1168  int mu, Float2 OneLink, Float2 Naik, Float2 mNaik,
1169  float4* linkEven, float4* linkOdd,
1170  Float2* momEven, Float2* momOdd,
1171  dim3 gridDim, dim3 blockDim)
1172  {
1173  dim3 halfGridDim(gridDim.x/2, 1,1);
1174 
1175  do_one_and_naik_terms_kernel<0><<<halfGridDim, blockDim>>>(TempxEven, TempxOdd,
1176  PmuEven, PmuOdd,
1177  PnumuEven, PnumuOdd,
1178  mu, OneLink, Naik, mNaik,
1179  linkEven, linkOdd,
1180  momEven, momOdd);
1181  do_one_and_naik_terms_kernel<1><<<halfGridDim, blockDim>>>(TempxEven, TempxOdd,
1182  PmuEven, PmuOdd,
1183  PnumuEven, PnumuOdd,
1184  mu, OneLink, Naik, mNaik,
1185  linkEven, linkOdd,
1186  momEven, momOdd);
1187  return;
1188  }
1189 
1190 
1191 
1192 #define Pmu tempvec[0]
1193 #define Pnumu tempvec[1]
1194 #define Prhonumu tempvec[2]
1195 #define P7 tempvec[3]
1196 #define P7rho tempvec[4]
1197 #define P7rhonu tempvec[5]
1198 #define P5 tempvec[6]
1199 #define P3 tempvec[7]
1200 #define P5nu tempvec[3]
1201 #define P3mu tempvec[3]
1202 #define Popmu tempvec[4]
1203 #define Pmumumu tempvec[4]
1204 
1205  template<typename Real>
1206  static void
1207  do_fermion_force_cuda(Real eps, Real weight1, Real weight2, Real* act_path_coeff, FullHw cudaHw,
1208  cudaGaugeField &siteLink, cudaGaugeField &cudaMom, FullHw tempvec[8], QudaGaugeParam* param)
1209  {
1210 
1211  int mu, nu, rho, sig;
1212  float2 coeff;
1213 
1214  float2 OneLink, Lepage, Naik, FiveSt, ThreeSt, SevenSt;
1215  float2 mNaik, mLepage, mFiveSt, mThreeSt, mSevenSt;
1216 
1217  Real ferm_epsilon;
1218  ferm_epsilon = 2.0*weight1*eps;
1219  OneLink.x = act_path_coeff[0]*ferm_epsilon ;
1220  Naik.x = act_path_coeff[1]*ferm_epsilon ; mNaik.x = -Naik.x;
1221  ThreeSt.x = act_path_coeff[2]*ferm_epsilon ; mThreeSt.x = -ThreeSt.x;
1222  FiveSt.x = act_path_coeff[3]*ferm_epsilon ; mFiveSt.x = -FiveSt.x;
1223  SevenSt.x = act_path_coeff[4]*ferm_epsilon ; mSevenSt.x = -SevenSt.x;
1224  Lepage.x = act_path_coeff[5]*ferm_epsilon ; mLepage.x = -Lepage.x;
1225 
1226  ferm_epsilon = 2.0*weight2*eps;
1227  OneLink.y = act_path_coeff[0]*ferm_epsilon ;
1228  Naik.y = act_path_coeff[1]*ferm_epsilon ; mNaik.y = -Naik.y;
1229  ThreeSt.y = act_path_coeff[2]*ferm_epsilon ; mThreeSt.y = -ThreeSt.y;
1230  FiveSt.y = act_path_coeff[3]*ferm_epsilon ; mFiveSt.y = -FiveSt.y;
1231  SevenSt.y = act_path_coeff[4]*ferm_epsilon ; mSevenSt.y = -SevenSt.y;
1232  Lepage.y = act_path_coeff[5]*ferm_epsilon ; mLepage.y = -Lepage.y;
1233 
1234  int DirectLinks[8] ;
1235 
1236  for(mu=0;mu<8;mu++){
1237  DirectLinks[mu] = 0 ;
1238  }
1239 
1240  int volume = param->X[0]*param->X[1]*param->X[2]*param->X[3];
1241  dim3 blockDim(BLOCK_DIM,1,1);
1242  dim3 gridDim(volume/blockDim.x, 1, 1);
1243 
1244 
1245  cudaBindTexture(0, siteLink0TexSingle_recon, siteLink.Even_p(), siteLink.Bytes()/2);
1246  cudaBindTexture(0, siteLink1TexSingle_recon, siteLink.Odd_p(), siteLink.Bytes()/2);
1247 
1248 
1249  for(sig=0; sig < 8; sig++){
1250  for(mu = 0; mu < 8; mu++){
1251  if ( (mu == sig) || (mu == OPP_DIR(sig))){
1252  continue;
1253  }
1254  //3-link
1255  //Kernel A: middle link
1256 
1257  middle_link_kernel( (float2*)cudaHw.even.data, (float2*)cudaHw.odd.data,
1258  (float2*)Pmu.even.data, (float2*)Pmu.odd.data,
1259  (float2*)P3.even.data, (float2*)P3.odd.data,
1260  sig, mu, mThreeSt,
1261  (float4*)siteLink.Even_p(), (float4*)siteLink.Odd_p(), siteLink,
1262  (float2*)cudaMom.Even_p(), (float2*)cudaMom.Odd_p(),
1263  gridDim, blockDim);
1264  checkCudaError();
1265  for(nu=0; nu < 8; nu++){
1266  if (nu == sig || nu == OPP_DIR(sig)
1267  || nu == mu || nu == OPP_DIR(mu)){
1268  continue;
1269  }
1270  //5-link: middle link
1271  //Kernel B
1272  middle_link_kernel( (float2*)Pmu.even.data, (float2*)Pmu.odd.data,
1273  (float2*)Pnumu.even.data, (float2*)Pnumu.odd.data,
1274  (float2*)P5.even.data, (float2*)P5.odd.data,
1275  sig, nu, FiveSt,
1276  (float4*)siteLink.Even_p(), (float4*)siteLink.Odd_p(), siteLink,
1277  (float2*)cudaMom.Even_p(), (float2*)cudaMom.Odd_p(),
1278  gridDim, blockDim);
1279  checkCudaError();
1280 
1281  for(rho =0; rho < 8; rho++){
1282  if (rho == sig || rho == OPP_DIR(sig)
1283  || rho == mu || rho == OPP_DIR(mu)
1284  || rho == nu || rho == OPP_DIR(nu)){
1285  continue;
1286  }
1287  //7-link: middle link and side link
1288  //kernel C
1289 
1290  if(FiveSt.x != 0)coeff.x = SevenSt.x/FiveSt.x ; else coeff.x = 0;
1291  if(FiveSt.y != 0)coeff.y = SevenSt.y/FiveSt.y ; else coeff.y = 0;
1292  all_link_kernel((float2*)Pnumu.even.data, (float2*)Pnumu.odd.data,
1293  (float2*)Prhonumu.even.data, (float2*)Prhonumu.odd.data,
1294  (float2*)P7.even.data, (float2*)P7.odd.data,
1295  (float2*)P7rho.even.data, (float2*)P7rho.odd.data,
1296  (float2*)P5.even.data, (float2*)P5.odd.data,
1297  sig, rho, SevenSt,mSevenSt,coeff,
1298  (float4*)siteLink.Even_p(), (float4*)siteLink.Odd_p(), siteLink,
1299  (float2*)cudaMom.Even_p(), (float2*)cudaMom.Odd_p(),
1300  gridDim, blockDim);
1301  checkCudaError();
1302 
1303  }//rho
1304 
1305  //5-link: side link
1306  //kernel B2
1307  if(ThreeSt.x != 0)coeff.x = FiveSt.x/ThreeSt.x ; else coeff.x = 0;
1308  if(ThreeSt.y != 0)coeff.y = FiveSt.y/ThreeSt.y ; else coeff.y = 0;
1309  side_link_kernel((float2*)P5.even.data, (float2*)P5.odd.data,
1310  (float2*)P5nu.even.data, (float2*)P5nu.odd.data,
1311  (float2*)Pmu.even.data, (float2*)Pmu.odd.data,
1312  (float2*)Pnumu.even.data, (float2*)Pnumu.odd.data,
1313  (float2*)P3.even.data, (float2*)P3.odd.data,
1314  sig, nu, mFiveSt, coeff,
1315  (float4*)siteLink.Even_p(), (float4*)siteLink.Odd_p(), siteLink,
1316  (float2*)cudaMom.Even_p(), (float2*)cudaMom.Odd_p(),
1317  gridDim, blockDim);
1318  checkCudaError();
1319 
1320  }//nu
1321 
1322  //lepage
1323  //Kernel A2
1324  middle_link_kernel( (float2*)Pmu.even.data, (float2*)Pmu.odd.data,
1325  (float2*)Pnumu.even.data, (float2*)Pnumu.odd.data,
1326  (float2*)P5.even.data, (float2*)P5.odd.data,
1327  sig, mu, Lepage,
1328  (float4*)siteLink.Even_p(), (float4*)siteLink.Odd_p(), siteLink,
1329  (float2*)cudaMom.Even_p(), (float2*)cudaMom.Odd_p(),
1330  gridDim, blockDim);
1331  checkCudaError();
1332 
1333  if(ThreeSt.x != 0)coeff.x = Lepage.x/ThreeSt.x ; else coeff.x = 0;
1334  if(ThreeSt.y != 0)coeff.y = Lepage.y/ThreeSt.y ; else coeff.y = 0;
1335 
1336  side_link_kernel((float2*)P5.even.data, (float2*)P5.odd.data,
1337  (float2*)P5nu.even.data, (float2*)P5nu.odd.data,
1338  (float2*)Pmu.even.data, (float2*)Pmu.odd.data,
1339  (float2*)Pnumu.even.data, (float2*)Pnumu.odd.data,
1340  (float2*)P3.even.data, (float2*)P3.odd.data,
1341  sig, mu, mLepage,coeff,
1342  (float4*)siteLink.Even_p(), (float4*)siteLink.Odd_p(), siteLink,
1343  (float2*)cudaMom.Even_p(), (float2*)cudaMom.Odd_p(),
1344  gridDim, blockDim);
1345  checkCudaError();
1346 
1347  //3-link side link
1348  coeff.x=coeff.y=0;
1349  side_link_kernel((float2*)P3.even.data, (float2*)P3.odd.data,
1350  (float2*)P3mu.even.data, (float2*)P3mu.odd.data,
1351  (float2*)cudaHw.even.data, (float2*)cudaHw.odd.data,
1352  (float2*)Pmu.even.data, (float2*)Pmu.odd.data,
1353  (float2*)NULL, (float2*)NULL,
1354  sig, mu, ThreeSt,coeff,
1355  (float4*)siteLink.Even_p(), (float4*)siteLink.Odd_p(), siteLink,
1356  (float2*)cudaMom.Even_p(), (float2*)cudaMom.Odd_p(),
1357  gridDim, blockDim);
1358  checkCudaError();
1359 
1360  //1-link and naik term
1361  if (!DirectLinks[mu]){
1362  DirectLinks[mu]=1;
1363  //kernel Z
1364  one_and_naik_terms_kernel((float2*)cudaHw.even.data, (float2*)cudaHw.odd.data,
1365  (float2*)Pmu.even.data, (float2*)Pmu.odd.data,
1366  (float2*)Pnumu.even.data, (float2*)Pnumu.odd.data,
1367  mu, OneLink, Naik, mNaik,
1368  (float4*)siteLink.Even_p(), (float4*)siteLink.Odd_p(),
1369  (float2*)cudaMom.Even_p(), (float2*)cudaMom.Odd_p(),
1370  gridDim, blockDim);
1371 
1372  checkCudaError();
1373  }
1374 
1375  }//mu
1376 
1377  }//sig
1378 
1379  cudaUnbindTexture(siteLink0TexSingle_recon);
1380  cudaUnbindTexture(siteLink1TexSingle_recon);
1381 
1382  }
1383 
1384 #undef Pmu
1385 #undef Pnumu
1386 #undef Prhonumu
1387 #undef P7
1388 #undef P7rho
1389 #undef P7rhonu
1390 #undef P5
1391 #undef P3
1392 #undef P5nu
1393 #undef P3mu
1394 #undef Popmu
1395 #undef Pmumumu
1396 
1397  void
1398  fermion_force_cuda(double eps, double weight1, double weight2, void* act_path_coeff,
1399  FullHw cudaHw, cudaGaugeField &siteLink, cudaGaugeField &cudaMom, QudaGaugeParam* param)
1400  {
1401  int i;
1402  FullHw tempvec[8];
1403 
1404  if (siteLink.Reconstruct() != QUDA_RECONSTRUCT_12)
1405  errorQuda("Reconstruct type %d not supported for gauge field", siteLink.Reconstruct());
1406 
1407  if (cudaMom.Reconstruct() != QUDA_RECONSTRUCT_10)
1408  errorQuda("Reconstruct type %d not supported for momentum field", cudaMom.Reconstruct());
1409 
1410  for(i=0;i < 8;i++){
1411  tempvec[i] = createHwQuda(param->X, param->cuda_prec);
1412  }
1413 
1414  if (param->cuda_prec == QUDA_DOUBLE_PRECISION){
1415  /*
1416  do_fermion_force_cuda( (double)eps, (double)weight1, (double)weight2, (double*)act_path_coeff,
1417  cudaHw, siteLink, cudaMom, tempvec, param);
1418  */
1419  errorQuda("Double precision not supported?");
1420  }else{
1421  do_fermion_force_cuda( (float)eps, (float)weight1, (float)weight2, (float*)act_path_coeff,
1422  cudaHw, siteLink, cudaMom, tempvec, param);
1423  }
1424 
1425  for(i=0;i < 8;i++){
1426  freeHwQuda(tempvec[i]);
1427  }
1428 
1429 
1430 
1431  }
1432 
1433 #undef BLOCK_DIM
1434 
1435 #undef FF_COMPUTE_NEW_FULL_IDX_PLUS_UPDATE
1436 #undef FF_COMPUTE_NEW_FULL_IDX_MINUS_UPDATE
1437 
1438 } // namespace quda
1439 
1440 #endif // defined(GPU_FERMION_FORCE)
#define P7
__constant__ int X1h
__constant__ int X2
FullHw createHwQuda(int *X, QudaPrecision precision)
Definition: hw_quda.cpp:41
int bc_link_nbr_idx
int point_d
#define errorQuda(...)
Definition: util_quda.h:73
__constant__ int X1
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int mu
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealA *const RealA *const RealA *const RealA *const P3Odd
__constant__ int X3X2X1
void fermion_force_cuda(double eps, double weight1, double weight2, void *act_path_coeff, FullHw cudaHw, cudaGaugeField &cudaSiteLink, cudaGaugeField &cudaMom, QudaGaugeParam *param)
int point_b
int point_c
#define OPP_DIR(dir)
Definition: force_common.h:16
QudaGaugeParam param
Definition: pack_test.cpp:17
texture< float4, 1, cudaReadModeElementType > siteLink1TexSingle_recon
void fermion_force_init_cuda(QudaGaugeParam *param)
void freeHwQuda(FullHw hw)
Definition: hw_quda.cpp:61
#define P5nu
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const linkOdd
int z1
Definition: llfat_core.h:814
ParityHw odd
Definition: quda_internal.h:67
#define P3mu
int new_mem_idx
Definition: llfat_core.h:834
#define FF_COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(mydir, idx, new_idx)
#define RECONSTRUCT_LINK_12(sign, var)
Definition: force_common.h:643
__constant__ double coeff
#define P5
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealTypeId< RealA >::Type accumu_coeff
QudaPrecision cuda_prec
Definition: quda.h:42
int X[4]
Definition: quda.h:29
#define P7rho
cudaGaugeField * cudaMom
texture< float4, 1, cudaReadModeElementType > siteLink0TexSingle_recon
#define FF_COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(mydir, idx, new_idx)
void * data
Definition: quda_internal.h:62
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealTypeId< RealA >::Type RealA *const RealA *const shortPOdd
short x1h
Definition: llfat_core.h:815
#define Pnumu
#define P3
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealA *const PmuEven
int dx[4]
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealA *const RealA *const RealA *const P3Even
int ab_link_nbr_idx
#define Prhonumu
#define GOES_BACKWARDS(dir)
Definition: force_common.h:18
__constant__ int X3
ParityHw even
Definition: quda_internal.h:68
int z2
Definition: llfat_core.h:816
short x1odd
Definition: llfat_core.h:821
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealTypeId< RealA >::Type RealA *const shortPEven
#define Pmu
#define GOES_FORWARDS(dir)
Definition: force_common.h:17
#define checkCudaError()
Definition: util_quda.h:110
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int sig
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const linkEven
int ad_link_nbr_idx
#define BLOCK_DIM
int oddBit
__constant__ int X4
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealA *const RealA *const PmuOdd
__constant__ int X2X1