QUDA  v0.5.0
A library for QCD on GPUs
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
hisq_paths_force_core.h
Go to the documentation of this file.
1 
2 //macro KERNEL_ENABLED is used to control compile time, debug purpose only
3 #if (PRECISION == 0 && RECON == 18)
4 #define EXT _dp_18_
5 #ifdef COMPILE_HISQ_DP_18
6 #define KERNEL_ENABLED
7 #endif
8 #elif (PRECISION == 0 && RECON == 12)
9 #define EXT _dp_12_
10 #ifdef COMPILE_HISQ_DP_12
11 #define KERNEL_ENABLED
12 #endif
13 #elif (PRECISION == 1 && RECON == 18)
14 #define EXT _sp_18_
15 #ifdef COMPILE_HISQ_SP_18
16 #define KERNEL_ENABLED
17 #endif
18 #else
19 #define EXT _sp_12_
20 #ifdef COMPILE_HISQ_SP_12
21 #define KERNEL_ENABLED
22 #endif
23 #endif
24 
25 #undef D1
26 #undef D1h
27 #undef D2
28 #undef D3
29 #undef D4
30 #undef xcomm
31 #undef ycomm
32 #undef zcomm
33 #undef tcomm
34 
35 
36 #define D1 kparam.D1
37 #define D1h kparam.D1h
38 #define D2 kparam.D2
39 #define D3 kparam.D3
40 #define D4 kparam.D4
41 #define xcomm kparam.ghostDim[0]
42 #define ycomm kparam.ghostDim[1]
43 #define zcomm kparam.ghostDim[2]
44 #define tcomm kparam.ghostDim[3]
45 
46 
47 #define print_matrix(mul) \
48  printf(" (%f %f) (%f %f) (%f %f)\n", mul##00_re, mul##00_im, mul##01_re, mul##01_im, mul##02_re, mul##02_im); \
49  printf(" (%f %f) (%f %f) (%f %f)\n", mul##10_re, mul##10_im, mul##11_re, mul##11_im, mul##12_re, mul##12_im); \
50  printf(" (%f %f) (%f %f) (%f %f)\n", mul##20_re, mul##20_im, mul##21_re, mul##21_im, mul##22_re, mul##22_im);
51 
52 
53 /**************************do_middle_link_kernel*****************************
54  *
55  *
56  * Generally we need
57  * READ
58  * 3 LINKS: ab_link, bc_link, ad_link
59  * 3 COLOR MATRIX: newOprod_at_A, oprod_at_C, Qprod_at_D
60  * WRITE
61  * 4 COLOR MATRIX: newOprod_at_A, P3_at_A, Pmu_at_B, Qmu_at_A
62  *
63  * Three call variations:
64  * 1. when Qprev == NULL: Qprod_at_D does not exit and is not read in
65  * 2. full read/write
66  * 3. when Pmu/Qmu == NULL, Pmu_at_B and Qmu_at_A are not written out
67  *
68  * In all three above case, if the direction sig is negative, newOprod_at_A is
69  * not read in or written out.
70  *
71  * Therefore the data traffic, in two-number pair (num_of_link, num_of_color_matrix)
72  * Call 1: (called 48 times, half positive sig, half negative sig)
73  * if (sig is positive): (3, 6)
74  * else : (3, 4)
75  * Call 2: (called 192 time, half positive sig, half negative sig)
76  * if (sig is positive): (3, 7)
77  * else : (3, 5)
78  * Call 3: (called 48 times, half positive sig, half negative sig)
79  * if (sig is positive): (3, 5)
80  * else : (3, 2) no need to loadQprod_at_D in this case
81  *
82  * note: oprod_at_C could actually be read in from D when it is the fresh outer product
83  * and we call it oprod_at_C to simply naming. This does not affect our data traffic analysis
84  *
85  * Flop count, in two-number pair (matrix_multi, matrix_add)
86  * call 1: if (sig is positive) (3, 1)
87  * else (2, 0)
88  * call 2: if (sig is positive) (4, 1)
89  * else (3, 0)
90  * call 3: if (sig is positive) (4, 1)
91  * else (2, 0)
92  *
93  ****************************************************************************/
94 template<class RealA, class RealB, int sig_positive, int mu_positive, int _oddBit, int oddness_change>
95  __global__ void
96  HISQ_KERNEL_NAME(do_middle_link, EXT)(const RealA* const oprodEven, const RealA* const oprodOdd,
97  const RealA* const QprevEven, const RealA* const QprevOdd,
98  const RealB* const linkEven, const RealB* const linkOdd,
99  int sig, int mu,
101  RealA* const PmuEven, RealA* const PmuOdd,
102  RealA* const P3Even, RealA* const P3Odd,
103  RealA* const QmuEven, RealA* const QmuOdd,
104  RealA* const newOprodEven, RealA* const newOprodOdd,
106 {
107 
108 #ifdef KERNEL_ENABLED
109  int oddBit = _oddBit;
110  int sid = blockIdx.x * blockDim.x + threadIdx.x;
111  if(sid >= kparam.threads) return;
112  int x[4];
113  int z1 = sid/D1h;
114  int x1h = sid - z1*D1h;
115  int z2 = z1/D2;
116  x[1] = z1 - z2*D2;
117  x[3] = z2/D3;
118  x[2] = z2 - x[3]*D3;
119  int x1odd = (x[1] + x[2] + x[3] + oddBit) & 1;
120  x[0] = 2*x1h + x1odd;
121 
122  int new_x[4];
123  int new_mem_idx;
124 #if(RECON == 12)
125  int ad_link_sign;
126  int ab_link_sign;
127  int bc_link_sign;
128 #endif
129 
130  RealA ab_link[ArrayLength<RealA>::result];
131  RealA bc_link[ArrayLength<RealA>::result];
132  RealA ad_link[ArrayLength<RealA>::result];
133 
134  RealA COLOR_MAT_W[ArrayLength<RealA>::result];
135  RealA COLOR_MAT_Y[ArrayLength<RealA>::result];
136  RealA COLOR_MAT_X[ArrayLength<RealA>::result];
137 
138  /* A________B
139  * mu | |
140  * D| |C
141  *
142  * A is the current point (sid)
143  *
144  */
145 
146  int point_b, point_c, point_d;
148  int mymu;
149 #ifdef MULTI_GPU
150  int E[4]= {E1,E2,E3,E4};
151  x[0] = x[0] + kparam.base_idx[0];
152  x[1] = x[1] + kparam.base_idx[1];
153  x[2] = x[2] + kparam.base_idx[2];
154  x[3] = x[3] + kparam.base_idx[3];
155  new_x[0] = x[0];
156  new_x[1] = x[1];
157  new_x[2] = x[2];
158  new_x[3] = x[3];
159  new_mem_idx = new_x[3]*E3E2E1 + new_x[2]*E2E1 + new_x[1]*E1 + new_x[0];
160  int new_sid=(new_mem_idx >> 1);
161  oddBit = _oddBit ^ oddness_change;
162 
163 #else
164  int X = 2*sid + x1odd;
165  new_x[0] = x[0];
166  new_x[1] = x[1];
167  new_x[2] = x[2];
168  new_x[3] = x[3];
169  new_mem_idx = X;
170  int new_sid = sid;
171 #endif
172 
173  if(mu_positive){
174  mymu = mu;
175  FF_COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(mu, new_mem_idx, new_mem_idx);
176  }else{
177  mymu = OPP_DIR(mu);
178  FF_COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(OPP_DIR(mu), new_mem_idx, new_mem_idx);
179  }
180  point_d = (new_mem_idx >> 1);
181  if (mu_positive){
182  ad_link_nbr_idx = point_d;
183  COMPUTE_LINK_SIGN(&ad_link_sign, mymu, new_x);
184  }else{
185  ad_link_nbr_idx = new_sid;
186  COMPUTE_LINK_SIGN(&ad_link_sign, mymu, x);
187  }
188 
189  int mysig;
190  if(sig_positive){
191  mysig = sig;
192  FF_COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(sig, new_mem_idx, new_mem_idx);
193  }else{
194  mysig = OPP_DIR(sig);
195  FF_COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(OPP_DIR(sig), new_mem_idx, new_mem_idx);
196  }
197  point_c = (new_mem_idx >> 1);
198  if (mu_positive){
199  bc_link_nbr_idx = point_c;
200  COMPUTE_LINK_SIGN(&bc_link_sign, mymu, new_x);
201  }
202 
203 #ifdef MULTI_GPU
204  new_x[0] = x[0];
205  new_x[1] = x[1];
206  new_x[2] = x[2];
207  new_x[3] = x[3];
208  new_mem_idx = new_x[3]*E3E2E1 + new_x[2]*E2E1 + new_x[1]*E1 + new_x[0];
209 #else
210  new_x[0] = x[0];
211  new_x[1] = x[1];
212  new_x[2] = x[2];
213  new_x[3] = x[3];
214  new_mem_idx = X;
215 #endif
216 
217  if(sig_positive){
218  FF_COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(sig, new_mem_idx, new_mem_idx);
219  }else{
220  FF_COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(OPP_DIR(sig), new_mem_idx, new_mem_idx);
221  }
222  point_b = (new_mem_idx >> 1);
223 
224  if (!mu_positive){
225  bc_link_nbr_idx = point_b;
226  COMPUTE_LINK_SIGN(&bc_link_sign, mymu, new_x);
227  }
228 
229  if(sig_positive){
230  ab_link_nbr_idx = new_sid;
231  COMPUTE_LINK_SIGN(&ab_link_sign, mysig, x);
232  }else{
233  ab_link_nbr_idx = point_b;
234  COMPUTE_LINK_SIGN(&ab_link_sign, mysig, new_x);
235  }
236  // now we have ab_link_nbr_idx
237 
238 
239  // load the link variable connecting a and b
240  // Store in ab_link
241  if(sig_positive){
242  HISQ_LOAD_LINK(linkEven, linkOdd, mysig, ab_link_nbr_idx, ab_link, oddBit);
243  }else{
244  HISQ_LOAD_LINK(linkEven, linkOdd, mysig, ab_link_nbr_idx, ab_link, 1-oddBit);
245  }
246  RECONSTRUCT_SITE_LINK(ab_link, ab_link_sign)
247 
248  // load the link variable connecting b and c
249  // Store in bc_link
250  if(mu_positive){
251  HISQ_LOAD_LINK(linkEven, linkOdd, mymu, bc_link_nbr_idx, bc_link, oddBit);
252  }else{
253  HISQ_LOAD_LINK(linkEven, linkOdd, mymu, bc_link_nbr_idx, bc_link, 1-oddBit);
254  }
255  RECONSTRUCT_SITE_LINK(bc_link, bc_link_sign)
256 
257  if(QprevOdd == NULL){
258  if(sig_positive){
259  loadMatrixFromField(oprodEven, oprodOdd, sig, point_d, COLOR_MAT_Y, 1-oddBit, hf.color_matrix_stride);
260  }else{
261  loadMatrixFromField(oprodEven, oprodOdd, OPP_DIR(sig), point_c, COLOR_MAT_Y, oddBit, hf.color_matrix_stride);
262  adjointMatrix(COLOR_MAT_Y);
263  }
264  }else{ // QprevOdd != NULL
265  loadMatrixFromField(oprodEven, oprodOdd, point_c, COLOR_MAT_Y, oddBit, hf.color_matrix_stride);
266  }
267 
268 
269  MATRIX_PRODUCT(bc_link, COLOR_MAT_Y, !mu_positive, COLOR_MAT_W);
270  if(PmuOdd){
271  storeMatrixToField(COLOR_MAT_W, point_b, PmuEven, PmuOdd, 1-oddBit);
272  }
273  MATRIX_PRODUCT(ab_link, COLOR_MAT_W, sig_positive,COLOR_MAT_Y);
274  storeMatrixToField(COLOR_MAT_Y, new_sid, P3Even, P3Odd, oddBit);
275 
276 
277  if(mu_positive){
278  HISQ_LOAD_LINK(linkEven, linkOdd, mymu, ad_link_nbr_idx, ad_link, 1-oddBit);
279  RECONSTRUCT_SITE_LINK(ad_link, ad_link_sign)
280  }else{
281  HISQ_LOAD_LINK(linkEven, linkOdd, mymu, ad_link_nbr_idx, ad_link, oddBit);
282  RECONSTRUCT_SITE_LINK(ad_link, ad_link_sign)
283  adjointMatrix(ad_link);
284 
285  }
286 
287 
288  if(QprevOdd == NULL){
289  if(sig_positive){
290  MAT_MUL_MAT(COLOR_MAT_W, ad_link, COLOR_MAT_Y);
291  }
292 
293  if(QmuEven){
294  ASSIGN_MAT(ad_link, COLOR_MAT_X);
295  storeMatrixToField(COLOR_MAT_X, new_sid, QmuEven, QmuOdd, oddBit);
296  }
297  }else{
298  if(QmuEven || sig_positive){
299  loadMatrixFromField(QprevEven, QprevOdd, point_d, COLOR_MAT_Y, 1-oddBit, hf.color_matrix_stride);
300  MAT_MUL_MAT(COLOR_MAT_Y, ad_link, COLOR_MAT_X);
301  }
302  if(QmuEven){
303  storeMatrixToField(COLOR_MAT_X, new_sid, QmuEven, QmuOdd, oddBit);
304  }
305  if(sig_positive){
306  MAT_MUL_MAT(COLOR_MAT_W, COLOR_MAT_X, COLOR_MAT_Y);
307  }
308  }
309 
310  if(sig_positive){
311  addMatrixToNewOprod(COLOR_MAT_Y, sig, new_sid, coeff, newOprodEven, newOprodOdd, oddBit);
312  }
313 
314 #endif
315  return;
316 }
317 
318 template<class RealA, class RealB, int sig_positive, int mu_positive, int _oddBit, int oddness_change>
319  __global__ void
320  HISQ_KERNEL_NAME(do_lepage_middle_link, EXT)(const RealA* const oprodEven, const RealA* const oprodOdd,
321  const RealA* const QprevEven, const RealA* const QprevOdd,
322  const RealB* const linkEven, const RealB* const linkOdd,
323  int sig, int mu,
325  RealA* const P3Even, RealA* const P3Odd,
326  RealA* const newOprodEven, RealA* const newOprodOdd,
328 {
329 
330 #ifdef KERNEL_ENABLED
331  int oddBit = _oddBit;
332  int sid = blockIdx.x * blockDim.x + threadIdx.x;
333  if(sid >= kparam.threads) return;
334  int x[4];
335  int z1 = sid/D1h;
336  int x1h = sid - z1*D1h;
337  int z2 = z1/D2;
338  x[1] = z1 - z2*D2;
339  x[3] = z2/D3;
340  x[2] = z2 - x[3]*D3;
341  int x1odd = (x[1] + x[2] + x[3] + oddBit) & 1;
342  x[0] = 2*x1h + x1odd;
343 
344  int new_x[4];
345  int new_mem_idx;
346 #if(RECON == 12)
347  int ad_link_sign;
348  int ab_link_sign;
349  int bc_link_sign;
350 #endif
351 
352  RealA ab_link[ArrayLength<RealA>::result];
353  RealA bc_link[ArrayLength<RealA>::result];
354  RealA ad_link[ArrayLength<RealA>::result];
355 
356  RealA COLOR_MAT_W[ArrayLength<RealA>::result];
357  RealA COLOR_MAT_Y[ArrayLength<RealA>::result];
358  RealA COLOR_MAT_X[ArrayLength<RealA>::result];
359 
360  /* A________B
361  * mu | |
362  * D| |C
363  *
364  * A is the current point (sid)
365  *
366  */
367 
368  int point_b, point_c, point_d;
370  int mymu;
371 #ifdef MULTI_GPU
372  int E[4]= {E1,E2,E3,E4};
373  x[0] = x[0] + kparam.base_idx[0];
374  x[1] = x[1] + kparam.base_idx[1];
375  x[2] = x[2] + kparam.base_idx[2];
376  x[3] = x[3] + kparam.base_idx[3];
377  new_x[0] = x[0];
378  new_x[1] = x[1];
379  new_x[2] = x[2];
380  new_x[3] = x[3];
381  new_mem_idx = new_x[3]*E3E2E1 + new_x[2]*E2E1 + new_x[1]*E1 + new_x[0];
382  int new_sid=(new_mem_idx >> 1);
383  oddBit = _oddBit ^ oddness_change;
384 #else
385  int X = 2*sid + x1odd;
386  new_x[0] = x[0];
387  new_x[1] = x[1];
388  new_x[2] = x[2];
389  new_x[3] = x[3];
390  new_mem_idx = X;
391  int new_sid = sid;
392 #endif
393  if(mu_positive){
394  mymu = mu;
395  FF_COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(mu, new_mem_idx, new_mem_idx);
396  }else{
397  mymu = OPP_DIR(mu);
398  FF_COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(OPP_DIR(mu), new_mem_idx, new_mem_idx);
399  }
400  point_d = (new_mem_idx >> 1);
401  if (mu_positive){
402  ad_link_nbr_idx = point_d;
403  COMPUTE_LINK_SIGN(&ad_link_sign, mymu, new_x);
404  }else{
405  ad_link_nbr_idx = new_sid;
406  COMPUTE_LINK_SIGN(&ad_link_sign, mymu, x);
407  }
408 
409  int mysig;
410  if(sig_positive){
411  mysig = sig;
412  FF_COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(sig, new_mem_idx, new_mem_idx);
413  }else{
414  mysig = OPP_DIR(sig);
415  FF_COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(OPP_DIR(sig), new_mem_idx, new_mem_idx);
416  }
417  point_c = (new_mem_idx >> 1);
418  if (mu_positive){
419  bc_link_nbr_idx = point_c;
420  COMPUTE_LINK_SIGN(&bc_link_sign, mymu, new_x);
421  }
422 
423 #ifdef MULTI_GPU
424  new_x[0] = x[0];
425  new_x[1] = x[1];
426  new_x[2] = x[2];
427  new_x[3] = x[3];
428  new_mem_idx = new_x[3]*E3E2E1 + new_x[2]*E2E1 + new_x[1]*E1 + new_x[0];
429 #else
430  new_x[0] = x[0];
431  new_x[1] = x[1];
432  new_x[2] = x[2];
433  new_x[3] = x[3];
434  new_mem_idx = X;
435 #endif
436 
437 
438  if(sig_positive){
439  FF_COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(sig, new_mem_idx, new_mem_idx);
440  }else{
441  FF_COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(OPP_DIR(sig), new_mem_idx, new_mem_idx);
442  }
443  point_b = (new_mem_idx >> 1);
444 
445  if (!mu_positive){
446  bc_link_nbr_idx = point_b;
447  COMPUTE_LINK_SIGN(&bc_link_sign, mymu, new_x);
448  }
449 
450  if(sig_positive){
451  ab_link_nbr_idx = new_sid;
452  COMPUTE_LINK_SIGN(&ab_link_sign, mysig, x);
453  }else{
454  ab_link_nbr_idx = point_b;
455  COMPUTE_LINK_SIGN(&ab_link_sign, mysig, new_x);
456  }
457  // now we have ab_link_nbr_idx
458 
459 
460  // load the link variable connecting a and b
461  // Store in ab_link
462  if(sig_positive){
463  HISQ_LOAD_LINK(linkEven, linkOdd, mysig, ab_link_nbr_idx, ab_link, oddBit);
464  }else{
465  HISQ_LOAD_LINK(linkEven, linkOdd, mysig, ab_link_nbr_idx, ab_link, 1-oddBit);
466  }
467  RECONSTRUCT_SITE_LINK(ab_link, ab_link_sign)
468 
469  // load the link variable connecting b and c
470  // Store in bc_link
471  if(mu_positive){
472  HISQ_LOAD_LINK(linkEven, linkOdd, mymu, bc_link_nbr_idx, bc_link, oddBit);
473  }else{
474  HISQ_LOAD_LINK(linkEven, linkOdd, mymu, bc_link_nbr_idx, bc_link, 1-oddBit);
475  }
476  RECONSTRUCT_SITE_LINK(bc_link, bc_link_sign)
477 
478  loadMatrixFromField(oprodEven, oprodOdd, point_c, COLOR_MAT_Y, oddBit, hf.color_matrix_stride);
479  MATRIX_PRODUCT(bc_link, COLOR_MAT_Y, !mu_positive, COLOR_MAT_W);
480  MATRIX_PRODUCT(ab_link, COLOR_MAT_W, sig_positive,COLOR_MAT_Y);
481  storeMatrixToField(COLOR_MAT_Y, new_sid, P3Even, P3Odd, oddBit);
482 
483  if(mu_positive){
484  HISQ_LOAD_LINK(linkEven, linkOdd, mymu, ad_link_nbr_idx, ad_link, 1-oddBit);
485  RECONSTRUCT_SITE_LINK(ad_link, ad_link_sign)
486  }else{
487  HISQ_LOAD_LINK(linkEven, linkOdd, mymu, ad_link_nbr_idx, ad_link, oddBit);
488  RECONSTRUCT_SITE_LINK(ad_link, ad_link_sign)
489  adjointMatrix(ad_link);
490  }
491 
492  if(sig_positive){
493  loadMatrixFromField(QprevEven, QprevOdd, point_d, COLOR_MAT_Y, 1-oddBit, hf.color_matrix_stride);
494  MAT_MUL_MAT(COLOR_MAT_Y, ad_link, COLOR_MAT_X);
495  MAT_MUL_MAT(COLOR_MAT_W, COLOR_MAT_X, COLOR_MAT_Y);
496  addMatrixToNewOprod(COLOR_MAT_Y, sig, new_sid, coeff, newOprodEven, newOprodOdd, oddBit);
497  }
498 
499 #endif
500  return;
501 }
502 
503 
504 
505 /***********************************do_side_link_kernel***************************
506  *
507  * In general we need
508  * READ
509  * 1 LINK: ad_link
510  * 4 COLOR MATRIX: shortP_at_D, newOprod, P3_at_A, Qprod_at_D,
511  * WRITE
512  * 2 COLOR MATRIX: shortP_at_D, newOprod,
513  *
514  * Two call variations:
515  * 1. full read/write
516  * 2. when shortP == NULL && Qprod == NULL:
517  * no need to read ad_link/shortP_at_D or write shortP_at_D
518  * Qprod_at_D does not exit and is not read in
519  *
520  *
521  * Therefore the data traffic, in two-number pair (num_of_links, num_of_color_matrix)
522  * Call 1: (called 192 times)
523  * (1, 6)
524  *
525  * Call 2: (called 48 times)
526  * (0, 3)
527  *
528  * note: newOprod can be at point D or A, depending on if mu is postive or negative
529  *
530  * Flop count, in two-number pair (matrix_multi, matrix_add)
531  * call 1: (2, 2)
532  * call 2: (0, 1)
533  *
534  *********************************************************************************/
535 
536 template<class RealA, class RealB, int sig_positive, int mu_positive, int _oddBit, int oddness_change>
537  __global__ void
538  HISQ_KERNEL_NAME(do_side_link, EXT)(const RealA* const P3Even, const RealA* const P3Odd,
539  const RealA* const QprodEven, const RealA* const QprodOdd,
540  const RealB* const linkEven, const RealB* const linkOdd,
541  int sig, int mu,
542  typename RealTypeId<RealA>::Type coeff,
544  RealA* const shortPEven, RealA* const shortPOdd,
545  RealA* const newOprodEven, RealA* const newOprodOdd,
547 {
548 #ifdef KERNEL_ENABLED
549  int oddBit = _oddBit;
550  int sid = blockIdx.x * blockDim.x + threadIdx.x;
551  if(sid >= kparam.threads) return;
552 
553  int x[4];
554  int z1 = sid/D1h;
555  int x1h = sid - z1*D1h;
556  int z2 = z1/D2;
557  x[1] = z1 - z2*D2;
558  x[3] = z2/D3;
559  x[2] = z2 - x[3]*D3;
560  int x1odd = (x[1] + x[2] + x[3] + oddBit) & 1;
561  x[0] = 2*x1h + x1odd;
562 
563 #if(RECON == 12)
564  int ad_link_sign;
565 #endif
566 
567  int new_mem_idx;
568  int new_x[4];
569 #ifdef MULTI_GPU
570  int E[4]= {E1,E2,E3,E4};
571  x[0] = x[0] + kparam.base_idx[0];
572  x[1] = x[1] + kparam.base_idx[1];
573  x[2] = x[2] + kparam.base_idx[2];
574  x[3] = x[3] + kparam.base_idx[3];
575 
576  new_x[0] = x[0];
577  new_x[1] = x[1];
578  new_x[2] = x[2];
579  new_x[3] = x[3];
580 
581  new_mem_idx = new_x[3]*E3E2E1 + new_x[2]*E2E1 + new_x[1]*E1 + new_x[0];
582  int new_sid=(new_mem_idx >> 1);
583  oddBit = _oddBit ^ oddness_change;
584 #else
585  int X = 2*sid + x1odd;
586  new_x[0] = x[0];
587  new_x[1] = x[1];
588  new_x[2] = x[2];
589  new_x[3] = x[3];
590  new_mem_idx = X;
591  int new_sid = sid;
592 #endif
593 
594 
595  RealA ad_link[ArrayLength<RealA>::result];
596 
597  RealA COLOR_MAT_W[ArrayLength<RealA>::result];
598  RealA COLOR_MAT_X[ArrayLength<RealA>::result];
599  RealA COLOR_MAT_Y[ArrayLength<RealA>::result];
600  // The compiler probably knows to reorder so that loads are done early on
601  loadMatrixFromField(P3Even, P3Odd, new_sid, COLOR_MAT_Y, oddBit, hf.color_matrix_stride);
602 
603  /* compute the side link contribution to the momentum
604  *
605  * sig
606  * A________B
607  * | | mu
608  * D | |C
609  *
610  * A is the current point (sid)
611  *
612  */
613 
615  int point_d;
616  int ad_link_nbr_idx;
617  int mymu;
618 
619  if(mu_positive){
620  mymu=mu;
621  FF_COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(mymu,new_mem_idx, new_mem_idx);
622  }else{
623  mymu = OPP_DIR(mu);
624  FF_COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(mymu, new_mem_idx, new_mem_idx);
625  }
626  point_d = (new_mem_idx >> 1);
627 
628  if (mu_positive){
629  ad_link_nbr_idx = point_d;
630  COMPUTE_LINK_SIGN(&ad_link_sign, mymu, new_x);
631  }else{
632  ad_link_nbr_idx = new_sid;
633  COMPUTE_LINK_SIGN(&ad_link_sign, mymu, x);
634  }
635 
636 
637  if(mu_positive){
638  HISQ_LOAD_LINK(linkEven, linkOdd, mymu, ad_link_nbr_idx, ad_link, 1-oddBit);
639  }else{
640  HISQ_LOAD_LINK(linkEven, linkOdd, mymu, ad_link_nbr_idx, ad_link, oddBit);
641  }
642  RECONSTRUCT_SITE_LINK(ad_link, ad_link_sign);
643 
644  MATRIX_PRODUCT(ad_link, COLOR_MAT_Y, mu_positive, COLOR_MAT_W);
645  addMatrixToField(COLOR_MAT_W, point_d, accumu_coeff, shortPEven, shortPOdd, 1-oddBit);
646  mycoeff = CoeffSign<sig_positive,_oddBit ^ oddness_change>::result*coeff;
647 
648  loadMatrixFromField(QprodEven, QprodOdd, point_d, COLOR_MAT_X, 1-oddBit, hf.color_matrix_stride);
649  if(mu_positive){
650  MAT_MUL_MAT(COLOR_MAT_Y, COLOR_MAT_X, COLOR_MAT_W);
651  if(!oddBit){ mycoeff = -mycoeff; }
652  addMatrixToNewOprod(COLOR_MAT_W, mu, point_d, mycoeff, newOprodEven, newOprodOdd, 1-oddBit);
653  }else{
654  ADJ_MAT_MUL_ADJ_MAT(COLOR_MAT_X, COLOR_MAT_Y, COLOR_MAT_W);
655  if(oddBit){ mycoeff = -mycoeff; }
656  addMatrixToNewOprod(COLOR_MAT_W, OPP_DIR(mu), new_sid, mycoeff, newOprodEven, newOprodOdd, oddBit);
657  }
658 #endif
659  return;
660 }
661 
662 
663 
664 
665 template<class RealA, class RealB, int sig_positive, int mu_positive, int _oddBit, int oddness_change>
666  __global__ void
667  HISQ_KERNEL_NAME(do_side_link_short, EXT)(const RealA* const P3Even, const RealA* const P3Odd,
668  const RealB* const linkEven, const RealB* const linkOdd,
669  int sig, int mu,
670  typename RealTypeId<RealA>::Type coeff,
671  RealA* const newOprodEven, RealA* const newOprodOdd,
673 {
674 #ifdef KERNEL_ENABLED
675  int oddBit = _oddBit;
676  int sid = blockIdx.x * blockDim.x + threadIdx.x;
677  if(sid >= kparam.threads) return;
678 
679  int x[4];
680  int z1 = sid/D1h;
681  int x1h = sid - z1*D1h;
682  int z2 = z1/D2;
683  x[1] = z1 - z2*D2;
684  x[3] = z2/D3;
685  x[2] = z2 - x[3]*D3;
686  int x1odd = (x[1] + x[2] + x[3] + oddBit) & 1;
687  x[0] = 2*x1h + x1odd;
688 
689  int new_mem_idx;
690  int new_x[4];
691 #ifdef MULTI_GPU
692  int E[4]= {E1,E2,E3,E4};
693  x[0] = x[0] + kparam.base_idx[0];
694  x[1] = x[1] + kparam.base_idx[1];
695  x[2] = x[2] + kparam.base_idx[2];
696  x[3] = x[3] + kparam.base_idx[3];
697 
698  new_x[0] = x[0];
699  new_x[1] = x[1];
700  new_x[2] = x[2];
701  new_x[3] = x[3];
702 
703  new_mem_idx = new_x[3]*E3E2E1 + new_x[2]*E2E1 + new_x[1]*E1 + new_x[0];
704  int new_sid=(new_mem_idx >> 1);
705  oddBit = _oddBit ^ oddness_change;
706 #else
707  int X = 2*sid + x1odd;
708  new_x[0] = x[0];
709  new_x[1] = x[1];
710  new_x[2] = x[2];
711  new_x[3] = x[3];
712  new_mem_idx = X;
713  int new_sid = sid;
714 #endif
715 
716  /* compute the side link contribution to the momentum
717  *
718  * sig
719  * A________B
720  * | | mu
721  * D | |C
722  *
723  * A is the current point (sid)
724  *
725  */
726 
727  RealA COLOR_MAT_W[ArrayLength<RealA>::result];
728  RealA COLOR_MAT_Y[ArrayLength<RealA>::result];
729  loadMatrixFromField(P3Even, P3Odd, new_sid, COLOR_MAT_Y, oddBit, hf.color_matrix_stride);
730 
732  int point_d;
733  int mymu;
734 
735  if(mu_positive){
736  mymu=mu;
737  FF_COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(mymu,new_mem_idx, new_mem_idx);
738  }else{
739  mymu = OPP_DIR(mu);
740  FF_COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(mymu, new_mem_idx, new_mem_idx);
741  }
742  point_d = (new_mem_idx >> 1);
743  mycoeff = CoeffSign<sig_positive,_oddBit ^ oddness_change>::result*coeff;
744 
745  if(mu_positive){
746  if(!oddBit){ mycoeff = -mycoeff;} // need to change this to get away from oddBit
747  addMatrixToNewOprod(COLOR_MAT_Y, mu, point_d, mycoeff, newOprodEven, newOprodOdd, 1-oddBit);
748  }else{
749  if(oddBit){ mycoeff = -mycoeff; }
750  ADJ_MAT(COLOR_MAT_Y, COLOR_MAT_W);
751  addMatrixToNewOprod(COLOR_MAT_W, OPP_DIR(mu), new_sid, mycoeff, newOprodEven, newOprodOdd, oddBit);
752  }
753 #endif // KERNEL_ENABLED
754  return;
755 }
756 
757 
758 
759 
760 
761 
762 /********************************do_all_link_kernel*********************************************
763 *
764 * In this function we need
765 * READ
766 * 3 LINKS: ad_link, ab_link, bc_link
767 * 5 COLOR MATRIX: Qprev_at_D, oprod_at_C, newOprod_at_A(sig), newOprod_at_D/newOprod_at_A(mu), shortP_at_D
768 * WRITE:
769 * 3 COLOR MATRIX: newOprod_at_A(sig), newOprod_at_D/newOprod_at_A(mu), shortP_at_D,
770 *
771 * If sig is negative, then we don't need to read/write the color matrix newOprod_at_A(sig)
772 *
773 * Therefore the data traffic, in two-number pair (num_of_link, num_of_color_matrix)
774 *
775 * if (sig is positive): (3, 8)
776 * else : (3, 6)
777 *
778 * This function is called 384 times, half positive sig, half negative sig
779 *
780 * Flop count, in two-number pair (matrix_multi, matrix_add)
781 * if(sig is positive) (6,3)
782 * else (4,2)
783 *
784 ************************************************************************************************/
785 
786 #define SHORT int
787 
788 template<class RealA, class RealB, SHORT sig_positive, SHORT mu_positive, SHORT _oddBit, int oddness_change>
789  __global__ void
790  HISQ_KERNEL_NAME(do_all_link, EXT)(const RealA* const oprodEven, const RealA* const oprodOdd,
791  const RealA* const QprevEven, const RealA* const QprevOdd,
792  const RealB* const linkEven, const RealB* const linkOdd,
793  SHORT sig, SHORT mu,
794  typename RealTypeId<RealA>::Type coeff,
796  RealA* const shortPEven, RealA* const shortPOdd,
797  RealA* const newOprodEven, RealA* const newOprodOdd,
799 {
800 #ifdef KERNEL_ENABLED
801  SHORT oddBit = _oddBit;
802  int sid = blockIdx.x * blockDim.x + threadIdx.x;
803  if(sid >= kparam.threads) return;
804 
805  SHORT x[4];
806  int z1 = sid/D1h;
807  SHORT x1h = sid - z1*D1h;
808  int z2 = z1/D2;
809  x[1] = z1 - z2*D2;
810  x[3] = z2/D3;
811  x[2] = z2 - x[3]*D3;
812  SHORT x1odd = (x[1] + x[2] + x[3] + oddBit) & 1;
813  x[0] = 2*x1h + x1odd;
814 
815 #if(RECON == 12)
816  int ad_link_sign;
817  int ab_link_sign;
818  int bc_link_sign;
819 #endif
820 
821  SHORT new_x[4];
822 
823  RealA ab_link[ArrayLength<RealA>::result];
824  RealA bc_link[ArrayLength<RealA>::result];
825  RealA ad_link[ArrayLength<RealA>::result];
826 
827  RealA COLOR_MAT_X[ArrayLength<RealA>::result];
828  RealA COLOR_MAT_Y[ArrayLength<RealA>::result];
829  RealA COLOR_MAT_Z[ArrayLength<RealA>::result];
830  RealA COLOR_MAT_W[ArrayLength<RealA>::result];
831 
832 
833  /* sig
834  * A________B
835  * mu | |
836  * D | |C
837  *
838  * A is the current point (sid)
839  *
840  */
841 
842  int point_b, point_c, point_d;
843  int ab_link_nbr_idx;
844  int new_mem_idx;
845 
846 #ifdef MULTI_GPU
847  x[0] = x[0] + kparam.base_idx[0];
848  x[1] = x[1] + kparam.base_idx[1];
849  x[2] = x[2] + kparam.base_idx[2];
850  x[3] = x[3] + kparam.base_idx[3];
851 
852  int E[4]= {E1,E2,E3,E4};
853  new_x[0] = x[0];
854  new_x[1] = x[1];
855  new_x[2] = x[2];
856  new_x[3] = x[3];
857 
858  new_mem_idx = new_x[3]*E3E2E1 + new_x[2]*E2E1 + new_x[1]*E1 + new_x[0];
859  int new_sid=(new_mem_idx >> 1);
860  oddBit = _oddBit ^ oddness_change;
861 #else
862  int X = 2*sid + x1odd;
863  new_x[0] = x[0];
864  new_x[1] = x[1];
865  new_x[2] = x[2];
866  new_x[3] = x[3];
867  new_mem_idx = X;
868  int new_sid = sid;
869 #endif
870 
871  if(sig_positive){
872  FF_COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(sig, new_mem_idx, new_mem_idx);
873  }else{
874  FF_COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(OPP_DIR(sig), new_mem_idx, new_mem_idx);
875  }
876  point_b = (new_mem_idx >> 1);
877  ab_link_nbr_idx = (sig_positive) ? new_sid : point_b;
878  if(sig_positive){
879  COMPUTE_LINK_SIGN(&ab_link_sign, sig, x);
880  }else{
881  COMPUTE_LINK_SIGN(&ab_link_sign, OPP_DIR(sig), new_x);
882  }
883  if(!mu_positive){
884  COMPUTE_LINK_SIGN(&bc_link_sign, OPP_DIR(mu), new_x);
885  }
886 #ifdef MULTI_GPU
887  new_x[0] = x[0];
888  new_x[1] = x[1];
889  new_x[2] = x[2];
890  new_x[3] = x[3];
891 
892  new_mem_idx = new_x[3]*E3E2E1 + new_x[2]*E2E1 + new_x[1]*E1 + new_x[0];
893 #else
894  new_x[0] = x[0];
895  new_x[1] = x[1];
896  new_x[2] = x[2];
897  new_x[3] = x[3];
898  new_mem_idx = X;
899 #endif
900 
901  const typename RealTypeId<RealA>::Type & mycoeff = CoeffSign<sig_positive,_oddBit ^ oddness_change>::result*coeff;
902  if(mu_positive){ //positive mu
903  FF_COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(mu, new_mem_idx, new_mem_idx);
904  point_d = (new_mem_idx >> 1);
905 
906  COMPUTE_LINK_SIGN(&ad_link_sign, mu, new_x);
907 
908  if(sig_positive){
909  FF_COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(sig, new_mem_idx, new_mem_idx);
910  }else{
911  FF_COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(OPP_DIR(sig), new_mem_idx, new_mem_idx);
912  }
913  point_c = (new_mem_idx >> 1);
914 
915  loadMatrixFromField(QprevEven, QprevOdd, point_d, COLOR_MAT_X, 1-oddBit, hf.color_matrix_stride); // COLOR_MAT_X
916  HISQ_LOAD_LINK(linkEven, linkOdd, mu, point_d, ad_link, 1-oddBit);
917  RECONSTRUCT_SITE_LINK(ad_link, ad_link_sign)
918 
919  loadMatrixFromField(oprodEven,oprodOdd, point_c, COLOR_MAT_Y, oddBit, hf.color_matrix_stride); // COLOR_MAT_Y
920  HISQ_LOAD_LINK(linkEven, linkOdd, mu, point_c, bc_link, oddBit);
921  COMPUTE_LINK_SIGN(&bc_link_sign, mu, new_x);
922  RECONSTRUCT_SITE_LINK(bc_link, bc_link_sign)
923 
924  MATRIX_PRODUCT(bc_link, COLOR_MAT_Y, 0, COLOR_MAT_Z); // COMPUTE_LINK_X
925 
926 
927  if (sig_positive)
928  {
929  MAT_MUL_MAT(COLOR_MAT_X, ad_link, COLOR_MAT_Y);
930  MAT_MUL_MAT(COLOR_MAT_Z, COLOR_MAT_Y, COLOR_MAT_W);
931  //addMatrixToField(COLOR_MAT_W, sig, sid, Sign<oddBit>::result*mycoeff, newOprodEven, newOprodOdd, oddBit);
932  addMatrixToNewOprod(COLOR_MAT_W, sig, new_sid, Sign<_oddBit ^ oddness_change>::result*mycoeff, newOprodEven, newOprodOdd, oddBit);
933  }
934 
935  if (sig_positive){
936  HISQ_LOAD_LINK(linkEven, linkOdd, sig, ab_link_nbr_idx, ab_link, oddBit);
937  }else{
938  HISQ_LOAD_LINK(linkEven, linkOdd, OPP_DIR(sig), ab_link_nbr_idx, ab_link, 1-oddBit);
939  }
940  RECONSTRUCT_SITE_LINK(ab_link, ab_link_sign)
941 
942  MATRIX_PRODUCT(ab_link, COLOR_MAT_Z, sig_positive, COLOR_MAT_Y); // COLOR_MAT_Y is assigned here
943 
944  MAT_MUL_MAT(COLOR_MAT_Y, COLOR_MAT_X, COLOR_MAT_W);
945  //addMatrixToField(COLOR_MAT_W, mu, point_d, -Sign<oddBit>::result*mycoeff, newOprodEven, newOprodOdd, 1-oddBit);
946  addMatrixToNewOprod(COLOR_MAT_W, mu, point_d, -Sign<_oddBit ^ oddness_change>::result*mycoeff, newOprodEven, newOprodOdd, 1-oddBit);
947 
948  MAT_MUL_MAT(ad_link, COLOR_MAT_Y, COLOR_MAT_W);
949  addMatrixToField(COLOR_MAT_W, point_d, accumu_coeff, shortPEven, shortPOdd, 1-oddBit);
950  } else{ //negative mu
951  mu = OPP_DIR(mu);
952  FF_COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(mu, new_mem_idx, new_mem_idx);
953  point_d = (new_mem_idx >> 1);
954  COMPUTE_LINK_SIGN(&ad_link_sign, mu, x);
955 
956  if(sig_positive){
957  FF_COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(sig, new_mem_idx, new_mem_idx);
958  }else{
959  FF_COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(OPP_DIR(sig), new_mem_idx, new_mem_idx);
960  }
961  point_c = (new_mem_idx >> 1);
962 
963  loadMatrixFromField(QprevEven, QprevOdd, point_d, COLOR_MAT_X, 1-oddBit, hf.color_matrix_stride); // COLOR_MAT_X used!
964 
965  HISQ_LOAD_LINK(linkEven, linkOdd, mu, new_sid, ad_link, oddBit);
966  RECONSTRUCT_SITE_LINK(ad_link, ad_link_sign);
967 
968  loadMatrixFromField(oprodEven, oprodOdd, point_c, COLOR_MAT_Y, oddBit, hf.color_matrix_stride); // COLOR_MAT_Y used
969  HISQ_LOAD_LINK(linkEven, linkOdd, mu, point_b, bc_link, 1-oddBit);
970  RECONSTRUCT_SITE_LINK(bc_link, bc_link_sign); //bc_link_sign is computed earlier in the function
971 
972  if(sig_positive){
973  MAT_MUL_ADJ_MAT(COLOR_MAT_X, ad_link, COLOR_MAT_W);
974  }
975  MAT_MUL_MAT(bc_link, COLOR_MAT_Y, COLOR_MAT_Z);
976  if (sig_positive){
977  MAT_MUL_MAT(COLOR_MAT_Z, COLOR_MAT_W, COLOR_MAT_Y);
978  //addMatrixToField(COLOR_MAT_Y, sig, sid, Sign<oddBit>::result*mycoeff, newOprodEven, newOprodOdd, oddBit);
979  addMatrixToNewOprod(COLOR_MAT_Y, sig, new_sid, Sign<_oddBit ^ oddness_change>::result*mycoeff, newOprodEven, newOprodOdd, oddBit);
980  }
981 
982  if (sig_positive){
983  HISQ_LOAD_LINK(linkEven, linkOdd, sig, ab_link_nbr_idx, ab_link, oddBit);
984  }else{
985  HISQ_LOAD_LINK(linkEven, linkOdd, OPP_DIR(sig), ab_link_nbr_idx, ab_link, 1-oddBit);
986  }
987  RECONSTRUCT_SITE_LINK(ab_link, ab_link_sign)
988 
989  MATRIX_PRODUCT(ab_link, COLOR_MAT_Z, sig_positive, COLOR_MAT_Y);
990  ADJ_MAT_MUL_ADJ_MAT(COLOR_MAT_X, COLOR_MAT_Y, COLOR_MAT_W);
991  //addMatrixToField(COLOR_MAT_W, mu, sid, Sign<oddBit>::result*mycoeff, newOprodEven, newOprodOdd, oddBit);
992  addMatrixToNewOprod(COLOR_MAT_W, mu, new_sid, Sign<_oddBit ^ oddness_change>::result*mycoeff, newOprodEven, newOprodOdd, oddBit);
993 
994  MATRIX_PRODUCT(ad_link, COLOR_MAT_Y, 0, COLOR_MAT_W);
995  addMatrixToField(COLOR_MAT_W, point_d, accumu_coeff, shortPEven, shortPOdd, 1-oddBit);
996  }
997 #endif
998  return;
999 }
1000 
1001 
1002 
1003 
1004 
1005 template<class RealA, class RealB, int oddBit>
1006  __global__ void
1007  HISQ_KERNEL_NAME(do_longlink, EXT)(const RealB* const linkEven, const RealB* const linkOdd,
1008  const RealA* const naikOprodEven, const RealA* const naikOprodOdd,
1009  int sig, typename RealTypeId<RealA>::Type coeff,
1010  RealA* const outputEven, RealA* const outputOdd,
1012 {
1013 #ifdef KERNEL_ENABLED
1014  int sid = blockIdx.x * blockDim.x + threadIdx.x;
1015  if (sid >= kparam.threads) return;
1016 
1017  int x[4];
1018  int z1 = sid/X1h;
1019  int x1h = sid - z1*X1h;
1020  int z2 = z1/X2;
1021  x[1] = z1 - z2*X2;
1022  x[3] = z2/X3;
1023  x[2] = z2 - x[3]*X3;
1024  int x1odd = (x[1] + x[2] + x[3] + oddBit) & 1;
1025  x[0] = 2*x1h + x1odd;
1026 
1027  int new_x[4];
1028 #ifdef MULTI_GPU
1029  int E[4]= {E1,E2,E3,E4};
1030  x[0] = x[0]+2;
1031  x[1] = x[1]+2;
1032  x[2] = x[2]+2;
1033  x[3] = x[3]+2;
1034 
1035  int X= x[3]*E3E2E1 + x[2]*E2E1 + x[1]*E1 + x[0];
1036  int new_sid = (X>> 1);
1037 #else
1038 
1039  int X = 2*sid + x1odd;
1040  int new_sid = sid;
1041 #endif
1042  new_x[0] = x[0];
1043  new_x[1] = x[1];
1044  new_x[2] = x[2];
1045  new_x[3] = x[3];
1046  int new_mem_idx = X;
1047 
1048 
1049  RealA ab_link[ArrayLength<RealA>::result];
1050  RealA bc_link[ArrayLength<RealA>::result];
1051  RealA de_link[ArrayLength<RealA>::result];
1052  RealA ef_link[ArrayLength<RealA>::result];
1053 
1054 #if(RECON == 12)
1055  int ab_link_sign =1;
1056  int bc_link_sign =1;
1057  int de_link_sign =1;
1058  int ef_link_sign =1;
1059 #endif
1060 
1061  RealA COLOR_MAT_U[ArrayLength<RealA>::result];
1062  RealA COLOR_MAT_V[ArrayLength<RealA>::result];
1063  RealA COLOR_MAT_W[ArrayLength<RealA>::result]; // used as a temporary
1064  RealA COLOR_MAT_X[ArrayLength<RealA>::result];
1065  RealA COLOR_MAT_Y[ArrayLength<RealA>::result];
1066  RealA COLOR_MAT_Z[ArrayLength<RealA>::result];
1067 
1068 
1069  const int & point_c = new_sid;
1070  int point_a, point_b, point_d, point_e;
1071 
1072 
1073  /*
1074  *
1075  * A B C D E
1076  * ---- ---- ---- ----
1077  *
1078  * ---> sig direction
1079  *
1080  * C is the current point (sid)
1081  *
1082  */
1083 
1084  // compute the force for forward long links
1085  if(GOES_FORWARDS(sig))
1086  {
1087 
1088  FF_COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(sig, X, new_mem_idx);
1089  point_d = (new_mem_idx >> 1);
1090  COMPUTE_LINK_SIGN(&de_link_sign, sig, new_x);
1091 
1092 
1093  FF_COMPUTE_NEW_FULL_IDX_PLUS_UPDATE(sig, new_mem_idx, new_mem_idx);
1094  point_e = (new_mem_idx >> 1);
1095  COMPUTE_LINK_SIGN(&ef_link_sign, sig, new_x);
1096 
1097 
1098  new_x[0] = x[0];
1099  new_x[1] = x[1];
1100  new_x[2] = x[2];
1101  new_x[3] = x[3];
1102 
1103  FF_COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(sig, X, new_mem_idx);
1104  point_b = (new_mem_idx >> 1);
1105  COMPUTE_LINK_SIGN(&bc_link_sign, sig, new_x);
1106 
1107 
1108  FF_COMPUTE_NEW_FULL_IDX_MINUS_UPDATE(sig, new_mem_idx, new_mem_idx);
1109  point_a = (new_mem_idx >> 1);
1110  COMPUTE_LINK_SIGN(&ab_link_sign, sig, new_x);
1111 
1112  HISQ_LOAD_LINK(linkEven, linkOdd, sig, point_a, ab_link, oddBit);
1113  HISQ_LOAD_LINK(linkEven, linkOdd, sig, point_b, bc_link, 1-oddBit);
1114  HISQ_LOAD_LINK(linkEven, linkOdd, sig, point_d, de_link, 1-oddBit);
1115  HISQ_LOAD_LINK(linkEven, linkOdd, sig, point_e, ef_link, oddBit);
1116 
1117  RECONSTRUCT_SITE_LINK(ab_link, ab_link_sign);
1118  RECONSTRUCT_SITE_LINK(bc_link, bc_link_sign);
1119  RECONSTRUCT_SITE_LINK(de_link, de_link_sign);
1120  RECONSTRUCT_SITE_LINK(ef_link, ef_link_sign);
1121 
1122  loadMatrixFromField(naikOprodEven, naikOprodOdd, sig, point_c, COLOR_MAT_Z, oddBit, hf.color_matrix_stride);
1123  loadMatrixFromField(naikOprodEven, naikOprodOdd, sig, point_b, COLOR_MAT_Y, 1-oddBit, hf.color_matrix_stride);
1124  loadMatrixFromField(naikOprodEven, naikOprodOdd, sig, point_a, COLOR_MAT_X, oddBit, hf.color_matrix_stride);
1125 
1126  MAT_MUL_MAT(ef_link, COLOR_MAT_Z, COLOR_MAT_W); // link(d)*link(e)*Naik(c)
1127  MAT_MUL_MAT(de_link, COLOR_MAT_W, COLOR_MAT_V);
1128 
1129  MAT_MUL_MAT(de_link, COLOR_MAT_Y, COLOR_MAT_W); // link(d)*Naik(b)*link(b)
1130  MAT_MUL_MAT(COLOR_MAT_W, bc_link, COLOR_MAT_U);
1131  SCALAR_MULT_ADD_MATRIX(COLOR_MAT_V, COLOR_MAT_U, -1, COLOR_MAT_V);
1132 
1133  MAT_MUL_MAT(COLOR_MAT_X, ab_link, COLOR_MAT_W); // Naik(a)*link(a)*link(b)
1134  MAT_MUL_MAT(COLOR_MAT_W, bc_link, COLOR_MAT_U);
1135  SCALAR_MULT_ADD_MATRIX(COLOR_MAT_V, COLOR_MAT_U, 1, COLOR_MAT_V);
1136 
1137  addMatrixToField(COLOR_MAT_V, sig, new_sid, coeff, outputEven, outputOdd, oddBit);
1138  }
1139 #endif
1140  return;
1141 }
1142 
1143 
1144 template<class RealA, class RealB, int oddBit>
1145  __global__ void
1146  HISQ_KERNEL_NAME(do_complete_force, EXT)(const RealB* const linkEven, const RealB* const linkOdd,
1147  const RealA* const oprodEven, const RealA* const oprodOdd,
1148  int sig,
1149  RealA* const forceEven, RealA* const forceOdd,
1150  const int threads)
1152 #ifdef KERNEL_ENABLED
1153  int sid = blockIdx.x * blockDim.x + threadIdx.x;
1154  if (sid >= threads) return;
1155 
1156  int x[4];
1157  int z1 = sid/X1h;
1158  int x1h = sid - z1*X1h;
1159  int z2 = z1/X2;
1160  x[1] = z1 - z2*X2;
1161  x[3] = z2/X3;
1162  x[2] = z2 - x[3]*X3;
1163  int x1odd = (x[1] + x[2] + x[3] + oddBit) & 1;
1164  x[0] = 2*x1h + x1odd;
1165 
1166 #if(RECON == 12)
1167  int link_sign;
1168 #endif
1169 
1170  int new_sid=sid;
1171 #ifdef MULTI_GPU
1172 
1173  x[0] = x[0]+2;
1174  x[1] = x[1]+2;
1175  x[2] = x[2]+2;
1176  x[3] = x[3]+2;
1177  new_sid = ( x[3]*E3E2E1 + x[2]*E2E1+x[1]*E1 + x[0])>>1;
1178 #endif
1179 
1180  RealA LINK_W[ArrayLength<RealA>::result];
1181  RealA COLOR_MAT_W[ArrayLength<RealA>::result];
1182  RealA COLOR_MAT_X[ArrayLength<RealA>::result];
1183 
1184 
1185  HISQ_LOAD_LINK(linkEven, linkOdd, sig, new_sid, LINK_W, oddBit);
1186  COMPUTE_LINK_SIGN(&link_sign, sig, x);
1187  RECONSTRUCT_SITE_LINK(LINK_W, link_sign);
1188 
1189  loadMatrixFromField(oprodEven, oprodOdd, sig, new_sid, COLOR_MAT_X, oddBit, hf.color_matrix_stride);
1190 
1191  typename RealTypeId<RealA>::Type coeff = (oddBit==1) ? -1 : 1;
1192  MAT_MUL_MAT(LINK_W, COLOR_MAT_X, COLOR_MAT_W);
1193 
1194  storeMatrixToMomentumField(COLOR_MAT_W, sig, sid, coeff, forceEven, forceOdd, oddBit);
1195 #endif
1196  return;
1197 }
1198 
1199 #undef EXT
1200 #undef KERNEL_ENABLED
1201 #undef D1
1202 #undef D2
1203 #undef D3
1204 #undef D4
1205 #undef D1h
1206 #undef xcomm
1207 #undef ycomm
1208 #undef zcomm
1209 #undef tcomm