QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
hisq_paths_force_core.h
Go to the documentation of this file.
1 
2 //macro KERNEL_ENABLED is used to control compile time, debug purpose only
3 #if (PRECISION == 0 && RECON == 18)
4 #define EXT _dp_18_
5 #elif (PRECISION == 0 && RECON == 12)
6 #define EXT _dp_12_
7 #elif (PRECISION == 1 && RECON == 18)
8 #define EXT _sp_18_
9 #else
10 #define EXT _sp_12_
11 #endif
12 
13 
14 #define print_matrix(mul) \
15 printf(" (%f %f) (%f %f) (%f %f)\n", mul##00_re, mul##00_im, mul##01_re, mul##01_im, mul##02_re, mul##02_im); \
16 printf(" (%f %f) (%f %f) (%f %f)\n", mul##10_re, mul##10_im, mul##11_re, mul##11_im, mul##12_re, mul##12_im); \
17 printf(" (%f %f) (%f %f) (%f %f)\n", mul##20_re, mul##20_im, mul##21_re, mul##21_im, mul##22_re, mul##22_im);
18 
19 
20 /**************************do_middle_link_kernel*****************************
21  *
22  *
23  * Generally we need
24  * READ
25  * 3 LINKS: ab_link, bc_link, ad_link
26  * 3 COLOR MATRIX: newOprod_at_A, oprod_at_C, Qprod_at_D
27  * WRITE
28  * 4 COLOR MATRIX: newOprod_at_A, P3_at_A, Pmu_at_B, Qmu_at_A
29  *
30  * Three call variations:
31  * 1. when Qprev == NULL: Qprod_at_D does not exist and is not read in
32  * 2. full read/write
33  * 3. when Pmu/Qmu == NULL, Pmu_at_B and Qmu_at_A are not written out
34  *
35  * In all three above case, if the direction sig is negative, newOprod_at_A is
36  * not read in or written out.
37  *
38  * Therefore the data traffic, in two-number pair (num_of_link, num_of_color_matrix)
39  * Call 1: (called 48 times, half positive sig, half negative sig)
40  * if (sig is positive): (3, 6)
41  * else : (3, 4)
42  * Call 2: (called 192 time, half positive sig, half negative sig)
43  * if (sig is positive): (3, 7)
44  * else : (3, 5)
45  * Call 3: (called 48 times, half positive sig, half negative sig)
46  * if (sig is positive): (3, 5)
47  * else : (3, 2) no need to loadQprod_at_D in this case
48  *
49  * note: oprod_at_C could actually be read in from D when it is the fresh outer product
50  * and we call it oprod_at_C to simply naming. This does not affect our data traffic analysis
51  *
52  * Flop count, in two-number pair (matrix_multi, matrix_add)
53  * call 1: if (sig is positive) (3, 1)
54  * else (2, 0)
55  * call 2: if (sig is positive) (4, 1)
56  * else (3, 0)
57  * call 3: if (sig is positive) (4, 1)
58  * else (2, 0)
59  *
60  ****************************************************************************/
61 // call 1: if (sig is positive) 612 Flops per site
62 // else 396 Flops per site
63 //
64 // call 2: if (sig is positive) 810 Flops per site
65 // else 594 Flops per site
66 //
67 // call 3: if (sig is positive) 810 Flops per site
68 // else 396 Flops per site
69 //
70 // call 1: 24 times with +ve sig and 24 times with -ve sig
71 // 24192 Flops per site for the full 48 calls
72 //
73 // call 2: 96 times with +ve sig and 96 times with -ve sig
74 // 134784 Flops per site in total
75 //
76 // call 3 (Lepage)
77 // : 24 times with +ve sig and 24 times with -ve sig
78 // 28944 Flops per site in total
79 //
80 template<class RealA, class RealB, int sig_positive, int mu_positive, int _oddBit, int oddness_change>
81  __global__ void
82  HISQ_KERNEL_NAME(do_middle_link, EXT)(const RealA* const oprodEven, const RealA* const oprodOdd,
83  const RealA* const QprevEven, const RealA* const QprevOdd,
84  const RealB* const linkEven, const RealB* const linkOdd,
85  int sig, int mu,
86  typename RealTypeId<RealA>::Type coeff,
87  RealA* const PmuEven, RealA* const PmuOdd,
88  RealA* const P3Even, RealA* const P3Odd,
89  RealA* const QmuEven, RealA* const QmuOdd,
90  RealA* const newOprodEven, RealA* const newOprodOdd,
91  hisq_kernel_param_t kparam)
92 {
93 
94 
95  int oddBit = _oddBit;
96  int sid = blockIdx.x * blockDim.x + threadIdx.x;
97  if(sid >= kparam.threads) return;
98  int dx[4] = {0,0,0,0};
99  int x[4];
100 
101  getCoords(x, sid, kparam.D, oddBit);
102 
103 
106 
107 
108  /* A________B
109  * mu | |
110  * D| |C
111  *
112  * A is the current point (sid)
113  *
114  */
115 
118  int mymu = posDir(mu);
119 
120 
121 
122 #ifdef MULTI_GPU
123  int E[4]= {kparam.X[0]+4, kparam.X[1]+4, kparam.X[2]+4, kparam.X[3]+4};
124 
125  x[0] = x[0] + kparam.base_idx[0];
126  x[1] = x[1] + kparam.base_idx[1];
127  x[2] = x[2] + kparam.base_idx[2];
128  x[3] = x[3] + kparam.base_idx[3];
129  int new_sid = linkIndex(x,dx,E);
130  oddBit = _oddBit ^ oddness_change;
131 
132 #else
133  int E[4] = {kparam.X[0], kparam.X[1], kparam.X[2], kparam.X[3]};
134  int new_sid = sid;
135 #endif
136 
137  int y[4] = {x[0], x[1], x[2], x[3]};
138 
139  mymu = posDir(mu);
140 
141  updateCoords(y, mymu, (mu_positive ? -1 : 1), kparam.X, kparam.ghostDim[mymu]);
142 
143  point_d = linkIndex(y, dx, E);
144 
145  if (mu_positive){
146  ad_link_nbr_idx = point_d;
147  }else{
148  ad_link_nbr_idx = new_sid;
149  }
150 
151  int mysig = posDir(sig);
152  updateCoords(y, mysig, (sig_positive ? 1 : -1), kparam.X, kparam.ghostDim[mysig]);
153  point_c = linkIndex(y, dx, E);
154 
155  if (mu_positive){
156  bc_link_nbr_idx = point_c;
157  }
158 
159 
160  for(int dir=0; dir<4; ++dir) y[dir] = x[dir];
161  updateCoords(y, mysig, (sig_positive ? 1 : -1), kparam.X, kparam.ghostDim[mysig]);
162  point_b = linkIndex(y, dx, E);
163 
164  if (!mu_positive){
165  bc_link_nbr_idx = point_b;
166  }
167 
168  if(sig_positive){
169  ab_link_nbr_idx = new_sid;
170  }else{
171  ab_link_nbr_idx = point_b;
172  }
173  // now we have ab_link_nbr_idx
174 
175 
176  // load the link variable connecting a and b
177  // Store in ab_link
178  //loadLink<18>(linkEven, linkOdd, mysig, ab_link_nbr_idx, Uab.data, sig_positive^(1-oddBit), kparam.thin_link_stride);
179  loadLink<18>(linkEven, linkOdd, mysig, ab_link_nbr_idx, Uab.data, sig_positive^(1-oddBit), kparam.thin_link_stride);
180 
181 
182  // load the link variable connecting b and c
183  // Store in bc_link
184  loadLink<18>(linkEven, linkOdd, mymu, bc_link_nbr_idx, Ubc.data, mu_positive^(1-oddBit), kparam.thin_link_stride);
185 
186  if(QprevOdd == NULL){
187  loadMatrixFromField(oprodEven, oprodOdd, posDir(sig), (sig_positive ? point_d : point_c), Oy.data, sig_positive^oddBit, kparam.color_matrix_stride);
188  if(!sig_positive) Oy = conj(Oy);
189  }else{ // QprevOdd != NULL
190  loadMatrixFromField(oprodEven, oprodOdd, point_c, Oy.data, oddBit, kparam.color_matrix_stride);
191  }
192 
193 
194  if(!mu_positive){
195  Ow = Ubc*Oy;
196  }else{
197  Ow = conj(Ubc)*Oy;
198  }
199 
200  if(PmuOdd){
201  storeMatrixToField(Ow.data, point_b, PmuEven, PmuOdd, 1-oddBit, kparam.color_matrix_stride);
202  }
203  if(sig_positive){
204  Oy = Uab*Ow;
205  }else{
206  Oy = conj(Uab)*Ow;
207  }
208 
209  storeMatrixToField(Oy.data, new_sid, P3Even, P3Odd, oddBit, kparam.color_matrix_stride);
210 
211  loadLink<18>(linkEven, linkOdd, mymu, ad_link_nbr_idx, Uad.data, mu_positive^oddBit, kparam.thin_link_stride);
212  if(!mu_positive) Uad = conj(Uad);
213 
214 
215  if(QprevOdd == NULL){
216  if(sig_positive){
217  Oy = Ow*Uad;
218  }
219 
220  if(QmuEven){
221  Ox = Uad;
222  storeMatrixToField(Ox.data, new_sid, QmuEven, QmuOdd, oddBit, kparam.color_matrix_stride);
223  }
224  }else{
225  if(QmuEven || sig_positive){
226  loadMatrixFromField(QprevEven, QprevOdd, point_d, Oy.data, 1-oddBit, kparam.color_matrix_stride);
227  Ox = Oy*Uad;
228  }
229  if(QmuEven){
230  storeMatrixToField(Ox.data, new_sid, QmuEven, QmuOdd, oddBit, kparam.color_matrix_stride);
231  }
232  if(sig_positive){
233  Oy = Ow*Ox;
234  }
235  }
236 
237  if(sig_positive){
238  addMatrixToNewOprod(Oy.data, sig, new_sid, coeff, newOprodEven, newOprodOdd, oddBit, kparam.color_matrix_stride);
239  }
240 
241  return;
242 }
243 
244 // Flop count, in two-number pair (matrix_multi, matrix_add)
245 // if (sig is positive) (4, 1)
246 // else (2, 0)
247 // if(sig is positive) 810 flops per lattice site
248 // else 396 flops per lattice site
249 template<class RealA, class RealB, int sig_positive, int mu_positive, int _oddBit, int oddness_change>
250  __global__ void
251 HISQ_KERNEL_NAME(do_lepage_middle_link, EXT)(const RealA* const oprodEven, const RealA* const oprodOdd,
252  const RealA* const QprevEven, const RealA* const QprevOdd,
253  const RealB* const linkEven, const RealB* const linkOdd,
254  int sig, int mu,
255  typename RealTypeId<RealA>::Type coeff,
256  RealA* const P3Even, RealA* const P3Odd,
257  RealA* const newOprodEven, RealA* const newOprodOdd,
258  hisq_kernel_param_t kparam)
259 {
260 
261  int sid = blockIdx.x * blockDim.x + threadIdx.x;
262  if(sid >= kparam.threads) return;
263  int oddBit = _oddBit;
264 
265 
268 
269 
270 
271 
272  /* A________B
273  * mu | |
274  * D| |C
275  *
276  * A is the current point (sid)
277  *
278  */
279 
280  int point_b, point_c, point_d;
282  int mymu;
283 
284  int x[4];
285  int dx[4] = {0,0,0,0};
286  getCoords(x, sid, kparam.D, oddBit);
287 
288 #ifdef MULTI_GPU
289  int E[4]= {kparam.X[0]+4, kparam.X[1]+4, kparam.X[2]+4, kparam.X[3]+4};
290  x[0] = x[0] + kparam.base_idx[0];
291  x[1] = x[1] + kparam.base_idx[1];
292  x[2] = x[2] + kparam.base_idx[2];
293  x[3] = x[3] + kparam.base_idx[3];
294  int new_sid = linkIndex(x,dx,E);
295  oddBit = _oddBit ^ oddness_change;
296 #else
297  int E[4]= {kparam.X[0], kparam.X[1], kparam.X[2], kparam.X[3]};
298  int new_sid = sid;
299 #endif
300 
301 
302  mymu = posDir(mu);
303  int y[4] = {x[0], x[1], x[2], x[3]};
304  updateCoords(y, mymu, (mu_positive ? -1 : 1), kparam.X, kparam.ghostDim[mymu]);
305  point_d = linkIndex(y, dx, E);
306 
307 
308 
309  if (mu_positive){
310  ad_link_nbr_idx = point_d;
311  }else{
312  ad_link_nbr_idx = new_sid;
313  }
314 
315  int mysig = posDir(sig);
316  updateCoords(y, mysig, (sig_positive ? 1 : -1), kparam.X, kparam.ghostDim[mysig]);
317  point_c = linkIndex(y, dx, E);
318 
319 
320 
321  if (mu_positive){
322  bc_link_nbr_idx = point_c;
323  }
324 
325 
326  for(int dir=0; dir<4; ++dir) y[dir] = x[dir];
327  updateCoords(y, mysig, (sig_positive ? 1 : -1), kparam.X, kparam.ghostDim[mysig]);
328  point_b = linkIndex(y, dx, E);
329 
330  if (!mu_positive){
331  bc_link_nbr_idx = point_b;
332  }
333 
334  if(sig_positive){
335  ab_link_nbr_idx = new_sid;
336  }else{
337  ab_link_nbr_idx = point_b;
338  }
339  // now we have ab_link_nbr_idx
340  //
341  //
342  // load the link variable connecting a and b
343  // Store in ab_link
344  loadLink<18>(linkEven, linkOdd, mysig, ab_link_nbr_idx, Uab.data, sig_positive^(1-oddBit), kparam.thin_link_stride);
345 
346  // load the link variable connecting b and c
347  // Store in bc_link
348  loadLink<18>(linkEven, linkOdd, mymu, bc_link_nbr_idx, Ubc.data, mu_positive^(1-oddBit), kparam.thin_link_stride);
349 
350 
351  loadMatrixFromField(oprodEven, oprodOdd, point_c, Oy.data, oddBit, kparam.color_matrix_stride);
352 
353  if(!mu_positive){
354  Ow = Ubc*Oy;
355  }else{
356  Ow = conj(Ubc)*Oy;
357  }
358 
359  if(sig_positive){
360  Oy = Uab*Ow;
361  }else{
362  Oy = conj(Uab)*Ow;
363  }
364 
365 
366  storeMatrixToField(Oy.data, new_sid, P3Even, P3Odd, oddBit, kparam.color_matrix_stride);
367  if(sig_positive){
368  loadLink<18>(linkEven, linkOdd, mymu, ad_link_nbr_idx, Uad.data, mu_positive^oddBit, kparam.thin_link_stride);
369  if(!mu_positive) Uad = conj(Uad);
370 
371  loadMatrixFromField(QprevEven, QprevOdd, point_d, Oy.data, 1-oddBit, kparam.color_matrix_stride);
372 
373  Ox = Oy*Uad;
374  Oy = Ow*Ox;
375 
376  addMatrixToNewOprod(Oy.data, sig, new_sid, coeff, newOprodEven, newOprodOdd, oddBit, kparam.color_matrix_stride);
377  }
378 
379 //#endif
380  return;
381 }
382 
383 
384 
385 
386 
387 
388 
389 /***********************************do_side_link_kernel***************************
390  *
391  * In general we need
392  * READ
393  * 1 LINK: ad_link
394  * 4 COLOR MATRIX: shortP_at_D, newOprod, P3_at_A, Qprod_at_D,
395  * WRITE
396  * 2 COLOR MATRIX: shortP_at_D, newOprod,
397  *
398  * Two call variations:
399  * 1. full read/write
400  * 2. when shortP == NULL && Qprod == NULL:
401  * no need to read ad_link/shortP_at_D or write shortP_at_D
402  * Qprod_at_D does not exit and is not read in
403  *
404  *
405  * Therefore the data traffic, in two-number pair (num_of_links, num_of_color_matrix)
406  * Call 1: (called 192 times)
407  * (1, 6)
408  *
409  * Call 2: (called 48 times)
410  * (0, 3)
411  *
412  * note: newOprod can be at point D or A, depending on if mu is postive or negative
413  *
414  * Flop count, in two-number pair (matrix_multi, matrix_add)
415  * call 1: (2, 2)
416  * call 2: (0, 1)
417  *
418  *********************************************************************************/
419 
420 // Flop count, in two-number pair (matrix_mult, matrix_add)
421 // (2,2)
422 // call 1: 432 Flops per site
423 // call 2 (short)
424 // : 18 Flops per site
425 //
426 // call 1: 240 calls
427 // call 2: 48 calls
428 //
429 // Aggregate Flops:
430 // call 1: 103680
431 // call 2: 864
432 
433 template<class RealA, class RealB, int sig_positive, int mu_positive, int _oddBit, int oddness_change>
434  __global__ void
435 HISQ_KERNEL_NAME(do_side_link, EXT)(const RealA* const P3Even, const RealA* const P3Odd,
436  const RealA* const QprodEven, const RealA* const QprodOdd,
437  const RealB* const linkEven, const RealB* const linkOdd,
438  int sig, int mu,
439  typename RealTypeId<RealA>::Type coeff,
440  typename RealTypeId<RealA>::Type accumu_coeff,
441  RealA* const shortPEven, RealA* const shortPOdd,
442  RealA* const newOprodEven, RealA* const newOprodOdd,
443  hisq_kernel_param_t kparam)
444 {
445  int oddBit = _oddBit;
446  int sid = blockIdx.x * blockDim.x + threadIdx.x;
447  if(sid >= kparam.threads) return;
448 
449 
450  int x[4];
451  int dx[4] = {0,0,0,0};
452  getCoords(x, sid, kparam.D, oddBit);
453 
454 
455 
456 
457 
458 #ifdef MULTI_GPU
459  int E[4]= {kparam.X[0]+4, kparam.X[1]+4, kparam.X[2]+4, kparam.X[3]+4};
460  x[0] = x[0] + kparam.base_idx[0];
461  x[1] = x[1] + kparam.base_idx[1];
462  x[2] = x[2] + kparam.base_idx[2];
463  x[3] = x[3] + kparam.base_idx[3];
464  int new_sid = linkIndex(x,dx,E);
465  oddBit = _oddBit ^ oddness_change;
466 #else
467  int E[4]= {kparam.X[0], kparam.X[1], kparam.X[2], kparam.X[3]};
468  int new_sid = sid;
469 #endif
470 
471 
472 
475 
476 
477 
478 
479 
480  loadMatrixFromField(P3Even, P3Odd, new_sid, Oy.data, oddBit, kparam.color_matrix_stride);
481 
482  /* compute the side link contribution to the momentum
483  *
484  * sig
485  * A________B
486  * | | mu
487  * D | |C
488  *
489  * A is the current point (sid)
490  *
491  */
492 
493 
494  int y[4] = {x[0], x[1], x[2], x[3]};
495 
496  typename RealTypeId<RealA>::Type mycoeff;
497  int point_d;
498  int ad_link_nbr_idx;
499  int mymu = posDir(mu);
500  updateCoords(y, mymu, (mu_positive ? -1 : 1), kparam.X, kparam.ghostDim[mymu]);
501  point_d = linkIndex(y,dx,E);
502 
503 
504 
505  if (mu_positive){
506  ad_link_nbr_idx = point_d;
507  }else{
508  ad_link_nbr_idx = new_sid;
509  }
510 
511  loadLink<18>(linkEven, linkOdd, mymu, ad_link_nbr_idx, Uad.data, mu_positive^oddBit, kparam.thin_link_stride);
512 
513 
514  if(mu_positive){
515  Ow = Uad*Oy;
516  }else{
517  Ow = conj(Uad)*Oy;
518  }
519 
520 
521 
522  addMatrixToField(Ow.data, point_d, accumu_coeff, shortPEven, shortPOdd, 1-oddBit, kparam.color_matrix_stride);
523  mycoeff = CoeffSign<sig_positive,_oddBit ^ oddness_change>::result*coeff;
524 
525  loadMatrixFromField(QprodEven, QprodOdd, point_d, Ox.data, 1-oddBit, kparam.color_matrix_stride);
526 
527  if(mu_positive){
528  Ow = Oy*Ox;
529  if(!oddBit){ mycoeff = -mycoeff; }
530  addMatrixToNewOprod(Ow.data, mu, point_d, mycoeff, newOprodEven, newOprodOdd, 1-oddBit, kparam.color_matrix_stride);
531  }else{
532  Ow = conj(Ox)*conj(Oy);
533  if(oddBit){ mycoeff = -mycoeff; }
534  addMatrixToNewOprod(Ow.data, OPP_DIR(mu), new_sid, mycoeff, newOprodEven, newOprodOdd, oddBit, kparam.color_matrix_stride);
535  }
536  return;
537 }
538 
539 
540 // Flop count, in two-number pair (matrix_mult, matrix_add)
541 // (0,1)
542 
543 
544 template<class RealA, class RealB, int sig_positive, int mu_positive, int _oddBit, int oddness_change>
545  __global__ void
546 HISQ_KERNEL_NAME(do_side_link_short, EXT)(const RealA* const P3Even, const RealA* const P3Odd,
547  const RealB* const linkEven, const RealB* const linkOdd,
548  int sig, int mu,
549  typename RealTypeId<RealA>::Type coeff,
550  RealA* const newOprodEven, RealA* const newOprodOdd,
551  hisq_kernel_param_t kparam)
552 {
553  int oddBit = _oddBit;
554  int sid = blockIdx.x * blockDim.x + threadIdx.x;
555  if(sid >= kparam.threads) return;
556 
557 
558  int x[4];
559  int dx[4] = {0,0,0,0};
560  getCoords(x, sid, kparam.D, oddBit);
561 
562 #ifdef MULTI_GPU
563  int E[4]= {kparam.X[0]+4, kparam.X[1]+4, kparam.X[2]+4, kparam.X[3]+4};
564  x[0] = x[0] + kparam.base_idx[0];
565  x[1] = x[1] + kparam.base_idx[1];
566  x[2] = x[2] + kparam.base_idx[2];
567  x[3] = x[3] + kparam.base_idx[3];
568  int new_sid = linkIndex(x,dx,E);
569  oddBit = _oddBit ^ oddness_change;
570 #else
571  int E[4]= {kparam.X[0], kparam.X[1], kparam.X[2], kparam.X[3]};
572  int new_sid = sid;
573 #endif
574 
575  /* compute the side link contribution to the momentum
576  *
577  * sig
578  * A________B
579  * | | mu
580  * D | |C
581  *
582  * A is the current point (sid)
583  *
584  */
585 
587 
588 
589  loadMatrixFromField(P3Even, P3Odd, new_sid, Oy.data, oddBit, kparam.color_matrix_stride);
590 
591  typename RealTypeId<RealA>::Type mycoeff;
592  int point_d;
593  int mymu = posDir(mu);
594  int y[4] = {x[0], x[1], x[2], x[3]};
595 
596  updateCoords(y, mymu, (mu_positive ? -1 : 1), kparam.X, kparam.ghostDim[mymu]);
597  point_d = linkIndex(y,dx,E);
598  mycoeff = CoeffSign<sig_positive,_oddBit ^ oddness_change>::result*coeff;
599 
600  if(mu_positive){
601  if(!oddBit){ mycoeff = -mycoeff;} // need to change this to get away from oddBit
602  addMatrixToNewOprod(Oy.data, mu, point_d, mycoeff, newOprodEven, newOprodOdd, 1-oddBit, kparam.color_matrix_stride);
603  }else{
604  if(oddBit){ mycoeff = -mycoeff; }
605  Ow = conj(Oy);
606  addMatrixToNewOprod(Ow.data, OPP_DIR(mu), new_sid, mycoeff, newOprodEven, newOprodOdd, oddBit, kparam.color_matrix_stride);
607  }
608  return;
609 }
610 
611 
612 
613 
614 
615 
616 /********************************do_all_link_kernel*********************************************
617  *
618  * In this function we need
619  * READ
620  * 3 LINKS: ad_link, ab_link, bc_link
621  * 5 COLOR MATRIX: Qprev_at_D, oprod_at_C, newOprod_at_A(sig), newOprod_at_D/newOprod_at_A(mu), shortP_at_D
622  * WRITE:
623  * 3 COLOR MATRIX: newOprod_at_A(sig), newOprod_at_D/newOprod_at_A(mu), shortP_at_D,
624  *
625  * If sig is negative, then we don't need to read/write the color matrix newOprod_at_A(sig)
626  *
627  * Therefore the data traffic, in two-number pair (num_of_link, num_of_color_matrix)
628  *
629  * if (sig is positive): (3, 8)
630  * else : (3, 6)
631  *
632  * This function is called 384 times, half positive sig, half negative sig
633  *
634  * Flop count, in two-number pair (matrix_multi, matrix_add)
635  * if(sig is positive) (6,3)
636  * else (4,2)
637  *
638  ************************************************************************************************/
639 
640 // 198 flops per matrix multiply
641 // 18 flops per matrix addition
642 // if(sig is positive) 1242 Flops per lattice site
643 // else 828 Flops per lattice site
644 //
645 // Aggregate Flops per site
646 // 1242*192 + 828*192
647 // = 397440 Flops per site
648 
649 template<class RealA, class RealB, int sig_positive, int mu_positive, int _oddBit, int oddness_change>
650  __global__ void
651 HISQ_KERNEL_NAME(do_all_link, EXT)(const RealA* const oprodEven, const RealA* const oprodOdd,
652  const RealA* const QprevEven, const RealA* const QprevOdd,
653  const RealB* const linkEven, const RealB* const linkOdd,
654  int sig, int mu,
655  typename RealTypeId<RealA>::Type coeff,
656  typename RealTypeId<RealA>::Type accumu_coeff,
657  RealA* const shortPEven, RealA* const shortPOdd,
658  RealA* const newOprodEven, RealA* const newOprodOdd,
659  hisq_kernel_param_t kparam)
660 {
661  int oddBit = _oddBit;
662  int sid = blockIdx.x * blockDim.x + threadIdx.x;
663  if(sid >= kparam.threads) return;
664 
665 
666  int x[4];
667  int dx[4] = {0,0,0,0};
668  getCoords(x, sid, kparam.D, oddBit);
669 
670 
671 
674 
675 
676  /* sig
677  * A________B
678  * mu | |
679  * D | |C
680  *
681  * A is the current point (sid)
682  *
683  */
684 
685  int point_b, point_c, point_d;
686  int ab_link_nbr_idx;
687 
688 #ifdef MULTI_GPU
689  x[0] = x[0] + kparam.base_idx[0];
690  x[1] = x[1] + kparam.base_idx[1];
691  x[2] = x[2] + kparam.base_idx[2];
692  x[3] = x[3] + kparam.base_idx[3];
693 
694  int E[4]= {kparam.X[0]+4, kparam.X[1]+4, kparam.X[2]+4, kparam.X[3]+4};
695  int new_sid = linkIndex(x,dx,E);
696  oddBit = _oddBit ^ oddness_change;
697 #else
698  int E[4]= {kparam.X[0], kparam.X[1], kparam.X[2], kparam.X[3]};
699  int new_sid = sid;
700 #endif
701 
702  int y[4] = {x[0], x[1], x[2], x[3]};
703  int mysig = posDir(sig);
704  updateCoords(y, mysig, (sig_positive ? 1 : -1), kparam.X, kparam.ghostDim[mysig]);
705  point_b = linkIndex(y,dx,E);
706 
707  ab_link_nbr_idx = (sig_positive) ? new_sid : point_b;
708 
709  for(int dir=0; dir<4; ++dir) y[dir] = x[dir];
710 
711 
712  const typename RealTypeId<RealA>::Type & mycoeff = CoeffSign<sig_positive,_oddBit ^ oddness_change>::result*coeff;
713  if(mu_positive){ //positive mu
714 
715  updateCoords(y, mu, -1, kparam.X, kparam.ghostDim[mu]);
716  point_d = linkIndex(y,dx,E);
717 
718  updateCoords(y, mysig, (sig_positive ? 1 : -1), kparam.X, kparam.ghostDim[mysig]);
719  point_c = linkIndex(y,dx,E);
720 
721  loadMatrixFromField(QprevEven, QprevOdd, point_d, Ox.data, 1-oddBit, kparam.color_matrix_stride); // COLOR_MAT_X
722  loadLink<18>(linkEven, linkOdd, mu, point_d, Uad.data, 1-oddBit, kparam.thin_link_stride);
723 
724  loadMatrixFromField(oprodEven,oprodOdd, point_c, Oy.data, oddBit, kparam.color_matrix_stride); // COLOR_MAT_Y
725  loadLink<18>(linkEven, linkOdd, mu, point_c, Ubc.data, oddBit, kparam.thin_link_stride);
726 
727  Oz = conj(Ubc)*Oy;
728 
729  if (sig_positive)
730  {
731  Ow = Oz*Ox*Uad;
732  addMatrixToNewOprod(Ow.data, sig, new_sid, Sign<_oddBit ^ oddness_change>::result*mycoeff, newOprodEven, newOprodOdd, oddBit, kparam.color_matrix_stride);
733  }
734 
735 
736  loadLink<18>(linkEven, linkOdd, posDir(sig), ab_link_nbr_idx, Uab.data, sig_positive^(1-oddBit), kparam.thin_link_stride);
737 
738  if(sig_positive){
739  Oy = Uab*Oz;
740  }else{
741  Oy = conj(Uab)*Oz;
742  }
743 
744 
745  Ow = Oy*Ox;
746  addMatrixToNewOprod(Ow.data, mu, point_d, -Sign<_oddBit ^ oddness_change>::result*mycoeff, newOprodEven, newOprodOdd, 1-oddBit, kparam.color_matrix_stride);
747  Ow = Uad*Oy;
748  addMatrixToField(Ow.data, point_d, accumu_coeff, shortPEven, shortPOdd, 1-oddBit, kparam.color_matrix_stride);
749 
750  } else{ //negative mu
751 
752  mu = OPP_DIR(mu);
753  updateCoords(y, mu, 1, kparam.X, kparam.ghostDim[mu]);
754  point_d = linkIndex(y,dx,E);
755  updateCoords(y, mysig, (sig_positive ? 1 : -1), kparam.X, kparam.ghostDim[mysig]);
756  point_c = linkIndex(y,dx,E);
757 
758  loadMatrixFromField(QprevEven, QprevOdd, point_d, Ox.data, 1-oddBit, kparam.color_matrix_stride); // COLOR_MAT_X used!
759 
760  loadLink<18>(linkEven, linkOdd, mu, new_sid, Uad.data, oddBit, kparam.thin_link_stride);
761 
762  loadMatrixFromField(oprodEven, oprodOdd, point_c, Oy.data, oddBit, kparam.color_matrix_stride); // COLOR_MAT_Y used
763  loadLink<18>(linkEven, linkOdd, mu, point_b, Ubc.data, 1-oddBit, kparam.thin_link_stride);
764 
765 
766  if(sig_positive){
767  Ow = Ox*conj(Uad);
768  }
769  Oz = Ubc*Oy;
770 
771  if (sig_positive){
772  Oy = Oz*Ow;
773  addMatrixToNewOprod(Oy.data, sig, new_sid, Sign<_oddBit ^ oddness_change>::result*mycoeff, newOprodEven, newOprodOdd, oddBit, kparam.color_matrix_stride);
774  }
775  loadLink<18>(linkEven, linkOdd, posDir(sig), ab_link_nbr_idx, Uab.data, sig_positive^(1-oddBit), kparam.thin_link_stride);
776 
777 
778  if(sig_positive){
779  Oy = Uab*Oz;
780  }else{
781  Oy = conj(Uab)*Oz;
782  }
783 
784  Ow = conj(Ox)*conj(Oy);
785 
786  addMatrixToNewOprod(Ow.data, mu, new_sid, Sign<_oddBit ^ oddness_change>::result*mycoeff, newOprodEven, newOprodOdd, oddBit, kparam.color_matrix_stride);
787 
788  Ow = conj(Uad)*Oy;
789 
790  addMatrixToField(Ow.data, point_d, accumu_coeff, shortPEven, shortPOdd, 1-oddBit, kparam.color_matrix_stride);
791 
792  }
793  return;
794 }
795 
796 
797 
798 // Flops count, in two-number pair (matrix_mult, matrix_add)
799 // (24, 12)
800 // 4968 Flops per site in total
801 template<class RealA, class RealB, int oddBit>
802  __global__ void
803 HISQ_KERNEL_NAME(do_longlink, EXT)(const RealB* const linkEven, const RealB* const linkOdd,
804  const RealA* const naikOprodEven, const RealA* const naikOprodOdd,
805  typename RealTypeId<RealA>::Type coeff,
806  RealA* const outputEven, RealA* const outputOdd,
807  hisq_kernel_param_t kparam)
808 {
809  int sid = blockIdx.x * blockDim.x + threadIdx.x;
810  if (sid >= kparam.threads) return;
811 
812 
813  int x[4];
814  int dx[4] = {0,0,0,0};
815 
816  getCoords(x, sid, kparam.X, oddBit);
817 #ifdef MULTI_GPU
818  int E[4]= {kparam.X[0]+4, kparam.X[1]+4, kparam.X[2]+4, kparam.X[3]+4};
819  for(int i=0; i<4; ++i) x[i] += 2;
820  int new_sid = linkIndex(x,dx,E);
821 #else
822  int E[4] = {kparam.X[0], kparam.X[1], kparam.X[2], kparam.X[3]};
823  int new_sid = sid;
824 #endif
825 
826 
827 
828  const int & point_c = new_sid;
830 
831 
832  /*
833  *
834  * A B C D E
835  * ---- ---- ---- ----
836  *
837  * ---> sig direction
838  *
839  * C is the current point (sid)
840  *
841  */
842 
843 
846 
847  // compute the force for forward long links
848  for(int sig=0; sig<4; ++sig){
849 
850  dx[sig]++;
851  point_d = linkIndex(x,dx,E);
852 
853  dx[sig]++;
854  point_e = linkIndex(x,dx,E);
855 
856  dx[sig] = -1;
857  point_b = linkIndex(x,dx,E);
858 
859  dx[sig]--;
860  point_a = linkIndex(x,dx,E);
861  dx[sig]=0;
862 
863 
864  loadLink<18>(linkEven, linkOdd, sig, point_a, Uab.data, oddBit, kparam.thin_link_stride);
865  loadLink<18>(linkEven, linkOdd, sig, point_b, Ubc.data, 1-oddBit, kparam.thin_link_stride);
866  loadLink<18>(linkEven, linkOdd, sig, point_d, Ude.data, 1-oddBit, kparam.thin_link_stride);
867  loadLink<18>(linkEven, linkOdd, sig, point_e, Uef.data, oddBit, kparam.thin_link_stride);
868 
869  loadMatrixFromField(naikOprodEven, naikOprodOdd, sig, point_c, Oz.data, oddBit, kparam.color_matrix_stride);
870  loadMatrixFromField(naikOprodEven, naikOprodOdd, sig, point_b, Oy.data, 1-oddBit, kparam.color_matrix_stride);
871  loadMatrixFromField(naikOprodEven, naikOprodOdd, sig, point_a, Ox.data, oddBit, kparam.color_matrix_stride);
872 
873  Matrix<RealA,3> temp = Ude*Uef*Oz - Ude*Oy*Ubc + Ox*Uab*Ubc;
874 
875  addMatrixToField(temp.data, sig, new_sid, coeff, outputEven, outputOdd, oddBit, kparam.color_matrix_stride);
876  } // loop over sig
877 
878  return;
879 }
880 
881 
882 // Flops count: 4 matrix multiplications per lattice site = 792 Flops per site
883 template<class RealA, class RealB, int oddBit>
884  __global__ void
885 HISQ_KERNEL_NAME(do_complete_force, EXT)(const RealB* const linkEven, const RealB* const linkOdd,
886  const RealA* const oprodEven, const RealA* const oprodOdd,
887  RealA* const forceEven, RealA* const forceOdd,
888  hisq_kernel_param_t kparam)
889 {
890  int sid = blockIdx.x * blockDim.x + threadIdx.x;
891  if (sid >= kparam.threads) return;
892 
893 
894  int x[4];
895  int dx[4] = {0,0,0,0};
896  getCoords(x, sid, kparam.X, oddBit);
897 
898  int new_sid=sid;
899 #ifdef MULTI_GPU
900  x[0] = x[0]+2;
901  x[1] = x[1]+2;
902  x[2] = x[2]+2;
903  x[3] = x[3]+2;
904  int E[4] = {kparam.X[0]+4, kparam.X[1]+4, kparam.X[2]+4, kparam.X[3]+4};
905  new_sid = linkIndex(x,dx,E);
906 #endif
907 
908  for(int sig=0; sig<4; ++sig){
909 
910  Matrix<RealA,3> Uw, Ow, Ox;
911 
912  loadLink<18>(linkEven, linkOdd, sig, new_sid, Uw.data, oddBit, kparam.thin_link_stride);
913 
914  loadMatrixFromField(oprodEven, oprodOdd, sig, new_sid, Ox.data, oddBit, kparam.color_matrix_stride);
915  typename RealTypeId<RealA>::Type coeff = (oddBit==1) ? -1 : 1;
916  Ow = Uw*Ox;
917 
918  storeMatrixToMomentumField(Ow.data, sig, sid, coeff, forceEven, forceOdd, oddBit, kparam.momentum_stride);
919  }
920  return;
921 }
922 
923 #undef EXT
__device__ __host__ int linkIndex(int x[], int dx[], const int X[4])
int bc_link_nbr_idx
Matrix< RealA, 3 > Uad
int point_d
Matrix< N, std::complex< T > > conj(const Matrix< N, std::complex< T > > &mat)
__global__ void const RealA *const const RealA *const const RealA *const QprevOdd
__global__ void const RealA *const const RealA *const const RealA *const QprodOdd
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int mu
addMatrixToField(Ow.data, point_d, accumu_coeff, shortPEven, shortPOdd, 1-oddBit, kparam.color_matrix_stride)
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealA *const RealA *const RealA *const RealA *const P3Odd
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const newOprodEven
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealA *const RealA *const RealA *const RealA *const RealA *const QmuEven
__global__ void const RealB *const const RealA *const const RealA *const naikOprodOdd
int point_b
__global__ void const RealB *const const RealA *const const RealA *const RealA *const RealA *const forceOdd
__global__ void HISQ_KERNEL_NAME(do_middle_link, EXT)(const RealA *const oprodEven
int point_c
#define OPP_DIR(dir)
Definition: force_common.h:16
Matrix< RealA, 3 > Uab
loadLink< 18 >(linkEven, linkOdd, mysig, ab_link_nbr_idx, Uab.data, sig_positive^(1-oddBit), kparam.thin_link_stride)
int E[4]
__global__ void const RealB *const const RealA *const oprodEven
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const QmuOdd
addMatrixToNewOprod(Ow.data, OPP_DIR(mu), new_sid, mycoeff, newOprodEven, newOprodOdd, oddBit, kparam.color_matrix_stride)
__global__ void const RealB *const const RealA *const const RealA *const RealA *const forceEven
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const linkOdd
__global__ void const RealA *const const RealA *const QprevEven
__global__ void const RealB *const const RealA *const const RealA *const RealTypeId< RealA >::Type RealA *const outputEven
Matrix< RealA, 3 > Ubc
#define EXT
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type coeff
__global__ void const RealB *const const RealA *const const RealA *const RealTypeId< RealA >::Type RealA *const RealA *const outputOdd
int new_sid
updateCoords(y, mymu,(mu_positive?-1:1), kparam.X, kparam.ghostDim[mymu])
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealTypeId< RealA >::Type accumu_coeff
__global__ void const RealA *const const RealA *const QprodEven
int point_a
int x[4]
Matrix< RealA, 3 > Uef
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealTypeId< RealA >::Type RealA *const RealA *const shortPOdd
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealA *const PmuEven
int dx[4]
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const hisq_kernel_param_t kparam
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealA *const RealA *const RealA *const P3Even
__global__ void const RealB *const const RealA *const naikOprodEven
int point_e
int ab_link_nbr_idx
Matrix< RealA, 3 > Ox
storeMatrixToField(Oy.data, new_sid, P3Even, P3Odd, oddBit, kparam.color_matrix_stride)
Matrix< RealA, 3 > Oz
Matrix< RealA, 3 > Ow
Matrix< RealA, 3 > Ude
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealTypeId< RealA >::Type RealA *const shortPEven
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int sig
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const newOprodOdd
Matrix< RealA, 3 > Oy
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const linkEven
RealTypeId< RealA >::Type mycoeff
getCoords(x, sid, kparam.D, oddBit)
int ad_link_nbr_idx
loadMatrixFromField(oprodEven, oprodOdd, point_c, Oy.data, oddBit, kparam.color_matrix_stride)
int y[4]
int oddBit
__global__ void const RealA *const oprodOdd
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealA *const RealA *const PmuOdd