QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
hisq_force_reference.cpp
Go to the documentation of this file.
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <math.h>
4 #include <string.h>
5 
6 #include "quda.h"
7 #include "test_util.h"
8 #include "misc.h"
9 #include "hisq_force_reference.h"
10 
11 extern int Z[4];
12 extern int V;
13 extern int Vh;
14 
15 
16 #define CADD(a,b,c) { (c).real = (a).real + (b).real; \
17  (c).imag = (a).imag + (b).imag; }
18 #define CMUL(a,b,c) { (c).real = (a).real*(b).real - (a).imag*(b).imag; \
19  (c).imag = (a).real*(b).imag + (a).imag*(b).real; }
20 #define CSUM(a,b) { (a).real += (b).real; (a).imag += (b).imag; }
21 
22 /* c = a* * b */
23 #define CMULJ_(a,b,c) { (c).real = (a).real*(b).real + (a).imag*(b).imag; \
24  (c).imag = (a).real*(b).imag - (a).imag*(b).real; }
25 
26 /* c = a * b* */
27 #define CMUL_J(a,b,c) { (c).real = (a).real*(b).real + (a).imag*(b).imag; \
28  (c).imag = (a).imag*(b).real - (a).real*(b).imag; }
29 
30 #define CONJG(a,b) { (b).real = (a).real; (b).imag = -(a).imag; }
31 
32 typedef struct {
33  float real;
34  float imag;
35 } fcomplex;
36 
37 /* specific for double complex */
38 typedef struct {
39  double real;
40  double imag;
41 } dcomplex;
42 
43 typedef struct { fcomplex e[3][3]; } fsu3_matrix;
44 typedef struct { fcomplex c[3]; } fsu3_vector;
45 typedef struct { dcomplex e[3][3]; } dsu3_matrix;
46 typedef struct { dcomplex c[3]; } dsu3_vector;
47 
48 typedef struct {
49  fcomplex m01,m02,m12;
50  float m00im,m11im,m22im;
51  float space;
53 
54 typedef struct {
55  dcomplex m01,m02,m12;
56  double m00im,m11im,m22im;
57  double space;
59 
60 typedef struct { fsu3_vector h[2]; } fhalf_wilson_vector;
61 typedef struct { dsu3_vector h[2]; } dhalf_wilson_vector;
62 
63 
64 template<typename su3_matrix>
65 su3_matrix* get_su3_matrix(int gauge_order, su3_matrix* p, int idx, int dir)
66 {
67  if(gauge_order == QUDA_MILC_GAUGE_ORDER){
68  return (p + 4*idx + dir);
69  }else if(gauge_order == QUDA_QDP_GAUGE_ORDER){ // This is nasty!
70  su3_matrix* data = ((su3_matrix**)p)[dir];
71  return data + idx;
72  }else{
73  errorQuda("get_su3_matrix: unsupported ordering scheme!\n");
74  }
75  return NULL;
76 }
77 
78 template<typename su3_matrix>
79 static void
80 su3_adjoint( su3_matrix *a, su3_matrix *b )
81 {
82  int i,j;
83  for(i=0;i<3;i++)for(j=0;j<3;j++){
84  CONJG( a->e[j][i], b->e[i][j] );
85  }
86 }
87 
88 template<typename su3_matrix>
89 static void
90 adjoint_su3_matrix( su3_matrix *a)
91 {
92  su3_matrix b;
93  int i,j;
94  for(i=0;i<3;i++)for(j=0;j<3;j++){
95  CONJG( a->e[j][i], b.e[i][j] );
96  }
97 
98  *a = b;
99 }
100 
101 template<typename su3_matrix, typename anti_hermitmat>
102 static void
103 make_anti_hermitian( su3_matrix *m3, anti_hermitmat *ah3 )
104 {
105 
106  typeof(ah3->m00im) temp =
107  (m3->e[0][0].imag + m3->e[1][1].imag + m3->e[2][2].imag)*0.33333333333333333;
108  ah3->m00im = m3->e[0][0].imag - temp;
109  ah3->m11im = m3->e[1][1].imag - temp;
110  ah3->m22im = m3->e[2][2].imag - temp;
111  ah3->m01.real = (m3->e[0][1].real - m3->e[1][0].real)*0.5;
112  ah3->m02.real = (m3->e[0][2].real - m3->e[2][0].real)*0.5;
113  ah3->m12.real = (m3->e[1][2].real - m3->e[2][1].real)*0.5;
114  ah3->m01.imag = (m3->e[0][1].imag + m3->e[1][0].imag)*0.5;
115  ah3->m02.imag = (m3->e[0][2].imag + m3->e[2][0].imag)*0.5;
116  ah3->m12.imag = (m3->e[1][2].imag + m3->e[2][1].imag)*0.5;
117 
118 }
119 
120 template <typename anti_hermitmat, typename su3_matrix>
121 static void
122 uncompress_anti_hermitian(anti_hermitmat *mat_antihermit,
123  su3_matrix *mat_su3 )
124 {
125  typeof(mat_antihermit->m00im) temp1;
126  mat_su3->e[0][0].imag=mat_antihermit->m00im;
127  mat_su3->e[0][0].real=0.;
128  mat_su3->e[1][1].imag=mat_antihermit->m11im;
129  mat_su3->e[1][1].real=0.;
130  mat_su3->e[2][2].imag=mat_antihermit->m22im;
131  mat_su3->e[2][2].real=0.;
132  mat_su3->e[0][1].imag=mat_antihermit->m01.imag;
133  temp1=mat_antihermit->m01.real;
134  mat_su3->e[0][1].real=temp1;
135  mat_su3->e[1][0].real= -temp1;
136  mat_su3->e[1][0].imag=mat_antihermit->m01.imag;
137  mat_su3->e[0][2].imag=mat_antihermit->m02.imag;
138  temp1=mat_antihermit->m02.real;
139  mat_su3->e[0][2].real=temp1;
140  mat_su3->e[2][0].real= -temp1;
141  mat_su3->e[2][0].imag=mat_antihermit->m02.imag;
142  mat_su3->e[1][2].imag=mat_antihermit->m12.imag;
143  temp1=mat_antihermit->m12.real;
144  mat_su3->e[1][2].real=temp1;
145  mat_su3->e[2][1].real= -temp1;
146  mat_su3->e[2][1].imag=mat_antihermit->m12.imag;
147 }
148 
149 template <typename su3_matrix, typename Float>
150 static void
151 scalar_mult_sub_su3_matrix(su3_matrix *a,su3_matrix *b, Float s, su3_matrix *c)
152 {
153  int i,j;
154  for(i=0;i<3;i++){
155  for(j=0;j<3;j++){
156  c->e[i][j].real = a->e[i][j].real - s*b->e[i][j].real;
157  c->e[i][j].imag = a->e[i][j].imag - s*b->e[i][j].imag;
158  }
159  }
160 }
161 
162 template <typename su3_matrix, typename Float>
163 static void
164 scalar_mult_add_su3_matrix(su3_matrix *a,su3_matrix *b, Float s, su3_matrix *c)
165 {
166  int i,j;
167  for(i=0;i<3;i++){
168  for(j=0;j<3;j++){
169  c->e[i][j].real = a->e[i][j].real + s*b->e[i][j].real;
170  c->e[i][j].imag = a->e[i][j].imag + s*b->e[i][j].imag;
171  }
172  }
173 }
174 template <typename su3_matrix, typename Float>
175 static void
176 scale_su3_matrix(su3_matrix *a, Float s)
177 {
178  int i,j;
179  for(i=0;i<3;i++){
180  for(j=0;j<3;j++){
181  a->e[i][j].real = a->e[i][j].real * s;
182  a->e[i][j].imag = a->e[i][j].imag * s;
183  }
184  }
185 }
186 
187 template<typename su3_matrix, typename su3_vector>
188 static void
189 mult_su3_mat_vec( su3_matrix *a, su3_vector *b, su3_vector *c )
190 {
191  int i,j;
192  typeof(a->e[0][0]) x,y;
193  for(i=0;i<3;i++){
194  x.real=x.imag=0.0;
195  for(j=0;j<3;j++){
196  CMUL( a->e[i][j] , b->c[j] , y );
197  CSUM( x , y );
198  }
199  c->c[i]=x;
200  }
201 }
202 template<typename su3_matrix, typename su3_vector>
203 static void
204 mult_adj_su3_mat_vec( su3_matrix *a, su3_vector *b, su3_vector *c )
205 {
206  int i,j;
207  typeof(a->e[0][0]) x,y,z;
208  for(i=0;i<3;i++){
209  x.real=x.imag=0.0;
210  for(j=0;j<3;j++){
211  CONJG( a->e[j][i], z );
212  CMUL( z , b->c[j], y );
213  CSUM( x , y );
214  }
215  c->c[i] = x;
216  }
217 }
218 
219 template<typename su3_vector, typename su3_matrix>
220 static void
221 su3_projector( su3_vector *a, su3_vector *b, su3_matrix *c )
222 {
223  int i,j;
224  for(i=0;i<3;i++)for(j=0;j<3;j++){
225  CMUL_J( a->c[i], b->c[j], c->e[i][j] );
226  }
227 }
228 
229 template<typename su3_vector, typename Real>
230 static void
231 scalar_mult_add_su3_vector(su3_vector *a, su3_vector *b, Real s,
232  su3_vector *c)
233 {
234  int i;
235  for(i=0;i<3;i++){
236  c->c[i].real = a->c[i].real + s*b->c[i].real;
237  c->c[i].imag = a->c[i].imag + s*b->c[i].imag;
238  }
239 }
240 
241 
242 
243 template < typename su3_matrix>
244 static void
245 print_su3_matrix(su3_matrix *a)
246 {
247  int i, j;
248  for(i=0;i < 3; i++){
249  for(j=0;j < 3;j++){
250  printf("(%f %f)\t", a->e[i][j].real, a->e[i][j].imag);
251  }
252  printf("\n");
253  }
254 
255 }
256 
257 
258 
259 // Add a matrix multiplication function
260 template<typename su3_matrix>
261 static void
262 matrix_mult_nn(su3_matrix* a, su3_matrix* b, su3_matrix* c){
263  // c = a*b
264  typeof(c->e[0][0]) x;
265  for(int i=0; i<3; i++){
266  for(int j=0; j<3; j++){
267  c->e[i][j].real = 0.;
268  c->e[i][j].imag = 0.;
269  for(int k=0; k<3; k++){
270  CMUL(a->e[i][k],b->e[k][j],x);
271  c->e[i][j].real += x.real;
272  c->e[i][j].imag += x.imag;
273  }
274  }
275  }
276  return;
277 }
278 
279 
280 template<typename su3_matrix>
281 static void
282 matrix_mult_an(su3_matrix* a, su3_matrix* b, su3_matrix* c){
283  // c = (a^{\dagger})*b
284  typeof(c->e[0][0]) x;
285  for(int i=0; i<3; i++){
286  for(int j=0; j<3; j++){
287  c->e[i][j].real = 0.;
288  c->e[i][j].imag = 0.;
289  for(int k=0; k<3; k++){
290  CMULJ_(a->e[k][i],b->e[k][j],x);
291  c->e[i][j].real += x.real;
292  c->e[i][j].imag += x.imag;
293  }
294  }
295  }
296  return;
297 }
298 
299 
300 
301 template<typename su3_matrix>
302 static void
303 matrix_mult_na(su3_matrix* a, su3_matrix* b, su3_matrix* c){
304  // c = a*b^{\dagger}
305  typeof(c->e[0][0]) x;
306  for(int i=0; i<3; i++){
307  for(int j=0; j<3; j++){
308  c->e[i][j].real = 0.; c->e[i][j].imag = 0.;
309  for(int k=0; k<3; k++){
310  CMUL_J(a->e[i][k],b->e[j][k],x);
311  c->e[i][j].real += x.real;
312  c->e[i][j].imag += x.imag;
313  }
314  }
315  }
316  return;
317 }
318 
319 template<typename su3_matrix>
320 static void
321 matrix_mult_aa(su3_matrix* a, su3_matrix* b, su3_matrix* c){
322 
323  su3_matrix a_adjoint;
324  su3_adjoint(a, &a_adjoint);
325  matrix_mult_na(&a_adjoint, b, c);
326 }
327 
328 template <typename su3_matrix, typename anti_hermitmat, typename Float>
329 static void
330 update_mom(anti_hermitmat* momentum, int dir, su3_matrix* sitelink,
331  su3_matrix* staple, Float eb3)
332 {
333  int i;
334  for(i=0;i <V; i++){
335  su3_matrix tmat1;
336  su3_matrix tmat2;
337  su3_matrix tmat3;
338 
339  su3_matrix* lnk = sitelink + 4*i+dir;
340  su3_matrix* stp = staple + i;
341  anti_hermitmat* mom = momentum + 4*i+dir;
342 
343  mult_su3_na(lnk, stp, &tmat1);
344  uncompress_anti_hermitian(mom, &tmat2);
345 
346  scalar_mult_sub_su3_matrix(&tmat2, &tmat1, eb3, &tmat3);
347  make_anti_hermitian(&tmat3, mom);
348 
349  }
350 
351 }
352 
353 
354 template<typename half_wilson_vector, typename su3_matrix>
355 static void
356 u_shift_hw(half_wilson_vector *src, half_wilson_vector *dest, int dir, su3_matrix* sitelink )
357 {
358  int i ;
359  int dx[4];
360 
361  dx[3]=dx[2]=dx[1]=dx[0]=0;
362 
363  if(GOES_FORWARDS(dir)){
364  dx[dir]=1;
365  for(i=0;i < V; i++){
366  int nbr_idx = neighborIndexFullLattice(i, dx[3], dx[2], dx[1], dx[0]);
367  half_wilson_vector* hw = src + nbr_idx;
368  su3_matrix* link = sitelink + i*4 + dir;
369  mult_su3_mat_vec(link, &hw->h[0], &dest[i].h[0]);
370  mult_su3_mat_vec(link, &hw->h[1], &dest[i].h[1]);
371  }
372  }else{
373  dx[OPP_DIR(dir)]=-1;
374  for(i=0;i < V; i++){
375  int nbr_idx = neighborIndexFullLattice(i, dx[3], dx[2], dx[1], dx[0]);
376  half_wilson_vector* hw = src + nbr_idx;
377  su3_matrix* link = sitelink + nbr_idx*4 + OPP_DIR(dir);
378  mult_adj_su3_mat_vec(link, &hw->h[0], &dest[i].h[0]);
379  mult_adj_su3_mat_vec(link, &hw->h[1], &dest[i].h[1]);
380  }
381  }
382 }
383 
384 
385 
386 template<typename half_wilson_vector, typename su3_matrix>
387 static void
388 shifted_outer_prod(half_wilson_vector *src, su3_matrix* dest, int dir)
389 {
390 
391  int i;
392  int dx[4];
393 
394  dx[3]=dx[2]=dx[1]=dx[0]=0;
395 
396  if(GOES_FORWARDS(dir)){
397  dx[dir]=1;
398  }else{ dx[OPP_DIR(dir)]=-1; }
399 
400  for(i=0;i < V; i++){
401  int nbr_idx = neighborIndexFullLattice(i, dx[3], dx[2], dx[1], dx[0]);
402  half_wilson_vector* hw = src + nbr_idx;
403  su3_projector( &src[i].h[0], &(hw->h[0]), &dest[i]);
404  }
405 }
406 
407 
408 template<typename half_wilson_vector, typename su3_matrix>
409 static void
410 forward_shifted_outer_prod(half_wilson_vector *src, su3_matrix* dest, int dir)
411 {
412 
413  int i;
414  int dx[4];
415 
416  dx[3]=dx[2]=dx[1]=dx[0]=0;
417 
418  if(GOES_FORWARDS(dir)){
419  dx[dir]=1;
420  }else{ dx[OPP_DIR(dir)]=-1; }
421 
422  for(i=0;i < V; i++){
423  int nbr_idx = neighborIndexFullLattice(i, dx[3], dx[2], dx[1], dx[0]);
424  half_wilson_vector* hw = src + nbr_idx;
425  //su3_projector( &src[i].h[0], &(hw->h[0]), &dest[i]);
426  su3_projector( &(hw->h[0]), &src[i].h[0], &dest[i]);
427  }
428 
429  return;
430 }
431 
432 
433 
434 
435 
436 template <typename half_wilson_vector, typename su3_matrix>
437 static void
438 computeLinkOrderedOuterProduct(half_wilson_vector *src, su3_matrix* dest, int gauge_order)
439 {
440  int dx[4];
441  for(int i=0; i<V; ++i){
442  for(int dir=0; dir<4; ++dir){
443  dx[3]=dx[2]=dx[1]=dx[0]=0;
444  dx[dir] = 1;
445  int nbr_idx = neighborIndexFullLattice(i, dx[3], dx[2], dx[1], dx[0]);
446  half_wilson_vector* hw = src + nbr_idx;
447  su3_matrix* p = get_su3_matrix(gauge_order, dest, i, dir);
448  su3_projector( &(hw->h[0]), &src[i].h[0], p);
449  } // dir
450  } // i
451  return;
452 }
453 
454 
455 template <typename half_wilson_vector, typename su3_matrix>
456 static void
457 computeLinkOrderedOuterProduct(half_wilson_vector *src, su3_matrix* dest, size_t nhops, int gauge_order)
458 {
459  int dx[4];
460  for(int i=0; i<V; ++i){
461  for(int dir=0; dir<4; ++dir){
462  dx[3]=dx[2]=dx[1]=dx[0]=0;
463  dx[dir] = nhops;
464  int nbr_idx = neighborIndexFullLattice(i, dx[3], dx[2], dx[1], dx[0]);
465  half_wilson_vector* hw = src + nbr_idx;
466  su3_matrix* p = get_su3_matrix(gauge_order, dest, i, dir);
467  su3_projector( &(hw->h[0]), &src[i].h[0], p);
468  } // dir
469  } // i
470  return;
471 }
472 
473 
474 
475 
476 void computeLinkOrderedOuterProduct(void *src, void *dst, QudaPrecision precision, int gauge_order)
477 {
478  if(precision == QUDA_SINGLE_PRECISION){
480  }else{
482  }
483  return;
484 }
485 
486 void computeLinkOrderedOuterProduct(void *src, void *dst, QudaPrecision precision, size_t nhops, int gauge_order)
487 {
488  if(precision == QUDA_SINGLE_PRECISION){
489  computeLinkOrderedOuterProduct((fhalf_wilson_vector*)src,(fsu3_matrix*)dst, nhops, gauge_order);
490  }else{
491  computeLinkOrderedOuterProduct((dhalf_wilson_vector*)src,(dsu3_matrix*)dst, nhops, gauge_order);
492  }
493  return;
494 }
495 
496 
497 
498 template<typename half_wilson_vector, typename su3_matrix>
499 static void shiftedOuterProduct(half_wilson_vector *src, su3_matrix* dest){
500  for(int dir=0; dir<4; dir++){
501  shifted_outer_prod(src, &dest[dir*V], OPP_DIR(dir));
502  }
503 }
504 
505 template<typename half_wilson_vector, typename su3_matrix>
506 static void forwardShiftedOuterProduct(half_wilson_vector *src, su3_matrix* dest){
507  for(int dir=0; dir<4; dir++){
508  forward_shifted_outer_prod(src, &dest[dir*V], dir);
509  }
510 }
511 
512 
513 void computeHisqOuterProduct(void* src, void* dest, QudaPrecision precision){
514  if(precision == QUDA_SINGLE_PRECISION){
515  forwardShiftedOuterProduct((fhalf_wilson_vector*)src,(fsu3_matrix*)dest);
516  }else{
517  forwardShiftedOuterProduct((dhalf_wilson_vector*)src,(dsu3_matrix*)dest);
518  }
519 }
520 
521 
522 // Note that the hisq routines do not involve half-wilson vectors,
523 // but instead require colour matrix shifts
524 
525 
526 template<typename su3_matrix>
527 static void
528 u_shift_mat(su3_matrix *src, su3_matrix *dest, int dir, su3_matrix* sitelink)
529 {
530  int i;
531  int dx[4];
532  dx[3]=dx[2]=dx[1]=dx[0]=0;
533 
534  if(GOES_FORWARDS(dir)){
535  dx[dir]=1;
536  for(i=0; i<V; i++){
537  int nbr_idx = neighborIndexFullLattice(i, dx[3], dx[2], dx[1], dx[0]);
538  su3_matrix* mat = src+nbr_idx; // No need for a factor of 4 here, the colour matrices do not have a Lorentz index
539  su3_matrix* link = sitelink + i*4 + dir;
540  matrix_mult_nn(link, mat, &dest[i]);
541  }
542  }else{
543  dx[OPP_DIR(dir)]=-1;
544  for(i=0; i<V; i++){
545  int nbr_idx = neighborIndexFullLattice(i, dx[3], dx[2], dx[1], dx[0]);
546  su3_matrix* mat = src+nbr_idx; // No need for a factor of 4 here, the colour matrices do not have a Lorentz index
547  su3_matrix* link = sitelink + nbr_idx*4 + OPP_DIR(dir);
548  matrix_mult_an(link, mat, &dest[i]);
549  }
550  }
551  return;
552 }
553 
554 
555 
556 
557 
558 
559 
560 template <typename half_wilson_vector,
561  typename anti_hermitmat, typename Real>
562 static void
563 add_3f_force_to_mom(half_wilson_vector *back, half_wilson_vector *forw,
564  int dir, Real coeff[2], anti_hermitmat* momentum)
565 {
566  Real my_coeff[2] ;
567  Real tmp_coeff[2] ;
568  int mydir;
569  int i;
570 
571  if(GOES_BACKWARDS(dir)){
572  mydir = OPP_DIR(dir);
573  my_coeff[0] = -coeff[0];
574  my_coeff[1] = -coeff[1];
575  }else{
576  mydir = dir;
577  my_coeff[0] = coeff[0];
578  my_coeff[1] = coeff[1];
579  }
580 
581  for(i=0;i < V;i++){
582  if (i < Vh){
583  tmp_coeff[0] = my_coeff[0];
584  tmp_coeff[1] = my_coeff[1];
585  }else{
586  tmp_coeff[0] = -my_coeff[0];
587  tmp_coeff[1] = -my_coeff[1] ;
588  }
589 
590  if (sizeof(Real) == sizeof(float)){
591  fsu3_matrix tmat;
592  fsu3_matrix mom_matrix;
593  anti_hermitmat* mom = momentum+ 4* i + mydir;
594  uncompress_anti_hermitian(mom, &mom_matrix);
595  su3_projector( &back[i].h[0], &forw[i].h[0], &tmat);
596  scalar_mult_add_su3_matrix(&mom_matrix, &tmat, tmp_coeff[0], &mom_matrix );
597  make_anti_hermitian(&mom_matrix, mom);
598  }else{
599  dsu3_matrix tmat;
600  dsu3_matrix mom_matrix;
601  anti_hermitmat* mom = momentum+ 4* i + mydir;
602  uncompress_anti_hermitian(mom, &mom_matrix);
603  su3_projector( &back[i].h[0], &forw[i].h[0], &tmat);
604  scalar_mult_add_su3_matrix(&mom_matrix, &tmat, tmp_coeff[0], &mom_matrix );
605  make_anti_hermitian(&mom_matrix, mom);
606  }
607  }
608 }
609 
610 
611 template<typename Real, typename half_wilson_vector, typename anti_hermitmat>
612  static void
613 side_link_3f_force(int mu, int nu, Real coeff[2], half_wilson_vector *Path ,
614  half_wilson_vector *Path_nu, half_wilson_vector *Path_mu,
615  half_wilson_vector *Path_numu, anti_hermitmat* mom)
616 {
617  Real m_coeff[2] ;
618 
619  m_coeff[0] = -coeff[0] ;
620  m_coeff[1] = -coeff[1] ;
621 
622  if(GOES_FORWARDS(mu)){
623  if(GOES_FORWARDS(nu)){
624  add_3f_force_to_mom(Path_numu, Path, mu, coeff, mom) ;
625  }else{
626  add_3f_force_to_mom(Path,Path_numu,OPP_DIR(mu),m_coeff, mom);
627  }
628  }
629  else{
630  if(GOES_FORWARDS(nu))
631  add_3f_force_to_mom(Path_nu, Path_mu, mu, m_coeff, mom);
632  else
633  add_3f_force_to_mom(Path_mu, Path_nu, OPP_DIR(mu), coeff, mom) ;
634  }
635 }
636 
637 
638 template<typename Real, typename su3_matrix, typename anti_hermitmat>
639  static void
640 add_force_to_momentum(su3_matrix *back, su3_matrix *forw,
641  int dir, Real coeff, anti_hermitmat* momentum)
642 {
643  Real my_coeff;
644  Real tmp_coeff;
645  int mydir;
646  int i;
647 
648  if(GOES_BACKWARDS(dir)){
649  mydir = OPP_DIR(dir);
650  my_coeff = -coeff;
651  }else{
652  mydir = dir;
653  my_coeff = coeff;
654  }
655 
656 
657  for(i=0; i<V; i++){
658  if(i<Vh){ tmp_coeff = my_coeff; }
659  else{ tmp_coeff = -my_coeff; }
660 
661 
662  su3_matrix tmat;
663  su3_matrix mom_matrix;
664  anti_hermitmat* mom = momentum + 4*i + mydir;
665  uncompress_anti_hermitian(mom, &mom_matrix);
666  matrix_mult_na(&back[i], &forw[i], &tmat);
667  scalar_mult_add_su3_matrix(&mom_matrix, &tmat, tmp_coeff, &mom_matrix);
668 
669  make_anti_hermitian(&mom_matrix, mom);
670  }
671  return;
672 }
673 
674 
675 template<typename Real, typename su3_matrix, typename anti_hermitmat>
676 static void
677 side_link_force(int mu, int nu, Real coeff, su3_matrix *Path,
678  su3_matrix *Path_nu, su3_matrix *Path_mu,
679  su3_matrix *Path_numu, anti_hermitmat* mom)
680 {
681  Real m_coeff;
682  m_coeff = -coeff;
683 
684  if(GOES_FORWARDS(mu)){
685  if(GOES_FORWARDS(nu)){
686  add_force_to_momentum(Path_numu, Path, mu, coeff, mom);
687  // In the example below:
688  // add_force_to_momentum(P7rho, Qnumu, rho, coeff, mom)
689  }else{
690  add_force_to_momentum(Path, Path_numu, OPP_DIR(mu), m_coeff, mom);
691  // add_force_to_momentum(Qnumu, P7rho, -Qnumu, rho, m_coeff, mom)
692  }
693  }
694  else { // if (GOES_BACKWARDS(mu))
695  if(GOES_FORWARDS(nu)){
696  add_force_to_momentum(Path_nu, Path_mu, mu, m_coeff, mom);
697  // add_force_to_momentum(P7, Qrhonumu, rho, m_coeff, mom)
698  }else{
699  add_force_to_momentum(Path_mu, Path_nu, OPP_DIR(mu), coeff, mom);
700  // add_force_to_momentum(Qrhonumu, P7, rho, coeff, mom)
701  }
702  }
703  return;
704 }
705 
706 
707 
708 
709 #define Pmu tempmat[0]
710 #define Pnumu tempmat[1]
711 #define Prhonumu tempmat[2]
712 #define P7 tempmat[3]
713 #define P7rho tempmat[4]
714 #define P5 tempmat[5]
715 #define P3 tempmat[6]
716 #define P5nu tempmat[3]
717 #define P3mu tempmat[3]
718 #define Popmu tempmat[4]
719 #define Pmumumu tempmat[4]
720 
721 #define Qmu tempmat[7]
722 #define Qnumu tempmat[8]
723 #define Qrhonumu tempmat[2] // same as Prhonumu
724 
725 
726 
727 
728 
729 template<typename su3_matrix>
730 static void set_identity_matrix(su3_matrix* mat)
731 {
732  for(int i=0; i<3; i++){
733  for(int j=0; j<3; j++){
734  mat->e[i][j].real=0;
735  mat->e[i][j].imag=0;
736  }
737  mat->e[i][i].real=1;
738  }
739 }
740 
741 
742 template<typename su3_matrix>
743 static void set_identity(su3_matrix* matrices, int num_dirs){
744  for(int i=0; i<V*num_dirs; i++){
745  set_identity_matrix(&matrices[i]);
746  }
747 }
748 
749 
750 template <typename Real, typename su3_matrix, typename anti_hermitmat>
751 void do_color_matrix_hisq_force_reference(Real eps, Real weight,
752  su3_matrix* temp_xx, Real* act_path_coeff,
753  su3_matrix* sitelink, anti_hermitmat* mom)
754 {
755  int i;
756  int mu, nu, rho, sig;
757  Real coeff;
758  Real OneLink, Lepage, FiveSt, ThreeSt, SevenSt;
759 // Real Naik;
760  Real mLepage, mFiveSt, mThreeSt, mSevenSt;
761 
762  su3_matrix* tempmat[9];
763  int sites_on_node = V;
764  //int DirectLinks[8] ;
765 
766  Real ferm_epsilon;
767  ferm_epsilon = 2.0*weight*eps;
768  OneLink = act_path_coeff[0]*ferm_epsilon ;
769 // Naik = act_path_coeff[1]*ferm_epsilon ; mNaik = -Naik;
770  ThreeSt = act_path_coeff[2]*ferm_epsilon ; mThreeSt = -ThreeSt;
771  FiveSt = act_path_coeff[3]*ferm_epsilon ; mFiveSt = -FiveSt;
772  SevenSt = act_path_coeff[4]*ferm_epsilon ; mSevenSt = -SevenSt;
773  Lepage = act_path_coeff[5]*ferm_epsilon ; mLepage = -Lepage;
774 
775 // for(mu=0;mu<8;mu++){
776 // DirectLinks[mu] = 0 ;
777 // }
778 
779  for(mu=0; mu<9; mu++){
780  tempmat[mu] = (su3_matrix *)malloc( sites_on_node*sizeof(su3_matrix) );
781  }
782 
783 
784  su3_matrix* id;
785  id = (su3_matrix *)malloc(sites_on_node*sizeof(su3_matrix) );
786  //su3_matrix* id4;
787  //id4 = (su3_matrix *)malloc(sites_on_node*4*sizeof(su3_matrix) );
788 
789  // su3_matrix* temp_mat;
790 // temp_mat = (su3_matrix *)malloc(sites_on_node*sizeof(su3_matrix) );
791 
792 
793  // su3_matrix* temp_xx;
794  // temp_xx = (su3_matrix *)malloc(sites_on_node*8*sizeof(su3_matrix) );
795  // shiftedOuterProduct(temp_x, temp_xx);
796 
797  // initialise id so that it is the identity matrix on each lattice site
798  set_identity(id,1);
799 
800  printf("Calling modified hisq\n");
801  return;
802 
803 
804  for(sig=0; sig < 8; sig++){
805  // One-link term - don't have the savings here that we get when working with the
806  // half-wilson vectors
807  if(GOES_FORWARDS(sig)){
808  u_shift_mat(&temp_xx[OPP_DIR(sig)*V], Pmu, sig, sitelink);
809  add_force_to_momentum(Pmu, id, sig, OneLink, mom); // I could optimise functions which
810  // involve id
811  }
812 
813 
814 
815  for(mu = 0; mu < 8; mu++){
816  if ( (mu == sig) || (mu == OPP_DIR(sig))){
817  continue;
818  }
819  // 3 link path
820  // sig
821  // A _______
822  // | |
823  // mu /|\ \|/
824  // | |
825  //
826  //
827  u_shift_mat(&temp_xx[OPP_DIR(sig)*V], Pmu, OPP_DIR(mu), sitelink); // temp_xx[sig] stores |X(x)><X(x-sig)|
828  u_shift_mat(id, Qmu, OPP_DIR(mu), sitelink); // This returns the path less the outer-product of quark fields at the end
829  // Qmu = U[mu]
830 
831 
832  u_shift_mat(Pmu, P3, sig, sitelink); // P3 is U[sig](X)U[-mu](X+sig) temp_xx
833  if (GOES_FORWARDS(sig)){
834  // add contribution from middle link
835  add_force_to_momentum(P3, Qmu, sig, mThreeSt, mom);
836  // matrix_mult_na(P3[x],Qmu[x],tmp);
837  // mom[sig][x] += mThreeSt*tmp;
838  }
839  for(nu=0; nu < 8; nu++){
840  if (nu == sig || nu == OPP_DIR(sig)
841  || nu == mu || nu == OPP_DIR(mu)){
842  continue;
843  }
844 
845  /*
846  5 link path
847 
848  sig
849  A ________
850  | |
851  /|\ \|/
852  | |
853  \ \
854  \ \
855  */
856 
857  u_shift_mat(Pmu, Pnumu, OPP_DIR(nu), sitelink);
858  u_shift_mat(Qmu, Qnumu, OPP_DIR(nu), sitelink);
859 
860  u_shift_mat(Pnumu, P5, sig, sitelink);
861  if (GOES_FORWARDS(sig)){
862  add_force_to_momentum(P5, Qnumu, sig, FiveSt, mom);
863  } // seems to do what I think it should
864  for(rho =0; rho < 8; rho++){
865  if (rho == sig || rho == OPP_DIR(sig)
866  || rho == mu || rho == OPP_DIR(mu)
867  || rho == nu || rho == OPP_DIR(nu)){
868  continue;
869  }
870 
871  //7 link path
872  u_shift_mat(Pnumu, Prhonumu, OPP_DIR(rho), sitelink);
873  u_shift_mat(Prhonumu, P7, sig, sitelink);
874 
875  // Prhonumu = tempmat[2] is not needed again
876  // => store Qrhonumu in the same memory
877  u_shift_mat(Qnumu, Qrhonumu, OPP_DIR(rho), sitelink);
878 
879 
880  if(GOES_FORWARDS(sig)){
881  add_force_to_momentum(P7, Qrhonumu, sig, mSevenSt, mom) ;
882  }
883  u_shift_mat(P7, P7rho, rho, sitelink);
884  side_link_force(rho, sig, SevenSt, Qnumu, P7, Qrhonumu, P7rho, mom);
885  if(FiveSt != 0)coeff = SevenSt/FiveSt ; else coeff = 0;
886  for(i=0; i<V; i++){
887  scalar_mult_add_su3_matrix(&P5[i], &P7rho[i], coeff, &P5[i]);
888  } // end loop over volume
889  } // end loop over rho
890 
891 
892  u_shift_mat(P5, P5nu, nu, sitelink);
893  side_link_force(nu,sig,mFiveSt,Qmu,P5,Qnumu,P5nu,mom); // I believe this should do what I want it to
894  // check this!
895  if(ThreeSt != 0)coeff = FiveSt/ThreeSt; else coeff = 0;
896 
897  for(i=0; i<V; i++){
898  scalar_mult_add_su3_matrix(&P3[i], &P5nu[i], coeff, &P3[i]);
899  } // end loop over volume
900  } // end loop over nu
901 
902  // Lepage term
903  u_shift_mat(Pmu, Pnumu, OPP_DIR(mu), sitelink);
904  u_shift_mat(Qmu, Qnumu, OPP_DIR(mu), sitelink);
905 
906  u_shift_mat(Pnumu, P5, sig, sitelink);
907  if(GOES_FORWARDS(sig)){
908  add_force_to_momentum(P5, Qnumu, sig, Lepage, mom);
909  }
910 
911  u_shift_mat(P5, P5nu, mu, sitelink);
912  side_link_force(mu, sig, mLepage, Qmu, P5, Qnumu, P5nu, mom);
913 
914 
915  if(ThreeSt != 0)coeff = Lepage/ThreeSt; else coeff = 0;
916 
917  for(i=0; i<V; i++){
918  scalar_mult_add_su3_matrix(&P3[i], &P5nu[i], coeff, &P3[i]);
919  }
920 
921  if(GOES_FORWARDS(mu)){
922  u_shift_mat(P3, P3mu, mu, sitelink);
923  }
924  side_link_force(mu, sig, ThreeSt, id, P3, Qmu, P3mu, mom);
925  } // end loop over mu
926  } // end loop over sig
927 } // modified hisq_force_reference
928 
929 
930 
931 
932 
933 
934 // This version of the test routine uses
935 // half-wilson vectors instead of color matrices.
936 template <typename Real, typename su3_matrix, typename anti_hermitmat, typename half_wilson_vector>
937 void do_halfwilson_hisq_force_reference(Real eps, Real weight,
938  half_wilson_vector* temp_x, Real* act_path_coeff,
939  su3_matrix* sitelink, anti_hermitmat* mom)
940 {
941  int i;
942  int mu, nu, rho, sig;
943  Real coeff;
944  Real OneLink, Lepage, FiveSt, ThreeSt, SevenSt;
945  //Real Naik;
946  Real mLepage, mFiveSt, mThreeSt, mSevenSt;
947 
948  su3_matrix* tempmat[9];
949  int sites_on_node = V;
950 // int DirectLinks[8] ;
951 
952  Real ferm_epsilon;
953  ferm_epsilon = 2.0*weight*eps;
954  OneLink = act_path_coeff[0]*ferm_epsilon ;
955 // Naik = act_path_coeff[1]*ferm_epsilon ;
956  ThreeSt = act_path_coeff[2]*ferm_epsilon ; mThreeSt = -ThreeSt;
957  FiveSt = act_path_coeff[3]*ferm_epsilon ; mFiveSt = -FiveSt;
958  SevenSt = act_path_coeff[4]*ferm_epsilon ; mSevenSt = -SevenSt;
959  Lepage = act_path_coeff[5]*ferm_epsilon ; mLepage = -Lepage;
960 
961 
962 // for(mu=0;mu<8;mu++){
963 // DirectLinks[mu] = 0 ;
964 // }
965 
966  for(mu=0; mu<9; mu++){
967  tempmat[mu] = (su3_matrix *)malloc( sites_on_node*sizeof(su3_matrix) );
968  }
969 
970 
971  su3_matrix* id;
972  id = (su3_matrix *)malloc(sites_on_node*sizeof(su3_matrix) );
973  su3_matrix* id4;
974  id4 = (su3_matrix *)malloc(sites_on_node*4*sizeof(su3_matrix) );
975 
976  su3_matrix* temp_mat;
977  temp_mat = (su3_matrix *)malloc(sites_on_node*sizeof(su3_matrix) );
978 
979  // initialise id so that it is the identity matrix on each lattice site
980  set_identity(id,1);
981 
982  printf("Calling hisq reference routine\n");
983  for(sig=0; sig < 8; sig++){
984  shifted_outer_prod(temp_x, temp_mat, OPP_DIR(sig));
985 
986  // One-link term - don't have the savings here that we get when working with the
987  // half-wilson vectors
988  if(GOES_FORWARDS(sig)){
989  u_shift_mat(temp_mat, Pmu, sig, sitelink);
990  add_force_to_momentum(Pmu, id, sig, OneLink, mom);
991  }
992 
993  for(mu = 0; mu < 8; mu++){
994  if ( (mu == sig) || (mu == OPP_DIR(sig))){
995  continue;
996  }
997  // 3 link path
998  // sig
999  // A _______
1000  // | |
1001  // mu /|\ \|/
1002  // | |
1003  //
1004  //
1005  u_shift_mat(temp_mat, Pmu, OPP_DIR(mu), sitelink); // temp_xx[sig] stores |X(x)><X(x-sig)|
1006  u_shift_mat(id, Qmu, OPP_DIR(mu), sitelink); // This returns the path less the outer-product of quark fields at the end
1007  // Qmu = U[mu]
1008 
1009 
1010  u_shift_mat(Pmu, P3, sig, sitelink); // P3 is U[sig](X)U[-mu](X+sig) temp_xx
1011  if (GOES_FORWARDS(sig)){
1012  // add contribution from middle link
1013  add_force_to_momentum(P3, Qmu, sig, mThreeSt, mom); // matrix_mult_na(P3[x],Qmu[x],tmp);
1014  // mom[sig][x] += mThreeSt*tmp;
1015  }
1016  for(nu=0; nu < 8; nu++){
1017  if (nu == sig || nu == OPP_DIR(sig)
1018  || nu == mu || nu == OPP_DIR(mu)){
1019  continue;
1020  }
1021 
1022  /*
1023  5 link path
1024 
1025  sig
1026  A ________
1027  | |
1028  /|\ \|/
1029  | |
1030  \ \
1031  \ \
1032  */
1033 
1034  u_shift_mat(Pmu, Pnumu, OPP_DIR(nu), sitelink);
1035  u_shift_mat(Qmu, Qnumu, OPP_DIR(nu), sitelink);
1036 
1037  u_shift_mat(Pnumu, P5, sig, sitelink);
1038  if (GOES_FORWARDS(sig)){
1039  add_force_to_momentum(P5, Qnumu, sig, FiveSt, mom);
1040  } // seems to do what I think it should
1041 
1042  for(rho =0; rho < 8; rho++){
1043  if (rho == sig || rho == OPP_DIR(sig)
1044  || rho == mu || rho == OPP_DIR(mu)
1045  || rho == nu || rho == OPP_DIR(nu)){
1046  continue;
1047  }
1048 
1049  //7 link path
1050  u_shift_mat(Pnumu, Prhonumu, OPP_DIR(rho), sitelink);
1051  u_shift_mat(Prhonumu, P7, sig, sitelink);
1052 
1053  // Prhonumu = tempmat[2] is not needed again
1054  // => store Qrhonumu in the same memory
1055  u_shift_mat(Qnumu, Qrhonumu, OPP_DIR(rho), sitelink);
1056 
1057 
1058  if(GOES_FORWARDS(sig)){
1059  add_force_to_momentum(P7, Qrhonumu, sig, mSevenSt, mom) ;
1060  }
1061 
1062  u_shift_mat(P7, P7rho, rho, sitelink);
1063  side_link_force(rho, sig, SevenSt, Qnumu, P7, Qrhonumu, P7rho, mom);
1064  if(FiveSt != 0)coeff = SevenSt/FiveSt ; else coeff = 0;
1065  for(i=0; i<V; i++){
1066  scalar_mult_add_su3_matrix(&P5[i], &P7rho[i], coeff, &P5[i]);
1067  } // end loop over volume
1068  } // end loop over rho
1069 
1070 
1071  u_shift_mat(P5, P5nu, nu, sitelink);
1072  side_link_force(nu,sig,mFiveSt,Qmu,P5,Qnumu,P5nu,mom); // I believe this should do what I want it to
1073  // check this!
1074  if(ThreeSt != 0)coeff = FiveSt/ThreeSt; else coeff = 0;
1075 
1076  for(i=0; i<V; i++){
1077  scalar_mult_add_su3_matrix(&P3[i], &P5nu[i], coeff, &P3[i]);
1078  } // end loop over volume
1079  } // end loop over nu
1080 
1081  // Lepage term
1082  u_shift_mat(Pmu, Pnumu, OPP_DIR(mu), sitelink);
1083  u_shift_mat(Qmu, Qnumu, OPP_DIR(mu), sitelink);
1084 
1085  u_shift_mat(Pnumu, P5, sig, sitelink);
1086  if(GOES_FORWARDS(sig)){
1087  add_force_to_momentum(P5, Qnumu, sig, Lepage, mom);
1088  }
1089 
1090  u_shift_mat(P5, P5nu, mu, sitelink);
1091  side_link_force(mu, sig, mLepage, Qmu, P5, Qnumu, P5nu, mom);
1092 
1093 
1094  if(ThreeSt != 0)coeff = Lepage/ThreeSt; else coeff = 0;
1095 
1096  for(i=0; i<V; i++){
1097  scalar_mult_add_su3_matrix(&P3[i], &P5nu[i], coeff, &P3[i]);
1098  }
1099 
1100  if(GOES_FORWARDS(mu)){
1101  u_shift_mat(P3, P3mu, mu, sitelink);
1102  }
1103  side_link_force(mu, sig, ThreeSt, id, P3, Qmu, P3mu, mom);
1104  } // end loop over mu
1105  } // end loop over sig
1106 
1107  for(mu=0; mu<9; mu++){
1108  free(tempmat[mu]);
1109  }
1110 
1111  free(id);
1112  free(id4);
1113  free(temp_mat);
1114 }
1115 
1116 #undef Pmu
1117 #undef Pnumu
1118 #undef Prhonumu
1119 #undef P3
1120 #undef P3mu
1121 #undef P5
1122 #undef P5nu
1123 #undef P7
1124 #undef P7rho
1125 #undef P7rhonu
1126 
1127 #undef Popmu
1128 #undef Pmumumu
1129 
1130 #undef Qmu
1131 #undef Qnumu
1132 #undef Qrhonumu
1133 
1134 
1135 
1136 
1137 
1138 
1139 void halfwilson_hisq_force_reference(float eps, float weight,
1140  void* act_path_coeff, void* temp_x,
1141  void* sitelink, void* mom)
1142 {
1143  do_halfwilson_hisq_force_reference((float)eps, (float)weight,
1144  (fhalf_wilson_vector*) temp_x, (float*)act_path_coeff,
1145  (fsu3_matrix*)sitelink, (fanti_hermitmat*)mom);
1146  return;
1147 }
1148 
1149 
1150 
1151 void halfwilson_hisq_force_reference(double eps, double weight,
1152  void* act_path_coeff, void* temp_x,
1153  void* sitelink, void* mom)
1154 {
1155  do_halfwilson_hisq_force_reference((double)eps, (double)weight,
1156  (dhalf_wilson_vector*) temp_x, (double*)act_path_coeff,
1157  (dsu3_matrix*)sitelink, (danti_hermitmat*)mom);
1158  return;
1159 }
1160 
1161 
1162 
1163 void color_matrix_hisq_force_reference(float eps, float weight,
1164  void* act_path_coeff, void* temp_xx,
1165  void* sitelink, void* mom)
1166 {
1167  do_color_matrix_hisq_force_reference((float)eps, (float)weight,
1168  (fsu3_matrix*) temp_xx, (float*)act_path_coeff,
1169  (fsu3_matrix*)sitelink, (fanti_hermitmat*)mom);
1170  return;
1171 }
1172 
1173 
1174 
#define P7rho
enum QudaPrecision_s QudaPrecision
int y[4]
#define errorQuda(...)
Definition: util_quda.h:73
void print_su3_matrix(su3_matrix *a)
#define P5nu
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int mu
void make_anti_hermitian(su3_matrix *m3, anti_hermitmat *ah3)
int neighborIndexFullLattice(int i, int dx4, int dx3, int dx2, int dx1)
Definition: test_util.cpp:521
void computeLinkOrderedOuterProduct(void *src, void *dst, QudaPrecision precision, int gauge_order)
void mat(void *out, void **fatlink, void **longlink, void *in, double kappa, int dagger_bit, QudaPrecision sPrecision, QudaPrecision gPrecision)
void halfwilson_hisq_force_reference(float eps, float weight, void *act_path_coeff, void *temp_x, void *sitelink, void *mom)
#define Qmu
#define CMUL_J(a, b, c)
#define OPP_DIR(dir)
Definition: force_common.h:16
#define CMUL(a, b, c)
void su3_adjoint(su3_matrix *a, su3_matrix *b)
#define CONJG(a, b)
FloatingPoint< float > Float
Definition: gtest.h:7350
for(int s=0;s< param.Ls;s++)
__constant__ double coeff
#define Qrhonumu
int x[4]
void do_color_matrix_hisq_force_reference(Real eps, Real weight, su3_matrix *temp_xx, Real *act_path_coeff, su3_matrix *sitelink, anti_hermitmat *mom)
void computeHisqOuterProduct(void *src, void *dest, QudaPrecision precision)
int dx[4]
#define P3mu
#define GOES_BACKWARDS(dir)
Definition: force_common.h:18
QudaGaugeFieldOrder gauge_order
Main header file for the QUDA library.
#define P7
#define Pmu
#define P5
#define Qnumu
#define CSUM(a, b)
#define GOES_FORWARDS(dir)
Definition: force_common.h:17
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int sig
#define Pnumu
VOLATILE spinorFloat * s
int Z[4]
Definition: test_util.cpp:28
void do_halfwilson_hisq_force_reference(Real eps, Real weight, half_wilson_vector *temp_x, Real *act_path_coeff, su3_matrix *sitelink, anti_hermitmat *mom)
#define Prhonumu
int Vh
void color_matrix_hisq_force_reference(float eps, float weight, void *act_path_coeff, void *temp_xx, void *sitelink, void *mom)
su3_matrix * get_su3_matrix(int gauge_order, su3_matrix *p, int idx, int dir)
int V
Definition: test_util.cpp:29
#define CMULJ_(a, b, c)
#define P3