QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
dslash_util.h
Go to the documentation of this file.
1 #ifndef _DSLASH_UTIL_H
2 #define _DSLASH_UTIL_H
3 
4 #include <test_util.h>
5 
6 template <typename Float>
7 static inline void sum(Float *dst, Float *a, Float *b, int cnt) {
8  for (int i = 0; i < cnt; i++)
9  dst[i] = a[i] + b[i];
10 }
11 
12 template <typename Float>
13 static inline void sub(Float *dst, Float *a, Float *b, int cnt) {
14  for (int i = 0; i < cnt; i++)
15  dst[i] = a[i] - b[i];
16 }
17 
18 template <typename Float>
19 static inline void ax(Float *dst, Float a, Float *x, int cnt) {
20  for (int i = 0; i < cnt; i++)
21  dst[i] = a * x[i];
22 }
23 
24 // performs the operation y[i] = x[i] + a*y[i]
25 template <typename Float>
26 static inline void xpay(Float *x, Float a, Float *y, int len) {
27  for (int i=0; i<len; i++) y[i] = x[i] + a*y[i];
28 }
29 
30 // performs the operation y[i] = a*x[i] + y[i]
31 template <typename Float>
32 static inline void axpy(Float a, Float *x, Float *y, int len) {
33  for (int i=0; i<len; i++) y[i] = a*x[i] + y[i];
34 }
35 
36 // performs the operation y[i] = a*x[i] + b*y[i]
37 template <typename Float>
38 static inline void axpby(Float a, Float *x, Float b, Float *y, int len) {
39  for (int i=0; i<len; i++) y[i] = a*x[i] + b*y[i];
40 }
41 
42 // performs the operation y[i] = a*x[i] - y[i]
43 template <typename Float>
44 static inline void axmy(Float *x, Float a, Float *y, int len) {
45  for (int i=0; i<len; i++) y[i] = a*x[i] - y[i];
46 }
47 
48 template <typename Float>
49 static double norm2(Float *v, int len) {
50  double sum=0.0;
51  for (int i=0; i<len; i++) sum += v[i]*v[i];
52  return sum;
53 }
54 
55 template <typename Float>
56 static inline void negx(Float *x, int len) {
57  for (int i=0; i<len; i++) x[i] = -x[i];
58 }
59 
60 template <typename sFloat, typename gFloat>
61 static inline void dot(sFloat* res, gFloat* a, sFloat* b) {
62  res[0] = res[1] = 0;
63  for (int m = 0; m < 3; m++) {
64  sFloat a_re = a[2*m+0];
65  sFloat a_im = a[2*m+1];
66  sFloat b_re = b[2*m+0];
67  sFloat b_im = b[2*m+1];
68  res[0] += a_re * b_re - a_im * b_im;
69  res[1] += a_re * b_im + a_im * b_re;
70  }
71 }
72 
73 template <typename Float>
74 static inline void su3Transpose(Float *res, Float *mat) {
75  for (int m = 0; m < 3; m++) {
76  for (int n = 0; n < 3; n++) {
77  res[m*(3*2) + n*(2) + 0] = + mat[n*(3*2) + m*(2) + 0];
78  res[m*(3*2) + n*(2) + 1] = - mat[n*(3*2) + m*(2) + 1];
79  }
80  }
81 }
82 
83 
84 template <typename sFloat, typename gFloat>
85 static inline void su3Mul(sFloat *res, gFloat *mat, sFloat *vec) {
86  for (int n = 0; n < 3; n++) dot(&res[n*(2)], &mat[n*(3*2)], vec);
87 }
88 
89 template <typename sFloat, typename gFloat>
90 static inline void su3Tmul(sFloat *res, gFloat *mat, sFloat *vec) {
91  gFloat matT[3*3*2];
92  su3Transpose(matT, mat);
93  su3Mul(res, matT, vec);
94 }
95 
96 
97 // i represents a "half index" into an even or odd "half lattice".
98 // when oddBit={0,1} the half lattice is {even,odd}.
99 //
100 // the displacements, such as dx, refer to the full lattice coordinates.
101 //
102 // neighborIndex() takes a "half index", displaces it, and returns the
103 // new "half index", which can be an index into either the even or odd lattices.
104 // displacements of magnitude one always interchange odd and even lattices.
105 //
106 
107 
108 template <typename Float>
109 static inline Float *gaugeLink(int i, int dir, int oddBit, Float **gaugeEven, Float **gaugeOdd, int nbr_distance) {
110  Float **gaugeField;
111  int j;
112  int d = nbr_distance;
113  if (dir % 2 == 0) {
114  j = i;
115  gaugeField = (oddBit ? gaugeOdd : gaugeEven);
116  }
117  else {
118  switch (dir) {
119  case 1: j = neighborIndex(i, oddBit, 0, 0, 0, -d); break;
120  case 3: j = neighborIndex(i, oddBit, 0, 0, -d, 0); break;
121  case 5: j = neighborIndex(i, oddBit, 0, -d, 0, 0); break;
122  case 7: j = neighborIndex(i, oddBit, -d, 0, 0, 0); break;
123  default: j = -1; break;
124  }
125  gaugeField = (oddBit ? gaugeEven : gaugeOdd);
126  }
127 
128  return &gaugeField[dir/2][j*(3*3*2)];
129 }
130 
131 template <typename Float>
132 static inline Float *spinorNeighbor(int i, int dir, int oddBit, Float *spinorField, int neighbor_distance)
133 {
134  int j;
135  int nb = neighbor_distance;
136  switch (dir) {
137  case 0: j = neighborIndex(i, oddBit, 0, 0, 0, +nb); break;
138  case 1: j = neighborIndex(i, oddBit, 0, 0, 0, -nb); break;
139  case 2: j = neighborIndex(i, oddBit, 0, 0, +nb, 0); break;
140  case 3: j = neighborIndex(i, oddBit, 0, 0, -nb, 0); break;
141  case 4: j = neighborIndex(i, oddBit, 0, +nb, 0, 0); break;
142  case 5: j = neighborIndex(i, oddBit, 0, -nb, 0, 0); break;
143  case 6: j = neighborIndex(i, oddBit, +nb, 0, 0, 0); break;
144  case 7: j = neighborIndex(i, oddBit, -nb, 0, 0, 0); break;
145  default: j = -1; break;
146  }
147 
148  return &spinorField[j*(mySpinorSiteSize)];
149 }
150 
151 
152 #ifdef MULTI_GPU
153 
154 static inline int
155 x4_mg(int i, int oddBit)
156 {
157  int Y = fullLatticeIndex(i, oddBit);
158  int x4 = Y/(Z[2]*Z[1]*Z[0]);
159  return x4;
160 }
161 
162 template <typename Float>
163 static inline Float *gaugeLink_mg4dir(int i, int dir, int oddBit, Float **gaugeEven, Float **gaugeOdd,
164  Float** ghostGaugeEven, Float** ghostGaugeOdd, int n_ghost_faces, int nbr_distance) {
165  Float **gaugeField;
166  int j;
167  int d = nbr_distance;
168  if (dir % 2 == 0) {
169  j = i;
170  gaugeField = (oddBit ? gaugeOdd : gaugeEven);
171  }
172  else {
173 
174  int Y = fullLatticeIndex(i, oddBit);
175  int x4 = Y/(Z[2]*Z[1]*Z[0]);
176  int x3 = (Y/(Z[1]*Z[0])) % Z[2];
177  int x2 = (Y/Z[0]) % Z[1];
178  int x1 = Y % Z[0];
179  int X1= Z[0];
180  int X2= Z[1];
181  int X3= Z[2];
182  int X4= Z[3];
183  Float* ghostGaugeField;
184 
185  switch (dir) {
186  case 1:
187  { //-X direction
188  int new_x1 = (x1 - d + X1 )% X1;
189  if (x1 -d < 0){
190  ghostGaugeField = (oddBit?ghostGaugeEven[0]: ghostGaugeOdd[0]);
191  int offset = (n_ghost_faces + x1 -d)*X4*X3*X2/2 + (x4*X3*X2 + x3*X2+x2)/2;
192  return &ghostGaugeField[offset*(3*3*2)];
193  }
194  j = (x4*X3*X2*X1 + x3*X2*X1 + x2*X1 + new_x1) / 2;
195  break;
196  }
197  case 3:
198  { //-Y direction
199  int new_x2 = (x2 - d + X2 )% X2;
200  if (x2 -d < 0){
201  ghostGaugeField = (oddBit?ghostGaugeEven[1]: ghostGaugeOdd[1]);
202  int offset = (n_ghost_faces + x2 -d)*X4*X3*X1/2 + (x4*X3*X1 + x3*X1+x1)/2;
203  return &ghostGaugeField[offset*(3*3*2)];
204  }
205  j = (x4*X3*X2*X1 + x3*X2*X1 + new_x2*X1 + x1) / 2;
206  break;
207 
208  }
209  case 5:
210  { //-Z direction
211  int new_x3 = (x3 - d + X3 )% X3;
212  if (x3 -d < 0){
213  ghostGaugeField = (oddBit?ghostGaugeEven[2]: ghostGaugeOdd[2]);
214  int offset = (n_ghost_faces + x3 -d)*X4*X2*X1/2 + (x4*X2*X1 + x2*X1+x1)/2;
215  return &ghostGaugeField[offset*(3*3*2)];
216  }
217  j = (x4*X3*X2*X1 + new_x3*X2*X1 + x2*X1 + x1) / 2;
218  break;
219  }
220  case 7:
221  { //-T direction
222  int new_x4 = (x4 - d + X4)% X4;
223  if (x4 -d < 0){
224  ghostGaugeField = (oddBit?ghostGaugeEven[3]: ghostGaugeOdd[3]);
225  int offset = (n_ghost_faces + x4 -d)*X1*X2*X3/2 + (x3*X2*X1 + x2*X1+x1)/2;
226  return &ghostGaugeField[offset*(3*3*2)];
227  }
228  j = (new_x4*(X3*X2*X1) + x3*(X2*X1) + x2*(X1) + x1) / 2;
229  break;
230  }//7
231 
232  default: j = -1; printf("ERROR: wrong dir \n"); exit(1);
233  }
234  gaugeField = (oddBit ? gaugeEven : gaugeOdd);
235 
236  }
237 
238  return &gaugeField[dir/2][j*(3*3*2)];
239 }
240 
241 template <typename Float>
242 static inline Float *spinorNeighbor_mg4dir(int i, int dir, int oddBit, Float *spinorField, Float** fwd_nbr_spinor,
243  Float** back_nbr_spinor, int neighbor_distance, int nFace)
244 {
245  int j;
246  int nb = neighbor_distance;
247  int Y = fullLatticeIndex(i, oddBit);
248  int x4 = Y/(Z[2]*Z[1]*Z[0]);
249  int x3 = (Y/(Z[1]*Z[0])) % Z[2];
250  int x2 = (Y/Z[0]) % Z[1];
251  int x1 = Y % Z[0];
252  int X1= Z[0];
253  int X2= Z[1];
254  int X3= Z[2];
255  int X4= Z[3];
256 
257  switch (dir) {
258  case 0://+X
259  {
260  int new_x1 = (x1 + nb)% X1;
261  if(x1+nb >=X1){
262  int offset = ( x1 + nb -X1)*X4*X3*X2/2+(x4*X3*X2 + x3*X2+x2)/2;
263  return fwd_nbr_spinor[0] + offset*mySpinorSiteSize;
264  }
265  j = (x4*X3*X2*X1 + x3*X2*X1 + x2*X1 + new_x1) / 2;
266  break;
267 
268  }
269  case 1://-X
270  {
271  int new_x1 = (x1 - nb + X1)% X1;
272  if(x1 - nb < 0){
273  int offset = ( x1+nFace- nb)*X4*X3*X2/2+(x4*X3*X2 + x3*X2+x2)/2;
274  return back_nbr_spinor[0] + offset*mySpinorSiteSize;
275  }
276  j = (x4*X3*X2*X1 + x3*X2*X1 + x2*X1 + new_x1) / 2;
277  break;
278  }
279  case 2://+Y
280  {
281  int new_x2 = (x2 + nb)% X2;
282  if(x2+nb >=X2){
283  int offset = ( x2 + nb -X2)*X4*X3*X1/2+(x4*X3*X1 + x3*X1+x1)/2;
284  return fwd_nbr_spinor[1] + offset*mySpinorSiteSize;
285  }
286  j = (x4*X3*X2*X1 + x3*X2*X1 + new_x2*X1 + x1) / 2;
287  break;
288  }
289  case 3:// -Y
290  {
291  int new_x2 = (x2 - nb + X2)% X2;
292  if(x2 - nb < 0){
293  int offset = ( x2 + nFace -nb)*X4*X3*X1/2+(x4*X3*X1 + x3*X1+x1)/2;
294  return back_nbr_spinor[1] + offset*mySpinorSiteSize;
295  }
296  j = (x4*X3*X2*X1 + x3*X2*X1 + new_x2*X1 + x1) / 2;
297  break;
298  }
299  case 4://+Z
300  {
301  int new_x3 = (x3 + nb)% X3;
302  if(x3+nb >=X3){
303  int offset = ( x3 + nb -X3)*X4*X2*X1/2+(x4*X2*X1 + x2*X1+x1)/2;
304  return fwd_nbr_spinor[2] + offset*mySpinorSiteSize;
305  }
306  j = (x4*X3*X2*X1 + new_x3*X2*X1 + x2*X1 + x1) / 2;
307  break;
308  }
309  case 5://-Z
310  {
311  int new_x3 = (x3 - nb + X3)% X3;
312  if(x3 - nb < 0){
313  int offset = ( x3 + nFace -nb)*X4*X2*X1/2+(x4*X2*X1 + x2*X1+x1)/2;
314  return back_nbr_spinor[2] + offset*mySpinorSiteSize;
315  }
316  j = (x4*X3*X2*X1 + new_x3*X2*X1 + x2*X1 + x1) / 2;
317  break;
318  }
319  case 6://+T
320  {
321  j = neighborIndex_mg(i, oddBit, +nb, 0, 0, 0);
322  int x4 = x4_mg(i, oddBit);
323  if ( (x4 + nb) >= Z[3]){
324  int offset = (x4+nb - Z[3])*Vsh_t;
325  return &fwd_nbr_spinor[3][(offset+j)*mySpinorSiteSize];
326  }
327  break;
328  }
329  case 7://-T
330  {
331  j = neighborIndex_mg(i, oddBit, -nb, 0, 0, 0);
332  int x4 = x4_mg(i, oddBit);
333  if ( (x4 - nb) < 0){
334  int offset = ( x4 - nb +nFace)*Vsh_t;
335  return &back_nbr_spinor[3][(offset+j)*mySpinorSiteSize];
336  }
337  break;
338  }
339  default: j = -1; printf("ERROR: wrong dir\n"); exit(1);
340  }
341 
342  return &spinorField[j*(mySpinorSiteSize)];
343 }
344 
345 #endif // MULTI_GPU
346 
347 #endif // _DSLASH_UTIL_H
__device__ __forceinline__ int neighborIndex(const unsigned int &cb_idx, const int(&shift)[4], const bool(&partitioned)[4], const unsigned int &parity)
__constant__ int X2
int y[4]
void axpby(const Float &a, const Float *x, const Float &b, Float *y, const int N)
Definition: blas_cpu.cpp:8
__constant__ int X1
void mat(void *out, void **fatlink, void **longlink, void *in, double kappa, int dagger_bit, QudaPrecision sPrecision, QudaPrecision gPrecision)
#define Vsh_t
Definition: llfat_core.h:4
void ax(double a, void *x, int len, QudaPrecision precision)
int fullLatticeIndex(int dim[4], int index, int oddBit)
Definition: test_util.cpp:400
FloatingPoint< float > Float
Definition: gtest.h:7350
#define mySpinorSiteSize
double norm2(Float *v, int len)
int x[4]
int neighborIndex_mg(int i, int oddBit, int dx4, int dx3, int dx2, int dx1)
Definition: test_util.cpp:485
void axpy(double a, void *x, void *y, int len, QudaPrecision precision)
int Z[4]
Definition: test_util.cpp:28
__constant__ int X3
int oddBit
__constant__ int X4