QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
domain_wall_dslash_reference.cpp
Go to the documentation of this file.
1 #include <iostream>
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <string.h>
5 #include <math.h>
6 
7 #include <quda.h>
8 #include <test_util.h>
9 #include <dslash_util.h>
11 #include <blas_reference.h>
12 
13 #include <gauge_field.h>
14 #include <color_spinor_field.h>
15 #include <face_quda.h>
16 
17 using namespace quda;
18 
19 // i represents a "half index" into an even or odd "half lattice".
20 // when oddBit={0,1} the half lattice is {even,odd}.
21 //
22 // the displacements, such as dx, refer to the full lattice coordinates.
23 //
24 // neighborIndex() takes a "half index", displaces it, and returns the
25 // new "half index", which can be an index into either the even or odd lattices.
26 // displacements of magnitude one always interchange odd and even lattices.
27 //
28 //
29 int neighborIndex_5d(int i, int oddBit, int dxs, int dx4, int dx3, int dx2, int dx1) {
30  // fullLatticeIndex was modified for fullLatticeIndex_4d. It is in util_quda.cpp.
31  // This code bit may not properly perform 5dPC.
32  int X = fullLatticeIndex_5d(i, oddBit);
33  // Checked that this matches code in dslash_core_ante.h.
34  int xs = X/(Z[3]*Z[2]*Z[1]*Z[0]);
35  int x4 = (X/(Z[2]*Z[1]*Z[0])) % Z[3];
36  int x3 = (X/(Z[1]*Z[0])) % Z[2];
37  int x2 = (X/Z[0]) % Z[1];
38  int x1 = X % Z[0];
39  // Displace and project back into domain 0,...,Ls-1.
40  // Note that we add Ls to avoid the negative problem
41  // of the C % operator.
42  xs = (xs+dxs+Ls) % Ls;
43  // Etc.
44  x4 = (x4+dx4+Z[3]) % Z[3];
45  x3 = (x3+dx3+Z[2]) % Z[2];
46  x2 = (x2+dx2+Z[1]) % Z[1];
47  x1 = (x1+dx1+Z[0]) % Z[0];
48  // Return linear half index. Remember that integer division
49  // rounds down.
50  return (xs*(Z[3]*Z[2]*Z[1]*Z[0]) + x4*(Z[2]*Z[1]*Z[0]) + x3*(Z[1]*Z[0]) + x2*(Z[0]) + x1) / 2;
51 }
52 
53 // index calculator for 4d even-odd preconditioning method
54 int neighborIndex_5d_4dpc(int i, int oddBit, int dxs, int dx4, int dx3, int dx2, int dx1) {
55  // fullLatticeIndex was modified for fullLatticeIndex_4d. It is in util_quda.cpp.
56  // This code bit may not properly perform 5dPC.
57  int X = fullLatticeIndex_5d_4dpc(i, oddBit);
58  // Checked that this matches code in dslash_core_ante.h.
59  int xs = X/(Z[3]*Z[2]*Z[1]*Z[0]);
60  int x4 = (X/(Z[2]*Z[1]*Z[0])) % Z[3];
61  int x3 = (X/(Z[1]*Z[0])) % Z[2];
62  int x2 = (X/Z[0]) % Z[1];
63  int x1 = X % Z[0];
64  // Displace and project back into domain 0,...,Ls-1.
65  // Note that we add Ls to avoid the negative problem
66  // of the C % operator.
67  xs = (xs+dxs+Ls) % Ls;
68  // Etc.
69  x4 = (x4+dx4+Z[3]) % Z[3];
70  x3 = (x3+dx3+Z[2]) % Z[2];
71  x2 = (x2+dx2+Z[1]) % Z[1];
72  x1 = (x1+dx1+Z[0]) % Z[0];
73  // Return linear half index. Remember that integer division
74  // rounds down.
75  return (xs*(Z[3]*Z[2]*Z[1]*Z[0]) + x4*(Z[2]*Z[1]*Z[0]) + x3*(Z[1]*Z[0]) + x2*(Z[0]) + x1) / 2;
76 }
77 
78 // i represents a "half index" into an even or odd "half lattice".
79 // when oddBit={0,1} the half lattice is {even,odd}.
80 //
81 // the displacements, such as dx, refer to the full lattice coordinates.
82 //
83 // neighborIndex() takes a "half index", displaces it, and returns the
84 // new "half index", which can be an index into either the even or odd lattices.
85 // displacements of magnitude one always interchange odd and even lattices.
86 //
87 //
88 int neighborIndex_4d(int i, int oddBit, int dx4, int dx3, int dx2, int dx1) {
89  // On input i should be in the range [0 , ... , Z[0]*Z[1]*Z[2]*Z[3]/2-1].
90  if (i < 0 || i >= (Z[0]*Z[1]*Z[2]*Z[3]/2))
91  { printf("i out of range in neighborIndex_4d\n"); exit(-1); }
92  // Compute the linear index. Then dissect.
93  // fullLatticeIndex_4d is in util_quda.cpp.
94  // The gauge fields live on a 4d sublattice.
95  int X = fullLatticeIndex_4d(i, oddBit);
96  int x4 = X/(Z[2]*Z[1]*Z[0]);
97  int x3 = (X/(Z[1]*Z[0])) % Z[2];
98  int x2 = (X/Z[0]) % Z[1];
99  int x1 = X % Z[0];
100 
101  x4 = (x4+dx4+Z[3]) % Z[3];
102  x3 = (x3+dx3+Z[2]) % Z[2];
103  x2 = (x2+dx2+Z[1]) % Z[1];
104  x1 = (x1+dx1+Z[0]) % Z[0];
105 
106  return (x4*(Z[2]*Z[1]*Z[0]) + x3*(Z[1]*Z[0]) + x2*(Z[0]) + x1) / 2;
107 }
108 
109 //BEGIN NEW
110 //#ifdef MULTI_GPU
111 int
112 neighborIndex_5d_mgpu(int i, int oddBit, int dxs, int dx4, int dx3, int dx2, int dx1)
113 {
114  int ret;
115 
116  int Y = fullLatticeIndex_5d(i, oddBit);
117 
118  int xs = Y/(Z[3]*Z[2]*Z[1]*Z[0]);
119  int x4 = (Y/(Z[2]*Z[1]*Z[0])) % Z[3];
120  int x3 = (Y/(Z[1]*Z[0])) % Z[2];
121  int x2 = (Y/Z[0]) % Z[1];
122  int x1 = Y % Z[0];
123 
124  int ghost_x4 = x4+ dx4;
125 
126  xs = (xs+dxs+Ls) % Ls;
127  x4 = (x4+dx4+Z[3]) % Z[3];
128  x3 = (x3+dx3+Z[2]) % Z[2];
129  x2 = (x2+dx2+Z[1]) % Z[1];
130  x1 = (x1+dx1+Z[0]) % Z[0];
131 
132  if ( ghost_x4 >= 0 && ghost_x4 < Z[3]){
133  ret = (xs*Z[3]*Z[2]*Z[1]*Z[0] + x4*Z[2]*Z[1]*Z[0] + x3*Z[1]*Z[0] + x2*Z[0] + x1) >> 1;
134  }else{
135  ret = (xs*Z[2]*Z[1]*Z[0] + x3*Z[1]*Z[0] + x2*Z[0] + x1) >> 1;
136  }
137 
138  return ret;
139 }
140 
141 int neighborIndex_5d_4dpc_mgpu(int i, int oddBit, int dxs, int dx4, int dx3, int dx2, int dx1)
142 {
143  int ret;
144 
145  int Y = fullLatticeIndex_5d_4dpc(i, oddBit);
146 
147  int xs = Y/(Z[3]*Z[2]*Z[1]*Z[0]);
148  int x4 = (Y/(Z[2]*Z[1]*Z[0])) % Z[3];
149  int x3 = (Y/(Z[1]*Z[0])) % Z[2];
150  int x2 = (Y/Z[0]) % Z[1];
151  int x1 = Y % Z[0];
152 
153  int ghost_x4 = x4+ dx4;
154 
155  xs = (xs+dxs+Ls) % Ls;
156  x4 = (x4+dx4+Z[3]) % Z[3];
157  x3 = (x3+dx3+Z[2]) % Z[2];
158  x2 = (x2+dx2+Z[1]) % Z[1];
159  x1 = (x1+dx1+Z[0]) % Z[0];
160 
161  if ( ghost_x4 >= 0 && ghost_x4 < Z[3]){
162  ret = (xs*Z[3]*Z[2]*Z[1]*Z[0] + x4*Z[2]*Z[1]*Z[0] + x3*Z[1]*Z[0] + x2*Z[0] + x1) >> 1;
163  }else{
164  ret = (xs*Z[2]*Z[1]*Z[0] + x3*Z[1]*Z[0] + x2*Z[0] + x1) >> 1;
165  }
166 
167  return ret;
168 }
169 
170 int
171 x4_5d_mgpu(int i, int oddBit)
172 {
173  int Y = fullLatticeIndex_5d(i, oddBit);
174  return (Y/(Z[2]*Z[1]*Z[0])) % Z[3];
175 }
176 
177 //#endif
178 int x4_5d_4dpc_mgpu(int i, int oddBit)
179 {
180  int Y = fullLatticeIndex_5d_4dpc(i, oddBit);
181  return (Y/(Z[2]*Z[1]*Z[0])) % Z[3];
182 }
183 
184 //#endif
185 
186 //END NEW
187 
188 //#ifndef MULTI_GPU
189 // This is just a copy of gaugeLink() from the quda code, except
190 // that neighborIndex() is replaced by the renamed version
191 // neighborIndex_4d().
192 //ok
193 template <typename Float>
194 Float *gaugeLink_sgpu(int i, int dir, int oddBit, Float **gaugeEven,
195  Float **gaugeOdd) {
196  Float **gaugeField;
197  int j;
198 
199  // If going forward, just grab link at site, U_\mu(x).
200  if (dir % 2 == 0) {
201  j = i;
202  // j will get used in the return statement below.
203  gaugeField = (oddBit ? gaugeOdd : gaugeEven);
204  } else {
205  // If going backward, a shift must occur, U_\mu(x-\muhat)^\dagger;
206  // dagger happens elsewhere, here we're just doing index gymnastics.
207  switch (dir) {
208  case 1: j = neighborIndex_4d(i, oddBit, 0, 0, 0, -1); break;
209  case 3: j = neighborIndex_4d(i, oddBit, 0, 0, -1, 0); break;
210  case 5: j = neighborIndex_4d(i, oddBit, 0, -1, 0, 0); break;
211  case 7: j = neighborIndex_4d(i, oddBit, -1, 0, 0, 0); break;
212  default: j = -1; break;
213  }
214  gaugeField = (oddBit ? gaugeEven : gaugeOdd);
215  }
216 
217  return &gaugeField[dir/2][j*(3*3*2)];
218 }
219 
220 
221 //#else
222 
223 //Standard 4d version (nothing to change)
224 template <typename Float>
225 Float *gaugeLink_mgpu(int i, int dir, int oddBit, Float **gaugeEven, Float **gaugeOdd, Float** ghostGaugeEven, Float** ghostGaugeOdd, int n_ghost_faces, int nbr_distance) {
226  Float **gaugeField;
227  int j;
228  int d = nbr_distance;
229  if (dir % 2 == 0) {
230  j = i;
231  gaugeField = (oddBit ? gaugeOdd : gaugeEven);
232  }
233  else {
234 
235  int Y = fullLatticeIndex(i, oddBit);
236  int x4 = Y/(Z[2]*Z[1]*Z[0]);
237  int x3 = (Y/(Z[1]*Z[0])) % Z[2];
238  int x2 = (Y/Z[0]) % Z[1];
239  int x1 = Y % Z[0];
240  int X1= Z[0];
241  int X2= Z[1];
242  int X3= Z[2];
243  int X4= Z[3];
244  Float* ghostGaugeField;
245 
246  switch (dir) {
247  case 1:
248  { //-X direction
249  int new_x1 = (x1 - d + X1 )% X1;
250  if (x1 -d < 0){
251  ghostGaugeField = (oddBit?ghostGaugeEven[0]: ghostGaugeOdd[0]);
252  int offset = (n_ghost_faces + x1 -d)*X4*X3*X2/2 + (x4*X3*X2 + x3*X2+x2)/2;
253  return &ghostGaugeField[offset*(3*3*2)];
254  }
255  j = (x4*X3*X2*X1 + x3*X2*X1 + x2*X1 + new_x1) / 2;
256  break;
257  }
258  case 3:
259  { //-Y direction
260  int new_x2 = (x2 - d + X2 )% X2;
261  if (x2 -d < 0){
262  ghostGaugeField = (oddBit?ghostGaugeEven[1]: ghostGaugeOdd[1]);
263  int offset = (n_ghost_faces + x2 -d)*X4*X3*X1/2 + (x4*X3*X1 + x3*X1+x1)/2;
264  return &ghostGaugeField[offset*(3*3*2)];
265  }
266  j = (x4*X3*X2*X1 + x3*X2*X1 + new_x2*X1 + x1) / 2;
267  break;
268 
269  }
270  case 5:
271  { //-Z direction
272  int new_x3 = (x3 - d + X3 )% X3;
273  if (x3 -d < 0){
274  ghostGaugeField = (oddBit?ghostGaugeEven[2]: ghostGaugeOdd[2]);
275  int offset = (n_ghost_faces + x3 -d)*X4*X2*X1/2 + (x4*X2*X1 + x2*X1+x1)/2;
276  return &ghostGaugeField[offset*(3*3*2)];
277  }
278  j = (x4*X3*X2*X1 + new_x3*X2*X1 + x2*X1 + x1) / 2;
279  break;
280  }
281  case 7:
282  { //-T direction
283  int new_x4 = (x4 - d + X4)% X4;
284  if (x4 -d < 0){
285  ghostGaugeField = (oddBit?ghostGaugeEven[3]: ghostGaugeOdd[3]);
286  int offset = (n_ghost_faces + x4 -d)*X1*X2*X3/2 + (x3*X2*X1 + x2*X1+x1)/2;
287  return &ghostGaugeField[offset*(3*3*2)];
288  }
289  j = (new_x4*(X3*X2*X1) + x3*(X2*X1) + x2*(X1) + x1) / 2;
290  break;
291  }//7
292 
293  default: j = -1; printf("ERROR: wrong dir \n"); exit(1);
294  }
295  gaugeField = (oddBit ? gaugeEven : gaugeOdd);
296 
297  }
298 
299  return &gaugeField[dir/2][j*(3*3*2)];
300 }
301 
302 //A.S.: this is valid for DW dslash with space-time decomposition.
303 template <typename Float>
304 Float *spinorNeighbor_5d_mgpu(int i, int dir, int oddBit, Float *spinorField, Float** fwd_nbr_spinor, Float** back_nbr_spinor, int neighbor_distance, int nFace)
305 {
306  int j;
307  int nb = neighbor_distance;
308  int Y = fullLatticeIndex_5d(i, oddBit);
309 
310  int mySpinorSiteSize = 24;
311 
312  int xs = Y/(Z[3]*Z[2]*Z[1]*Z[0]);
313  int x4 = (Y/(Z[2]*Z[1]*Z[0])) % Z[3];
314  int x3 = (Y/(Z[1]*Z[0])) % Z[2];
315  int x2 = (Y/Z[0]) % Z[1];
316  int x1 = Y % Z[0];
317 
318  int X1= Z[0];
319  int X2= Z[1];
320  int X3= Z[2];
321  int X4= Z[3];
322  switch (dir) {
323  case 0://+X
324  {
325  int new_x1 = (x1 + nb)% X1;
326  if(x1+nb >=X1){
327  int offset = ((x1 + nb -X1)*Ls*X4*X3*X2+xs*X4*X3*X2+x4*X3*X2 + x3*X2+x2) >> 1;
328  return fwd_nbr_spinor[0] + offset*mySpinorSiteSize;
329  }
330  j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + x3*X2*X1 + x2*X1 + new_x1) >> 1;
331  break;
332 
333  }
334  case 1://-X
335  {
336  int new_x1 = (x1 - nb + X1)% X1;
337  if(x1 - nb < 0){
338  int offset = (( x1+nFace- nb)*Ls*X4*X3*X2 + xs*X4*X3*X2 + x4*X3*X2 + x3*X2 + x2) >> 1;
339  return back_nbr_spinor[0] + offset*mySpinorSiteSize;
340  }
341  j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + x3*X2*X1 + x2*X1 + new_x1) >> 1;
342  break;
343  }
344  case 2://+Y
345  {
346  int new_x2 = (x2 + nb)% X2;
347  if(x2+nb >=X2){
348  int offset = (( x2 + nb -X2)*Ls*X4*X3*X1+xs*X4*X3*X1+x4*X3*X1 + x3*X1+x1) >> 1;
349  return fwd_nbr_spinor[1] + offset*mySpinorSiteSize;
350  }
351  j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + x3*X2*X1 + new_x2*X1 + x1) >> 1;
352  break;
353  }
354  case 3:// -Y
355  {
356  int new_x2 = (x2 - nb + X2)% X2;
357  if(x2 - nb < 0){
358  int offset = (( x2 + nFace -nb)*Ls*X4*X3*X1+xs*X4*X3*X1+ x4*X3*X1 + x3*X1+x1) >> 1;
359  return back_nbr_spinor[1] + offset*mySpinorSiteSize;
360  }
361  j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + x3*X2*X1 + new_x2*X1 + x1) >> 1;
362  break;
363  }
364  case 4://+Z
365  {
366  int new_x3 = (x3 + nb)% X3;
367  if(x3+nb >=X3){
368  int offset = (( x3 + nb -X3)*Ls*X4*X2*X1+xs*X4*X2*X1+x4*X2*X1 + x2*X1+x1) >> 1;
369  return fwd_nbr_spinor[2] + offset*mySpinorSiteSize;
370  }
371  j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + new_x3*X2*X1 + x2*X1 + x1) >> 1;
372  break;
373  }
374  case 5://-Z
375  {
376  int new_x3 = (x3 - nb + X3)% X3;
377  if(x3 - nb < 0){
378  int offset = (( x3 + nFace -nb)*Ls*X4*X2*X1+xs*X4*X2*X1+x4*X2*X1+x2*X1+x1) >> 1;
379  return back_nbr_spinor[2] + offset*mySpinorSiteSize;
380  }
381  j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + new_x3*X2*X1 + x2*X1 + x1) >> 1;
382  break;
383  }
384  case 6://+T
385  {
386  j = neighborIndex_5d_mgpu(i, oddBit, 0, +nb, 0, 0, 0);
387  int x4 = x4_5d_mgpu(i, oddBit);
388  if ( (x4 + nb) >= Z[3])
389  {
390  int offset = (x4+nb - Z[3])*Vsh_t;//?
391  return &fwd_nbr_spinor[3][(offset+j)*mySpinorSiteSize];
392  }
393  break;
394  }
395  case 7://-T
396  {
397  j = neighborIndex_5d_mgpu(i, oddBit, 0, -nb, 0, 0, 0);
398  int x4 = x4_5d_mgpu(i, oddBit);
399  if ( (x4 - nb) < 0){
400  int offset = ( x4 - nb +nFace)*Vsh_t;//?
401  return &back_nbr_spinor[3][(offset+j)*mySpinorSiteSize];
402  }
403  break;
404  }
405  default: j = -1; printf("ERROR: wrong dir\n"); exit(1);
406  }
407 
408  return &spinorField[j*(mySpinorSiteSize)];
409 }
410 
411 template <typename Float>
412 Float *spinorNeighbor_5d_4dpc_mgpu(int i, int dir, int oddBit, Float *spinorField, Float** fwd_nbr_spinor, Float** back_nbr_spinor, int neighbor_distance, int nFace)
413 {
414  int j;
415  int nb = neighbor_distance;
416  int Y = fullLatticeIndex_5d_4dpc(i, oddBit);
417 
418  int mySpinorSiteSize = 24;
419 
420  int xs = Y/(Z[3]*Z[2]*Z[1]*Z[0]);
421  int x4 = (Y/(Z[2]*Z[1]*Z[0])) % Z[3];
422  int x3 = (Y/(Z[1]*Z[0])) % Z[2];
423  int x2 = (Y/Z[0]) % Z[1];
424  int x1 = Y % Z[0];
425 
426  int X1= Z[0];
427  int X2= Z[1];
428  int X3= Z[2];
429  int X4= Z[3];
430  switch (dir) {
431  case 0://+X
432  {
433  int new_x1 = (x1 + nb)% X1;
434  if(x1+nb >=X1){
435  int offset = ((x1 + nb -X1)*Ls*X4*X3*X2+xs*X4*X3*X2+x4*X3*X2 + x3*X2+x2) >> 1;
436  return fwd_nbr_spinor[0] + offset*mySpinorSiteSize;
437  }
438  j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + x3*X2*X1 + x2*X1 + new_x1) >> 1;
439  break;
440 
441  }
442  case 1://-X
443  {
444  int new_x1 = (x1 - nb + X1)% X1;
445  if(x1 - nb < 0){
446  int offset = (( x1+nFace- nb)*Ls*X4*X3*X2 + xs*X4*X3*X2 + x4*X3*X2 + x3*X2 + x2) >> 1;
447  return back_nbr_spinor[0] + offset*mySpinorSiteSize;
448  }
449  j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + x3*X2*X1 + x2*X1 + new_x1) >> 1;
450  break;
451  }
452  case 2://+Y
453  {
454  int new_x2 = (x2 + nb)% X2;
455  if(x2+nb >=X2){
456  int offset = (( x2 + nb -X2)*Ls*X4*X3*X1+xs*X4*X3*X1+x4*X3*X1 + x3*X1+x1) >> 1;
457  return fwd_nbr_spinor[1] + offset*mySpinorSiteSize;
458  }
459  j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + x3*X2*X1 + new_x2*X1 + x1) >> 1;
460  break;
461  }
462  case 3:// -Y
463  {
464  int new_x2 = (x2 - nb + X2)% X2;
465  if(x2 - nb < 0){
466  int offset = (( x2 + nFace -nb)*Ls*X4*X3*X1+xs*X4*X3*X1+ x4*X3*X1 + x3*X1+x1) >> 1;
467  return back_nbr_spinor[1] + offset*mySpinorSiteSize;
468  }
469  j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + x3*X2*X1 + new_x2*X1 + x1) >> 1;
470  break;
471  }
472  case 4://+Z
473  {
474  int new_x3 = (x3 + nb)% X3;
475  if(x3+nb >=X3){
476  int offset = (( x3 + nb -X3)*Ls*X4*X2*X1+xs*X4*X2*X1+x4*X2*X1 + x2*X1+x1) >> 1;
477  return fwd_nbr_spinor[2] + offset*mySpinorSiteSize;
478  }
479  j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + new_x3*X2*X1 + x2*X1 + x1) >> 1;
480  break;
481  }
482  case 5://-Z
483  {
484  int new_x3 = (x3 - nb + X3)% X3;
485  if(x3 - nb < 0){
486  int offset = (( x3 + nFace -nb)*Ls*X4*X2*X1+xs*X4*X2*X1+x4*X2*X1+x2*X1+x1) >> 1;
487  return back_nbr_spinor[2] + offset*mySpinorSiteSize;
488  }
489  j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + new_x3*X2*X1 + x2*X1 + x1) >> 1;
490  break;
491  }
492  case 6://+T
493  {
494  j = neighborIndex_5d_4dpc_mgpu(i, oddBit, 0, +nb, 0, 0, 0);
495  int x4 = x4_5d_4dpc_mgpu(i, oddBit);
496  if ( (x4 + nb) >= Z[3])
497  {
498  int offset = (x4+nb - Z[3])*Vsh_t;//?
499  return &fwd_nbr_spinor[3][(offset+j)*mySpinorSiteSize];
500  }
501  break;
502  }
503  case 7://-T
504  {
505  j = neighborIndex_5d_4dpc_mgpu(i, oddBit, 0, -nb, 0, 0, 0);
506  int x4 = x4_5d_4dpc_mgpu(i, oddBit);
507  if ( (x4 - nb) < 0){
508  int offset = ( x4 - nb +nFace)*Vsh_t;//?
509  return &back_nbr_spinor[3][(offset+j)*mySpinorSiteSize];
510  }
511  break;
512  }
513  default: j = -1; printf("ERROR: wrong dir\n"); exit(1);
514  }
515 
516  return &spinorField[j*(mySpinorSiteSize)];
517 }
518 
519 //#endif
520 
521 
522 template <typename Float>
523 Float *spinorNeighbor_5d(int i, int dir, int oddBit, Float *spinorField) {
524  int j;
525  switch (dir) {
526  case 0: j = neighborIndex_5d(i, oddBit, 0, 0, 0, 0, +1); break;
527  case 1: j = neighborIndex_5d(i, oddBit, 0, 0, 0, 0, -1); break;
528  case 2: j = neighborIndex_5d(i, oddBit, 0, 0, 0, +1, 0); break;
529  case 3: j = neighborIndex_5d(i, oddBit, 0, 0, 0, -1, 0); break;
530  case 4: j = neighborIndex_5d(i, oddBit, 0, 0, +1, 0, 0); break;
531  case 5: j = neighborIndex_5d(i, oddBit, 0, 0, -1, 0, 0); break;
532  case 6: j = neighborIndex_5d(i, oddBit, 0, +1, 0, 0, 0); break;
533  case 7: j = neighborIndex_5d(i, oddBit, 0, -1, 0, 0, 0); break;
534  case 8: j = neighborIndex_5d(i, oddBit, +1, 0, 0, 0, 0); break;
535  case 9: j = neighborIndex_5d(i, oddBit, -1, 0, 0, 0, 0); break;
536  default: j = -1; break;
537  }
538 
539  return &spinorField[j*(4*3*2)];
540 }
541 
542 template <typename Float>
543 Float *spinorNeighbor_5d_4dpc(int i, int dir, int oddBit, Float *spinorField) {
544  int j;
545  switch (dir) {
546  case 0: j = neighborIndex_5d_4dpc(i, oddBit, 0, 0, 0, 0, +1); break;
547  case 1: j = neighborIndex_5d_4dpc(i, oddBit, 0, 0, 0, 0, -1); break;
548  case 2: j = neighborIndex_5d_4dpc(i, oddBit, 0, 0, 0, +1, 0); break;
549  case 3: j = neighborIndex_5d_4dpc(i, oddBit, 0, 0, 0, -1, 0); break;
550  case 4: j = neighborIndex_5d_4dpc(i, oddBit, 0, 0, +1, 0, 0); break;
551  case 5: j = neighborIndex_5d_4dpc(i, oddBit, 0, 0, -1, 0, 0); break;
552  case 6: j = neighborIndex_5d_4dpc(i, oddBit, 0, +1, 0, 0, 0); break;
553  case 7: j = neighborIndex_5d_4dpc(i, oddBit, 0, -1, 0, 0, 0); break;
554  case 8: j = neighborIndex_5d_4dpc(i, oddBit, +1, 0, 0, 0, 0); break;
555  case 9: j = neighborIndex_5d_4dpc(i, oddBit, -1, 0, 0, 0, 0); break;
556  default: j = -1; break;
557  }
558 
559  return &spinorField[j*(4*3*2)];
560 }
561 
562 
563 //J Directions 0..7 were used in the 4d code.
564 //J Directions 8,9 will be for P_- and P_+, chiral
565 //J projectors.
566 const double projector[10][4][4][2] = {
567  {
568  {{1,0}, {0,0}, {0,0}, {0,-1}},
569  {{0,0}, {1,0}, {0,-1}, {0,0}},
570  {{0,0}, {0,1}, {1,0}, {0,0}},
571  {{0,1}, {0,0}, {0,0}, {1,0}}
572  },
573  {
574  {{1,0}, {0,0}, {0,0}, {0,1}},
575  {{0,0}, {1,0}, {0,1}, {0,0}},
576  {{0,0}, {0,-1}, {1,0}, {0,0}},
577  {{0,-1}, {0,0}, {0,0}, {1,0}}
578  },
579  {
580  {{1,0}, {0,0}, {0,0}, {1,0}},
581  {{0,0}, {1,0}, {-1,0}, {0,0}},
582  {{0,0}, {-1,0}, {1,0}, {0,0}},
583  {{1,0}, {0,0}, {0,0}, {1,0}}
584  },
585  {
586  {{1,0}, {0,0}, {0,0}, {-1,0}},
587  {{0,0}, {1,0}, {1,0}, {0,0}},
588  {{0,0}, {1,0}, {1,0}, {0,0}},
589  {{-1,0}, {0,0}, {0,0}, {1,0}}
590  },
591  {
592  {{1,0}, {0,0}, {0,-1}, {0,0}},
593  {{0,0}, {1,0}, {0,0}, {0,1}},
594  {{0,1}, {0,0}, {1,0}, {0,0}},
595  {{0,0}, {0,-1}, {0,0}, {1,0}}
596  },
597  {
598  {{1,0}, {0,0}, {0,1}, {0,0}},
599  {{0,0}, {1,0}, {0,0}, {0,-1}},
600  {{0,-1}, {0,0}, {1,0}, {0,0}},
601  {{0,0}, {0,1}, {0,0}, {1,0}}
602  },
603  {
604  {{1,0}, {0,0}, {-1,0}, {0,0}},
605  {{0,0}, {1,0}, {0,0}, {-1,0}},
606  {{-1,0}, {0,0}, {1,0}, {0,0}},
607  {{0,0}, {-1,0}, {0,0}, {1,0}}
608  },
609  {
610  {{1,0}, {0,0}, {1,0}, {0,0}},
611  {{0,0}, {1,0}, {0,0}, {1,0}},
612  {{1,0}, {0,0}, {1,0}, {0,0}},
613  {{0,0}, {1,0}, {0,0}, {1,0}}
614  },
615  // P_+ = P_R
616  {
617  {{0,0}, {0,0}, {0,0}, {0,0}},
618  {{0,0}, {0,0}, {0,0}, {0,0}},
619  {{0,0}, {0,0}, {2,0}, {0,0}},
620  {{0,0}, {0,0}, {0,0}, {2,0}}
621  },
622  // P_- = P_L
623  {
624  {{2,0}, {0,0}, {0,0}, {0,0}},
625  {{0,0}, {2,0}, {0,0}, {0,0}},
626  {{0,0}, {0,0}, {0,0}, {0,0}},
627  {{0,0}, {0,0}, {0,0}, {0,0}}
628  }
629 };
630 
631 
632 // todo pass projector
633 template <typename Float>
634 void multiplySpinorByDiracProjector5(Float *res, int projIdx, Float *spinorIn) {
635  for (int i=0; i<4*3*2; i++) res[i] = 0.0;
636 
637  for (int s = 0; s < 4; s++) {
638  for (int t = 0; t < 4; t++) {
639  Float projRe = projector[projIdx][s][t][0];
640  Float projIm = projector[projIdx][s][t][1];
641 
642  for (int m = 0; m < 3; m++) {
643  Float spinorRe = spinorIn[t*(3*2) + m*(2) + 0];
644  Float spinorIm = spinorIn[t*(3*2) + m*(2) + 1];
645  res[s*(3*2) + m*(2) + 0] += projRe*spinorRe - projIm*spinorIm;
646  res[s*(3*2) + m*(2) + 1] += projRe*spinorIm + projIm*spinorRe;
647  }
648  }
649  }
650 }
651 
652 
653 //#ifndef MULTI_GPU
654 // dslashReference_4d()
655 //J This is just the 4d wilson dslash of quda code, with a
656 //J few small changes to take into account that the spinors
657 //J are 5d and the gauge fields are 4d.
658 //
659 // if oddBit is zero: calculate odd parity spinor elements (using even parity spinor)
660 // if oddBit is one: calculate even parity spinor elements
661 //
662 // if daggerBit is zero: perform ordinary dslash operator
663 // if daggerBit is one: perform hermitian conjugate of dslash
664 //
665 //An "ok" will only be granted once check2.tex is deemed complete,
666 //since the logic in this function is important and nontrivial.
667 template <typename sFloat, typename gFloat>
668 void dslashReference_4d_sgpu(sFloat *res, gFloat **gaugeFull, sFloat *spinorField,
669  int oddBit, int daggerBit) {
670 
671  // Initialize the return half-spinor to zero. Note that it is a
672  // 5d spinor, hence the use of V5h.
673  for (int i=0; i<V5h*4*3*2; i++) res[i] = 0.0;
674 
675  // Some pointers that we use to march through arrays.
676  gFloat *gaugeEven[4], *gaugeOdd[4];
677  // Initialize to beginning of even and odd parts of
678  // gauge array.
679  for (int dir = 0; dir < 4; dir++) {
680  gaugeEven[dir] = gaugeFull[dir];
681  // Note the use of Vh here, since the gauge fields
682  // are 4-dim'l.
683  gaugeOdd[dir] = gaugeFull[dir]+Vh*gaugeSiteSize;
684  }
685  int sp_idx,oddBit_gge;
686  for (int xs=0;xs<Ls;xs++) {
687  for (int gge_idx = 0; gge_idx < Vh; gge_idx++) {
688  for (int dir = 0; dir < 8; dir++) {
689  sp_idx=gge_idx+Vh*xs;
690  // Here is a function call to study. It is defined near
691  // Line 90 of this file.
692  // Here we have to switch oddBit depending on the value of xs. E.g., suppose
693  // xs=1. Then the odd spinor site x1=x2=x3=x4=0 wants the even gauge array
694  // element 0, so that we get U_\mu(0).
695  if ((xs % 2) == 0) oddBit_gge=oddBit;
696  else oddBit_gge= (oddBit+1) % 2;
697  gFloat *gauge = gaugeLink_sgpu(gge_idx, dir, oddBit_gge, gaugeEven, gaugeOdd);
698 
699  // Even though we're doing the 4d part of the dslash, we need
700  // to use a 5d neighbor function, to get the offsets right.
701  sFloat *spinor = spinorNeighbor_5d(sp_idx, dir, oddBit, spinorField);
702  sFloat projectedSpinor[4*3*2], gaugedSpinor[4*3*2];
703  int projIdx = 2*(dir/2)+(dir+daggerBit)%2;
704  multiplySpinorByDiracProjector5(projectedSpinor, projIdx, spinor);
705 
706  for (int s = 0; s < 4; s++) {
707  if (dir % 2 == 0) {
708  su3Mul(&gaugedSpinor[s*(3*2)], gauge, &projectedSpinor[s*(3*2)]);
709 #ifdef DBUG_VERBOSE
710  std::cout << "spinor:" << std::endl;
711  printSpinorElement(&projectedSpinor[s*(3*2)],0,QUDA_DOUBLE_PRECISION);
712  std::cout << "gauge:" << std::endl;
713 #endif
714  } else {
715  su3Tmul(&gaugedSpinor[s*(3*2)], gauge, &projectedSpinor[s*(3*2)]);
716  }
717  }
718 
719  sum(&res[sp_idx*(4*3*2)], &res[sp_idx*(4*3*2)], gaugedSpinor, 4*3*2);
720  }
721  }
722  }
723 }
724 
725 template <typename sFloat, typename gFloat>
726 void dslashReference_4d_4dpc_sgpu(sFloat *res, gFloat **gaugeFull, sFloat *spinorField,
727  int oddBit, int daggerBit) {
728 
729  // Initialize the return half-spinor to zero. Note that it is a
730  // 5d spinor, hence the use of V5h.
731  for (int i=0; i<V5h*4*3*2; i++) res[i] = 0.0;
732 
733  // Some pointers that we use to march through arrays.
734  gFloat *gaugeEven[4], *gaugeOdd[4];
735  // Initialize to beginning of even and odd parts of
736  // gauge array.
737  for (int dir = 0; dir < 4; dir++) {
738  gaugeEven[dir] = gaugeFull[dir];
739  // Note the use of Vh here, since the gauge fields
740  // are 4-dim'l.
741  gaugeOdd[dir] = gaugeFull[dir]+Vh*gaugeSiteSize;
742  }
743  int sp_idx;//oddBit_gge;
744  for (int xs=0;xs<Ls;xs++) {
745  for (int gge_idx = 0; gge_idx < Vh; gge_idx++) {
746  for (int dir = 0; dir < 8; dir++) {
747  sp_idx=gge_idx+Vh*xs;
748  // Here is a function call to study. It is defined near
749  // Line 90 of this file.
750  // Here we have to switch oddBit depending on the value of xs. E.g., suppose
751  // xs=1. Then the odd spinor site x1=x2=x3=x4=0 wants the even gauge array
752  // element 0, so that we get U_\mu(0).
753  //if ((xs % 2) == 0) oddBit_gge=oddBit;
754  //else oddBit_gge= (oddBit+1) % 2;
755  gFloat *gauge = gaugeLink_sgpu(gge_idx, dir, oddBit, gaugeEven, gaugeOdd);
756 
757  // Even though we're doing the 4d part of the dslash, we need
758  // to use a 5d neighbor function, to get the offsets right.
759  sFloat *spinor = spinorNeighbor_5d_4dpc(sp_idx, dir, oddBit, spinorField);
760  sFloat projectedSpinor[4*3*2], gaugedSpinor[4*3*2];
761  int projIdx = 2*(dir/2)+(dir+daggerBit)%2;
762  multiplySpinorByDiracProjector5(projectedSpinor, projIdx, spinor);
763 
764  for (int s = 0; s < 4; s++) {
765  if (dir % 2 == 0) {
766  su3Mul(&gaugedSpinor[s*(3*2)], gauge, &projectedSpinor[s*(3*2)]);
767 #ifdef DBUG_VERBOSE
768  std::cout << "spinor:" << std::endl;
769  printSpinorElement(&projectedSpinor[s*(3*2)],0,QUDA_DOUBLE_PRECISION);
770  std::cout << "gauge:" << std::endl;
771 #endif
772  } else {
773  su3Tmul(&gaugedSpinor[s*(3*2)], gauge, &projectedSpinor[s*(3*2)]);
774  }
775  }
776 
777  sum(&res[sp_idx*(4*3*2)], &res[sp_idx*(4*3*2)], gaugedSpinor, 4*3*2);
778  }
779  }
780  }
781 }
782 //#else
783 
784 template <typename sFloat, typename gFloat>
785 void dslashReference_4d_mgpu(sFloat *res, gFloat **gaugeFull, gFloat **ghostGauge, sFloat *spinorField, sFloat **fwdSpinor, sFloat **backSpinor, int oddBit, int daggerBit)
786 {
787 
788  int mySpinorSiteSize = 24;
789  for (int i=0; i<V5h*mySpinorSiteSize; i++) res[i] = 0.0;
790 
791  gFloat *gaugeEven[4], *gaugeOdd[4];
792  gFloat *ghostGaugeEven[4], *ghostGaugeOdd[4];
793 
794  for (int dir = 0; dir < 4; dir++)
795  {
796  gaugeEven[dir] = gaugeFull[dir];
797  gaugeOdd[dir] = gaugeFull[dir]+Vh*gaugeSiteSize;
798 
799  ghostGaugeEven[dir] = ghostGauge[dir];
800  ghostGaugeOdd[dir] = ghostGauge[dir] + (faceVolume[dir]/2)*gaugeSiteSize;
801  }
802  for (int xs=0;xs<Ls;xs++)
803  {
804  int sp_idx;
805  for (int i = 0; i < Vh; i++)
806  {
807  sp_idx = i + Vh*xs;
808  for (int dir = 0; dir < 8; dir++)
809  {
810  int oddBit_gge;
811 
812  if ((xs % 2) == 0) oddBit_gge=oddBit;
813  else oddBit_gge= (oddBit+1) % 2;
814 
815  gFloat *gauge = gaugeLink_mgpu(i, dir, oddBit_gge, gaugeEven, gaugeOdd, ghostGaugeEven, ghostGaugeOdd, 1, 1);//this is unchanged from MPi version
816  sFloat *spinor = spinorNeighbor_5d_mgpu(sp_idx, dir, oddBit, spinorField, fwdSpinor, backSpinor, 1, 1);
817 
818  sFloat projectedSpinor[mySpinorSiteSize], gaugedSpinor[mySpinorSiteSize];
819  int projIdx = 2*(dir/2)+(dir+daggerBit)%2;
820  multiplySpinorByDiracProjector5(projectedSpinor, projIdx, spinor);
821 
822  for (int s = 0; s < 4; s++)
823  {
824  if (dir % 2 == 0) su3Mul(&gaugedSpinor[s*(3*2)], gauge, &projectedSpinor[s*(3*2)]);
825  else su3Tmul(&gaugedSpinor[s*(3*2)], gauge, &projectedSpinor[s*(3*2)]);
826  }
827  sum(&res[sp_idx*(4*3*2)], &res[sp_idx*(4*3*2)], gaugedSpinor, 4*3*2);
828  }
829  }
830  }
831 }
832 
833 template <typename sFloat, typename gFloat>
834 void dslashReference_4d_4dpc_mgpu(sFloat *res, gFloat **gaugeFull, gFloat **ghostGauge, sFloat *spinorField, sFloat **fwdSpinor, sFloat **backSpinor, int oddBit, int daggerBit)
835 {
836 
837  int mySpinorSiteSize = 24;
838  for (int i=0; i<V5h*mySpinorSiteSize; i++) res[i] = 0.0;
839 
840  gFloat *gaugeEven[4], *gaugeOdd[4];
841  gFloat *ghostGaugeEven[4], *ghostGaugeOdd[4];
842 
843  for (int dir = 0; dir < 4; dir++)
844  {
845  gaugeEven[dir] = gaugeFull[dir];
846  gaugeOdd[dir] = gaugeFull[dir]+Vh*gaugeSiteSize;
847 
848  ghostGaugeEven[dir] = ghostGauge[dir];
849  ghostGaugeOdd[dir] = ghostGauge[dir] + (faceVolume[dir]/2)*gaugeSiteSize;
850  }
851  for (int xs=0;xs<Ls;xs++)
852  {
853  int sp_idx;
854  for (int i = 0; i < Vh; i++)
855  {
856  sp_idx = i + Vh*xs;
857  for (int dir = 0; dir < 8; dir++)
858  {
859  //int oddBit_gge;
860 
861  //if ((xs % 2) == 0) oddBit_gge=oddBit;
862  // else oddBit_gge= (oddBit+1) % 2;
863 
864  gFloat *gauge = gaugeLink_mgpu(i, dir, oddBit, gaugeEven, gaugeOdd, ghostGaugeEven, ghostGaugeOdd, 1, 1);//this is unchanged from MPi version
865  sFloat *spinor = spinorNeighbor_5d_4dpc_mgpu(sp_idx, dir, oddBit, spinorField, fwdSpinor, backSpinor, 1, 1);
866  //sFloat *spinor = spinorNeighbor_5d_4dpc(sp_idx, dir, oddBit, spinorField);
867 
868  sFloat projectedSpinor[mySpinorSiteSize], gaugedSpinor[mySpinorSiteSize];
869  int projIdx = 2*(dir/2)+(dir+daggerBit)%2;
870  multiplySpinorByDiracProjector5(projectedSpinor, projIdx, spinor);
871 
872  for (int s = 0; s < 4; s++)
873  {
874  if (dir % 2 == 0) su3Mul(&gaugedSpinor[s*(3*2)], gauge, &projectedSpinor[s*(3*2)]);
875  else su3Tmul(&gaugedSpinor[s*(3*2)], gauge, &projectedSpinor[s*(3*2)]);
876  }
877  sum(&res[sp_idx*(4*3*2)], &res[sp_idx*(4*3*2)], gaugedSpinor, 4*3*2);
878  }
879  }
880  }
881 }
882 //#endif
883 
884 //Currently we consider only spacetime decomposition (not in 5th dim), so this operator is local
885 template <typename sFloat>
886 void dslashReference_5th(sFloat *res, sFloat *spinorField,
887  int oddBit, int daggerBit, sFloat mferm) {
888  for (int i = 0; i < V5h; i++) {
889  for (int dir = 8; dir < 10; dir++) {
890  // Calls for an extension of the original function.
891  // 8 is forward hop, which wants P_+, 9 is backward hop,
892  // which wants P_-. Dagger reverses these.
893  sFloat *spinor = spinorNeighbor_5d(i, dir, oddBit, spinorField);
894  sFloat projectedSpinor[4*3*2];
895  int projIdx = 2*(dir/2)+(dir+daggerBit)%2;
896  multiplySpinorByDiracProjector5(projectedSpinor, projIdx, spinor);
897  //J Need a conditional here for s=0 and s=Ls-1.
898  int X = fullLatticeIndex_5d(i, oddBit);
899  int xs = X/(Z[3]*Z[2]*Z[1]*Z[0]);
900 
901  if ( (xs == 0 && dir == 9) || (xs == Ls-1 && dir == 8) ) {
902  ax(projectedSpinor,(sFloat)(-mferm),projectedSpinor,4*3*2);
903  }
904  sum(&res[i*(4*3*2)], &res[i*(4*3*2)], projectedSpinor, 4*3*2);
905  }
906  }
907 }
908 
909 //Currently we consider only spacetime decomposition (not in 5th dim), so this operator is local
910 template <typename sFloat>
911 void dslashReference_5th_4d(sFloat *res, sFloat *spinorField,
912  int oddBit, int daggerBit, sFloat mferm) {
913  for (int i = 0; i < V5h; i++) {
914  for(int one_site = 0 ; one_site < 24 ; one_site++)
915  res[i*(4*3*2)+one_site] = 0.0;
916  for (int dir = 8; dir < 10; dir++) {
917  // Calls for an extension of the original function.
918  // 8 is forward hop, which wants P_+, 9 is backward hop,
919  // which wants P_-. Dagger reverses these.
920  sFloat *spinor = spinorNeighbor_5d_4dpc(i, dir, oddBit, spinorField);
921  sFloat projectedSpinor[4*3*2];
922  int projIdx = 2*(dir/2)+(dir+daggerBit)%2;
923  multiplySpinorByDiracProjector5(projectedSpinor, projIdx, spinor);
924  //J Need a conditional here for s=0 and s=Ls-1.
925  int X = fullLatticeIndex_5d_4dpc(i, oddBit);
926  int xs = X/(Z[3]*Z[2]*Z[1]*Z[0]);
927 
928  if ( (xs == 0 && dir == 9) || (xs == Ls-1 && dir == 8) ) {
929  ax(projectedSpinor,(sFloat)(-mferm),projectedSpinor,4*3*2);
930  }
931  sum(&res[i*(4*3*2)], &res[i*(4*3*2)], projectedSpinor, 4*3*2);
932  }
933  }
934 }
935 
936 //Currently we consider only spacetime decomposition (not in 5th dim), so this operator is local
937 template <typename sFloat>
938 void dslashReference_5th_inv(sFloat *res, sFloat *spinorField,
939  int oddBit, int daggerBit, sFloat mferm, double *kappa) {
940  double *inv_Ftr = (double*)malloc(Ls*sizeof(sFloat));
941  double *Ftr = (double*)malloc(Ls*sizeof(sFloat));
942  for(int xs = 0 ; xs < Ls ; xs++)
943  {
944  inv_Ftr[xs] = 1.0/(1.0+pow(2.0*kappa[xs], Ls)*mferm);
945  Ftr[xs] = -2.0*kappa[xs]*mferm*inv_Ftr[xs];
946  for (int i = 0; i < Vh; i++) {
947  memcpy(&res[24*(i+Vh*xs)], &spinorField[24*(i+Vh*xs)], 24*sizeof(sFloat));
948  }
949  }
950  if(daggerBit == 0)
951  {
952  // s = 0
953  for (int i = 0; i < Vh; i++) {
954  ax(&res[12+24*(i+Vh*(Ls-1))],(sFloat)(inv_Ftr[0]), &spinorField[12+24*(i+Vh*(Ls-1))], 12);
955  }
956 
957  // s = 1 ... ls-2
958  for(int xs = 0 ; xs <= Ls-2 ; ++xs)
959  {
960  for (int i = 0; i < Vh; i++) {
961  axpy((sFloat)(2.0*kappa[xs]), &res[24*(i+Vh*xs)], &res[24*(i+Vh*(xs+1))], 12);
962  axpy((sFloat)Ftr[xs], &res[12+24*(i+Vh*xs)], &res[12+24*(i+Vh*(Ls-1))], 12);
963  }
964  for (int tmp_s = 0 ; tmp_s < Ls ; tmp_s++)
965  Ftr[tmp_s] *= 2.0*kappa[tmp_s];
966  }
967  for(int xs = 0 ; xs < Ls ; xs++)
968  {
969  Ftr[xs] = -pow(2.0*kappa[xs],Ls-1)*mferm*inv_Ftr[xs];
970  }
971  // s = ls-2 ... 0
972  for(int xs = Ls-2 ; xs >=0 ; --xs)
973  {
974  for (int i = 0; i < Vh; i++) {
975  axpy((sFloat)Ftr[xs], &res[24*(i+Vh*(Ls-1))], &res[24*(i+Vh*xs)], 12);
976  axpy((sFloat)(2.0*kappa[xs]), &res[12+24*(i+Vh*(xs+1))], &res[12+24*(i+Vh*xs)], 12);
977  }
978  for (int tmp_s = 0 ; tmp_s < Ls ; tmp_s++)
979  Ftr[tmp_s] /= 2.0*kappa[tmp_s];
980  }
981  // s = ls -1
982  for (int i = 0; i < Vh; i++) {
983  ax(&res[24*(i+Vh*(Ls-1))], (sFloat)(inv_Ftr[Ls-1]), &res[24*(i+Vh*(Ls-1))], 12);
984  }
985  }
986  else
987  {
988  // s = 0
989  for (int i = 0; i < Vh; i++) {
990  ax(&res[24*(i+Vh*(Ls-1))],(sFloat)(inv_Ftr[0]), &spinorField[24*(i+Vh*(Ls-1))], 12);
991  }
992 
993  // s = 1 ... ls-2
994  for(int xs = 0 ; xs <= Ls-2 ; ++xs)
995  {
996  for (int i = 0; i < Vh; i++) {
997  axpy((sFloat)Ftr[xs], &res[24*(i+Vh*xs)], &res[24*(i+Vh*(Ls-1))], 12);
998  axpy((sFloat)(2.0*kappa[xs]), &res[12+24*(i+Vh*xs)], &res[12+24*(i+Vh*(xs+1))], 12);
999  }
1000  for (int tmp_s = 0 ; tmp_s < Ls ; tmp_s++)
1001  Ftr[tmp_s] *= 2.0*kappa[tmp_s];
1002  }
1003  for(int xs = 0 ; xs < Ls ; xs++)
1004  {
1005  Ftr[xs] = -pow(2.0*kappa[xs],Ls-1)*mferm*inv_Ftr[xs];
1006  }
1007  // s = ls-2 ... 0
1008  for(int xs = Ls-2 ; xs >=0 ; --xs)
1009  {
1010  for (int i = 0; i < Vh; i++) {
1011  axpy((sFloat)(2.0*kappa[xs]), &res[24*(i+Vh*(xs+1))], &res[24*(i+Vh*xs)], 12);
1012  axpy((sFloat)Ftr[xs], &res[12+24*(i+Vh*(Ls-1))], &res[12+24*(i+Vh*xs)], 12);
1013  }
1014  for (int tmp_s = 0 ; tmp_s < Ls ; tmp_s++)
1015  Ftr[tmp_s] /= 2.0*kappa[tmp_s];
1016  }
1017  // s = ls -1
1018  for (int i = 0; i < Vh; i++) {
1019  ax(&res[12+24*(i+Vh*(Ls-1))], (sFloat)(inv_Ftr[Ls-1]), &res[12+24*(i+Vh*(Ls-1))], 12);
1020  }
1021  }
1022  free(inv_Ftr);
1023  free(Ftr);
1024 }
1025 
1026 // Recall that dslash is only the off-diagonal parts, so m0_dwf is not needed.
1027 //
1028 //#ifndef MULTI_GPU
1029 /*
1030 void dslash(void *res, void **gaugeFull, void *spinorField,
1031  int oddBit, int daggerBit,
1032  QudaPrecision sPrecision, QudaPrecision gPrecision, double mferm) {
1033 
1034  if (sPrecision == QUDA_DOUBLE_PRECISION) {
1035  if (gPrecision == QUDA_DOUBLE_PRECISION) {
1036  // Do the 4d part, which hasn't changed.
1037  printf("doing 4d part\n"); fflush(stdout);
1038  dslashReference_4d_sgpu<double,double>((double*)res, (double**)gaugeFull,
1039  (double*)spinorField, oddBit, daggerBit);
1040  // Now add in the 5th dim.
1041  printf("doing 5th dimen. part\n"); fflush(stdout);
1042  dslashReference_5th<double>((double*)res, (double*)spinorField,
1043  oddBit, daggerBit, mferm);
1044  } else {
1045  dslashReference_4d_sgpu<double,float>((double*)res, (float**)gaugeFull, (double*)spinorField, oddBit, daggerBit);
1046  dslashReference_5th<double>((double*)res, (double*)spinorField, oddBit, daggerBit, mferm);
1047  }
1048  } else {
1049  // Single-precision spinor.
1050  if (gPrecision == QUDA_DOUBLE_PRECISION) {
1051  dslashReference_4d_sgpu<float,double>((float*)res, (double**)gaugeFull, (float*)spinorField, oddBit, daggerBit);
1052  dslashReference_5th<float>((float*)res, (float*)spinorField, oddBit, daggerBit, mferm);
1053  } else {
1054  // Do the 4d part, which hasn't changed.
1055  printf("CPU reference: doing 4d part all single precision\n"); fflush(stdout);
1056  dslashReference_4d_sgpu<float,float>((float*)res, (float**)gaugeFull, (float*)spinorField, oddBit, daggerBit);
1057  // Now add in the 5th dim.
1058  printf("CPU reference: doing 5th dimen. part all single precision\n"); fflush(stdout);
1059  dslashReference_5th<float>((float*)res, (float*)spinorField, oddBit, daggerBit, mferm);
1060  }
1061  }
1062 }
1063 */
1064 //#endif
1065 
1066 //BEGIN NEW
1067 // this actually applies the preconditioned dslash, e.g., D_ee^{-1} D_eo or D_oo^{-1} D_oe
1068 void dw_dslash(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm)
1069 {
1070 #ifndef MULTI_GPU
1071  if (precision == QUDA_DOUBLE_PRECISION) {
1072  dslashReference_4d_sgpu((double*)out, (double**)gauge, (double*)in, oddBit, daggerBit);
1073  dslashReference_5th((double*)out, (double*)in, oddBit, daggerBit, mferm);
1074  } else {
1075  dslashReference_4d_sgpu((float*)out, (float**)gauge, (float*)in, oddBit, daggerBit);
1076  dslashReference_5th((float*)out, (float*)in, oddBit, daggerBit, (float)mferm);
1077  }
1078 #else
1079 
1080 // void *ghostGauge[4], *sendGauge[4];
1081 // for (int d=0; d<4; d++) {
1082 // ghostGauge[d] = malloc(faceVolume[d]*gaugeSiteSize*precision);
1083 // sendGauge[d] = malloc(faceVolume[d]*gaugeSiteSize*precision);
1084 // }
1085 
1086 // { // Exchange gauge matrices at boundary
1087 // set_dim(Z);///?
1088 // pack_ghost(gauge, sendGauge, 1, precision);
1089 // int nFace = 1;
1090 // FaceBuffer faceBuf(Z, 4, gaugeSiteSize, nFace, precision);
1091 // faceBuf.exchangeLink(ghostGauge, sendGauge, QUDA_CPU_FIELD_LOCATION);
1092 // }
1093 
1094 //BEGINOFNEW
1095  GaugeFieldParam gauge_field_param(gauge, gauge_param);
1096  cpuGaugeField cpu(gauge_field_param);
1097  void **ghostGauge = (void**)cpu.Ghost();
1098 //ENDOFNEW
1099 
1100  // Get spinor ghost fields
1101  // First wrap the input spinor into a ColorSpinorField
1103  csParam.v = in;
1104  csParam.nColor = 3;
1105  csParam.nSpin = 4;
1106  csParam.nDim = 5; //for DW dslash
1107  for (int d=0; d<4; d++) csParam.x[d] = Z[d];
1108  csParam.x[4] = Ls;//5th dimention
1109  csParam.precision = precision;
1110  csParam.pad = 0;
1112  csParam.x[0] /= 2;
1117  csParam.PCtype = QUDA_5D_PC;
1118 
1119  cpuColorSpinorField inField(csParam);
1120 
1121  { // Now do the exchange
1122  QudaParity otherParity = QUDA_INVALID_PARITY;
1123  if (oddBit == QUDA_EVEN_PARITY) otherParity = QUDA_ODD_PARITY;
1124  else if (oddBit == QUDA_ODD_PARITY) otherParity = QUDA_EVEN_PARITY;
1125  else errorQuda("ERROR: full parity not supported in function %s", __FUNCTION__);
1126 
1127  int nFace = 1;
1128  FaceBuffer faceBuf(Z, 5, mySpinorSiteSize, nFace, precision, Ls);//4 <-> 5
1129  faceBuf.exchangeCpuSpinor(inField, otherParity, daggerBit);
1130  }
1131  void** fwd_nbr_spinor = inField.fwdGhostFaceBuffer;
1132  void** back_nbr_spinor = inField.backGhostFaceBuffer;
1133  //NOTE: hopping in 5th dimension does not use MPI.
1134  if (precision == QUDA_DOUBLE_PRECISION)
1135  {
1136  dslashReference_4d_mgpu((double*)out, (double**)gauge, (double**)ghostGauge, (double*)in,(double**)fwd_nbr_spinor, (double**)back_nbr_spinor, oddBit, daggerBit);
1137  dslashReference_5th((double*)out, (double*)in, oddBit, daggerBit, mferm);
1138  } else
1139  {
1140  dslashReference_4d_mgpu((float*)out, (float**)gauge, (float**)ghostGauge, (float*)in,
1141  (float**)fwd_nbr_spinor, (float**)back_nbr_spinor, oddBit, daggerBit);
1142  dslashReference_5th((float*)out, (float*)in, oddBit, daggerBit, (float)mferm);
1143  }
1144 
1145 #endif
1146 
1147 }
1148 
1149 void dslash_4_4d(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm)
1150 {
1151 #ifndef MULTI_GPU
1152  if (precision == QUDA_DOUBLE_PRECISION) {
1153  dslashReference_4d_4dpc_sgpu((double*)out, (double**)gauge, (double*)in, oddBit, daggerBit);
1154  } else {
1155  dslashReference_4d_4dpc_sgpu((float*)out, (float**)gauge, (float*)in, oddBit, daggerBit);
1156  }
1157 #else
1158 
1159 // void *ghostGauge[4], *sendGauge[4];
1160 // for (int d=0; d<4; d++) {
1161 // ghostGauge[d] = malloc(faceVolume[d]*gaugeSiteSize*precision);
1162 // sendGauge[d] = malloc(faceVolume[d]*gaugeSiteSize*precision);
1163 // }
1164 
1165 // { // Exchange gauge matrices at boundary
1166 // set_dim(Z);///?
1167 // pack_ghost(gauge, sendGauge, 1, precision);
1168 // int nFace = 1;
1169 // FaceBuffer faceBuf(Z, 4, gaugeSiteSize, nFace, precision);
1170 // faceBuf.exchangeLink(ghostGauge, sendGauge, QUDA_CPU_FIELD_LOCATION);
1171 // }
1172 
1173 //BEGINOFNEW
1174  GaugeFieldParam gauge_field_param(gauge, gauge_param);
1175  cpuGaugeField cpu(gauge_field_param);
1176  void **ghostGauge = (void**)cpu.Ghost();
1177 //ENDOFNEW
1178 
1179  // Get spinor ghost fields
1180  // First wrap the input spinor into a ColorSpinorField
1182  csParam.v = in;
1183  csParam.nColor = 3;
1184  csParam.nSpin = 4;
1185  csParam.nDim = 5; //for DW dslash
1186  for (int d=0; d<4; d++) csParam.x[d] = Z[d];
1187  csParam.x[4] = Ls;//5th dimention
1188  csParam.precision = precision;
1189  csParam.pad = 0;
1191  csParam.x[0] /= 2;
1196  csParam.PCtype = QUDA_4D_PC;
1197 
1198  cpuColorSpinorField inField(csParam);
1199 
1200  { // Now do the exchange
1201  QudaParity otherParity = QUDA_INVALID_PARITY;
1202  if (oddBit == QUDA_EVEN_PARITY) otherParity = QUDA_ODD_PARITY;
1203  else if (oddBit == QUDA_ODD_PARITY) otherParity = QUDA_EVEN_PARITY;
1204  else errorQuda("ERROR: full parity not supported in function %s", __FUNCTION__);
1205 
1206  int nFace = 1;
1207  FaceBuffer faceBuf(Z, 5, mySpinorSiteSize, nFace, precision, Ls);//4 <-> 5
1208  faceBuf.exchangeCpuSpinor(inField, otherParity, daggerBit);
1209  }
1210  void** fwd_nbr_spinor = inField.fwdGhostFaceBuffer;
1211  void** back_nbr_spinor = inField.backGhostFaceBuffer;
1212  //NOTE: hopping in 5th dimension does not use MPI.
1213  if (precision == QUDA_DOUBLE_PRECISION)
1214  {
1215  dslashReference_4d_4dpc_mgpu((double*)out, (double**)gauge, (double**)ghostGauge, (double*)in,(double**)fwd_nbr_spinor, (double**)back_nbr_spinor, oddBit, daggerBit);
1216  } else
1217  {
1218  dslashReference_4d_4dpc_mgpu((float*)out, (float**)gauge, (float**)ghostGauge, (float*)in,
1219  (float**)fwd_nbr_spinor, (float**)back_nbr_spinor, oddBit, daggerBit);
1220  }
1221 
1222 #endif
1223 
1224 }
1225 //END NEW
1226 void dw_dslash_5_4d(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm)
1227 {
1228  if (precision == QUDA_DOUBLE_PRECISION) {
1229  dslashReference_5th_4d((double*)out, (double*)in, oddBit, daggerBit, mferm);
1230  } else {
1231  dslashReference_5th_4d((float*)out, (float*)in, oddBit, daggerBit, (float)mferm);
1232  }
1233 }
1234 
1235 void dslash_5_inv(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm, double *kappa)
1236 {
1237  if (precision == QUDA_DOUBLE_PRECISION) {
1238  dslashReference_5th_inv((double*)out, (double*)in, oddBit, daggerBit, mferm, kappa);
1239  } else {
1240  dslashReference_5th_inv((float*)out, (float*)in, oddBit, daggerBit, (float)mferm, kappa);
1241  }
1242 }
1243 
1244 void mdw_dslash_5(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm, double *kappa)
1245 {
1246  if (precision == QUDA_DOUBLE_PRECISION) {
1247  dslashReference_5th_4d((double*)out, (double*)in, oddBit, daggerBit, mferm);
1248  for(int xs = 0 ; xs < Ls ; xs++)
1249  {
1250  xpay((double*)in + Vh*spinorSiteSize*xs, kappa[xs], (double*)out + Vh*spinorSiteSize*xs, Vh*spinorSiteSize);
1251  }
1252  } else {
1253  dslashReference_5th_4d((float*)out, (float*)in, oddBit, daggerBit, (float)mferm);
1254  for(int xs = 0 ; xs < Ls ; xs++)
1255  {
1256  xpay((float*)in + Vh*spinorSiteSize*xs, (float)(kappa[xs]), (float*)out + Vh*spinorSiteSize*xs, Vh*spinorSiteSize);
1257  }
1258  }
1259 }
1260 
1261 void mdw_dslash_4_pre(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm, double *b5, double *c5)
1262 {
1263  if (precision == QUDA_DOUBLE_PRECISION) {
1264  dslashReference_5th_4d((double*)out, (double*)in, oddBit, daggerBit, mferm);
1265  for(int xs = 0 ; xs < Ls ; xs++)
1266  {
1267  axpby(b5[xs],(double*)in + Vh*spinorSiteSize*xs,0.5*c5[xs], (double*)out + Vh*spinorSiteSize*xs, Vh*spinorSiteSize);
1268  }
1269  } else {
1270  dslashReference_5th_4d((float*)out, (float*)in, oddBit, daggerBit, (float)mferm);
1271  for(int xs = 0 ; xs < Ls ; xs++)
1272  {
1273  axpby((float)(b5[xs]),(float*)in + Vh*spinorSiteSize*xs, (float)(0.5*c5[xs]), (float*)out + Vh*spinorSiteSize*xs, Vh*spinorSiteSize);
1274  }
1275  }
1276 
1277 }
1278 
1279 void dw_mat(void *out, void **gauge, void *in, double kappa, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm) {
1280 
1281  void *inEven = in;
1282  void *inOdd = (char*)in + V5h*spinorSiteSize*precision;
1283  void *outEven = out;
1284  void *outOdd = (char*)out + V5h*spinorSiteSize*precision;
1285 
1286  dw_dslash(outOdd, gauge, inEven, 1, dagger_bit, precision, gauge_param, mferm);
1287  dw_dslash(outEven, gauge, inOdd, 0, dagger_bit, precision, gauge_param, mferm);
1288 
1289  // lastly apply the kappa term
1290  if (precision == QUDA_DOUBLE_PRECISION) xpay((double*)in, -kappa, (double*)out, V5*spinorSiteSize);
1291  else xpay((float*)in, -(float)kappa, (float*)out, V5*spinorSiteSize);
1292 }
1293 
1294 //void dw_4d_mat(void *out, void **gauge, void *in, double kappa, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm) {
1295 //
1296 // void *inEven = in;
1297 // void *inOdd = (char*)in + V5h*spinorSiteSize*precision;
1298 // void *outEven = out;
1299 // void *outOdd = (char*)out + V5h*spinorSiteSize*precision;
1300 //
1301 // dw_dslash(outOdd, gauge, inEven, 1, dagger_bit, precision, gauge_param, mferm);
1302 // dw_dslash(outEven, gauge, inOdd, 0, dagger_bit, precision, gauge_param, mferm);
1303 //
1304 // // lastly apply the kappa term
1305 // if (precision == QUDA_DOUBLE_PRECISION) xpay((double*)in, -kappa, (double*)out, V5*spinorSiteSize);
1306 // else xpay((float*)in, -(float)kappa, (float*)out, V5*spinorSiteSize);
1307 //}
1308 
1309 //void mdw_mat(void *out, void **gauge, void *in, double kappa, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm, double *b5, double *c5) {
1310 //
1311 // void *inEven = in;
1312 // void *inOdd = (char*)in + V5h*spinorSiteSize*precision;
1313 // void *outEven = out;
1314 // void *outOdd = (char*)out + V5h*spinorSiteSize*precision;
1315 //
1316 // dw_dslash(outOdd, gauge, inEven, 1, dagger_bit, precision, gauge_param, mferm);
1317 // dw_dslash(outEven, gauge, inOdd, 0, dagger_bit, precision, gauge_param, mferm);
1318 //
1319 // // lastly apply the kappa term
1320 // if (precision == QUDA_DOUBLE_PRECISION) xpay((double*)in, -kappa, (double*)out, V5*spinorSiteSize);
1321 // else xpay((float*)in, -(float)kappa, (float*)out, V5*spinorSiteSize);
1322 //}
1323 
1324 //
1325 void dw_matdagmat(void *out, void **gauge, void *in, double kappa, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm)
1326 {
1327 
1328  void *tmp = malloc(V5*spinorSiteSize*precision);
1329  dw_mat(tmp, gauge, in, kappa, dagger_bit, precision, gauge_param, mferm);
1330  dagger_bit = (dagger_bit == 1) ? 0 : 1;
1331  dw_mat(out, gauge, tmp, kappa, dagger_bit, precision, gauge_param, mferm);
1332 
1333  free(tmp);
1334 }
1335 
1336 void dw_matpc(void *out, void **gauge, void *in, double kappa, QudaMatPCType matpc_type, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm)
1337 {
1338  void *tmp = malloc(V5h*spinorSiteSize*precision);
1339 
1340  if (matpc_type == QUDA_MATPC_EVEN_EVEN || matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC) {
1341  dw_dslash(tmp, gauge, in, 1, dagger_bit, precision, gauge_param, mferm);
1342  dw_dslash(out, gauge, tmp, 0, dagger_bit, precision, gauge_param, mferm);
1343  } else {
1344  dw_dslash(tmp, gauge, in, 0, dagger_bit, precision, gauge_param, mferm);
1345  dw_dslash(out, gauge, tmp, 1, dagger_bit, precision, gauge_param, mferm);
1346  }
1347 
1348  // lastly apply the kappa term
1349  double kappa2 = -kappa*kappa;
1350  if (precision == QUDA_DOUBLE_PRECISION) xpay((double*)in, kappa2, (double*)out, V5h*spinorSiteSize);
1351  else xpay((float*)in, (float)kappa2, (float*)out, V5h*spinorSiteSize);
1352 
1353  free(tmp);
1354 }
1355 
1356 
1357 void dw_4d_matpc(void *out, void **gauge, void *in, double kappa, QudaMatPCType matpc_type, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm)
1358 {
1359  double kappa2 = -kappa*kappa;
1360  double *kappa5 = (double*)malloc(Ls*sizeof(double));
1361  for(int xs = 0; xs < Ls ; xs++)
1362  kappa5[xs] = kappa;
1363  void *tmp = malloc(V5h*spinorSiteSize*precision);
1364  //------------------------------------------
1365  double *output = (double*)out;
1366  for(int k = 0 ; k< V5h*spinorSiteSize; k++)
1367  output[k] = 0.0;
1368  //------------------------------------------
1369 
1370  if (matpc_type == QUDA_MATPC_EVEN_EVEN || matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC) {
1371  dslash_4_4d(tmp, gauge, in, 0, dagger_bit, precision, gauge_param, mferm);
1372  dslash_5_inv(out, gauge, tmp, 1, dagger_bit, precision, gauge_param, mferm, kappa5);
1373  dslash_4_4d(tmp, gauge, out, 1, dagger_bit, precision, gauge_param, mferm);
1374  if (precision == QUDA_DOUBLE_PRECISION) xpay((double*)in, kappa2, (double*)tmp, V5h*spinorSiteSize);
1375  else xpay((float*)in, (float)kappa2, (float*)tmp, V5h*spinorSiteSize);
1376  dw_dslash_5_4d(out, gauge, in, 1, dagger_bit, precision, gauge_param, mferm);
1377  if (precision == QUDA_DOUBLE_PRECISION) xpay((double*)tmp, -kappa, (double*)out, V5h*spinorSiteSize);
1378  else xpay((float*)tmp, -(float)kappa, (float*)out, V5h*spinorSiteSize);
1379  } else {
1380  dslash_4_4d(tmp, gauge, in, 1, dagger_bit, precision, gauge_param, mferm);
1381  dslash_5_inv(out, gauge, tmp, 0, dagger_bit, precision, gauge_param, mferm, kappa5);
1382  dslash_4_4d(tmp, gauge, out, 0, dagger_bit, precision, gauge_param, mferm);
1383  if (precision == QUDA_DOUBLE_PRECISION) xpay((double*)in, kappa2, (double*)tmp, V5h*spinorSiteSize);
1384  else xpay((float*)in, (float)kappa2, (float*)tmp, V5h*spinorSiteSize);
1385  dw_dslash_5_4d(out, gauge, in, 0, dagger_bit, precision, gauge_param, mferm);
1386  if (precision == QUDA_DOUBLE_PRECISION) xpay((double*)tmp, -kappa, (double*)out, V5h*spinorSiteSize);
1387  else xpay((float*)tmp, -(float)kappa, (float*)out, V5h*spinorSiteSize);
1388  }
1389  free(tmp);
1390  free(kappa5);
1391 }
1392 
1393 void mdw_matpc(void *out, void **gauge, void *in, double *kappa_b, double *kappa_c, QudaMatPCType matpc_type, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm, double *b5, double *c5)
1394 {
1395  void *tmp = malloc(V5h*spinorSiteSize*precision);
1396  double *kappa5 = (double*)malloc(Ls*sizeof(double));
1397  double *kappa2 = (double*)malloc(Ls*sizeof(double));
1398  double *kappa_mdwf = (double*)malloc(Ls*sizeof(double));
1399  for(int xs = 0; xs < Ls ; xs++)
1400  {
1401  kappa5[xs] = 0.5*kappa_b[xs]/kappa_c[xs];
1402  kappa2[xs] = -kappa_b[xs]*kappa_b[xs];
1403  kappa_mdwf[xs] = -kappa5[xs];
1404  }
1405 
1406  if(dagger_bit == 0)
1407  {
1408  if (matpc_type == QUDA_MATPC_EVEN_EVEN || matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC) {
1409  mdw_dslash_4_pre(out, gauge, in, 1, dagger_bit, precision, gauge_param, mferm, b5, c5);
1410  dslash_4_4d(tmp, gauge, out, 0, dagger_bit, precision, gauge_param, mferm);
1411  dslash_5_inv(out, gauge, tmp, 1, dagger_bit, precision, gauge_param, mferm, kappa_mdwf);
1412  mdw_dslash_4_pre(tmp, gauge, out, 0, dagger_bit, precision, gauge_param, mferm, b5, c5);
1413  dslash_4_4d(out, gauge, tmp, 1, dagger_bit, precision, gauge_param, mferm);
1414  mdw_dslash_5(tmp, gauge, in, 0, dagger_bit, precision, gauge_param, mferm, kappa5);
1415  for(int xs = 0 ; xs < Ls ; xs++)
1416  {
1417  if (precision == QUDA_DOUBLE_PRECISION) xpay((double*)tmp + Vh*spinorSiteSize*xs, kappa2[xs], (double*)out + Vh*spinorSiteSize*xs, Vh*spinorSiteSize);
1418  else xpay((float*)tmp + Vh*spinorSiteSize*xs, (float)kappa2[xs], (float*)out + Vh*spinorSiteSize*xs, Vh*spinorSiteSize);
1419  }
1420  }
1421  else {
1422  mdw_dslash_4_pre(out, gauge, in, 0, dagger_bit, precision, gauge_param, mferm, b5, c5);
1423  dslash_4_4d(tmp, gauge, out, 1, dagger_bit, precision, gauge_param, mferm);
1424  dslash_5_inv(out, gauge, tmp, 0, dagger_bit, precision, gauge_param, mferm, kappa_mdwf);
1425  mdw_dslash_4_pre(tmp, gauge, out, 1, dagger_bit, precision, gauge_param, mferm, b5, c5);
1426  dslash_4_4d(out, gauge, tmp, 0, dagger_bit, precision, gauge_param, mferm);
1427  mdw_dslash_5(tmp, gauge, in, 1, dagger_bit, precision, gauge_param, mferm, kappa5);
1428  for(int xs = 0 ; xs < Ls ; xs++)
1429  {
1430  if (precision == QUDA_DOUBLE_PRECISION) xpay((double*)tmp + Vh*spinorSiteSize*xs, kappa2[xs], (double*)out + Vh*spinorSiteSize*xs, Vh*spinorSiteSize);
1431  else xpay((float*)tmp + Vh*spinorSiteSize*xs, (float)kappa2[xs], (float*)out + Vh*spinorSiteSize*xs, Vh*spinorSiteSize);
1432  }
1433  }
1434  } else
1435  {
1436  if (matpc_type == QUDA_MATPC_EVEN_EVEN || matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC) {
1437  dslash_4_4d(out, gauge, in, 0, dagger_bit, precision, gauge_param, mferm);
1438  mdw_dslash_4_pre(tmp, gauge, out, 1, dagger_bit, precision, gauge_param, mferm, b5, c5);
1439  dslash_5_inv(out, gauge, tmp, 0, dagger_bit, precision, gauge_param, mferm, kappa_mdwf);
1440  dslash_4_4d(tmp, gauge, out, 1, dagger_bit, precision, gauge_param, mferm);
1441  mdw_dslash_4_pre(out, gauge, tmp, 0, dagger_bit, precision, gauge_param, mferm, b5, c5);
1442  mdw_dslash_5(tmp, gauge, in, 0, dagger_bit, precision, gauge_param, mferm, kappa5);
1443  for(int xs = 0 ; xs < Ls ; xs++)
1444  {
1445  if (precision == QUDA_DOUBLE_PRECISION) xpay((double*)tmp + Vh*spinorSiteSize*xs, kappa2[xs], (double*)out + Vh*spinorSiteSize*xs, Vh*spinorSiteSize);
1446  else xpay((float*)tmp + Vh*spinorSiteSize*xs, (float)kappa2[xs], (float*)out + Vh*spinorSiteSize*xs, Vh*spinorSiteSize);
1447  }
1448  }
1449  else {
1450  dslash_4_4d(out, gauge, in, 1, dagger_bit, precision, gauge_param, mferm);
1451  mdw_dslash_4_pre(tmp, gauge, out, 0, dagger_bit, precision, gauge_param, mferm, b5, c5);
1452  dslash_5_inv(out, gauge, tmp, 1, dagger_bit, precision, gauge_param, mferm, kappa_mdwf);
1453  dslash_4_4d(tmp, gauge, out, 0, dagger_bit, precision, gauge_param, mferm);
1454  mdw_dslash_4_pre(out, gauge, tmp, 1, dagger_bit, precision, gauge_param, mferm, b5, c5);
1455  mdw_dslash_5(tmp, gauge, in, 1, dagger_bit, precision, gauge_param, mferm, kappa5);
1456  for(int xs = 0 ; xs < Ls ; xs++)
1457  {
1458  if (precision == QUDA_DOUBLE_PRECISION) xpay((double*)tmp + Vh*spinorSiteSize*xs, kappa2[xs], (double*)out + Vh*spinorSiteSize*xs, Vh*spinorSiteSize);
1459  else xpay((float*)tmp + Vh*spinorSiteSize*xs, (float)kappa2[xs], (float*)out + Vh*spinorSiteSize*xs, Vh*spinorSiteSize);
1460  }
1461  }
1462  }
1463  free(tmp);
1464  free(kappa5);
1465  free(kappa2);
1466  free(kappa_mdwf);
1467 }
1468 
1469 /*
1470 // Apply the even-odd preconditioned Dirac operator
1471 template <typename sFloat, typename gFloat>
1472 void MatPC(sFloat *outEven, gFloat **gauge, sFloat *inEven, sFloat kappa,
1473  QudaMatPCType matpc_type, sFloat mferm) {
1474 
1475  sFloat *tmp = (sFloat*)malloc(V5h*spinorSiteSize*sizeof(sFloat));
1476 
1477  // full dslash operator
1478  if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
1479  dslashReference_4d(tmp, gauge, inEven, 1, 0);
1480  dslashReference_5th(tmp, inEven, 1, 0, mferm);
1481  dslashReference_4d(outEven, gauge, tmp, 0, 0);
1482  dslashReference_5th(outEven, tmp, 0, 0, mferm);
1483  } else {
1484  dslashReference_4d(tmp, gauge, inEven, 0, 0);
1485  dslashReference_5th(tmp, inEven, 0, 0, mferm);
1486  dslashReference_4d(outEven, gauge, tmp, 1, 0);
1487  dslashReference_5th(outEven, tmp, 1, 0, mferm);
1488  }
1489 
1490  // lastly apply the kappa term
1491  sFloat kappa2 = -kappa*kappa;
1492  xpay(inEven, kappa2, outEven, V5h*spinorSiteSize);
1493  free(tmp);
1494 }
1495 
1496 // Apply the even-odd preconditioned Dirac operator
1497 template <typename sFloat, typename gFloat>
1498 void MatPCDag(sFloat *outEven, gFloat **gauge, sFloat *inEven, sFloat kappa,
1499  QudaMatPCType matpc_type, sFloat mferm) {
1500 
1501  sFloat *tmp = (sFloat*)malloc(V5h*spinorSiteSize*sizeof(sFloat));
1502 
1503  // full dslash operator
1504  if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
1505  dslashReference_4d(tmp, gauge, inEven, 1, 1);
1506  dslashReference_5th(tmp, inEven, 1, 1, mferm);
1507  dslashReference_4d(outEven, gauge, tmp, 0, 1);
1508  dslashReference_5th(outEven, tmp, 0, 1, mferm);
1509  } else {
1510  dslashReference_4d(tmp, gauge, inEven, 0, 1);
1511  dslashReference_5th(tmp, inEven, 0, 1, mferm);
1512  dslashReference_4d(outEven, gauge, tmp, 1, 1);
1513  dslashReference_5th(outEven, tmp, 1, 1, mferm);
1514  }
1515 
1516  sFloat kappa2 = -kappa*kappa;
1517  xpay(inEven, kappa2, outEven, V5h*spinorSiteSize);
1518  free(tmp);
1519 }
1520 */
1521 
1522 void matpc(void *outEven, void **gauge, void *inEven, double kappa,
1523  QudaMatPCType matpc_type, int dagger_bit, QudaPrecision sPrecision, QudaPrecision gPrecision,
1524  double mferm) {
1525 /*
1526  if (!dagger_bit) {
1527  if (sPrecision == QUDA_DOUBLE_PRECISION)
1528  if (gPrecision == QUDA_DOUBLE_PRECISION)
1529  MatPC((double*)outEven, (double**)gauge, (double*)inEven, (double)kappa, matpc_type, (double)mferm);
1530  else
1531  MatPC((double*)outEven, (float**)gauge, (double*)inEven, (double)kappa, matpc_type, (double)mferm);
1532  else
1533  if (gPrecision == QUDA_DOUBLE_PRECISION)
1534  MatPC((float*)outEven, (double**)gauge, (float*)inEven, (float)kappa, matpc_type, (float)mferm);
1535  else
1536  MatPC((float*)outEven, (float**)gauge, (float*)inEven, (float)kappa, matpc_type, (float)mferm);
1537  } else {
1538  if (sPrecision == QUDA_DOUBLE_PRECISION)
1539  if (gPrecision == QUDA_DOUBLE_PRECISION)
1540  MatPCDag((double*)outEven, (double**)gauge, (double*)inEven, (double)kappa, matpc_type, (double)mferm);
1541  else
1542  MatPCDag((double*)outEven, (float**)gauge, (double*)inEven, (double)kappa, matpc_type, (double)mferm);
1543  else
1544  if (gPrecision == QUDA_DOUBLE_PRECISION)
1545  MatPCDag((float*)outEven, (double**)gauge, (float*)inEven, (float)kappa, matpc_type, (float)mferm);
1546  else
1547  MatPCDag((float*)outEven, (float**)gauge, (float*)inEven, (float)kappa, matpc_type, (float)mferm);
1548  }
1549 */
1550 }
1551 
1552 /*
1553 template <typename sFloat, typename gFloat>
1554 void MatDagMat(sFloat *out, gFloat **gauge, sFloat *in, sFloat kappa, sFloat mferm)
1555 {
1556  // Allocate a full spinor.
1557  sFloat *tmp = (sFloat*)malloc(V5*spinorSiteSize*sizeof(sFloat));
1558  // Call templates above.
1559  Mat(tmp, gauge, in, kappa, mferm);
1560  MatDag(out, gauge, tmp, kappa, mferm);
1561  free(tmp);
1562 }
1563 
1564 template <typename sFloat, typename gFloat>
1565 void MatPCDagMatPC(sFloat *out, gFloat **gauge, sFloat *in, sFloat kappa,
1566  QudaMatPCType matpc_type, sFloat mferm)
1567 {
1568 
1569  // Allocate half spinor
1570  sFloat *tmp = (sFloat*)malloc(V5h*spinorSiteSize*sizeof(sFloat));
1571  // Apply the PC templates above
1572  MatPC(tmp, gauge, in, kappa, matpc_type, mferm);
1573  MatPCDag(out, gauge, tmp, kappa, matpc_type, mferm);
1574  free(tmp);
1575 }
1576 */
1577 // Wrapper to templates that handles different precisions.
1578 void matdagmat(void *out, void **gauge, void *in, double kappa,
1579  QudaPrecision sPrecision, QudaPrecision gPrecision, double mferm)
1580 {
1581 /*
1582  if (sPrecision == QUDA_DOUBLE_PRECISION) {
1583  if (gPrecision == QUDA_DOUBLE_PRECISION)
1584  MatDagMat((double*)out, (double**)gauge, (double*)in, (double)kappa,
1585  (double)mferm);
1586  else
1587  MatDagMat((double*)out, (float**)gauge, (double*)in, (double)kappa, (double)mferm);
1588  } else {
1589  if (gPrecision == QUDA_DOUBLE_PRECISION)
1590  MatDagMat((float*)out, (double**)gauge, (float*)in, (float)kappa,
1591  (float)mferm);
1592  else
1593  MatDagMat((float*)out, (float**)gauge, (float*)in, (float)kappa, (float)mferm);
1594  }
1595 */
1596 }
1597 
1598 // Wrapper to templates that handles different precisions.
1599 void matpcdagmatpc(void *out, void **gauge, void *in, double kappa,
1600  QudaPrecision sPrecision, QudaPrecision gPrecision, double mferm, QudaMatPCType matpc_type)
1601 {
1602 /*
1603  if (sPrecision == QUDA_DOUBLE_PRECISION) {
1604  if (gPrecision == QUDA_DOUBLE_PRECISION)
1605  MatPCDagMatPC((double*)out, (double**)gauge, (double*)in, (double)kappa,
1606  matpc_type, (double)mferm);
1607  else
1608  MatPCDagMatPC((double*)out, (float**)gauge, (double*)in, (double)kappa,
1609  matpc_type, (double)mferm);
1610  } else {
1611  if (gPrecision == QUDA_DOUBLE_PRECISION)
1612  MatPCDagMatPC((float*)out, (double**)gauge, (float*)in, (float)kappa,
1613  matpc_type, (float)mferm);
1614  else
1615  MatPCDagMatPC((float*)out, (float**)gauge, (float*)in, (float)kappa,
1616  matpc_type, (float)mferm);
1617  }
1618 */
1619 }
1620 
1621 
QudaGaugeParam gauge_param
Definition: dslash_test.cpp:37
Float * spinorNeighbor_5d_4dpc(int i, int dir, int oddBit, Float *spinorField)
__constant__ int Vh
void dw_dslash_5_4d(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm)
void dw_4d_matpc(void *out, void **gauge, void *in, double kappa, QudaMatPCType matpc_type, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm)
const void ** Ghost() const
Definition: gauge_field.h:209
__constant__ int X2
enum QudaPrecision_s QudaPrecision
int neighborIndex_5d_mgpu(int i, int oddBit, int dxs, int dx4, int dx3, int dx2, int dx1)
void dslashReference_5th_4d(sFloat *res, sFloat *spinorField, int oddBit, int daggerBit, sFloat mferm)
void mdw_matpc(void *out, void **gauge, void *in, double *kappa_b, double *kappa_c, QudaMatPCType matpc_type, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm, double *b5, double *c5)
#define errorQuda(...)
Definition: util_quda.h:73
void axpby(const Float &a, const Float *x, const Float &b, Float *y, const int N)
Definition: blas_cpu.cpp:8
void printSpinorElement(void *spinor, int X, QudaPrecision precision)
Definition: test_util.cpp:162
__constant__ int X1
int sp_idx
int fullLatticeIndex_5d_4dpc(int i, int oddBit)
Definition: test_util.cpp:655
void dslashReference_4d_4dpc_mgpu(sFloat *res, gFloat **gaugeFull, gFloat **ghostGauge, sFloat *spinorField, sFloat **fwdSpinor, sFloat **backSpinor, int oddBit, int daggerBit)
QudaPrecision precision
Definition: lattice_field.h:41
#define gaugeSiteSize
int x4_5d_mgpu(int i, int oddBit)
void dw_dslash(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm)
void dw_mat(void *out, void **gauge, void *in, double kappa, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm)
cpuColorSpinorField * spinor
Definition: dslash_test.cpp:40
#define spinorSiteSize
#define Vsh_t
Definition: llfat_core.h:4
QudaSiteSubset siteSubset
Definition: lattice_field.h:42
void dslashReference_4d_mgpu(sFloat *res, gFloat **gaugeFull, gFloat **ghostGauge, sFloat *spinorField, sFloat **fwdSpinor, sFloat **backSpinor, int oddBit, int daggerBit)
void ax(double a, void *x, int len, QudaPrecision precision)
int fullLatticeIndex(int dim[4], int index, int oddBit)
Definition: test_util.cpp:400
int Ls
Definition: test_util.cpp:40
int xs
void matpc(void *outEven, void **gauge, void *inEven, double kappa, QudaMatPCType matpc_type, int dagger_bit, QudaPrecision sPrecision, QudaPrecision gPrecision, double mferm)
int x[QUDA_MAX_DIM]
Definition: lattice_field.h:38
int neighborIndex_5d(int i, int oddBit, int dxs, int dx4, int dx3, int dx2, int dx1)
void dw_matdagmat(void *out, void **gauge, void *in, double kappa, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm)
void dw_matpc(void *out, void **gauge, void *in, double kappa, QudaMatPCType matpc_type, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm)
cudaColorSpinorField * tmp
VOLATILE spinorFloat kappa
Float * spinorNeighbor_5d_mgpu(int i, int dir, int oddBit, Float *spinorField, Float **fwd_nbr_spinor, Float **back_nbr_spinor, int neighbor_distance, int nFace)
FloatingPoint< float > Float
Definition: gtest.h:7350
int V5h
Definition: test_util.cpp:42
ColorSpinorParam csParam
Definition: pack_test.cpp:24
cpuColorSpinorField * in
void exchangeCpuSpinor(quda::cpuColorSpinorField &in, int parity, int dagger)
#define mySpinorSiteSize
Float * gaugeLink_sgpu(int i, int dir, int oddBit, Float **gaugeEven, Float **gaugeOdd)
enum QudaMatPCType_s QudaMatPCType
Float * spinorNeighbor_5d_4dpc_mgpu(int i, int dir, int oddBit, Float *spinorField, Float **fwd_nbr_spinor, Float **back_nbr_spinor, int neighbor_distance, int nFace)
__host__ __device__ ValueType pow(ValueType x, ExponentType e)
Definition: complex_quda.h:100
enum QudaParity_s QudaParity
void mdw_dslash_5(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm, double *kappa)
void matpcdagmatpc(void *out, void **gauge, void *in, double kappa, QudaPrecision sPrecision, QudaPrecision gPrecision, double mferm, QudaMatPCType matpc_type)
Float * spinorNeighbor_5d(int i, int dir, int oddBit, Float *spinorField)
void dslashReference_5th_inv(sFloat *res, sFloat *spinorField, int oddBit, int daggerBit, sFloat mferm, double *kappa)
void axpy(double a, void *x, void *y, int len, QudaPrecision precision)
int fullLatticeIndex_5d(int i, int oddBit)
Definition: test_util.cpp:650
cpuColorSpinorField * out
void mdw_dslash_4_pre(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm, double *b5, double *c5)
int Z[4]
Definition: test_util.cpp:28
void dslashReference_4d_4dpc_sgpu(sFloat *res, gFloat **gaugeFull, sFloat *spinorField, int oddBit, int daggerBit)
Main header file for the QUDA library.
__constant__ int X3
void dslashReference_4d_sgpu(sFloat *res, gFloat **gaugeFull, sFloat *spinorField, int oddBit, int daggerBit)
int fullLatticeIndex_4d(int i, int oddBit)
Definition: test_util.cpp:616
const double projector[10][4][4][2]
int neighborIndex_5d_4dpc(int i, int oddBit, int dxs, int dx4, int dx3, int dx2, int dx1)
void dslash_4_4d(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm)
QudaMatPCType matpc_type
Definition: test_util.cpp:1573
int V5
Definition: test_util.cpp:41
void matdagmat(void *out, void **gauge, void *in, double kappa, QudaPrecision sPrecision, QudaPrecision gPrecision, double mferm)
VOLATILE spinorFloat * s
Float * gaugeLink_mgpu(int i, int dir, int oddBit, Float **gaugeEven, Float **gaugeOdd, Float **ghostGaugeEven, Float **ghostGaugeOdd, int n_ghost_faces, int nbr_distance)
int neighborIndex_4d(int i, int oddBit, int dx4, int dx3, int dx2, int dx1)
int neighborIndex_5d_4dpc_mgpu(int i, int oddBit, int dxs, int dx4, int dx3, int dx2, int dx1)
void * gauge[4]
Definition: su3_test.cpp:15
void dslash_5_inv(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm, double *kappa)
void multiplySpinorByDiracProjector5(Float *res, int projIdx, Float *spinorIn)
int oddBit
double kappa5
Definition: dslash_test.cpp:32
int x4_5d_4dpc_mgpu(int i, int oddBit)
__constant__ int X4
void dslashReference_5th(sFloat *res, sFloat *spinorField, int oddBit, int daggerBit, sFloat mferm)