QUDA  v0.5.0
A library for QCD on GPUs
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
domain_wall_dslash_reference.cpp
Go to the documentation of this file.
1 #include <iostream>
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <math.h>
5 
6 #include <quda.h>
7 #include <test_util.h>
8 #include <dslash_util.h>
10 #include <blas_reference.h>
11 
12 #include <gauge_field.h>
13 #include <color_spinor_field.h>
14 #include <face_quda.h>
15 
16 using namespace quda;
17 
18 // i represents a "half index" into an even or odd "half lattice".
19 // when oddBit={0,1} the half lattice is {even,odd}.
20 //
21 // the displacements, such as dx, refer to the full lattice coordinates.
22 //
23 // neighborIndex() takes a "half index", displaces it, and returns the
24 // new "half index", which can be an index into either the even or odd lattices.
25 // displacements of magnitude one always interchange odd and even lattices.
26 //
27 //
28 int neighborIndex_5d(int i, int oddBit, int dxs, int dx4, int dx3, int dx2, int dx1) {
29  // fullLatticeIndex was modified for fullLatticeIndex_4d. It is in util_quda.cpp.
30  // This code bit may not properly perform 5dPC.
31  int X = fullLatticeIndex_5d(i, oddBit);
32  // Checked that this matches code in dslash_core_ante.h.
33  int xs = X/(Z[3]*Z[2]*Z[1]*Z[0]);
34  int x4 = (X/(Z[2]*Z[1]*Z[0])) % Z[3];
35  int x3 = (X/(Z[1]*Z[0])) % Z[2];
36  int x2 = (X/Z[0]) % Z[1];
37  int x1 = X % Z[0];
38  // Displace and project back into domain 0,...,Ls-1.
39  // Note that we add Ls to avoid the negative problem
40  // of the C % operator.
41  xs = (xs+dxs+Ls) % Ls;
42  // Etc.
43  x4 = (x4+dx4+Z[3]) % Z[3];
44  x3 = (x3+dx3+Z[2]) % Z[2];
45  x2 = (x2+dx2+Z[1]) % Z[1];
46  x1 = (x1+dx1+Z[0]) % Z[0];
47  // Return linear half index. Remember that integer division
48  // rounds down.
49  return (xs*(Z[3]*Z[2]*Z[1]*Z[0]) + x4*(Z[2]*Z[1]*Z[0]) + x3*(Z[1]*Z[0]) + x2*(Z[0]) + x1) / 2;
50 }
51 
52 // i represents a "half index" into an even or odd "half lattice".
53 // when oddBit={0,1} the half lattice is {even,odd}.
54 //
55 // the displacements, such as dx, refer to the full lattice coordinates.
56 //
57 // neighborIndex() takes a "half index", displaces it, and returns the
58 // new "half index", which can be an index into either the even or odd lattices.
59 // displacements of magnitude one always interchange odd and even lattices.
60 //
61 //
62 int neighborIndex_4d(int i, int oddBit, int dx4, int dx3, int dx2, int dx1) {
63  // On input i should be in the range [0 , ... , Z[0]*Z[1]*Z[2]*Z[3]/2-1].
64  if (i < 0 || i >= (Z[0]*Z[1]*Z[2]*Z[3]/2))
65  { printf("i out of range in neighborIndex_4d\n"); exit(-1); }
66  // Compute the linear index. Then dissect.
67  // fullLatticeIndex_4d is in util_quda.cpp.
68  // The gauge fields live on a 4d sublattice.
69  int X = fullLatticeIndex_4d(i, oddBit);
70  int x4 = X/(Z[2]*Z[1]*Z[0]);
71  int x3 = (X/(Z[1]*Z[0])) % Z[2];
72  int x2 = (X/Z[0]) % Z[1];
73  int x1 = X % Z[0];
74 
75  x4 = (x4+dx4+Z[3]) % Z[3];
76  x3 = (x3+dx3+Z[2]) % Z[2];
77  x2 = (x2+dx2+Z[1]) % Z[1];
78  x1 = (x1+dx1+Z[0]) % Z[0];
79 
80  return (x4*(Z[2]*Z[1]*Z[0]) + x3*(Z[1]*Z[0]) + x2*(Z[0]) + x1) / 2;
81 }
82 
83 //BEGIN NEW
84 //#ifdef MULTI_GPU
85 int
86 neighborIndex_5d_mgpu(int i, int oddBit, int dxs, int dx4, int dx3, int dx2, int dx1)
87 {
88  int ret;
89 
90  int Y = fullLatticeIndex_5d(i, oddBit);
91 
92  int xs = Y/(Z[3]*Z[2]*Z[1]*Z[0]);
93  int x4 = (Y/(Z[2]*Z[1]*Z[0])) % Z[3];
94  int x3 = (Y/(Z[1]*Z[0])) % Z[2];
95  int x2 = (Y/Z[0]) % Z[1];
96  int x1 = Y % Z[0];
97 
98  int ghost_x4 = x4+ dx4;
99 
100  xs = (xs+dxs+Ls) % Ls;
101  x4 = (x4+dx4+Z[3]) % Z[3];
102  x3 = (x3+dx3+Z[2]) % Z[2];
103  x2 = (x2+dx2+Z[1]) % Z[1];
104  x1 = (x1+dx1+Z[0]) % Z[0];
105 
106  if ( ghost_x4 >= 0 && ghost_x4 < Z[3]){
107  ret = (xs*Z[3]*Z[2]*Z[1]*Z[0] + x4*Z[2]*Z[1]*Z[0] + x3*Z[1]*Z[0] + x2*Z[0] + x1) >> 1;
108  }else{
109  ret = (xs*Z[2]*Z[1]*Z[0] + x3*Z[1]*Z[0] + x2*Z[0] + x1) >> 1;
110  }
111 
112 
113  return ret;
114 }
115 
116 int
117 x4_5d_mgpu(int i, int oddBit)
118 {
119  int Y = fullLatticeIndex_5d(i, oddBit);
120  return (Y/(Z[2]*Z[1]*Z[0])) % Z[3];
121 }
122 
123 //#endif
124 
125 //END NEW
126 
127 //#ifndef MULTI_GPU
128 // This is just a copy of gaugeLink() from the quda code, except
129 // that neighborIndex() is replaced by the renamed version
130 // neighborIndex_4d().
131 //ok
132 template <typename Float>
133 Float *gaugeLink_sgpu(int i, int dir, int oddBit, Float **gaugeEven,
134  Float **gaugeOdd) {
135  Float **gaugeField;
136  int j;
137 
138  // If going forward, just grab link at site, U_\mu(x).
139  if (dir % 2 == 0) {
140  j = i;
141  // j will get used in the return statement below.
142  gaugeField = (oddBit ? gaugeOdd : gaugeEven);
143  } else {
144  // If going backward, a shift must occur, U_\mu(x-\muhat)^\dagger;
145  // dagger happens elsewhere, here we're just doing index gymnastics.
146  switch (dir) {
147  case 1: j = neighborIndex_4d(i, oddBit, 0, 0, 0, -1); break;
148  case 3: j = neighborIndex_4d(i, oddBit, 0, 0, -1, 0); break;
149  case 5: j = neighborIndex_4d(i, oddBit, 0, -1, 0, 0); break;
150  case 7: j = neighborIndex_4d(i, oddBit, -1, 0, 0, 0); break;
151  default: j = -1; break;
152  }
153  gaugeField = (oddBit ? gaugeEven : gaugeOdd);
154  }
155 
156  return &gaugeField[dir/2][j*(3*3*2)];
157 }
158 
159 
160 //#else
161 
162 //Standard 4d version (nothing to change)
163 template <typename Float>
164 Float *gaugeLink_mgpu(int i, int dir, int oddBit, Float **gaugeEven, Float **gaugeOdd, Float** ghostGaugeEven, Float** ghostGaugeOdd, int n_ghost_faces, int nbr_distance) {
165  Float **gaugeField;
166  int j;
167  int d = nbr_distance;
168  if (dir % 2 == 0) {
169  j = i;
170  gaugeField = (oddBit ? gaugeOdd : gaugeEven);
171  }
172  else {
173 
174  int Y = fullLatticeIndex(i, oddBit);
175  int x4 = Y/(Z[2]*Z[1]*Z[0]);
176  int x3 = (Y/(Z[1]*Z[0])) % Z[2];
177  int x2 = (Y/Z[0]) % Z[1];
178  int x1 = Y % Z[0];
179  int X1= Z[0];
180  int X2= Z[1];
181  int X3= Z[2];
182  int X4= Z[3];
183  Float* ghostGaugeField;
184 
185  switch (dir) {
186  case 1:
187  { //-X direction
188  int new_x1 = (x1 - d + X1 )% X1;
189  if (x1 -d < 0){
190  ghostGaugeField = (oddBit?ghostGaugeEven[0]: ghostGaugeOdd[0]);
191  int offset = (n_ghost_faces + x1 -d)*X4*X3*X2/2 + (x4*X3*X2 + x3*X2+x2)/2;
192  return &ghostGaugeField[offset*(3*3*2)];
193  }
194  j = (x4*X3*X2*X1 + x3*X2*X1 + x2*X1 + new_x1) / 2;
195  break;
196  }
197  case 3:
198  { //-Y direction
199  int new_x2 = (x2 - d + X2 )% X2;
200  if (x2 -d < 0){
201  ghostGaugeField = (oddBit?ghostGaugeEven[1]: ghostGaugeOdd[1]);
202  int offset = (n_ghost_faces + x2 -d)*X4*X3*X1/2 + (x4*X3*X1 + x3*X1+x1)/2;
203  return &ghostGaugeField[offset*(3*3*2)];
204  }
205  j = (x4*X3*X2*X1 + x3*X2*X1 + new_x2*X1 + x1) / 2;
206  break;
207 
208  }
209  case 5:
210  { //-Z direction
211  int new_x3 = (x3 - d + X3 )% X3;
212  if (x3 -d < 0){
213  ghostGaugeField = (oddBit?ghostGaugeEven[2]: ghostGaugeOdd[2]);
214  int offset = (n_ghost_faces + x3 -d)*X4*X2*X1/2 + (x4*X2*X1 + x2*X1+x1)/2;
215  return &ghostGaugeField[offset*(3*3*2)];
216  }
217  j = (x4*X3*X2*X1 + new_x3*X2*X1 + x2*X1 + x1) / 2;
218  break;
219  }
220  case 7:
221  { //-T direction
222  int new_x4 = (x4 - d + X4)% X4;
223  if (x4 -d < 0){
224  ghostGaugeField = (oddBit?ghostGaugeEven[3]: ghostGaugeOdd[3]);
225  int offset = (n_ghost_faces + x4 -d)*X1*X2*X3/2 + (x3*X2*X1 + x2*X1+x1)/2;
226  return &ghostGaugeField[offset*(3*3*2)];
227  }
228  j = (new_x4*(X3*X2*X1) + x3*(X2*X1) + x2*(X1) + x1) / 2;
229  break;
230  }//7
231 
232  default: j = -1; printf("ERROR: wrong dir \n"); exit(1);
233  }
234  gaugeField = (oddBit ? gaugeEven : gaugeOdd);
235 
236  }
237 
238  return &gaugeField[dir/2][j*(3*3*2)];
239 }
240 
241 //A.S.: this is valid for DW dslash wiht space-time decomposition.
242 template <typename Float>
243 Float *spinorNeighbor_5d_mgpu(int i, int dir, int oddBit, Float *spinorField, Float** fwd_nbr_spinor, Float** back_nbr_spinor, int neighbor_distance, int nFace)
244 {
245  int j;
246  int nb = neighbor_distance;
247  int Y = fullLatticeIndex_5d(i, oddBit);
248 
249  int mySpinorSiteSize = 24;
250 
251  int xs = Y/(Z[3]*Z[2]*Z[1]*Z[0]);
252  int x4 = (Y/(Z[2]*Z[1]*Z[0])) % Z[3];
253  int x3 = (Y/(Z[1]*Z[0])) % Z[2];
254  int x2 = (Y/Z[0]) % Z[1];
255  int x1 = Y % Z[0];
256 
257  int X1= Z[0];
258  int X2= Z[1];
259  int X3= Z[2];
260  int X4= Z[3];
261  switch (dir) {
262  case 0://+X
263  {
264  int new_x1 = (x1 + nb)% X1;
265  if(x1+nb >=X1){
266  int offset = ((x1 + nb -X1)*Ls*X4*X3*X2+xs*X4*X3*X2+x4*X3*X2 + x3*X2+x2) >> 1;
267  return fwd_nbr_spinor[0] + offset*mySpinorSiteSize;
268  }
269  j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + x3*X2*X1 + x2*X1 + new_x1) >> 1;
270  break;
271 
272  }
273  case 1://-X
274  {
275  int new_x1 = (x1 - nb + X1)% X1;
276  if(x1 - nb < 0){
277  int offset = (( x1+nFace- nb)*Ls*X4*X3*X2 + xs*X4*X3*X2 + x4*X3*X2 + x3*X2 + x2) >> 1;
278  return back_nbr_spinor[0] + offset*mySpinorSiteSize;
279  }
280  j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + x3*X2*X1 + x2*X1 + new_x1) >> 1;
281  break;
282  }
283  case 2://+Y
284  {
285  int new_x2 = (x2 + nb)% X2;
286  if(x2+nb >=X2){
287  int offset = (( x2 + nb -X2)*Ls*X4*X3*X1+xs*X4*X3*X1+x4*X3*X1 + x3*X1+x1) >> 1;
288  return fwd_nbr_spinor[1] + offset*mySpinorSiteSize;
289  }
290  j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + x3*X2*X1 + new_x2*X1 + x1) >> 1;
291  break;
292  }
293  case 3:// -Y
294  {
295  int new_x2 = (x2 - nb + X2)% X2;
296  if(x2 - nb < 0){
297  int offset = (( x2 + nFace -nb)*Ls*X4*X3*X1+xs*X4*X3*X1+ x4*X3*X1 + x3*X1+x1) >> 1;
298  return back_nbr_spinor[1] + offset*mySpinorSiteSize;
299  }
300  j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + x3*X2*X1 + new_x2*X1 + x1) >> 1;
301  break;
302  }
303  case 4://+Z
304  {
305  int new_x3 = (x3 + nb)% X3;
306  if(x3+nb >=X3){
307  int offset = (( x3 + nb -X3)*Ls*X4*X2*X1+xs*X4*X2*X1+x4*X2*X1 + x2*X1+x1) >> 1;
308  return fwd_nbr_spinor[2] + offset*mySpinorSiteSize;
309  }
310  j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + new_x3*X2*X1 + x2*X1 + x1) >> 1;
311  break;
312  }
313  case 5://-Z
314  {
315  int new_x3 = (x3 - nb + X3)% X3;
316  if(x3 - nb < 0){
317  int offset = (( x3 + nFace -nb)*Ls*X4*X2*X1+xs*X4*X2*X1+x4*X2*X1+x2*X1+x1) >> 1;
318  return back_nbr_spinor[2] + offset*mySpinorSiteSize;
319  }
320  j = (xs*X4*X3*X2*X1 + x4*X3*X2*X1 + new_x3*X2*X1 + x2*X1 + x1) >> 1;
321  break;
322  }
323  case 6://+T
324  {
325  j = neighborIndex_5d_mgpu(i, oddBit, 0, +nb, 0, 0, 0);
326  int x4 = x4_5d_mgpu(i, oddBit);
327  if ( (x4 + nb) >= Z[3])
328  {
329  int offset = (x4+nb - Z[3])*Vsh_t;//?
330  return &fwd_nbr_spinor[3][(offset+j)*mySpinorSiteSize];
331  }
332  break;
333  }
334  case 7://-T
335  {
336  j = neighborIndex_5d_mgpu(i, oddBit, 0, -nb, 0, 0, 0);
337  int x4 = x4_5d_mgpu(i, oddBit);
338  if ( (x4 - nb) < 0){
339  int offset = ( x4 - nb +nFace)*Vsh_t;//?
340  return &back_nbr_spinor[3][(offset+j)*mySpinorSiteSize];
341  }
342  break;
343  }
344  default: j = -1; printf("ERROR: wrong dir\n"); exit(1);
345  }
346 
347  return &spinorField[j*(mySpinorSiteSize)];
348 }
349 
350 //#endif
351 
352 
353 template <typename Float>
354 Float *spinorNeighbor_5d(int i, int dir, int oddBit, Float *spinorField) {
355  int j;
356  switch (dir) {
357  case 0: j = neighborIndex_5d(i, oddBit, 0, 0, 0, 0, +1); break;
358  case 1: j = neighborIndex_5d(i, oddBit, 0, 0, 0, 0, -1); break;
359  case 2: j = neighborIndex_5d(i, oddBit, 0, 0, 0, +1, 0); break;
360  case 3: j = neighborIndex_5d(i, oddBit, 0, 0, 0, -1, 0); break;
361  case 4: j = neighborIndex_5d(i, oddBit, 0, 0, +1, 0, 0); break;
362  case 5: j = neighborIndex_5d(i, oddBit, 0, 0, -1, 0, 0); break;
363  case 6: j = neighborIndex_5d(i, oddBit, 0, +1, 0, 0, 0); break;
364  case 7: j = neighborIndex_5d(i, oddBit, 0, -1, 0, 0, 0); break;
365  case 8: j = neighborIndex_5d(i, oddBit, +1, 0, 0, 0, 0); break;
366  case 9: j = neighborIndex_5d(i, oddBit, -1, 0, 0, 0, 0); break;
367  default: j = -1; break;
368  }
369 
370  return &spinorField[j*(4*3*2)];
371 }
372 
373 
374 //J Directions 0..7 were used in the 4d code.
375 //J Directions 8,9 will be for P_- and P_+, chiral
376 //J projectors.
377 const double projector[10][4][4][2] = {
378  {
379  {{1,0}, {0,0}, {0,0}, {0,-1}},
380  {{0,0}, {1,0}, {0,-1}, {0,0}},
381  {{0,0}, {0,1}, {1,0}, {0,0}},
382  {{0,1}, {0,0}, {0,0}, {1,0}}
383  },
384  {
385  {{1,0}, {0,0}, {0,0}, {0,1}},
386  {{0,0}, {1,0}, {0,1}, {0,0}},
387  {{0,0}, {0,-1}, {1,0}, {0,0}},
388  {{0,-1}, {0,0}, {0,0}, {1,0}}
389  },
390  {
391  {{1,0}, {0,0}, {0,0}, {1,0}},
392  {{0,0}, {1,0}, {-1,0}, {0,0}},
393  {{0,0}, {-1,0}, {1,0}, {0,0}},
394  {{1,0}, {0,0}, {0,0}, {1,0}}
395  },
396  {
397  {{1,0}, {0,0}, {0,0}, {-1,0}},
398  {{0,0}, {1,0}, {1,0}, {0,0}},
399  {{0,0}, {1,0}, {1,0}, {0,0}},
400  {{-1,0}, {0,0}, {0,0}, {1,0}}
401  },
402  {
403  {{1,0}, {0,0}, {0,-1}, {0,0}},
404  {{0,0}, {1,0}, {0,0}, {0,1}},
405  {{0,1}, {0,0}, {1,0}, {0,0}},
406  {{0,0}, {0,-1}, {0,0}, {1,0}}
407  },
408  {
409  {{1,0}, {0,0}, {0,1}, {0,0}},
410  {{0,0}, {1,0}, {0,0}, {0,-1}},
411  {{0,-1}, {0,0}, {1,0}, {0,0}},
412  {{0,0}, {0,1}, {0,0}, {1,0}}
413  },
414  {
415  {{1,0}, {0,0}, {-1,0}, {0,0}},
416  {{0,0}, {1,0}, {0,0}, {-1,0}},
417  {{-1,0}, {0,0}, {1,0}, {0,0}},
418  {{0,0}, {-1,0}, {0,0}, {1,0}}
419  },
420  {
421  {{1,0}, {0,0}, {1,0}, {0,0}},
422  {{0,0}, {1,0}, {0,0}, {1,0}},
423  {{1,0}, {0,0}, {1,0}, {0,0}},
424  {{0,0}, {1,0}, {0,0}, {1,0}}
425  },
426  // P_+ = P_R
427  {
428  {{0,0}, {0,0}, {0,0}, {0,0}},
429  {{0,0}, {0,0}, {0,0}, {0,0}},
430  {{0,0}, {0,0}, {2,0}, {0,0}},
431  {{0,0}, {0,0}, {0,0}, {2,0}}
432  },
433  // P_- = P_L
434  {
435  {{2,0}, {0,0}, {0,0}, {0,0}},
436  {{0,0}, {2,0}, {0,0}, {0,0}},
437  {{0,0}, {0,0}, {0,0}, {0,0}},
438  {{0,0}, {0,0}, {0,0}, {0,0}}
439  }
440 };
441 
442 
443 // todo pass projector
444 template <typename Float>
445 void multiplySpinorByDiracProjector5(Float *res, int projIdx, Float *spinorIn) {
446  for (int i=0; i<4*3*2; i++) res[i] = 0.0;
447 
448  for (int s = 0; s < 4; s++) {
449  for (int t = 0; t < 4; t++) {
450  Float projRe = projector[projIdx][s][t][0];
451  Float projIm = projector[projIdx][s][t][1];
452 
453  for (int m = 0; m < 3; m++) {
454  Float spinorRe = spinorIn[t*(3*2) + m*(2) + 0];
455  Float spinorIm = spinorIn[t*(3*2) + m*(2) + 1];
456  res[s*(3*2) + m*(2) + 0] += projRe*spinorRe - projIm*spinorIm;
457  res[s*(3*2) + m*(2) + 1] += projRe*spinorIm + projIm*spinorRe;
458  }
459  }
460  }
461 }
462 
463 
464 //#ifndef MULTI_GPU
465 // dslashReference_4d()
466 //J This is just the 4d wilson dslash of quda code, with a
467 //J few small changes to take into account that the spinors
468 //J are 5d and the gauge fields are 4d.
469 //
470 // if oddBit is zero: calculate odd parity spinor elements (using even parity spinor)
471 // if oddBit is one: calculate even parity spinor elements
472 //
473 // if daggerBit is zero: perform ordinary dslash operator
474 // if daggerBit is one: perform hermitian conjugate of dslash
475 //
476 //An "ok" will only be granted once check2.tex is deemed complete,
477 //since the logic in this function is important and nontrivial.
478 template <typename sFloat, typename gFloat>
479 void dslashReference_4d_sgpu(sFloat *res, gFloat **gaugeFull, sFloat *spinorField,
480  int oddBit, int daggerBit) {
481 
482  // Initialize the return half-spinor to zero. Note that it is a
483  // 5d spinor, hence the use of V5h.
484  for (int i=0; i<V5h*4*3*2; i++) res[i] = 0.0;
485 
486  // Some pointers that we use to march through arrays.
487  gFloat *gaugeEven[4], *gaugeOdd[4];
488  // Initialize to beginning of even and odd parts of
489  // gauge array.
490  for (int dir = 0; dir < 4; dir++) {
491  gaugeEven[dir] = gaugeFull[dir];
492  // Note the use of Vh here, since the gauge fields
493  // are 4-dim'l.
494  gaugeOdd[dir] = gaugeFull[dir]+Vh*gaugeSiteSize;
495  }
496  int sp_idx,oddBit_gge;
497  for (int xs=0;xs<Ls;xs++) {
498  for (int gge_idx = 0; gge_idx < Vh; gge_idx++) {
499  for (int dir = 0; dir < 8; dir++) {
500  sp_idx=gge_idx+Vh*xs;
501  // Here is a function call to study. It is defined near
502  // Line 90 of this file.
503  // Here we have to switch oddBit depending on the value of xs. E.g., suppose
504  // xs=1. Then the odd spinor site x1=x2=x3=x4=0 wants the even gauge array
505  // element 0, so that we get U_\mu(0).
506  if ((xs % 2) == 0) oddBit_gge=oddBit;
507  else oddBit_gge= (oddBit+1) % 2;
508  gFloat *gauge = gaugeLink_sgpu(gge_idx, dir, oddBit_gge, gaugeEven, gaugeOdd);
509 
510  // Even though we're doing the 4d part of the dslash, we need
511  // to use a 5d neighbor function, to get the offsets right.
512  sFloat *spinor = spinorNeighbor_5d(sp_idx, dir, oddBit, spinorField);
513  sFloat projectedSpinor[4*3*2], gaugedSpinor[4*3*2];
514  int projIdx = 2*(dir/2)+(dir+daggerBit)%2;
515  multiplySpinorByDiracProjector5(projectedSpinor, projIdx, spinor);
516 
517  for (int s = 0; s < 4; s++) {
518  if (dir % 2 == 0) {
519  su3Mul(&gaugedSpinor[s*(3*2)], gauge, &projectedSpinor[s*(3*2)]);
520 #ifdef DBUG_VERBOSE
521  std::cout << "spinor:" << std::endl;
522  printSpinorElement(&projectedSpinor[s*(3*2)],0,QUDA_DOUBLE_PRECISION);
523  std::cout << "gauge:" << std::endl;
524 #endif
525  } else {
526  su3Tmul(&gaugedSpinor[s*(3*2)], gauge, &projectedSpinor[s*(3*2)]);
527  }
528  }
529 
530  sum(&res[sp_idx*(4*3*2)], &res[sp_idx*(4*3*2)], gaugedSpinor, 4*3*2);
531  }
532  }
533  }
534 }
535 //#else
536 
537 template <typename sFloat, typename gFloat>
538 void dslashReference_4d_mgpu(sFloat *res, gFloat **gaugeFull, gFloat **ghostGauge, sFloat *spinorField, sFloat **fwdSpinor, sFloat **backSpinor, int oddBit, int daggerBit)
539 {
540 
541  int mySpinorSiteSize = 24;
542  for (int i=0; i<V5h*mySpinorSiteSize; i++) res[i] = 0.0;
543 
544  gFloat *gaugeEven[4], *gaugeOdd[4];
545  gFloat *ghostGaugeEven[4], *ghostGaugeOdd[4];
546 
547  for (int dir = 0; dir < 4; dir++)
548  {
549  gaugeEven[dir] = gaugeFull[dir];
550  gaugeOdd[dir] = gaugeFull[dir]+Vh*gaugeSiteSize;
551 
552  ghostGaugeEven[dir] = ghostGauge[dir];
553  ghostGaugeOdd[dir] = ghostGauge[dir] + (faceVolume[dir]/2)*gaugeSiteSize;
554  }
555  for (int xs=0;xs<Ls;xs++)
556  {
557  int sp_idx;
558  for (int i = 0; i < Vh; i++)
559  {
560  sp_idx = i + Vh*xs;
561  for (int dir = 0; dir < 8; dir++)
562  {
563  int oddBit_gge;
564 
565  if ((xs % 2) == 0) oddBit_gge=oddBit;
566  else oddBit_gge= (oddBit+1) % 2;
567 
568  gFloat *gauge = gaugeLink_mgpu(i, dir, oddBit_gge, gaugeEven, gaugeOdd, ghostGaugeEven, ghostGaugeOdd, 1, 1);//this is unchanged from MPi version
569  sFloat *spinor = spinorNeighbor_5d_mgpu(sp_idx, dir, oddBit, spinorField, fwdSpinor, backSpinor, 1, 1);
570 
571  sFloat projectedSpinor[mySpinorSiteSize], gaugedSpinor[mySpinorSiteSize];
572  int projIdx = 2*(dir/2)+(dir+daggerBit)%2;
573  multiplySpinorByDiracProjector5(projectedSpinor, projIdx, spinor);
574 
575  for (int s = 0; s < 4; s++)
576  {
577  if (dir % 2 == 0) su3Mul(&gaugedSpinor[s*(3*2)], gauge, &projectedSpinor[s*(3*2)]);
578  else su3Tmul(&gaugedSpinor[s*(3*2)], gauge, &projectedSpinor[s*(3*2)]);
579  }
580  sum(&res[sp_idx*(4*3*2)], &res[sp_idx*(4*3*2)], gaugedSpinor, 4*3*2);
581  }
582  }
583  }
584 }
585 //#endif
586 
587 //Currently we consider only spacetime decomposition (not in 5th dim), so this operator is local
588 template <typename sFloat>
589 void dslashReference_5th(sFloat *res, sFloat *spinorField,
590  int oddBit, int daggerBit, sFloat mferm) {
591  for (int i = 0; i < V5h; i++) {
592  for (int dir = 8; dir < 10; dir++) {
593  // Calls for an extension of the original function.
594  // 8 is forward hop, which wants P_+, 9 is backward hop,
595  // which wants P_-. Dagger reverses these.
596  sFloat *spinor = spinorNeighbor_5d(i, dir, oddBit, spinorField);
597  sFloat projectedSpinor[4*3*2];
598  int projIdx = 2*(dir/2)+(dir+daggerBit)%2;
599  multiplySpinorByDiracProjector5(projectedSpinor, projIdx, spinor);
600  //J Need a conditional here for s=0 and s=Ls-1.
601  int X = fullLatticeIndex_5d(i, oddBit);
602  int xs = X/(Z[3]*Z[2]*Z[1]*Z[0]);
603 
604  if ( (xs == 0 && dir == 9) || (xs == Ls-1 && dir == 8) ) {
605  ax(projectedSpinor,(sFloat)(-mferm),projectedSpinor,4*3*2);
606  }
607  sum(&res[i*(4*3*2)], &res[i*(4*3*2)], projectedSpinor, 4*3*2);
608  }
609  }
610 }
611 
612 // Recall that dslash is only the off-diagonal parts, so m0_dwf is not needed.
613 //
614 //#ifndef MULTI_GPU
615 /*
616 void dslash(void *res, void **gaugeFull, void *spinorField,
617  int oddBit, int daggerBit,
618  QudaPrecision sPrecision, QudaPrecision gPrecision, double mferm) {
619 
620  if (sPrecision == QUDA_DOUBLE_PRECISION) {
621  if (gPrecision == QUDA_DOUBLE_PRECISION) {
622  // Do the 4d part, which hasn't changed.
623  printf("doing 4d part\n"); fflush(stdout);
624  dslashReference_4d_sgpu<double,double>((double*)res, (double**)gaugeFull,
625  (double*)spinorField, oddBit, daggerBit);
626  // Now add in the 5th dim.
627  printf("doing 5th dimen. part\n"); fflush(stdout);
628  dslashReference_5th<double>((double*)res, (double*)spinorField,
629  oddBit, daggerBit, mferm);
630  } else {
631  dslashReference_4d_sgpu<double,float>((double*)res, (float**)gaugeFull, (double*)spinorField, oddBit, daggerBit);
632  dslashReference_5th<double>((double*)res, (double*)spinorField, oddBit, daggerBit, mferm);
633  }
634  } else {
635  // Single-precision spinor.
636  if (gPrecision == QUDA_DOUBLE_PRECISION) {
637  dslashReference_4d_sgpu<float,double>((float*)res, (double**)gaugeFull, (float*)spinorField, oddBit, daggerBit);
638  dslashReference_5th<float>((float*)res, (float*)spinorField, oddBit, daggerBit, mferm);
639  } else {
640  // Do the 4d part, which hasn't changed.
641  printf("CPU reference: doing 4d part all single precision\n"); fflush(stdout);
642  dslashReference_4d_sgpu<float,float>((float*)res, (float**)gaugeFull, (float*)spinorField, oddBit, daggerBit);
643  // Now add in the 5th dim.
644  printf("CPU reference: doing 5th dimen. part all single precision\n"); fflush(stdout);
645  dslashReference_5th<float>((float*)res, (float*)spinorField, oddBit, daggerBit, mferm);
646  }
647  }
648 }
649 */
650 //#endif
651 
652 //BEGIN NEW
653 // this actually applies the preconditioned dslash, e.g., D_ee^{-1} D_eo or D_oo^{-1} D_oe
654 void dw_dslash(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm)
655 {
656 #ifndef MULTI_GPU
657  if (precision == QUDA_DOUBLE_PRECISION) {
658  dslashReference_4d_sgpu((double*)out, (double**)gauge, (double*)in, oddBit, daggerBit);
659  dslashReference_5th((double*)out, (double*)in, oddBit, daggerBit, mferm);
660  } else {
661  dslashReference_4d_sgpu((float*)out, (float**)gauge, (float*)in, oddBit, daggerBit);
662  dslashReference_5th((float*)out, (float*)in, oddBit, daggerBit, (float)mferm);
663  }
664 #else
665 
666 // void *ghostGauge[4], *sendGauge[4];
667 // for (int d=0; d<4; d++) {
668 // ghostGauge[d] = malloc(faceVolume[d]*gaugeSiteSize*precision);
669 // sendGauge[d] = malloc(faceVolume[d]*gaugeSiteSize*precision);
670 // }
671 
672 // { // Exchange gauge matrices at boundary
673 // set_dim(Z);///?
674 // pack_ghost(gauge, sendGauge, 1, precision);
675 // int nFace = 1;
676 // FaceBuffer faceBuf(Z, 4, gaugeSiteSize, nFace, precision);
677 // faceBuf.exchangeCpuLink(ghostGauge, sendGauge);
678 // }
679 
680 //BEGINOFNEW
681  GaugeFieldParam gauge_field_param(gauge, gauge_param);
682  cpuGaugeField cpu(gauge_field_param);
683  cpu.exchangeGhost();
684  void **ghostGauge = (void**)cpu.Ghost();
685 //ENDOFNEW
686 
687  // Get spinor ghost fields
688  // First wrap the input spinor into a ColorSpinorField
690  csParam.v = in;
691  csParam.nColor = 3;
692  csParam.nSpin = 4;
693  csParam.nDim = 5; //for DW dslash
694  for (int d=0; d<4; d++) csParam.x[d] = Z[d];
695  csParam.x[4] = Ls;//5th dimention
696  csParam.precision = precision;
697  csParam.pad = 0;
699  csParam.x[0] /= 2;
704 
705  cpuColorSpinorField inField(csParam);
706 
707  { // Now do the exchange
708  QudaParity otherParity = QUDA_INVALID_PARITY;
709  if (oddBit == QUDA_EVEN_PARITY) otherParity = QUDA_ODD_PARITY;
710  else if (oddBit == QUDA_ODD_PARITY) otherParity = QUDA_EVEN_PARITY;
711  else errorQuda("ERROR: full parity not supported in function %s", __FUNCTION__);
712 
713  int nFace = 1;
714  FaceBuffer faceBuf(Z, 5, mySpinorSiteSize, nFace, precision, Ls);//4 <-> 5
715  faceBuf.exchangeCpuSpinor(inField, otherParity, daggerBit);
716  }
717  void** fwd_nbr_spinor = inField.fwdGhostFaceBuffer;
718  void** back_nbr_spinor = inField.backGhostFaceBuffer;
719  //NOTE: hopping in 5th dimension does not use MPI.
720  if (precision == QUDA_DOUBLE_PRECISION)
721  {
722  dslashReference_4d_mgpu((double*)out, (double**)gauge, (double**)ghostGauge, (double*)in,(double**)fwd_nbr_spinor, (double**)back_nbr_spinor, oddBit, daggerBit);
723  dslashReference_5th((double*)out, (double*)in, oddBit, daggerBit, mferm);
724  } else
725  {
726  dslashReference_4d_mgpu((float*)out, (float**)gauge, (float**)ghostGauge, (float*)in,
727  (float**)fwd_nbr_spinor, (float**)back_nbr_spinor, oddBit, daggerBit);
728  dslashReference_5th((float*)out, (float*)in, oddBit, daggerBit, (float)mferm);
729  }
730 
731 #endif
732 
733 }
734 
735 //END NEW
736 
737 
738 
739 void dw_mat(void *out, void **gauge, void *in, double kappa, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm) {
740 
741  void *inEven = in;
742  void *inOdd = (char*)in + V5h*spinorSiteSize*precision;
743  void *outEven = out;
744  void *outOdd = (char*)out + V5h*spinorSiteSize*precision;
745 
746  dw_dslash(outOdd, gauge, inEven, 1, dagger_bit, precision, gauge_param, mferm);
747  dw_dslash(outEven, gauge, inOdd, 0, dagger_bit, precision, gauge_param, mferm);
748 
749  // lastly apply the kappa term
750  if (precision == QUDA_DOUBLE_PRECISION) xpay((double*)in, -kappa, (double*)out, V5*spinorSiteSize);
751  else xpay((float*)in, -(float)kappa, (float*)out, V5*spinorSiteSize);
752 }
753 
754 void dw_matdagmat(void *out, void **gauge, void *in, double kappa, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm)
755 {
756 
757  void *tmp = malloc(V5*spinorSiteSize*precision);
758  dw_mat(tmp, gauge, in, kappa, dagger_bit, precision, gauge_param, mferm);
759  dagger_bit = (dagger_bit == 1) ? 0 : 1;
760  dw_mat(out, gauge, tmp, kappa, dagger_bit, precision, gauge_param, mferm);
761 
762  free(tmp);
763 }
764 
765 void dw_matpc(void *out, void **gauge, void *in, double kappa, QudaMatPCType matpc_type, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm)
766 {
767  void *tmp = malloc(V5h*spinorSiteSize*precision);
768 
769  if (matpc_type == QUDA_MATPC_EVEN_EVEN || matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC) {
770  dw_dslash(tmp, gauge, in, 1, dagger_bit, precision, gauge_param, mferm);
771  dw_dslash(out, gauge, tmp, 0, dagger_bit, precision, gauge_param, mferm);
772  } else {
773  dw_dslash(tmp, gauge, in, 0, dagger_bit, precision, gauge_param, mferm);
774  dw_dslash(out, gauge, tmp, 1, dagger_bit, precision, gauge_param, mferm);
775  }
776 
777  // lastly apply the kappa term
778  double kappa2 = -kappa*kappa;
779  if (precision == QUDA_DOUBLE_PRECISION) xpay((double*)in, kappa2, (double*)out, V5h*spinorSiteSize);
780  else xpay((float*)in, (float)kappa2, (float*)out, V5h*spinorSiteSize);
781 
782  free(tmp);
783 }
784 
785 /*
786 // Apply the even-odd preconditioned Dirac operator
787 template <typename sFloat, typename gFloat>
788 void MatPC(sFloat *outEven, gFloat **gauge, sFloat *inEven, sFloat kappa,
789  QudaMatPCType matpc_type, sFloat mferm) {
790 
791  sFloat *tmp = (sFloat*)malloc(V5h*spinorSiteSize*sizeof(sFloat));
792 
793  // full dslash operator
794  if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
795  dslashReference_4d(tmp, gauge, inEven, 1, 0);
796  dslashReference_5th(tmp, inEven, 1, 0, mferm);
797  dslashReference_4d(outEven, gauge, tmp, 0, 0);
798  dslashReference_5th(outEven, tmp, 0, 0, mferm);
799  } else {
800  dslashReference_4d(tmp, gauge, inEven, 0, 0);
801  dslashReference_5th(tmp, inEven, 0, 0, mferm);
802  dslashReference_4d(outEven, gauge, tmp, 1, 0);
803  dslashReference_5th(outEven, tmp, 1, 0, mferm);
804  }
805 
806  // lastly apply the kappa term
807  sFloat kappa2 = -kappa*kappa;
808  xpay(inEven, kappa2, outEven, V5h*spinorSiteSize);
809  free(tmp);
810 }
811 
812 // Apply the even-odd preconditioned Dirac operator
813 template <typename sFloat, typename gFloat>
814 void MatPCDag(sFloat *outEven, gFloat **gauge, sFloat *inEven, sFloat kappa,
815  QudaMatPCType matpc_type, sFloat mferm) {
816 
817  sFloat *tmp = (sFloat*)malloc(V5h*spinorSiteSize*sizeof(sFloat));
818 
819  // full dslash operator
820  if (matpc_type == QUDA_MATPC_EVEN_EVEN) {
821  dslashReference_4d(tmp, gauge, inEven, 1, 1);
822  dslashReference_5th(tmp, inEven, 1, 1, mferm);
823  dslashReference_4d(outEven, gauge, tmp, 0, 1);
824  dslashReference_5th(outEven, tmp, 0, 1, mferm);
825  } else {
826  dslashReference_4d(tmp, gauge, inEven, 0, 1);
827  dslashReference_5th(tmp, inEven, 0, 1, mferm);
828  dslashReference_4d(outEven, gauge, tmp, 1, 1);
829  dslashReference_5th(outEven, tmp, 1, 1, mferm);
830  }
831 
832  sFloat kappa2 = -kappa*kappa;
833  xpay(inEven, kappa2, outEven, V5h*spinorSiteSize);
834  free(tmp);
835 }
836 */
837 
838 void matpc(void *outEven, void **gauge, void *inEven, double kappa,
839  QudaMatPCType matpc_type, int dagger_bit, QudaPrecision sPrecision, QudaPrecision gPrecision,
840  double mferm) {
841 /*
842  if (!dagger_bit) {
843  if (sPrecision == QUDA_DOUBLE_PRECISION)
844  if (gPrecision == QUDA_DOUBLE_PRECISION)
845  MatPC((double*)outEven, (double**)gauge, (double*)inEven, (double)kappa, matpc_type, (double)mferm);
846  else
847  MatPC((double*)outEven, (float**)gauge, (double*)inEven, (double)kappa, matpc_type, (double)mferm);
848  else
849  if (gPrecision == QUDA_DOUBLE_PRECISION)
850  MatPC((float*)outEven, (double**)gauge, (float*)inEven, (float)kappa, matpc_type, (float)mferm);
851  else
852  MatPC((float*)outEven, (float**)gauge, (float*)inEven, (float)kappa, matpc_type, (float)mferm);
853  } else {
854  if (sPrecision == QUDA_DOUBLE_PRECISION)
855  if (gPrecision == QUDA_DOUBLE_PRECISION)
856  MatPCDag((double*)outEven, (double**)gauge, (double*)inEven, (double)kappa, matpc_type, (double)mferm);
857  else
858  MatPCDag((double*)outEven, (float**)gauge, (double*)inEven, (double)kappa, matpc_type, (double)mferm);
859  else
860  if (gPrecision == QUDA_DOUBLE_PRECISION)
861  MatPCDag((float*)outEven, (double**)gauge, (float*)inEven, (float)kappa, matpc_type, (float)mferm);
862  else
863  MatPCDag((float*)outEven, (float**)gauge, (float*)inEven, (float)kappa, matpc_type, (float)mferm);
864  }
865 */
866 }
867 
868 /*
869 template <typename sFloat, typename gFloat>
870 void MatDagMat(sFloat *out, gFloat **gauge, sFloat *in, sFloat kappa, sFloat mferm)
871 {
872  // Allocate a full spinor.
873  sFloat *tmp = (sFloat*)malloc(V5*spinorSiteSize*sizeof(sFloat));
874  // Call templates above.
875  Mat(tmp, gauge, in, kappa, mferm);
876  MatDag(out, gauge, tmp, kappa, mferm);
877  free(tmp);
878 }
879 
880 template <typename sFloat, typename gFloat>
881 void MatPCDagMatPC(sFloat *out, gFloat **gauge, sFloat *in, sFloat kappa,
882  QudaMatPCType matpc_type, sFloat mferm)
883 {
884 
885  // Allocate half spinor
886  sFloat *tmp = (sFloat*)malloc(V5h*spinorSiteSize*sizeof(sFloat));
887  // Apply the PC templates above
888  MatPC(tmp, gauge, in, kappa, matpc_type, mferm);
889  MatPCDag(out, gauge, tmp, kappa, matpc_type, mferm);
890  free(tmp);
891 }
892 */
893 // Wrapper to templates that handles different precisions.
894 void matdagmat(void *out, void **gauge, void *in, double kappa,
895  QudaPrecision sPrecision, QudaPrecision gPrecision, double mferm)
896 {
897 /*
898  if (sPrecision == QUDA_DOUBLE_PRECISION) {
899  if (gPrecision == QUDA_DOUBLE_PRECISION)
900  MatDagMat((double*)out, (double**)gauge, (double*)in, (double)kappa,
901  (double)mferm);
902  else
903  MatDagMat((double*)out, (float**)gauge, (double*)in, (double)kappa, (double)mferm);
904  } else {
905  if (gPrecision == QUDA_DOUBLE_PRECISION)
906  MatDagMat((float*)out, (double**)gauge, (float*)in, (float)kappa,
907  (float)mferm);
908  else
909  MatDagMat((float*)out, (float**)gauge, (float*)in, (float)kappa, (float)mferm);
910  }
911 */
912 }
913 
914 // Wrapper to templates that handles different precisions.
915 void matpcdagmatpc(void *out, void **gauge, void *in, double kappa,
916  QudaPrecision sPrecision, QudaPrecision gPrecision, double mferm, QudaMatPCType matpc_type)
917 {
918 /*
919  if (sPrecision == QUDA_DOUBLE_PRECISION) {
920  if (gPrecision == QUDA_DOUBLE_PRECISION)
921  MatPCDagMatPC((double*)out, (double**)gauge, (double*)in, (double)kappa,
922  matpc_type, (double)mferm);
923  else
924  MatPCDagMatPC((double*)out, (float**)gauge, (double*)in, (double)kappa,
925  matpc_type, (double)mferm);
926  } else {
927  if (gPrecision == QUDA_DOUBLE_PRECISION)
928  MatPCDagMatPC((float*)out, (double**)gauge, (float*)in, (float)kappa,
929  matpc_type, (float)mferm);
930  else
931  MatPCDagMatPC((float*)out, (float**)gauge, (float*)in, (float)kappa,
932  matpc_type, (float)mferm);
933  }
934 */
935 }
936 
937