QUDA  0.9.0
dyninv_clover_mg.cuh
Go to the documentation of this file.
1 #ifdef DYNAMIC_CLOVER
2 /* Some variables that will become handy later */
3 
4 #define c00_01 (conj(c01_00))
5 #define c00_02 (conj(c02_00))
6 #define c01_02 (conj(c02_01))
7 #define c00_10 (conj(c10_00))
8 #define c01_10 (conj(c10_01))
9 #define c02_10 (conj(c10_02))
10 #define c00_11 (conj(c11_00))
11 #define c01_11 (conj(c11_01))
12 #define c02_11 (conj(c11_02))
13 #define c10_11 (conj(c11_10))
14 #define c00_12 (conj(c12_00))
15 #define c01_12 (conj(c12_01))
16 #define c02_12 (conj(c12_02))
17 #define c10_12 (conj(c12_10))
18 #define c11_12 (conj(c12_11))
19 
20 #define tri5 tri0
21 #define tri9 tri1
22 #define tri13 tri2
23 #define tri14 tri3
24 #define tmp0 tri0.x
25 #define tmp1 tri0.y
26 #define tmp2 tri1.x
27 #define tmp3 tri1.y
28 #define tmp4 tri2.x
29 #define tmp5 tri2.y
30 #define v1_0 tri3
31 #define v1_1 tri4
32 #define v1_2 tri6
33 #define v1_3 tri7
34 #define v1_4 tri8
35 #define v1_5 tri10
36 #define sum tri11
37 
38 #define Clv(s1,c1,s2,c2)\
39 (arg.C(0, parity, x_cb, s1+ch, s2+ch, c1, c2))
40 
41 #define InvClv(s1,c1,s2,c2)\
42 c##s1##c1##_##s2##c2
43 
44 template<typename Float, int fineSpin, int fineColor, int coarseColor, typename Arg>
45 __device__ __host__ inline void applyInvClover(Arg &arg, int parity, int x_cb) {
46  /* Applies the inverse of the clover term squared plus mu2 to the spinor */
47  /* Compute (T^2 + mu2) first, then invert */
48  /* We proceed by chiral blocks */
49 
50 
51  for (int ch = 0; ch < 4; ch += 2) { /* Loop over chiral blocks */
52  complex<Float> tri0, tri1, tri2, tri3, tri4, tri6, tri7, tri8, tri10, tri11, tri12;
53  Float d, c00_00, c01_01, c02_02, c10_10, c11_11, c12_12;
54  complex<Float> c01_00, c02_00, c10_00, c11_00, c12_00;
55  complex<Float> c02_01, c10_01, c11_01, c12_01;
56  complex<Float> c10_02, c11_02, c12_02;
57  complex<Float> c11_10, c12_10;
58  complex<Float> c12_11;
59 
60 
61  tri0 = Clv(0,1,0,0)*Clv(0,0,0,0).real() + Clv(0,1,0,1)*Clv(0,1,0,0) + Clv(0,1,0,2)*Clv(0,2,0,0) + Clv(0,1,1,0)*Clv(1,0,0,0) + Clv(0,1,1,1)*Clv(1,1,0,0) + Clv(0,1,1,2)*Clv(1,2,0,0);
62  tri1 = Clv(0,2,0,0)*Clv(0,0,0,0).real() + Clv(0,2,0,2)*Clv(0,2,0,0) + Clv(0,2,0,1)*Clv(0,1,0,0) + Clv(0,2,1,0)*Clv(1,0,0,0) + Clv(0,2,1,1)*Clv(1,1,0,0) + Clv(0,2,1,2)*Clv(1,2,0,0);
63  tri3 = Clv(1,0,0,0)*Clv(0,0,0,0).real() + Clv(1,0,1,0)*Clv(1,0,0,0) + Clv(1,0,0,1)*Clv(0,1,0,0) + Clv(1,0,0,2)*Clv(0,2,0,0) + Clv(1,0,1,1)*Clv(1,1,0,0) + Clv(1,0,1,2)*Clv(1,2,0,0);
64  tri6 = Clv(1,1,0,0)*Clv(0,0,0,0).real() + Clv(1,1,1,1)*Clv(1,1,0,0) + Clv(1,1,0,1)*Clv(0,1,0,0) + Clv(1,1,0,2)*Clv(0,2,0,0) + Clv(1,1,1,0)*Clv(1,0,0,0) + Clv(1,1,1,2)*Clv(1,2,0,0);
65  tri10 = Clv(1,2,0,0)*Clv(0,0,0,0).real() + Clv(1,2,1,2)*Clv(1,2,0,0) + Clv(1,2,0,1)*Clv(0,1,0,0) + Clv(1,2,0,2)*Clv(0,2,0,0) + Clv(1,2,1,0)*Clv(1,0,0,0) + Clv(1,2,1,1)*Clv(1,1,0,0);
66 
67  d = arg.mu*arg.mu + Clv(0,0,0,0).real()*Clv(0,0,0,0).real() + norm(Clv(0,1,0,0)) + norm(Clv(0,2,0,0)) + norm(Clv(1,0,0,0)) + norm(Clv(1,1,0,0)) + norm(Clv(1,2,0,0));
68 
69  InvClv(0,0,0,0) = d;
70 
71  tri2 = Clv(0,2,0,1)*Clv(0,1,0,1).real() + Clv(0,2,0,2)*Clv(0,2,0,1) + Clv(0,2,0,0)*Clv(0,0,0,1) + Clv(0,2,1,0)*Clv(1,0,0,1) + Clv(0,2,1,1)*Clv(1,1,0,1) + Clv(0,2,1,2)*Clv(1,2,0,1);
72  tri4 = Clv(1,0,0,1)*Clv(0,1,0,1).real() + Clv(1,0,1,0)*Clv(1,0,0,1) + Clv(1,0,0,0)*Clv(0,0,0,1) + Clv(1,0,0,2)*Clv(0,2,0,1) + Clv(1,0,1,1)*Clv(1,1,0,1) + Clv(1,0,1,2)*Clv(1,2,0,1);
73  tri7 = Clv(1,1,0,1)*Clv(0,1,0,1).real() + Clv(1,1,1,1)*Clv(1,1,0,1) + Clv(1,1,0,0)*Clv(0,0,0,1) + Clv(1,1,0,2)*Clv(0,2,0,1) + Clv(1,1,1,0)*Clv(1,0,0,1) + Clv(1,1,1,2)*Clv(1,2,0,1);
74  tri11 = Clv(1,2,0,1)*Clv(0,1,0,1).real() + Clv(1,2,1,2)*Clv(1,2,0,1) + Clv(1,2,0,0)*Clv(0,0,0,1) + Clv(1,2,0,2)*Clv(0,2,0,1) + Clv(1,2,1,0)*Clv(1,0,0,1) + Clv(1,2,1,1)*Clv(1,1,0,1);
75 
76  d = arg.mu*arg.mu + Clv(0,1,0,1).real()*Clv(0,1,0,1).real() + norm(Clv(0,0,0,1)) + norm(Clv(0,2,0,1)) + norm(Clv(1,0,0,1)) + norm(Clv(1,1,0,1)) + norm(Clv(1,2,0,1));
77 
78  InvClv(0,1,0,1) = d;
79  InvClv(0,1,0,0) = tri0; /* We freed tri0, so we can reuse it as tri5 to save memory */
80 
81  tri5 = Clv(1,0,0,2)*Clv(0,2,0,2).real() + Clv(1,0,1,0)*Clv(1,0,0,2) + Clv(1,0,0,0)*Clv(0,0,0,2) + Clv(1,0,0,1)*Clv(0,1,0,2) + Clv(1,0,1,1)*Clv(1,1,0,2) + Clv(1,0,1,2)*Clv(1,2,0,2);
82  tri8 = Clv(1,1,0,2)*Clv(0,2,0,2).real() + Clv(1,1,1,1)*Clv(1,1,0,2) + Clv(1,1,0,0)*Clv(0,0,0,2) + Clv(1,1,0,1)*Clv(0,1,0,2) + Clv(1,1,1,0)*Clv(1,0,0,2) + Clv(1,1,1,2)*Clv(1,2,0,2);
83  tri12 = Clv(1,2,0,2)*Clv(0,2,0,2).real() + Clv(1,2,1,2)*Clv(1,2,0,2) + Clv(1,2,0,0)*Clv(0,0,0,2) + Clv(1,2,0,1)*Clv(0,1,0,2) + Clv(1,2,1,0)*Clv(1,0,0,2) + Clv(1,2,1,1)*Clv(1,1,0,2);
84 
85  d = arg.mu*arg.mu + Clv(0,2,0,2).real()*Clv(0,2,0,2).real() + norm(Clv(0,0,0,2)) + norm(Clv(0,1,0,2)) + norm(Clv(1,0,0,2)) + norm(Clv(1,1,0,2)) + norm(Clv(1,2,0,2));
86 
87  InvClv(0,2,0,2) = d;
88  InvClv(0,2,0,0) = tri1; /* We freed tri1, so we can reuse it as tri9 to save memory */
89  InvClv(0,2,0,1) = tri2; /* We freed tri2, so we can reuse it as tri13 to save memory */
90 
91  tri9 = Clv(1,1,1,0)*Clv(1,0,1,0).real() + Clv(1,1,1,1)*Clv(1,1,1,0) + Clv(1,1,0,0)*Clv(0,0,1,0) + Clv(1,1,0,1)*Clv(0,1,1,0) + Clv(1,1,0,2)*Clv(0,2,1,0) + Clv(1,1,1,2)*Clv(1,2,1,0);
92  tri13 = Clv(1,2,1,0)*Clv(1,0,1,0).real() + Clv(1,2,1,2)*Clv(1,2,1,0) + Clv(1,2,0,0)*Clv(0,0,1,0) + Clv(1,2,0,1)*Clv(0,1,1,0) + Clv(1,2,0,2)*Clv(0,2,1,0) + Clv(1,2,1,1)*Clv(1,1,1,0);
93 
94  d = arg.mu*arg.mu + Clv(1,0,1,0).real()*Clv(1,0,1,0).real() + norm(Clv(0,0,1,0)) + norm(Clv(0,1,1,0)) + norm(Clv(0,2,1,0)) + norm(Clv(1,1,1,0)) + norm(Clv(1,2,1,0));
95 
96  InvClv(1,0,1,0) = d;
97  InvClv(1,0,0,0) = tri3; /* We freed tri3, so we can reuse it as tri14 to save memory */
98 
99  tri14 = Clv(1,2,1,1)*Clv(1,1,1,1).real() + Clv(1,2,1,2)*Clv(1,2,1,1) + Clv(1,2,0,0)*Clv(0,0,1,1) + Clv(1,2,0,1)*Clv(0,1,1,1) + Clv(1,2,0,2)*Clv(0,2,1,1) + Clv(1,2,1,0)*Clv(1,0,1,1);
100 
101  d = arg.mu*arg.mu + Clv(1,1,1,1).real()*Clv(1,1,1,1).real() + norm(Clv(0,0,1,1)) + norm(Clv(0,1,1,1)) + norm(Clv(0,2,1,1)) + norm(Clv(1,0,1,1)) + norm(Clv(1,2,1,1));
102 
103  InvClv(1,1,1,1) = d;
104 
105  d = arg.mu*arg.mu + Clv(1,2,1,2).real()*Clv(1,2,1,2).real() + norm(Clv(0,0,1,2)) + norm(Clv(0,1,1,2)) + norm(Clv(0,2,1,2)) + norm(Clv(1,0,1,2)) + norm(Clv(1,1,1,2));
106 
107  InvClv(1,2,1,2) = d;
108 
109  InvClv(1,2,1,1) = tri14;
110  InvClv(1,2,1,0) = tri13;
111  InvClv(1,2,0,2) = tri12;
112  InvClv(1,2,0,1) = tri11;
113  InvClv(1,2,0,0) = tri10;
114  InvClv(1,1,1,0) = tri9;
115  InvClv(1,1,0,2) = tri8;
116  InvClv(1,1,0,1) = tri7;
117  InvClv(1,1,0,0) = tri6;
118  InvClv(1,0,0,2) = tri5;
119  InvClv(1,0,0,1) = tri4;
120 
121  /* INVERSION STARTS */
122 
123  /* j = 0 */
124  InvClv(0,0,0,0) = sqrt(InvClv(0,0,0,0));
125  tmp0 = 1. / InvClv(0,0,0,0);
126  InvClv(0,1,0,0) *= tmp0;
127  InvClv(0,2,0,0) *= tmp0;
128  InvClv(1,0,0,0) *= tmp0;
129  InvClv(1,1,0,0) *= tmp0;
130  InvClv(1,2,0,0) *= tmp0;
131 
132  /* k = 1 kj = 0 */
133  InvClv(0,1,0,1) -= norm(InvClv(0,1,0,0));
134 
135  /* l = 2...5 kj = 0 lj = 1,3,6,10 lk = 2,4,7,11*/
136  InvClv(0,2,0,1) -= InvClv(0,2,0,0)*InvClv(0,0,0,1);
137  InvClv(1,0,0,1) -= InvClv(1,0,0,0)*InvClv(0,0,0,1);
138  InvClv(1,1,0,1) -= InvClv(1,1,0,0)*InvClv(0,0,0,1);
139  InvClv(1,2,0,1) -= InvClv(1,2,0,0)*InvClv(0,0,0,1);
140 
141  /* k = 2 kj = 1 */
142  InvClv(0,2,0,2) -= norm(InvClv(0,2,0,0));
143 
144  /* l = 3...5 kj = 1 lj = 3,6,10 lk = 5,8,12*/
145  InvClv(1,0,0,2) -= InvClv(1,0,0,0)*InvClv(0,0,0,2);
146  InvClv(1,1,0,2) -= InvClv(1,1,0,0)*InvClv(0,0,0,2);
147  InvClv(1,2,0,2) -= InvClv(1,2,0,0)*InvClv(0,0,0,2);
148 
149  /* k = 3 kj = 3 */
150  InvClv(1,0,1,0) -= norm(InvClv(1,0,0,0));
151 
152  /* l = 4 kj = 3 lj = 6,10 lk = 9,13*/
153  InvClv(1,1,1,0) -= InvClv(1,1,0,0)*InvClv(0,0,1,0);
154  InvClv(1,2,1,0) -= InvClv(1,2,0,0)*InvClv(0,0,1,0);
155 
156  /* k = 4 kj = 6 */
157  InvClv(1,1,1,1) -= norm(InvClv(1,1,0,0));
158 
159  /* l = 5 kj = 6 lj = 10 lk = 14*/
160  InvClv(1,2,1,1) -= InvClv(1,2,0,0)*InvClv(0,0,1,1);
161 
162  /* k = 5 kj = 10 */
163  InvClv(1,2,1,2) -= norm(InvClv(1,2,0,0));
164 
165  /* j = 1 */
166  InvClv(0,1,0,1) = sqrt(InvClv(0,1,0,1));
167  tmp1 = 1. / InvClv(0,1,0,1);
168  InvClv(0,2,0,1) *= tmp1;
169  InvClv(1,0,0,1) *= tmp1;
170  InvClv(1,1,0,1) *= tmp1;
171  InvClv(1,2,0,1) *= tmp1;
172 
173  /* k = 2 kj = 2 */
174  InvClv(0,2,0,2) -= norm(InvClv(0,2,0,1));
175 
176  /* l = 3...5 kj = 2 lj = 4,7,11 lk = 5,8,12*/
177  InvClv(1,0,0,2) -= InvClv(1,0,0,1)*InvClv(0,1,0,2);
178  InvClv(1,1,0,2) -= InvClv(1,1,0,1)*InvClv(0,1,0,2);
179  InvClv(1,2,0,2) -= InvClv(1,2,0,1)*InvClv(0,1,0,2);
180 
181  /* k = 3 kj = 4 */
182  InvClv(1,0,1,0) -= norm(InvClv(1,0,0,1));
183 
184  /* l = 4 kj = 4 lj = 7,11 lk = 9,13*/
185  InvClv(1,1,1,0) -= InvClv(1,1,0,1)*InvClv(0,1,1,0);
186  InvClv(1,2,1,0) -= InvClv(1,2,0,1)*InvClv(0,1,1,0);
187 
188  /* k = 4 kj = 7 */
189  InvClv(1,1,1,1) -= norm(InvClv(1,1,0,1));
190 
191  /* l = 5 kj = 7 lj = 11 lk = 14*/
192  InvClv(1,2,1,1) -= InvClv(1,2,0,1)*InvClv(0,1,1,1);
193 
194  /* k = 5 kj = 11 */
195  InvClv(1,2,1,2) -= norm(InvClv(1,2,0,1));
196 
197  /* j = 2 */
198  InvClv(0,2,0,2) = sqrt(InvClv(0,2,0,2));
199  tmp2 = 1. / InvClv(0,2,0,2);
200  InvClv(1,0,0,2) *= tmp2;
201  InvClv(1,1,0,2) *= tmp2;
202  InvClv(1,2,0,2) *= tmp2;
203 
204  /* k = 3 kj = 5 */
205  InvClv(1,0,1,0) -= norm(InvClv(1,0,0,2));
206 
207  /* l = 4 kj = 5 lj = 8,12 lk = 9,13*/
208  InvClv(1,1,1,0) -= InvClv(1,1,0,2)*InvClv(0,2,1,0);
209  InvClv(1,2,1,0) -= InvClv(1,2,0,2)*InvClv(0,2,1,0);
210 
211  /* k = 4 kj = 8 */
212  InvClv(1,1,1,1) -= norm(InvClv(1,1,0,2));
213 
214  /* l = 5 kj = 8 lj = 12 lk = 14*/
215  InvClv(1,2,1,1) -= InvClv(1,2,0,2)*InvClv(0,2,1,1);
216 
217  /* k = 5 kj = 12 */
218  InvClv(1,2,1,2) -= norm(InvClv(1,2,0,2));
219 
220  /* j = 3 */
221  InvClv(1,0,1,0) = sqrt(InvClv(1,0,1,0));
222  tmp3 = 1. / InvClv(1,0,1,0);
223  InvClv(1,1,1,0) *= tmp3;
224  InvClv(1,2,1,0) *= tmp3;
225 
226  /* k = 4 kj = 9 */
227  InvClv(1,1,1,1) -= norm(InvClv(1,1,1,0));
228 
229  /* l = 5 kj = 9 lj = 13 lk = 14*/
230  InvClv(1,2,1,1) -= InvClv(1,2,1,0)*InvClv(1,0,1,1);
231 
232  /* k = 5 kj = 13 */
233  InvClv(1,2,1,2) -= norm(InvClv(1,2,1,0));
234 
235  /* j = 4 */
236  InvClv(1,1,1,1) = sqrt(InvClv(1,1,1,1));
237  tmp4 = 1. / InvClv(1,1,1,1);
238  InvClv(1,2,1,1) *= tmp4;
239 
240  /* k = 5 kj = 14 */
241  InvClv(1,2,1,2) -= norm(InvClv(1,2,1,1));
242 
243  /* j = 5 */
244  InvClv(1,2,1,2) = sqrt(InvClv(1,2,1,2));
245  tmp5 = 1. / InvClv(1,2,1,2);
246 
247 
248  /* Forwards substitute */
249 
250  /* k = 0 */
251  v1_0 = complex<Float>(tmp0,0.);
252  v1_1 = InvClv(0,1,0,0)*(-tmp1*tmp0);
253  v1_2 = (InvClv(0,2,0,0)*tmp0 + InvClv(0,2,0,1)*v1_1)*(-tmp2);
254  v1_3 = (InvClv(1,0,0,0)*tmp0 + InvClv(1,0,0,1)*v1_1 + InvClv(1,0,0,2)*v1_2)*(-tmp3);
255  v1_4 = (InvClv(1,1,0,0)*tmp0 + InvClv(1,1,0,1)*v1_1 + InvClv(1,1,0,2)*v1_2 + InvClv(1,1,1,0)*v1_3)*(-tmp4);
256  v1_5 = (InvClv(1,2,0,0)*tmp0 + InvClv(1,2,0,1)*v1_1 + InvClv(1,2,0,2)*v1_2 + InvClv(1,2,1,0)*v1_3 + InvClv(1,2,1,1)*v1_4)*(-tmp5*tmp5);
257 
258  /* Backwards substitute */
259 
260  sum = v1_4 - InvClv(1,1,1,2)*v1_5;
261  v1_4 = sum*tmp4;
262 
263  sum = v1_3 - InvClv(1,0,1,1)*v1_4 - InvClv(1,0,1,2)*v1_5;
264  v1_3 = sum*tmp3;
265 
266  sum = v1_2 - InvClv(0,2,1,0)*v1_3 - InvClv(0,2,1,1)*v1_4 - InvClv(0,2,1,2)*v1_5;
267  v1_2 = sum*tmp2;
268 
269  sum = v1_1 - InvClv(0,1,0,2)*v1_2 - InvClv(0,1,1,0)*v1_3 - InvClv(0,1,1,1)*v1_4 - InvClv(0,1,1,2)*v1_5;
270  v1_1 = sum*tmp1;
271 
272  sum = v1_0 - InvClv(0,0,0,1)*v1_1 - InvClv(0,0,0,2)*v1_2 - InvClv(0,0,1,0)*v1_3 - InvClv(0,0,1,1)*v1_4 - InvClv(0,0,1,2)*v1_5;
273  v1_0 = sum*tmp0;
274 
275  InvClv(0,0,0,0) = v1_0.real();
276  InvClv(0,1,0,0) = v1_1;
277  InvClv(0,2,0,0) = v1_2;
278  InvClv(1,0,0,0) = v1_3;
279  InvClv(1,1,0,0) = v1_4;
280  InvClv(1,2,0,0) = v1_5;
281 
282  /* k = 1 */
283  v1_1 = complex<Float>(tmp1,0.);
284  v1_2 = InvClv(0,2,0,1)*(-tmp2*tmp1);
285  v1_3 = (InvClv(1,0,0,1)*tmp1 + InvClv(1,0,0,2)*v1_2)*(-tmp3);
286  v1_4 = (InvClv(1,1,0,1)*tmp1 + InvClv(1,1,0,2)*v1_2 + InvClv(1,1,1,0)*v1_3)*(-tmp4);
287  v1_5 = (InvClv(1,2,0,1)*tmp1 + InvClv(1,2,0,2)*v1_2 + InvClv(1,2,1,0)*v1_3 + InvClv(1,2,1,1)*v1_4)*(-tmp5*tmp5);
288 
289  /* Backwards substitute */
290 
291  /* l = 4 */
292  sum = v1_4 - InvClv(1,1,1,2)*v1_5;
293  v1_4 = sum*tmp4;
294 
295  sum = v1_3 - InvClv(1,0,1,1)*v1_4 - InvClv(1,0,1,2)*v1_5;
296  v1_3 = sum*tmp3;
297 
298  sum = v1_2 - InvClv(0,2,1,0)*v1_3 - InvClv(0,2,1,1)*v1_4 - InvClv(0,2,1,2)*v1_5;
299  v1_2 = sum*tmp2;
300 
301  sum = v1_1 - InvClv(0,1,0,2)*v1_2 - InvClv(0,1,1,0)*v1_3 - InvClv(0,1,1,1)*v1_4 - InvClv(0,1,1,2)*v1_5;
302  v1_1 = sum*tmp1;
303 
304  InvClv(0,1,0,1) = v1_1.real();
305  InvClv(0,2,0,1) = v1_2;
306  InvClv(1,0,0,1) = v1_3;
307  InvClv(1,1,0,1) = v1_4;
308  InvClv(1,2,0,1) = v1_5;
309 
310  /* k = 2 */
311  v1_2 = complex<Float>(tmp2,0.);
312  v1_3 = InvClv(1,0,0,2)*(-tmp2*tmp3);
313  v1_4 = (InvClv(1,1,0,2)*tmp2 + InvClv(1,1,1,0)*v1_3)*(-tmp4);
314  v1_5 = (InvClv(1,2,0,2)*tmp2 + InvClv(1,2,1,0)*v1_3 + InvClv(1,2,1,1)*v1_4)*(-tmp5*tmp5);
315 
316  /* Backwards substitute */
317 
318  /* l = 4 */
319  sum = v1_4 - InvClv(1,1,1,2)*v1_5;
320  v1_4 = sum*tmp4;
321 
322  sum = v1_3 - InvClv(1,0,1,1)*v1_4 - InvClv(1,0,1,2)*v1_5;
323  v1_3 = sum*tmp3;
324 
325  sum = v1_2 - InvClv(0,2,1,0)*v1_3 - InvClv(0,2,1,1)*v1_4 - InvClv(0,2,1,2)*v1_5;
326  v1_2 = sum*tmp2;
327 
328  InvClv(0,2,0,2) = v1_2.real();
329  InvClv(1,0,0,2) = v1_3;
330  InvClv(1,1,0,2) = v1_4;
331  InvClv(1,2,0,2) = v1_5;
332 
333  /* k = 3 */
334  v1_3 = complex<Float>(tmp3,0.);
335  v1_4 = InvClv(1,1,1,0)*(-tmp3*tmp4);
336  v1_5 = (InvClv(1,2,1,0)*tmp3 + InvClv(1,2,1,1)*v1_4)*(-tmp5*tmp5);
337 
338  /* Backwards substitute */
339 
340  /* l = 4 */
341  sum = v1_4 - InvClv(1,1,1,2)*v1_5;
342  v1_4 = sum*tmp4;
343 
344  sum = v1_3 - InvClv(1,0,1,1)*v1_4 - InvClv(1,0,1,2)*v1_5;
345  v1_3 = sum*tmp3;
346 
347  InvClv(1,0,1,0) = v1_3.real();
348  InvClv(1,1,1,0) = v1_4;
349  InvClv(1,2,1,0) = v1_5;
350 
351  /* k = 4 */
352  v1_4 = complex<Float>(tmp4,0.);
353  v1_5 = InvClv(1,2,1,1)*(-tmp4*tmp5*tmp5);
354 
355  /* Backwards substitute */
356 
357  /* l = 4 */
358  sum = v1_4 - InvClv(1,1,1,2)*v1_5;
359  v1_4 = sum*tmp4;
360 
361  InvClv(1,1,1,1) = v1_4.real();
362  InvClv(1,2,1,1) = v1_5;
363 
364  /* k = 5 */
365  InvClv(1,2,1,2) = tmp5*tmp5;
366 
367  /* Calculate the product for the first chiral block */
368 
369  //Then we calculate AV = Cinv UV, so [AV = (C^2 + mu^2)^{-1} (Clover -/+ i mu)·Vector]
370  //for in twisted-clover fermions, Cinv keeps (C^2 + mu^2)^{-1}
371 
372  for(int ic_c = 0; ic_c < coarseColor; ic_c++) { //Coarse Color
373  /* Unrolled loop, supports limited values for fineSpin, fineColor and coarseColor */
374  arg.AV(parity,x_cb,ch,0,ic_c) += InvClv(0, 0, 0, 0) * arg.UV(parity, x_cb, ch, 0, ic_c)
375  + InvClv(0, 0, 0, 1) * arg.UV(parity, x_cb, ch, 1, ic_c)
376  + InvClv(0, 0, 0, 2) * arg.UV(parity, x_cb, ch, 2, ic_c)
377  + InvClv(0, 0, 1, 0) * arg.UV(parity, x_cb, ch+1, 0, ic_c)
378  + InvClv(0, 0, 1, 1) * arg.UV(parity, x_cb, ch+1, 1, ic_c)
379  + InvClv(0, 0, 1, 2) * arg.UV(parity, x_cb, ch+1, 2, ic_c);
380 
381  arg.AV(parity,x_cb,ch,1,ic_c) += InvClv(0, 1, 0, 0) * arg.UV(parity, x_cb, ch, 0, ic_c)
382  + InvClv(0, 1, 0, 1) * arg.UV(parity, x_cb, ch, 1, ic_c)
383  + InvClv(0, 1, 0, 2) * arg.UV(parity, x_cb, ch, 2, ic_c)
384  + InvClv(0, 1, 1, 0) * arg.UV(parity, x_cb, ch+1, 0, ic_c)
385  + InvClv(0, 1, 1, 1) * arg.UV(parity, x_cb, ch+1, 1, ic_c)
386  + InvClv(0, 1, 1, 2) * arg.UV(parity, x_cb, ch+1, 2, ic_c);
387 
388  arg.AV(parity,x_cb,ch,2,ic_c) += InvClv(0, 2, 0, 0) * arg.UV(parity, x_cb, ch, 0, ic_c)
389  + InvClv(0, 2, 0, 1) * arg.UV(parity, x_cb, ch, 1, ic_c)
390  + InvClv(0, 2, 0, 2) * arg.UV(parity, x_cb, ch, 2, ic_c)
391  + InvClv(0, 2, 1, 0) * arg.UV(parity, x_cb, ch+1, 0, ic_c)
392  + InvClv(0, 2, 1, 1) * arg.UV(parity, x_cb, ch+1, 1, ic_c)
393  + InvClv(0, 2, 1, 2) * arg.UV(parity, x_cb, ch+1, 2, ic_c);
394 
395  arg.AV(parity,x_cb,ch+1,0,ic_c)+= InvClv(1, 0, 0, 0) * arg.UV(parity, x_cb, ch, 0, ic_c)
396  + InvClv(1, 0, 0, 1) * arg.UV(parity, x_cb, ch, 1, ic_c)
397  + InvClv(1, 0, 0, 2) * arg.UV(parity, x_cb, ch, 2, ic_c)
398  + InvClv(1, 0, 1, 0) * arg.UV(parity, x_cb, ch+1, 0, ic_c)
399  + InvClv(1, 0, 1, 1) * arg.UV(parity, x_cb, ch+1, 1, ic_c)
400  + InvClv(1, 0, 1, 2) * arg.UV(parity, x_cb, ch+1, 2, ic_c);
401 
402  arg.AV(parity,x_cb,ch+1,1,ic_c)+= InvClv(1, 1, 0, 0) * arg.UV(parity, x_cb, ch, 0, ic_c)
403  + InvClv(1, 1, 0, 1) * arg.UV(parity, x_cb, ch, 1, ic_c)
404  + InvClv(1, 1, 0, 2) * arg.UV(parity, x_cb, ch, 2, ic_c)
405  + InvClv(1, 1, 1, 0) * arg.UV(parity, x_cb, ch+1, 0, ic_c)
406  + InvClv(1, 1, 1, 1) * arg.UV(parity, x_cb, ch+1, 1, ic_c)
407  + InvClv(1, 1, 1, 2) * arg.UV(parity, x_cb, ch+1, 2, ic_c);
408 
409  arg.AV(parity,x_cb,ch+1,2,ic_c)+= InvClv(1, 2, 0, 0) * arg.UV(parity, x_cb, ch, 0, ic_c)
410  + InvClv(1, 2, 0, 1) * arg.UV(parity, x_cb, ch, 1, ic_c)
411  + InvClv(1, 2, 0, 2) * arg.UV(parity, x_cb, ch, 2, ic_c)
412  + InvClv(1, 2, 1, 0) * arg.UV(parity, x_cb, ch+1, 0, ic_c)
413  + InvClv(1, 2, 1, 1) * arg.UV(parity, x_cb, ch+1, 1, ic_c)
414  + InvClv(1, 2, 1, 2) * arg.UV(parity, x_cb, ch+1, 2, ic_c);
415  } // Coarse color
416  } // Chirality
417 }
418 
419 
420 /* Cleanup */
421 
422 #undef tri14
423 #undef tri13
424 #undef tri9
425 #undef tri5
426 #undef tmp0
427 #undef tmp1
428 #undef tmp2
429 #undef tmp3
430 #undef tmp4
431 #undef tmp5
432 #undef v1_0
433 #undef v1_1
434 #undef v1_2
435 #undef v1_3
436 #undef v1_4
437 #undef v1_5
438 #undef sum
439 
440 #undef c00_01
441 #undef c00_02
442 #undef c01_02
443 #undef c00_10
444 #undef c01_10
445 #undef c02_10
446 #undef c00_11
447 #undef c01_11
448 #undef c02_11
449 #undef c10_11
450 #undef c00_12
451 #undef c01_12
452 #undef c02_12
453 #undef c10_12
454 #undef c11_12
455 
456 #undef Clv
457 #undef InvClv
458 
459 #endif
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
Definition: complex_quda.h:896
#define tmp5
Definition: tmc_core.h:19
#define tmp4
Definition: tmc_core.h:18
#define tmp2
Definition: tmc_core.h:16
__host__ __device__ void sum(double &a, double &b)
double sqrt(double)
#define tmp1
Definition: tmc_core.h:15
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:880
static __inline__ size_t size_t d
QudaParity parity
Definition: covdev_test.cpp:53
#define tmp0
Definition: tmc_core.h:14
#define tmp3
Definition: tmc_core.h:17