9 template <
typename Float,
typename coarseGauge,
typename fineGauge,
typename fineSpinor,
10 typename fineSpinorTmp,
typename fineClover>
40 CalculateYArg(coarseGauge &
Y, coarseGauge &
X, coarseGauge &
Xinv, fineSpinorTmp &
UV, fineSpinor &
AV,
const fineGauge &
U,
const fineSpinor &
V,
41 const fineClover &
C,
const fineClover &
Cinv,
double kappa,
double mu,
double mu_factor,
const int *x_size_,
const int *xc_size_,
int *geo_bs_,
int spin_bs_)
42 :
Y(
Y),
X(
X),
Xinv(
Xinv),
UV(
UV),
AV(
AV),
U(
U),
V(
V),
C(
C),
Cinv(
Cinv),
spin_bs(spin_bs_),
kappa(static_cast<Float>(
kappa)),
mu(static_cast<Float>(
mu)),
46 errorQuda(
"Gamma basis %d not supported",
V.GammaBasis());
61 template<
bool from_coarse,
typename Float,
int dim, QudaDirection dir,
int fineSpin,
int fineColor,
int coarseSpin,
int coarseColor,
typename Arg>
71 constexpr
int uvSpin = fineSpin * (from_coarse ? 2 : 1);
73 complex<Float> UV[uvSpin][fineColor];
75 for(
int s = 0;
s < uvSpin;
s++) {
76 for(
int c = 0;
c < fineColor;
c++) {
77 UV[
s][
c] =
static_cast<Float
>(0.0);
83 int ghost_idx = ghostFaceIndex<1>(
coord,
arg.x_size,
dim, nFace);
85 for(
int s = 0;
s < fineSpin;
s++) {
86 for(
int ic = 0; ic < fineColor; ic++) {
87 for(
int jc = 0; jc < fineColor; jc++) {
89 UV[
s][ic] +=
arg.U(
dim,
parity, x_cb, ic, jc) * W.Ghost(
dim, 1, (
parity+1)&1, ghost_idx,
s, jc, ic_c);
91 for (
int s_col=0; s_col<fineSpin; s_col++) {
94 W.Ghost(
dim, 1, (
parity+1)&1, ghost_idx, s_col, jc, ic_c);
103 for(
int s = 0;
s < fineSpin;
s++) {
104 for(
int ic = 0; ic < fineColor; ic++) {
105 for(
int jc = 0; jc < fineColor; jc++) {
109 for (
int s_col=0; s_col<fineSpin; s_col++) {
111 UV[s_col*fineSpin+
s][ic] +=
113 W((
parity+1)&1, y_cb, s_col, jc, ic_c);
122 for(
int s = 0;
s < uvSpin;
s++) {
123 for(
int c = 0;
c < fineColor;
c++) {
131 template<
bool from_coarse,
typename Float,
int dim, QudaDirection dir,
int fineSpin,
int fineColor,
int coarseSpin,
int coarseColor,
typename Arg>
134 for (
int x_cb=0; x_cb<
arg.fineVolumeCB; x_cb++) {
135 for (
int ic_c=0; ic_c < coarseColor; ic_c++)
136 computeUV<from_coarse,Float,dim,dir,fineSpin,fineColor,coarseSpin,coarseColor,Arg>(
arg,
parity, x_cb, ic_c);
141 template<
bool from_coarse,
typename Float,
int dim, QudaDirection dir,
int fineSpin,
int fineColor,
int coarseSpin,
int coarseColor,
typename Arg>
143 int x_cb =
blockDim.x*blockIdx.x + threadIdx.x;
144 if (x_cb >=
arg.fineVolumeCB)
return;
147 int ic_c =
blockDim.z*blockIdx.z + threadIdx.z;
148 if (ic_c >= coarseColor)
return;
149 computeUV<from_coarse,Float,dim,dir,fineSpin,fineColor,coarseSpin,coarseColor,Arg>(
arg,
parity, x_cb, ic_c);
156 template<
typename Float,
int fineSpin,
int fineColor,
int coarseColor,
typename Arg>
159 for(
int s = 0;
s < fineSpin;
s++) {
160 for(
int c = 0;
c < fineColor;
c++) {
161 arg.AV(
parity,x_cb,
s,
c,ic_c) =
static_cast<Float
>(0.0);
165 for(
int s = 0;
s < fineSpin;
s++) {
166 const int s_c =
s/
arg.spin_bs;
170 for(
int s_col = s_c*
arg.spin_bs; s_col < (s_c+1)*
arg.spin_bs; s_col++) {
172 for(
int ic = 0; ic < fineColor; ic++) {
173 for(
int jc = 0; jc < fineColor; jc++) {
175 arg.Cinv(0,
parity, x_cb,
s, s_col, ic, jc) *
arg.V(
parity, x_cb, s_col, jc, ic_c);
183 template<
typename Float,
int fineSpin,
int fineColor,
int coarseColor,
typename Arg>
186 for (
int x_cb=0; x_cb<
arg.fineVolumeCB; x_cb++) {
187 for (
int ic_c=0; ic_c < coarseColor; ic_c++)
188 computeAV<Float,fineSpin,fineColor,coarseColor,Arg>(
arg,
parity, x_cb, ic_c);
193 template<
typename Float,
int fineSpin,
int fineColor,
int coarseColor,
typename Arg>
195 int x_cb =
blockDim.x*blockIdx.x + threadIdx.x;
196 if (x_cb >=
arg.fineVolumeCB)
return;
199 int ic_c =
blockDim.z*blockIdx.z + threadIdx.z;
200 if (ic_c >= coarseColor)
return;
201 computeAV<Float,fineSpin,fineColor,coarseColor,Arg>(
arg,
parity, x_cb, ic_c);
208 template<
typename Float,
int fineSpin,
int fineColor,
int coarseColor,
typename Arg>
214 for(
int s = 0;
s < fineSpin/2;
s++) {
215 for(
int c = 0;
c < fineColor;
c++) {
220 for(
int s = fineSpin/2;
s < fineSpin;
s++) {
221 for(
int c = 0;
c < fineColor;
c++) {
228 template<
typename Float,
int fineSpin,
int fineColor,
int coarseColor,
typename Arg>
231 for (
int x_cb=0; x_cb<
arg.fineVolumeCB; x_cb++) {
232 for (
int v=0; v<coarseColor; v++)
233 computeTMAV<Float,fineSpin,fineColor,coarseColor,Arg>(
arg,
parity, x_cb, v);
238 template<
typename Float,
int fineSpin,
int fineColor,
int coarseColor,
typename Arg>
240 int x_cb =
blockDim.x*blockIdx.x + threadIdx.x;
241 if (x_cb >=
arg.fineVolumeCB)
return;
244 int v =
blockDim.z*blockIdx.z + threadIdx.z;
245 if (v >= coarseColor)
return;
247 computeTMAV<Float,fineSpin,fineColor,coarseColor,Arg>(
arg,
parity, x_cb, v);
250 #ifdef DYNAMIC_CLOVER 255 template<
typename Float,
int fineSpin,
int fineColor,
int coarseColor,
typename Arg>
256 __device__ __host__
inline void applyInvClover(Arg &
arg,
int parity,
int x_cb) {
261 for (
int ch = 0; ch < 2; ch++) {
262 Float diag[6],
tmp[6];
263 complex<Float> tri[15];
267 #define Cl(s1,c1,s2,c2) (arg.C(0, parity, x_cb, s1+2*ch, s2+2*ch, c1, c2)) 269 tri[0] = Cl(0,1,0,0)*Cl(0,0,0,0).real() + Cl(0,1,0,1)*Cl(0,1,0,0) + Cl(0,1,0,2)*Cl(0,2,0,0) + Cl(0,1,1,0)*Cl(1,0,0,0) + Cl(0,1,1,1)*Cl(1,1,0,0) + Cl(0,1,1,2)*Cl(1,2,0,0);
270 tri[1] = Cl(0,2,0,0)*Cl(0,0,0,0).real() + Cl(0,2,0,2)*Cl(0,2,0,0) + Cl(0,2,0,1)*Cl(0,1,0,0) + Cl(0,2,1,0)*Cl(1,0,0,0) + Cl(0,2,1,1)*Cl(1,1,0,0) + Cl(0,2,1,2)*Cl(1,2,0,0);
271 tri[3] = Cl(1,0,0,0)*Cl(0,0,0,0).real() + Cl(1,0,1,0)*Cl(1,0,0,0) + Cl(1,0,0,1)*Cl(0,1,0,0) + Cl(1,0,0,2)*Cl(0,2,0,0) + Cl(1,0,1,1)*Cl(1,1,0,0) + Cl(1,0,1,2)*Cl(1,2,0,0);
272 tri[6] = Cl(1,1,0,0)*Cl(0,0,0,0).real() + Cl(1,1,1,1)*Cl(1,1,0,0) + Cl(1,1,0,1)*Cl(0,1,0,0) + Cl(1,1,0,2)*Cl(0,2,0,0) + Cl(1,1,1,0)*Cl(1,0,0,0) + Cl(1,1,1,2)*Cl(1,2,0,0);
273 tri[10] = Cl(1,2,0,0)*Cl(0,0,0,0).real() + Cl(1,2,1,2)*Cl(1,2,0,0) + Cl(1,2,0,1)*Cl(0,1,0,0) + Cl(1,2,0,2)*Cl(0,2,0,0) + Cl(1,2,1,0)*Cl(1,0,0,0) + Cl(1,2,1,1)*Cl(1,1,0,0);
275 tri[2] = Cl(0,2,0,1)*Cl(0,1,0,1).real() + Cl(0,2,0,2)*Cl(0,2,0,1) + Cl(0,2,0,0)*Cl(0,0,0,1) + Cl(0,2,1,0)*Cl(1,0,0,1) + Cl(0,2,1,1)*Cl(1,1,0,1) + Cl(0,2,1,2)*Cl(1,2,0,1);
276 tri[4] = Cl(1,0,0,1)*Cl(0,1,0,1).real() + Cl(1,0,1,0)*Cl(1,0,0,1) + Cl(1,0,0,0)*Cl(0,0,0,1) + Cl(1,0,0,2)*Cl(0,2,0,1) + Cl(1,0,1,1)*Cl(1,1,0,1) + Cl(1,0,1,2)*Cl(1,2,0,1);
277 tri[7] = Cl(1,1,0,1)*Cl(0,1,0,1).real() + Cl(1,1,1,1)*Cl(1,1,0,1) + Cl(1,1,0,0)*Cl(0,0,0,1) + Cl(1,1,0,2)*Cl(0,2,0,1) + Cl(1,1,1,0)*Cl(1,0,0,1) + Cl(1,1,1,2)*Cl(1,2,0,1);
278 tri[11] = Cl(1,2,0,1)*Cl(0,1,0,1).real() + Cl(1,2,1,2)*Cl(1,2,0,1) + Cl(1,2,0,0)*Cl(0,0,0,1) + Cl(1,2,0,2)*Cl(0,2,0,1) + Cl(1,2,1,0)*Cl(1,0,0,1) + Cl(1,2,1,1)*Cl(1,1,0,1);
280 tri[5] = Cl(1,0,0,2)*Cl(0,2,0,2).real() + Cl(1,0,1,0)*Cl(1,0,0,2) + Cl(1,0,0,0)*Cl(0,0,0,2) + Cl(1,0,0,1)*Cl(0,1,0,2) + Cl(1,0,1,1)*Cl(1,1,0,2) + Cl(1,0,1,2)*Cl(1,2,0,2);
281 tri[8] = Cl(1,1,0,2)*Cl(0,2,0,2).real() + Cl(1,1,1,1)*Cl(1,1,0,2) + Cl(1,1,0,0)*Cl(0,0,0,2) + Cl(1,1,0,1)*Cl(0,1,0,2) + Cl(1,1,1,0)*Cl(1,0,0,2) + Cl(1,1,1,2)*Cl(1,2,0,2);
282 tri[12] = Cl(1,2,0,2)*Cl(0,2,0,2).real() + Cl(1,2,1,2)*Cl(1,2,0,2) + Cl(1,2,0,0)*Cl(0,0,0,2) + Cl(1,2,0,1)*Cl(0,1,0,2) + Cl(1,2,1,0)*Cl(1,0,0,2) + Cl(1,2,1,1)*Cl(1,1,0,2);
284 tri[9] = Cl(1,1,1,0)*Cl(1,0,1,0).real() + Cl(1,1,1,1)*Cl(1,1,1,0) + Cl(1,1,0,0)*Cl(0,0,1,0) + Cl(1,1,0,1)*Cl(0,1,1,0) + Cl(1,1,0,2)*Cl(0,2,1,0) + Cl(1,1,1,2)*Cl(1,2,1,0);
285 tri[13] = Cl(1,2,1,0)*Cl(1,0,1,0).real() + Cl(1,2,1,2)*Cl(1,2,1,0) + Cl(1,2,0,0)*Cl(0,0,1,0) + Cl(1,2,0,1)*Cl(0,1,1,0) + Cl(1,2,0,2)*Cl(0,2,1,0) + Cl(1,2,1,1)*Cl(1,1,1,0);
286 tri[14] = Cl(1,2,1,1)*Cl(1,1,1,1).real() + Cl(1,2,1,2)*Cl(1,2,1,1) + Cl(1,2,0,0)*Cl(0,0,1,1) + Cl(1,2,0,1)*Cl(0,1,1,1) + Cl(1,2,0,2)*Cl(0,2,1,1) + Cl(1,2,1,0)*Cl(1,0,1,1);
288 diag[0] =
arg.mu*
arg.mu + Cl(0,0,0,0).real()*Cl(0,0,0,0).real() +
norm(Cl(0,1,0,0)) +
norm(Cl(0,2,0,0)) +
norm(Cl(1,0,0,0)) +
norm(Cl(1,1,0,0)) +
norm(Cl(1,2,0,0));
289 diag[1] =
arg.mu*
arg.mu + Cl(0,1,0,1).real()*Cl(0,1,0,1).real() +
norm(Cl(0,0,0,1)) +
norm(Cl(0,2,0,1)) +
norm(Cl(1,0,0,1)) +
norm(Cl(1,1,0,1)) +
norm(Cl(1,2,0,1));
290 diag[2] =
arg.mu*
arg.mu + Cl(0,2,0,2).real()*Cl(0,2,0,2).real() +
norm(Cl(0,0,0,2)) +
norm(Cl(0,1,0,2)) +
norm(Cl(1,0,0,2)) +
norm(Cl(1,1,0,2)) +
norm(Cl(1,2,0,2));
291 diag[3] =
arg.mu*
arg.mu + Cl(1,0,1,0).real()*Cl(1,0,1,0).real() +
norm(Cl(0,0,1,0)) +
norm(Cl(0,1,1,0)) +
norm(Cl(0,2,1,0)) +
norm(Cl(1,1,1,0)) +
norm(Cl(1,2,1,0));
292 diag[4] =
arg.mu*
arg.mu + Cl(1,1,1,1).real()*Cl(1,1,1,1).real() +
norm(Cl(0,0,1,1)) +
norm(Cl(0,1,1,1)) +
norm(Cl(0,2,1,1)) +
norm(Cl(1,0,1,1)) +
norm(Cl(1,2,1,1));
293 diag[5] =
arg.mu*
arg.mu + Cl(1,2,1,2).real()*Cl(1,2,1,2).real() +
norm(Cl(0,0,1,2)) +
norm(Cl(0,1,1,2)) +
norm(Cl(0,2,1,2)) +
norm(Cl(1,0,1,2)) +
norm(Cl(1,1,1,2));
299 for (
int j=0; j<6; j++) {
300 diag[j] =
sqrt(diag[j]);
303 for (
int k=j+1; k<6; k++) {
304 int kj = k*(k-1)/2+j;
308 for(
int k=j+1;k<6;k++){
310 diag[k] -= (tri[kj] *
conj(tri[kj])).real();
311 for(
int l=k+1;l<6;l++){
314 tri[lk] -= tri[lj] *
conj(tri[kj]);
320 complex<Float> v1[6];
321 for (
int k=0;k<6;k++) {
322 for(
int l=0;l<k;l++) v1[l] = complex<Float>(0.0, 0.0);
325 v1[k] = complex<Float>(
tmp[k], 0.0);
326 for(
int l=k+1;l<6;l++){
327 complex<Float>
sum = complex<Float>(0.0, 0.0);
328 for(
int j=k;j<l;j++){
330 sum -= tri[lj] * v1[j];
336 v1[5] = v1[5] *
tmp[5];
337 for(
int l=4;l>=k;l--){
338 complex<Float>
sum = v1[l];
339 for(
int j=l+1;j<6;j++){
347 diag[k] = v1[k].real();
348 for(
int l=k+1;l<6;l++){
359 for(
int ic_c = 0; ic_c < coarseColor; ic_c++) {
360 for (
int j=0; j<(fineSpin/2)*fineColor; j++) {
361 int s = j / fineColor, ic = j % fineColor;
365 for (
int k=0; k<j; k++) {
366 const int jk = j*(j-1)/2 + k;
367 const int s_col = k / fineColor, jc = k % fineColor;
369 arg.AV(
parity, x_cb,
s+2*ch, ic, ic_c) += tri[jk] *
arg.UV(
parity, x_cb, s_col+2*ch, jc, ic_c);
372 for (
int k=j+1; k<(fineSpin/2)*fineColor; k++) {
373 int kj = k*(k-1)/2 + j;
374 int s_col = k / fineColor, jc = k % fineColor;
383 #endif // UGLY_DYNCLOV 384 #endif // DYNAMIC_CLOVER 390 template<
typename Float,
int fineSpin,
int fineColor,
int coarseColor,
typename Arg>
393 complex<Float>
mu(0.,
arg.mu);
395 for(
int s = 0;
s < fineSpin;
s++) {
396 for(
int c = 0;
c < fineColor;
c++) {
397 for(
int v = 0; v < coarseColor; v++) {
398 arg.UV(
parity,x_cb,
s,
c,v) =
static_cast<Float
>(0.0);
399 arg.AV(
parity,x_cb,
s,
c,v) =
static_cast<Float
>(0.0);
405 for(
int s = 0;
s < fineSpin;
s++) {
406 const int s_c =
s/
arg.spin_bs;
410 for(
int s_col = s_c*
arg.spin_bs; s_col < (s_c+1)*
arg.spin_bs; s_col++) {
412 for(
int ic_c = 0; ic_c < coarseColor; ic_c++) {
413 for(
int ic = 0; ic < fineColor; ic++) {
414 for(
int jc = 0; jc < fineColor; jc++) {
416 arg.C(0,
parity, x_cb,
s, s_col, ic, jc) *
arg.V(
parity, x_cb, s_col, jc, ic_c);
423 for(
int s = 0;
s < fineSpin/2;
s++) {
424 for(
int ic_c = 0; ic_c < coarseColor; ic_c++) {
425 for(
int ic = 0; ic < fineColor; ic++) {
431 for(
int s = fineSpin/2;
s < fineSpin;
s++) {
432 for(
int ic_c = 0; ic_c < coarseColor; ic_c++) {
433 for(
int ic = 0; ic < fineColor; ic++) {
439 #ifndef DYNAMIC_CLOVER 442 for(
int s = 0;
s < fineSpin;
s++) {
443 const int s_c =
s/
arg.spin_bs;
447 for(
int s_col = s_c*
arg.spin_bs; s_col < (s_c+1)*
arg.spin_bs; s_col++) {
449 for(
int ic_c = 0; ic_c < coarseColor; ic_c++) {
450 for(
int ic = 0; ic < fineColor; ic++) {
451 for(
int jc = 0; jc < fineColor; jc++) {
453 arg.Cinv(0,
parity, x_cb,
s, s_col, ic, jc) *
arg.UV(
parity, x_cb, s_col, jc, ic_c);
460 applyInvClover<Float,fineSpin,fineColor,coarseColor,Arg>(
arg,
parity, x_cb);
464 template<
typename Float,
int fineSpin,
int fineColor,
int coarseColor,
typename Arg>
467 for (
int x_cb=0; x_cb<
arg.fineVolumeCB; x_cb++) {
468 computeTMCAV<Float,fineSpin,fineColor,coarseColor,Arg>(
arg,
parity, x_cb);
473 template<
typename Float,
int fineSpin,
int fineColor,
int coarseColor,
typename Arg>
475 int x_cb =
blockDim.x*blockIdx.x + threadIdx.x;
476 if (x_cb >=
arg.fineVolumeCB)
return;
479 computeTMCAV<Float,fineSpin,fineColor,coarseColor,Arg>(
arg,
parity, x_cb);
493 template <
bool from_coarse,
typename Float,
int dim, QudaDirection dir,
int fineSpin,
int fineColor,
int coarseSpin,
int coarseColor,
typename Arg>
498 for (
int i=0;
i<coarseSpin*coarseSpin*coarseColor;
i++) vuv[
i] = 0.0;
502 for(
int s = 0;
s < fineSpin;
s++) {
508 int s_c_row =
s/
arg.spin_bs;
516 complex<Float> coupling =
gamma.getrowelem(
s, s_col);
517 int s_c_col = s_col/
arg.spin_bs;
520 for(
int jc_c = 0; jc_c < coarseColor; jc_c++) {
521 for(
int ic = 0; ic < fineColor; ic++) {
526 vuv[(s_c_row*coarseSpin+s_c_row)*coarseColor+jc_c] +=
531 vuv[(s_c_row*coarseSpin+s_c_col)*coarseColor+jc_c] +=
536 vuv[(s_c_row*coarseSpin+s_c_row)*coarseColor+jc_c] +=
541 vuv[(s_c_row*coarseSpin+s_c_col)*coarseColor+jc_c] -=
551 for (
int s_col=0; s_col<fineSpin; s_col++) {
552 for (
int s = 0;
s < fineSpin;
s++) {
554 for(
int jc_c = 0; jc_c < coarseColor; jc_c++) {
555 for(
int ic = 0; ic < fineColor; ic++) {
557 vuv[(
s*coarseSpin+s_col)*coarseColor+jc_c] +=
569 template<
bool from_coarse,
typename Float,
int dim, QudaDirection dir,
int fineSpin,
int fineColor,
int coarseSpin,
int coarseColor,
typename Arg>
576 for(
int d = 0;
d<nDim;
d++) coarse_size *=
arg.xc_size[
d];
579 for(
int d = 0;
d < nDim;
d++) coord_coarse[
d] =
coord[
d]/
arg.geo_bs[
d];
589 int coarse_parity = 0;
590 for (
int d=0;
d<nDim;
d++) coarse_parity += coord_coarse[
d];
592 coord_coarse[0] /= 2;
593 int coarse_x_cb = ((coord_coarse[3]*
arg.xc_size[2]+coord_coarse[2])*
arg.xc_size[1]+coord_coarse[1])*(
arg.xc_size[0]/2) + coord_coarse[0];
597 complex<Float> vuv[coarseSpin*coarseSpin*coarseColor];
598 multiplyVUV<from_coarse,Float,dim,dir,fineSpin,fineColor,coarseSpin,coarseColor,Arg>(vuv,
arg,
parity, x_cb, c_row);
600 for (
int s_row = 0; s_row < coarseSpin; s_row++) {
601 for (
int s_col = 0; s_col < coarseSpin; s_col++) {
603 for(
int c_col = 0; c_col < coarseColor; c_col++) {
604 M.atomicAdd(dim_index,coarse_parity,coarse_x_cb,s_row,s_col,c_row,c_col,
605 vuv[(s_row*coarseSpin+s_col)*coarseColor+c_col]);
613 template<
bool from_coarse,
typename Float,
int dim, QudaDirection dir,
int fineSpin,
int fineColor,
int coarseSpin,
int coarseColor,
typename Arg>
616 for (
int x_cb=0; x_cb<
arg.fineVolumeCB; x_cb++) {
617 for (
int c_row=0; c_row<coarseColor; c_row++)
618 computeVUV<from_coarse,Float,dim,dir,fineSpin,fineColor,coarseSpin,coarseColor,Arg>(
arg,
parity, x_cb, c_row);
623 template<
bool from_coarse,
typename Float,
int dim, QudaDirection dir,
int fineSpin,
int fineColor,
int coarseSpin,
int coarseColor,
typename Arg>
625 int x_cb =
blockDim.x*blockIdx.x + threadIdx.x;
626 if (x_cb >=
arg.fineVolumeCB)
return;
629 int c_row =
blockDim.z*blockIdx.z + threadIdx.z;
630 if (c_row >= coarseColor)
return;
631 computeVUV<from_coarse,Float,dim,dir,fineSpin,fineColor,coarseSpin,coarseColor,Arg>(
arg,
parity, x_cb, c_row);
638 template<
typename Float,
int nSpin,
int nColor,
typename Arg>
642 for (
int d=0;
d<4;
d++) {
643 for(
int s_row = 0; s_row < nSpin; s_row++) {
644 for(
int s_col = 0; s_col < nSpin; s_col++) {
646 const Float
sign = (s_row == s_col) ? static_cast<Float>(1.0) :
static_cast<Float
>(-1.0);
648 for(
int ic_c = 0; ic_c <
nColor; ic_c++) {
649 for(
int jc_c = 0; jc_c <
nColor; jc_c++) {
650 Y(
d+4,
parity,x_cb,s_row,s_col,ic_c,jc_c) =
sign*Y(
d,
parity,x_cb,s_row,s_col,ic_c,jc_c);
660 template<
typename Float,
int nSpin,
int nColor,
typename Arg>
663 for (
int x_cb=0; x_cb<
arg.coarseVolumeCB; x_cb++) {
664 computeYreverse<Float,nSpin,nColor,Arg>(
arg,
parity, x_cb);
669 template<
typename Float,
int nSpin,
int nColor,
typename Arg>
671 int x_cb =
blockDim.x*blockIdx.x + threadIdx.x;
672 if (x_cb >=
arg.coarseVolumeCB)
return;
675 computeYreverse<Float,nSpin,nColor,Arg>(
arg,
parity, x_cb);
685 template<
bool b
idirectional,
typename Float,
int nSpin,
int nColor,
typename Arg>
690 for(
int s_row = 0; s_row < nSpin; s_row++) {
691 for(
int s_col = 0; s_col < nSpin; s_col++) {
694 for(
int ic_c = 0; ic_c <
nColor; ic_c++) {
695 for(
int jc_c = 0; jc_c <
nColor; jc_c++) {
697 Xlocal[((nSpin*s_col+s_row)*
nColor+ic_c)*
nColor+jc_c] =
arg.X(0,
parity,x_cb,s_row, s_col, ic_c, jc_c);
703 for(
int s_row = 0; s_row < nSpin; s_row++) {
704 for(
int s_col = 0; s_col < nSpin; s_col++) {
706 const Float
sign = (s_row == s_col) ? static_cast<Float>(1.0) :
static_cast<Float
>(-1.0);
708 for(
int ic_c = 0; ic_c <
nColor; ic_c++) {
709 for(
int jc_c = 0; jc_c <
nColor; jc_c++) {
712 arg.X(0,
parity,x_cb,s_row,s_col,ic_c,jc_c) =
713 -
arg.kappa*(
arg.Xinv(0,
parity,x_cb,s_row,s_col,ic_c,jc_c)
717 arg.X(0,
parity,x_cb,s_row,s_col,ic_c,jc_c) =
728 template<
bool b
idirectional,
typename Float,
int nSpin,
int nColor,
typename Arg>
731 for (
int x_cb=0; x_cb<
arg.coarseVolumeCB; x_cb++) {
732 computeCoarseLocal<bidirectional,Float,nSpin,nColor,Arg>(
arg,
parity, x_cb);
737 template<
bool b
idirectional,
typename Float,
int nSpin,
int nColor,
typename Arg>
739 int x_cb =
blockDim.x*blockIdx.x + threadIdx.x;
740 if (x_cb >=
arg.coarseVolumeCB)
return;
743 computeCoarseLocal<bidirectional,Float,nSpin,nColor,Arg>(
arg,
parity, x_cb);
747 template<
bool from_coarse,
typename Float,
int fineSpin,
int coarseSpin,
int fineColor,
int coarseColor,
typename Arg>
755 for(
int d = 0;
d<nDim;
d++) coarse_size *=
arg.xc_size[
d];
758 for (
int d=0;
d<nDim;
d++) coord_coarse[
d] =
coord[
d]/
arg.geo_bs[
d];
760 int coarse_parity = 0;
761 for (
int d=0;
d<nDim;
d++) coarse_parity += coord_coarse[
d];
763 coord_coarse[0] /= 2;
764 int coarse_x_cb = ((coord_coarse[3]*
arg.xc_size[2]+coord_coarse[2])*
arg.xc_size[1]+coord_coarse[1])*(
arg.xc_size[0]/2) + coord_coarse[0];
768 complex<Float>
X[coarseSpin*coarseSpin*coarseColor];
769 for (
int i=0;
i<coarseSpin*coarseSpin*coarseColor;
i++)
X[
i] = 0.0;
773 for(
int s = 0;
s < fineSpin;
s++) {
774 int s_c =
s/
arg.spin_bs;
777 for(
int s_col = s_c*
arg.spin_bs; s_col < (s_c+1)*
arg.spin_bs; s_col++) {
779 for(
int jc_c = 0; jc_c < coarseColor; jc_c++) {
780 for(
int ic = 0; ic < fineColor; ic++) {
781 for(
int jc = 0; jc < fineColor; jc++) {
782 X[ (s_c*coarseSpin + s_c)*coarseColor + jc_c] +=
783 conj(
arg.V(
parity, x_cb,
s, ic, ic_c)) *
arg.C(0,
parity, x_cb,
s, s_col, ic, jc) *
arg.V(
parity, x_cb, s_col, jc, jc_c);
793 for(
int s = 0;
s < fineSpin;
s++) {
794 for(
int s_col = 0; s_col < fineSpin; s_col++) {
796 for(
int jc_c = 0; jc_c <coarseColor; jc_c++) {
797 for(
int ic = 0; ic < fineColor; ic++) {
798 for(
int jc = 0; jc < fineColor; jc++) {
799 X[ (
s*coarseSpin + s_col)*coarseColor + jc_c] +=
800 conj(
arg.V(
parity, x_cb,
s, ic, ic_c)) *
arg.C(0,
parity, x_cb,
s, s_col, ic, jc) *
arg.V(
parity, x_cb, s_col, jc, jc_c);
809 for (
int si = 0; si < coarseSpin; si++) {
810 for (
int sj = 0; sj < coarseSpin; sj++) {
812 for (
int jc = 0; jc < coarseColor; jc++) {
813 arg.X.atomicAdd(0,coarse_parity,coarse_x_cb,si,sj,ic_c,jc,
X[(si*coarseSpin+sj)*coarseColor+jc]);
821 template <
bool from_coarse,
typename Float,
int fineSpin,
int coarseSpin,
int fineColor,
int coarseColor,
typename Arg>
824 for (
int x_cb=0; x_cb<
arg.fineVolumeCB; x_cb++) {
825 for (
int ic_c=0; ic_c<coarseColor; ic_c++) {
826 computeCoarseClover<from_coarse,Float,fineSpin,coarseSpin,fineColor,coarseColor>(
arg,
parity, x_cb, ic_c);
832 template <
bool from_coarse,
typename Float,
int fineSpin,
int coarseSpin,
int fineColor,
int coarseColor,
typename Arg>
834 int x_cb =
blockDim.x*blockIdx.x + threadIdx.x;
835 if (x_cb >=
arg.fineVolumeCB)
return;
837 int ic_c =
blockDim.z*blockIdx.z + threadIdx.z;
838 if (ic_c >= coarseColor)
return;
839 computeCoarseClover<from_coarse,Float,fineSpin,coarseSpin,fineColor,coarseColor>(
arg,
parity, x_cb, ic_c);
845 template<
typename Float,
int nSpin,
int nColor,
typename Arg>
848 for (
int x_cb=0; x_cb<
arg.coarseVolumeCB; x_cb++) {
849 for(
int s = 0;
s < nSpin;
s++) {
860 template<
typename Float,
int nSpin,
int nColor,
typename Arg>
862 int x_cb =
blockDim.x*blockIdx.x + threadIdx.x;
863 if (x_cb >=
arg.coarseVolumeCB)
return;
866 for(
int s = 0;
s < nSpin;
s++) {
874 template<
typename Float,
int nSpin,
int nColor,
typename Arg>
877 const complex<Float>
mu(0.,
arg.mu*
arg.mu_factor);
880 for (
int x_cb=0; x_cb<
arg.coarseVolumeCB; x_cb++) {
881 for(
int s = 0;
s < nSpin/2;
s++) {
886 for(
int s = nSpin/2;
s < nSpin;
s++) {
896 template<
typename Float,
int nSpin,
int nColor,
typename Arg>
898 int x_cb =
blockDim.x*blockIdx.x + threadIdx.x;
899 if (x_cb >=
arg.coarseVolumeCB)
return;
902 const complex<Float>
mu(0.,
arg.mu*
arg.mu_factor);
904 for(
int s = 0;
s < nSpin/2;
s++) {
905 for(
int ic_c = 0; ic_c <
nColor; ic_c++) {
909 for(
int s = nSpin/2;
s < nSpin;
s++) {
910 for(
int ic_c = 0; ic_c <
nColor; ic_c++) {
930 template <
bool from_coarse,
typename Float,
int fineSpin,
931 int fineColor,
int coarseSpin,
int coarseColor,
typename Arg>
948 long long flops_ = 0;
952 flops_ = 2l *
arg.fineVolumeCB * 8 * fineSpin * coarseColor * fineColor * fineColor * (!from_coarse ? 1 : fineSpin);
957 flops_ = 2l *
arg.fineVolumeCB * 8 * (fineSpin/2) * (fineSpin/2) * (fineSpin/2) * fineColor * fineColor * coarseColor;
961 flops_ = 4l *
arg.fineVolumeCB * 8 * (fineSpin/2) * (fineSpin/2) * (fineSpin/2) * fineColor * fineColor * coarseColor;
965 flops_ = 2l *
arg.fineVolumeCB * 8 * fineSpin * fineSpin * coarseColor * coarseColor * fineColor / (!from_coarse ? coarseSpin : 1);
969 flops_ = 2l *
arg.fineVolumeCB * 8 * fineSpin * fineSpin * coarseColor * coarseColor * fineColor * fineColor / (!from_coarse ? coarseSpin : 1);
977 flops_ = 2l *
arg.coarseVolumeCB*coarseSpin*coarseSpin*coarseColor*coarseColor*2;
982 flops_ = 2l *
arg.coarseVolumeCB*coarseSpin*coarseColor;
992 long long bytes_ = 0;
995 bytes_ =
arg.UV.Bytes() +
arg.V.Bytes() + 2*
arg.U.Bytes()*coarseColor;
998 bytes_ =
arg.AV.Bytes() +
arg.V.Bytes() + 2*
arg.C.Bytes();
1001 bytes_ =
arg.AV.Bytes() +
arg.V.Bytes();
1004 bytes_ =
arg.AV.Bytes() +
arg.V.Bytes() +
arg.UV.Bytes() + 4*
arg.C.Bytes();
1007 bytes_ =
arg.UV.Bytes() +
arg.V.Bytes();
1010 bytes_ = 2*
arg.X.Bytes() + 2*
arg.C.Bytes() +
arg.V.Bytes();
1013 bytes_ = 4*2*2*
arg.Y.Bytes();
1017 bytes_ = 2*2*
arg.X.Bytes();
1026 unsigned int threads = 0;
1034 threads =
arg.fineVolumeCB;
1040 threads =
arg.coarseVolumeCB;
1069 if (
dim==0) ComputeUVCPU<from_coarse,Float,0,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(
arg);
1070 else if (
dim==1) ComputeUVCPU<from_coarse,Float,1,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(
arg);
1071 else if (
dim==2) ComputeUVCPU<from_coarse,Float,2,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(
arg);
1072 else if (
dim==3) ComputeUVCPU<from_coarse,Float,3,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(
arg);
1074 if (
dim==0) ComputeUVCPU<from_coarse,Float,0,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(
arg);
1075 else if (
dim==1) ComputeUVCPU<from_coarse,Float,1,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(
arg);
1076 else if (
dim==2) ComputeUVCPU<from_coarse,Float,2,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(
arg);
1077 else if (
dim==3) ComputeUVCPU<from_coarse,Float,3,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(
arg);
1084 if (from_coarse)
errorQuda(
"ComputeAV should only be called from the fine grid");
1085 ComputeAVCPU<Float,fineSpin,fineColor,coarseColor>(
arg);
1089 if (from_coarse)
errorQuda(
"ComputeTMAV should only be called from the fine grid");
1090 ComputeTMAVCPU<Float,fineSpin,fineColor,coarseColor>(
arg);
1094 if (from_coarse)
errorQuda(
"ComputeTMCAV should only be called from the fine grid");
1095 ComputeTMCAVCPU<Float,fineSpin,fineColor,coarseColor>(
arg);
1100 if (
dim==0) ComputeVUVCPU<from_coarse,Float,0,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(
arg);
1101 else if (
dim==1) ComputeVUVCPU<from_coarse,Float,1,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(
arg);
1102 else if (
dim==2) ComputeVUVCPU<from_coarse,Float,2,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(
arg);
1103 else if (
dim==3) ComputeVUVCPU<from_coarse,Float,3,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(
arg);
1105 if (
dim==0) ComputeVUVCPU<from_coarse,Float,0,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(
arg);
1106 else if (
dim==1) ComputeVUVCPU<from_coarse,Float,1,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(
arg);
1107 else if (
dim==2) ComputeVUVCPU<from_coarse,Float,2,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(
arg);
1108 else if (
dim==3) ComputeVUVCPU<from_coarse,Float,3,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor>(
arg);
1115 ComputeCoarseCloverCPU<from_coarse,Float,fineSpin,coarseSpin,fineColor,coarseColor>(
arg);
1119 ComputeYReverseCPU<Float,coarseSpin,coarseColor>(
arg);
1123 if (
bidirectional) ComputeCoarseLocalCPU<true,Float,coarseSpin,coarseColor>(
arg);
1124 else ComputeCoarseLocalCPU<false,Float,coarseSpin,coarseColor>(
arg);
1128 AddCoarseDiagonalCPU<Float,coarseSpin,coarseColor>(
arg);
1132 AddCoarseTmDiagonalCPU<Float,coarseSpin,coarseColor>(
arg);
1142 if (
dim==0) ComputeUVGPU<from_coarse,Float,0,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.
grid,tp.
block,tp.
shared_bytes>>>(
arg);
1143 else if (
dim==1) ComputeUVGPU<from_coarse,Float,1,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.
grid,tp.
block,tp.
shared_bytes>>>(
arg);
1144 else if (
dim==2) ComputeUVGPU<from_coarse,Float,2,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.
grid,tp.
block,tp.
shared_bytes>>>(
arg);
1145 else if (
dim==3) ComputeUVGPU<from_coarse,Float,3,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.
grid,tp.
block,tp.
shared_bytes>>>(
arg);
1147 if (
dim==0) ComputeUVGPU<from_coarse,Float,0,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.
grid,tp.
block,tp.
shared_bytes>>>(
arg);
1148 else if (
dim==1) ComputeUVGPU<from_coarse,Float,1,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.
grid,tp.
block,tp.
shared_bytes>>>(
arg);
1149 else if (
dim==2) ComputeUVGPU<from_coarse,Float,2,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.
grid,tp.
block,tp.
shared_bytes>>>(
arg);
1150 else if (
dim==3) ComputeUVGPU<from_coarse,Float,3,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.
grid,tp.
block,tp.
shared_bytes>>>(
arg);
1157 if (from_coarse)
errorQuda(
"ComputeAV should only be called from the fine grid");
1162 if (from_coarse)
errorQuda(
"ComputeTMAV should only be called from the fine grid");
1167 if (from_coarse)
errorQuda(
"ComputeTMCAV should only be called from the fine grid");
1173 if (
dim==0) ComputeVUVGPU<from_coarse,Float,0,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.
grid,tp.
block,tp.
shared_bytes>>>(
arg);
1174 else if (
dim==1) ComputeVUVGPU<from_coarse,Float,1,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.
grid,tp.
block,tp.
shared_bytes>>>(
arg);
1175 else if (
dim==2) ComputeVUVGPU<from_coarse,Float,2,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.
grid,tp.
block,tp.
shared_bytes>>>(
arg);
1176 else if (
dim==3) ComputeVUVGPU<from_coarse,Float,3,QUDA_BACKWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.
grid,tp.
block,tp.
shared_bytes>>>(
arg);
1178 if (
dim==0) ComputeVUVGPU<from_coarse,Float,0,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.
grid,tp.
block,tp.
shared_bytes>>>(
arg);
1179 else if (
dim==1) ComputeVUVGPU<from_coarse,Float,1,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.
grid,tp.
block,tp.
shared_bytes>>>(
arg);
1180 else if (
dim==2) ComputeVUVGPU<from_coarse,Float,2,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.
grid,tp.
block,tp.
shared_bytes>>>(
arg);
1181 else if (
dim==3) ComputeVUVGPU<from_coarse,Float,3,QUDA_FORWARDS,fineSpin,fineColor,coarseSpin,coarseColor><<<tp.
grid,tp.
block,tp.
shared_bytes>>>(
arg);
1188 ComputeCoarseCloverGPU<from_coarse,Float,fineSpin,coarseSpin,fineColor,coarseColor>
1266 else if (
dim == 1)
strcat(Aux,
",dim=1");
1267 else if (
dim == 2)
strcat(Aux,
",dim=2");
1268 else if (
dim == 3)
strcat(Aux,
",dim=3");
1276 strcat(Aux,
"coarse_vol=");
1329 template <
typename Flloat,
typename Gauge,
int n>
1341 for (
int i=0;
i<4;
i++) {
1343 this->dim[
i] =
dim[
i];
1348 template<
typename Float,
int n,
typename Arg>
1355 const int ghost_idx = ghostFaceIndex<0>(
coord,
arg.dim,
d,
arg.nFace);
1360 for(
int j = 0; j<
n; j++) {
1362 for(
int k = 0; k<
n; k++) {
1363 arg.Yhat.Ghost(
d,1-
parity,ghost_idx,
i,j) +=
arg.Y.Ghost(
d,1-
parity,ghost_idx,
i,k) *
conj(
arg.Xinv(0,
parity,x_cb,j,k));
1370 for(
int j = 0; j<
n; j++) {
1372 for(
int k = 0; k<
n; k++) {
1373 arg.Yhat(
d,1-
parity,back_idx,
i,j) +=
arg.Y(
d,1-
parity,back_idx,
i,k) *
conj(
arg.Xinv(0,
parity,x_cb,j,k));
1380 for(
int j = 0; j<
n; j++) {
1382 for(
int k = 0; k<
n; k++) {
1383 arg.Yhat(
d+4,
parity,x_cb,
i,j) +=
arg.Xinv(0,
parity,x_cb,
i,k) *
arg.Y(
d+4,
parity,x_cb,k,j);
1389 template<
typename Float,
int n,
typename Arg>
1392 for (
int d=0;
d<4;
d++) {
1394 for (
int x_cb=0; x_cb<
arg.Y.VolumeCB(); x_cb++) {
1395 for (
int i=0; i<n; i++) computeYhat<Float,n>(
arg,
d, x_cb,
parity,
i);
1401 template<
typename Float,
int n,
typename Arg>
1403 int x_cb =
blockDim.x*blockIdx.x + threadIdx.x;
1404 if (x_cb >=
arg.coarseVolumeCB)
return;
1405 int i_parity =
blockDim.y*blockIdx.y + threadIdx.y;
1406 if (i_parity >= 2*
n)
return;
1407 int d =
blockDim.z*blockIdx.z + threadIdx.z;
1410 int i = i_parity %
n;
1416 template <
typename Float,
int n,
typename Arg>
1423 long long flops()
const {
return 2l *
arg.coarseVolumeCB * 8 *
n *
n * (8*
n-2); }
1424 long long bytes()
const {
return 2l * (
arg.Xinv.Bytes() + 8*
arg.Y.Bytes() + 8*
arg.Yhat.Bytes()); }
1440 CalculateYhatCPU<Float,n,Arg>(
arg);
1485 template<
bool from_coarse,
typename Float,
int fineSpin,
int fineColor,
int coarseSpin,
int coarseColor,
1486 QudaGaugeFieldOrder gOrder,
typename F,
typename Ftmp,
typename coarseGauge,
typename fineGauge,
typename fineClover>
1487 void calculateY(coarseGauge &Y, coarseGauge &
X, coarseGauge &Xinv, Ftmp &UV, F &AV, F &
V, fineGauge &G, fineClover &C, fineClover &Cinv,
1496 if (is_dirac_coarse && fineSpin != 2)
1497 errorQuda(
"Input Dirac operator %d should have nSpin=2, not nSpin=%d\n",
dirac, fineSpin);
1498 if (!is_dirac_coarse && fineSpin != 4)
1499 errorQuda(
"Input Dirac operator %d should have nSpin=4, not nSpin=%d\n",
dirac, fineSpin);
1500 if (!is_dirac_coarse && fineColor != 3)
1501 errorQuda(
"Input Dirac operator %d should have nColor=3, not nColor=%d\n",
dirac, fineColor);
1503 if (G.Ndim() != 4)
errorQuda(
"Number of dimensions not supported");
1507 for (
int i=0;
i<4;
i++) x_size[
i] = v.
X(
i);
1511 for (
int i=0;
i<4;
i++) xc_size[
i] = X_.
X()[
i];
1515 for(
int d = 0;
d < nDim;
d++) geo_bs[
d] = x_size[
d]/xc_size[
d];
1516 int spin_bs =
V.Nspin()/Y.NspinCoarse();
1521 Arg
arg(Y,
X, Xinv, UV, AV, G,
V, C, Cinv,
kappa,
mu,
mu_factor, x_size, xc_size, geo_bs, spin_bs);
1522 CalculateY<from_coarse, Float, fineSpin, fineColor, coarseSpin, coarseColor, Arg> y(
arg,
dirac, v, Y_, X_, Xinv_);
1531 if (bidirectional_links)
printfQuda(
"Doing bi-directional link coarsening\n");
1532 else printfQuda(
"Doing uni-directional link coarsening\n");
1537 const int nFace = 1;
1540 if (&v == &av)
arg.AV.resetGhost(av.
Ghost());
1581 if (bidirectional_links) {
1582 for (
int d = 0;
d < nDim;
d++) {
1585 printfQuda(
"Computing forward %d UV and VUV\n",
d);
1605 for (
int d = 0;
d < nDim;
d++) {
1608 printfQuda(
"Computing backward %d UV and VUV\n",
d);
1624 if ( !bidirectional_links ) {
1641 printfQuda(
"Computing fine->coarse clover term\n");
1645 printfQuda(
"Summing diagonal contribution to coarse clover\n");
1654 arg.mu_factor += 1.;
1693 gCoarse yAccessor(const_cast<GaugeField&>(Y_));
1694 gCoarse yHatAccessor(const_cast<GaugeField&>(Yhat_));
1695 gCoarse xInvAccessor(const_cast<GaugeField&>(Xinv_));
1696 printfQuda(
"Xinv = %e\n", xInvAccessor.norm2(0));
1701 yHatArg
arg(yHatAccessor, yAccessor, xInvAccessor, xc_size,
comm_dim, 1);
__device__ __host__ void multiplyVUV(complex< Float > vuv[], Arg &arg, int parity, int x_cb, int ic_c)
Do a single (AV)^ * UV product, where for preconditioned clover, AV correspond to the clover inverse ...
int comm_dim[QUDA_MAX_DIM]
void setDirection(QudaDirection dir_)
__global__ void ComputeVUVGPU(Arg arg)
void resizeVector(int y, int z)
const char * comm_dim_partitioned_string()
Return a string that defines the comm partitioning (used as a tuneKey)
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
const char * AuxString() const
QudaVerbosity getVerbosity()
int comm_dim[QUDA_MAX_DIM]
__device__ __host__ void computeCoarseClover(Arg &arg, int parity, int x_cb, int ic_c)
double mu_factor[QUDA_MAX_MG_LEVEL]
__host__ __device__ ValueType sqrt(ValueType x)
__device__ __host__ void computeUV(Arg &arg, int parity, int x_cb, int ic_c)
static __inline__ dim3 dim3 void size_t cudaStream_t int dim
cudaColorSpinorField * tmp
__global__ void ComputeCoarseCloverGPU(Arg arg)
char * strcpy(char *__dst, const char *__src)
CalculateY(Arg &arg, QudaDiracType dirac, const ColorSpinorField &meta, GaugeField &Y, GaugeField &X, GaugeField &Xinv)
const char * VolString() const
virtual void restore()
Restores the cpuGaugeField.
int xc_size[QUDA_MAX_DIM]
char * strcat(char *__s1, const char *__s2)
__global__ void ComputeTMCAVGPU(Arg arg)
void ComputeTMAVCPU(Arg &arg)
__device__ __host__ void computeAV(Arg &arg, int parity, int x_cb, int ic_c)
unsigned int minThreads() const
void matpc(void *outEven, void **gauge, void *inEven, double kappa, QudaMatPCType matpc_type, int dagger_bit, QudaPrecision sPrecision, QudaPrecision gPrecision, double mferm)
void ComputeVUVCPU(Arg arg)
enum QudaDirection_s QudaDirection
void AddCoarseTmDiagonalCPU(Arg &arg)
VOLATILE spinorFloat kappa
virtual void exchangeGhost(QudaParity parity, int nFace, int dagger, const MemoryLocation *pack_destination=nullptr, const MemoryLocation *halo_location=nullptr, bool gdr_send=false, bool gdr_recv=false) const =0
static __device__ __host__ int linkIndexM1(const int x[], const I X[4], const int mu)
unsigned int minThreads() const
__device__ __host__ void computeCoarseLocal(Arg &arg, int parity, int x_cb)
void setComputeType(ComputeType type_)
__host__ __device__ void sum(double &a, double &b)
__global__ void CalculateYhatGPU(Arg arg)
long long BatchInvertMatrix(void *Ainv, void *A, const int n, const int batch, QudaPrecision precision, QudaFieldLocation location)
for(int s=0;s< param.dc.Ls;s++)
void setDimension(int dim_)
enum QudaMatPCType_s QudaMatPCType
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
#define checkLocation(...)
void CalculateYhatCPU(Arg &arg)
enum QudaGaugeFieldOrder_s QudaGaugeFieldOrder
__global__ void ComputeUVGPU(Arg arg)
void apply(const cudaStream_t &stream)
virtual void backup() const
Backs up the LatticeField.
__device__ __host__ void computeTMCAV(Arg &arg, int parity, int x_cb)
double gamma(double) __attribute__((availability(macosx
virtual void injectGhost(QudaLinkDirection=QUDA_LINK_BACKWARDS)=0
__global__ void AddCoarseDiagonalGPU(Arg arg)
__device__ __host__ void computeYreverse(Arg &arg, int parity, int x_cb)
__global__ void ComputeCoarseLocalGPU(Arg arg)
void * Ghost(const int i)
QudaFieldLocation Location() const
bool advanceTuneParam(TuneParam ¶m) const
enum QudaFieldLocation_s QudaFieldLocation
const ColorSpinorField & meta
CalculateYhatArg(const Gauge &Yhat, const Gauge Y, const Gauge Xinv, const int *dim, const int *comm_dim, int nFace)
void apply(const cudaStream_t &stream)
CalculateYArg(coarseGauge &Y, coarseGauge &X, coarseGauge &Xinv, fineSpinorTmp &UV, fineSpinor &AV, const fineGauge &U, const fineSpinor &V, const fineClover &C, const fineClover &Cinv, double kappa, double mu, double mu_factor, const int *x_size_, const int *xc_size_, int *geo_bs_, int spin_bs_)
__global__ void ComputeYReverseGPU(Arg arg)
void ComputeCoarseCloverCPU(Arg &arg)
virtual void exchangeGhost(QudaLinkDirection=QUDA_LINK_BACKWARDS)=0
void ComputeCoarseLocalCPU(Arg &arg)
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
static bool bidirectional_debug
const LatticeField & meta
void ComputeAVCPU(Arg &arg)
__device__ __host__ void computeYhat(Arg &arg, int d, int x_cb, int parity, int i)
void AddCoarseDiagonalCPU(Arg &arg)
QudaGaugeFieldOrder Order() const
void ComputeYReverseCPU(Arg &arg)
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
CalculateYhat(Arg &arg, const LatticeField &meta)
__device__ __host__ void computeVUV(Arg &arg, int parity, int x_cb, int c_row)
__global__ void ComputeTMAVGPU(Arg arg)
__host__ __device__ ValueType conj(ValueType x)
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
static __inline__ size_t size_t d
QudaPrecision Precision() const
static __device__ __host__ int linkIndexP1(const int x[], const I X[4], const int mu)
void ComputeUVCPU(Arg &arg)
__global__ void AddCoarseTmDiagonalGPU(Arg arg)
__device__ __host__ void computeTMAV(Arg &arg, int parity, int x_cb, int v)
void ComputeTMCAVCPU(Arg &arg)
virtual void copy(const GaugeField &src)=0
int comm_dim_partitioned(int dim)
void calculateY(coarseGauge &Y, coarseGauge &X, coarseGauge &Xinv, Ftmp &UV, F &AV, F &V, fineGauge &G, fineClover &C, fineClover &Cinv, GaugeField &Y_, GaugeField &X_, GaugeField &Xinv_, GaugeField &Yhat_, ColorSpinorField &av, const ColorSpinorField &v, double kappa, double mu, double mu_factor, QudaDiracType dirac, QudaMatPCType matpc)
Calculate the coarse-link field, include the clover field, and its inverse, and finally also compute ...
enum QudaDiracType_s QudaDiracType
virtual bool advanceTuneParam(TuneParam ¶m) const
__global__ void ComputeAVGPU(Arg arg)
bool advanceTuneParam(TuneParam ¶m) const
static __device__ __host__ void getCoords(int x[], int cb_index, const I X[], int parity)