9 #define max_color_per_block 8 17 template <
typename Float,
int fineSpin,
int coarseSpin,
int fineColor,
int coarseColor,
18 typename coarseGauge,
typename coarseGaugeAtomic,
typename fineGauge,
typename fineSpinor,
19 typename fineSpinorTmp,
typename fineSpinorV,
typename fineClover>
88 coarseGaugeAtomic &Y_atomic, coarseGaugeAtomic &X_atomic,
89 fineSpinorTmp &UV, fineSpinor &AV,
const fineGauge &U,
const fineSpinorV &V,
90 const fineClover &C,
const fineClover &Cinv,
double kappa,
double mu,
double mu_factor,
91 const int *x_size_,
const int *xc_size_,
int *geo_bs_,
int spin_bs_,
92 const int *fine_to_coarse,
const int *coarse_to_fine,
bool bidirectional)
93 : Y(Y), X(X), Y_atomic(Y_atomic), X_atomic(X_atomic),
94 UV(UV), AV(AV), U(U), V(V), C(C), Cinv(Cinv), spin_bs(spin_bs_), spin_map(),
95 kappa(static_cast<Float>(kappa)), mu(static_cast<Float>(mu)), mu_factor(static_cast<Float>(mu_factor)),
96 fineVolumeCB(V.VolumeCB()), coarseVolumeCB(X.VolumeCB()),
97 fine_to_coarse(fine_to_coarse), coarse_to_fine(coarse_to_fine),
98 bidirectional(bidirectional), shared_atomic(false), parity_flip(shared_atomic ? true : false),
99 aggregates_per_block(1), max_d(nullptr)
102 errorQuda(
"Gamma basis %d not supported", V.GammaBasis());
105 x_size[i] = x_size_[i];
106 xc_size[i] = xc_size_[i];
107 geo_bs[i] = geo_bs_[i];
116 template<
typename Float>
117 inline __device__ __host__
void caxpy(
const complex<Float> &a,
const complex<Float> &x, complex<Float> &y) {
128 template<
bool from_coarse,
typename Float,
int dim,
QudaDirection dir,
int fineSpin,
int fineColor,
129 int coarseSpin,
int coarseColor,
typename Wtype,
typename Arg>
133 getCoords(coord, x_cb, arg.x_size, parity);
135 constexpr
int uvSpin = fineSpin * (from_coarse ? 2 : 1);
137 complex<Float>
UV[uvSpin][fineColor];
139 for(
int s = 0;
s < uvSpin;
s++) {
140 for(
int c = 0; c < fineColor; c++) {
141 UV[
s][c] =
static_cast<Float
>(0.0);
145 if ( arg.comm_dim[dim] && (coord[dim] + 1 >= arg.x_size[dim]) ) {
147 int ghost_idx = ghostFaceIndex<1>(coord, arg.x_size, dim, nFace);
149 for(
int s = 0;
s < fineSpin;
s++) {
150 for(
int ic = 0; ic < fineColor; ic++) {
151 for(
int jc = 0; jc < fineColor; jc++) {
153 caxpy(arg.U(dim, parity, x_cb, ic, jc), W.Ghost(dim, 1, (parity+1)&1, ghost_idx,
s, jc, ic_c), UV[
s][ic]);
155 for (
int s_col=0; s_col<fineSpin; s_col++) {
158 W.Ghost(dim, 1, (parity+1)&1, ghost_idx, s_col, jc, ic_c), UV[s_col*fineSpin+
s][ic]);
168 for(
int s = 0;
s < fineSpin;
s++) {
169 for(
int ic = 0; ic < fineColor; ic++) {
170 for(
int jc = 0; jc < fineColor; jc++) {
172 caxpy(arg.U(dim, parity, x_cb, ic, jc), W((parity+1)&1, y_cb,
s, jc, ic_c), UV[
s][ic]);
174 for (
int s_col=0; s_col<fineSpin; s_col++) {
177 W((parity+1)&1, y_cb, s_col, jc, ic_c), UV[s_col*fineSpin+
s][ic]);
187 for(
int s = 0;
s < uvSpin;
s++) {
188 for(
int c = 0; c < fineColor; c++) {
189 arg.UV(parity,x_cb,
s,c,ic_c) = UV[
s][c];
196 template<
bool from_coarse,
typename Float,
int dim, QudaDirection dir,
int fineSpin,
int fineColor,
int coarseSpin,
int coarseColor,
typename Arg>
200 #pragma omp parallel for 201 for (
int x_cb=0; x_cb<arg.fineVolumeCB; x_cb++) {
202 for (
int ic_c=0; ic_c < coarseColor; ic_c++)
204 computeUV<from_coarse,Float,dim,dir,fineSpin,fineColor,coarseSpin,coarseColor>(
arg, arg.V,
parity, x_cb, ic_c);
206 computeUV<from_coarse,Float,dim,dir,fineSpin,fineColor,coarseSpin,coarseColor>(
arg, arg.AV,
parity, x_cb, ic_c);
211 template<
bool from_coarse,
typename Float,
int dim, QudaDirection dir,
int fineSpin,
int fineColor,
int coarseSpin,
int coarseColor,
typename Arg>
213 int x_cb = blockDim.x*blockIdx.x + threadIdx.x;
214 if (x_cb >= arg.fineVolumeCB)
return;
216 int parity = blockDim.y*blockIdx.y + threadIdx.y;
217 int ic_c = blockDim.z*blockIdx.z + threadIdx.z;
218 if (ic_c >= coarseColor)
return;
220 computeUV<from_coarse,Float,dim,dir,fineSpin,fineColor,coarseSpin,coarseColor>(
arg, arg.V,
parity, x_cb, ic_c);
222 computeUV<from_coarse,Float,dim,dir,fineSpin,fineColor,coarseSpin,coarseColor>(
arg, arg.AV,
parity, x_cb, ic_c);
229 template <
typename Float,
int fineSpin,
int fineColor,
int coarseColor,
typename Arg>
232 constexpr
int N = fineSpin * fineColor / 2;
236 for (
int i = 0; i < N; i++) {
237 int s_i = 2 * ch + i / fineColor;
238 int c_i = i % fineColor;
240 for (
int j = 0; j <= i; j++) {
241 int s_j = 2 * ch + j / fineColor;
242 int c_j = j % fineColor;
243 #ifndef DYNAMIC_CLOVER 244 A(i, j) = arg.Cinv(0, parity, x_cb, s_i, s_j, c_i, c_j);
246 A(i, j) = arg.C(0, parity, x_cb, s_i, s_j, c_i, c_j);
252 for (
int s = 0;
s < fineSpin / 2;
s++) {
253 for (
int c = 0; c < fineColor; c++) {
V(
s, c) = arg.
V(parity, x_cb, 2 * ch +
s, c, ic_c); }
256 #ifndef DYNAMIC_CLOVER 265 for (
int s = 0;
s < fineSpin / 2;
s++) {
267 for (
int ic = 0; ic < fineColor; ic++) { arg.AV(parity, x_cb, 2 * ch +
s, ic, ic_c) =
AV(
s, ic); }
272 template <
typename Float,
int fineSpin,
int fineColor,
int coarseColor,
typename Arg>
void ComputeAVCPU(
Arg &
arg)
275 #pragma omp parallel for 276 for (
int x_cb=0; x_cb<arg.fineVolumeCB; x_cb++) {
277 for (
int ch = 0; ch < 2; ch++) {
279 for (
int ic_c = 0; ic_c < coarseColor; ic_c++) {
280 computeAV<Float, fineSpin, fineColor, coarseColor>(
arg,
parity, x_cb, ch, ic_c);
287 template <
typename Float,
int fineSpin,
int fineColor,
int coarseColor,
typename Arg>
290 int x_cb = blockDim.x*blockIdx.x + threadIdx.x;
291 if (x_cb >= arg.fineVolumeCB)
return;
293 int ch_parity = blockDim.y * blockIdx.y + threadIdx.y;
294 if (ch_parity >= 4)
return;
295 int ch = ch_parity % 2;
296 int parity = ch_parity / 2;
298 int ic_c = blockDim.z * blockIdx.z + threadIdx.z;
299 if (ic_c >= coarseColor)
return;
302 computeAV<Float, fineSpin, fineColor, coarseColor>(
arg,
parity, x_cb, 0, ic_c);
304 computeAV<Float, fineSpin, fineColor, coarseColor>(
arg,
parity, x_cb, 1, ic_c);
311 template<
typename Float,
int fineSpin,
int fineColor,
int coarseColor,
typename Arg>
314 complex<Float> fp(1./(1.+arg.mu*arg.mu),-arg.mu/(1.+arg.mu*arg.mu));
315 complex<Float> fm(1./(1.+arg.mu*arg.mu),+arg.mu/(1.+arg.mu*arg.mu));
317 for(
int s = 0;
s < fineSpin/2;
s++) {
318 for(
int c = 0; c < fineColor; c++) {
319 arg.AV(parity,x_cb,
s,c,v) = arg.
V(parity,x_cb,
s,c,v)*fp;
323 for(
int s = fineSpin/2;
s < fineSpin;
s++) {
324 for(
int c = 0; c < fineColor; c++) {
325 arg.AV(parity,x_cb,
s,c,v) = arg.
V(parity,x_cb,
s,c,v)*fm;
331 template<
typename Float,
int fineSpin,
int fineColor,
int coarseColor,
typename Arg>
334 #pragma omp parallel for 335 for (
int x_cb=0; x_cb<arg.fineVolumeCB; x_cb++) {
336 for (
int v=0; v<coarseColor; v++)
337 computeTMAV<Float,fineSpin,fineColor,coarseColor,Arg>(arg,
parity, x_cb, v);
342 template<
typename Float,
int fineSpin,
int fineColor,
int coarseColor,
typename Arg>
344 int x_cb = blockDim.x*blockIdx.x + threadIdx.x;
345 if (x_cb >= arg.fineVolumeCB)
return;
347 int parity = blockDim.y*blockIdx.y + threadIdx.y;
348 int v = blockDim.z*blockIdx.z + threadIdx.z;
349 if (v >= coarseColor)
return;
351 computeTMAV<Float,fineSpin,fineColor,coarseColor,Arg>(
arg,
parity, x_cb, v);
354 #ifdef DYNAMIC_CLOVER 360 template <
typename Float,
bool twist,
typename Arg>
361 __device__ __host__
inline Float computeCloverInvMax(
Arg &
arg,
int parity,
int x_cb)
367 constexpr
int nSpin = 4;
368 constexpr
int N = nColor * nSpin / 2;
372 for (
int ch = 0; ch < 2; ch++) {
376 for (
int i = 0; i < N; i++) {
378 for (
int j = 0; j <= i; j++) {
379 A(i, j) = arg.C(0, parity, x_cb, 2 * ch + i / nColor, 2 * ch + j / nColor, i % nColor, j % nColor);
385 A += arg.mu * arg.mu;
391 Mat Ainv = cholesky.
invert();
393 Float inv_max = Ainv.max();
394 max = max > inv_max ? max : inv_max;
401 template <
typename Float,
bool twist,
typename Arg>
void ComputeCloverInvMaxCPU(
Arg &arg)
404 for (
int parity=0; parity<2; parity++) {
405 #pragma omp parallel for reduction(max:max) 406 for (
int x_cb=0; x_cb<arg.fineVolumeCB; x_cb++) {
407 Float max_x = computeCloverInvMax<Float, twist, Arg>(
arg,
parity, x_cb);
408 max = max > max_x ? max : max_x;
414 template <
typename Float,
bool twist,
typename Arg> __global__
void ComputeCloverInvMaxGPU(
Arg arg)
416 int x_cb = blockDim.x*blockIdx.x + threadIdx.x;
417 if (x_cb >= arg.fineVolumeCB)
return;
419 int parity = blockDim.y*blockIdx.y + threadIdx.y;
420 arg.max_d[parity + 2 * x_cb] = computeCloverInvMax<Float, twist, Arg>(
arg,
parity, x_cb);
423 #endif // DYNAMIC_CLOVER 429 template <
typename Float,
int fineSpin,
int fineColor,
int coarseColor,
typename Arg>
430 __device__ __host__
inline void computeTMCAV(
Arg &arg,
int parity,
int x_cb,
int ch,
int ic_c)
432 constexpr
int N = fineSpin * fineColor / 2;
436 for (
int i = 0; i < N; i++) {
437 int s_i = 2 * ch + i / fineColor;
438 int c_i = i % fineColor;
440 for (
int j = 0; j <= i; j++) {
441 int s_j = 2 * ch + j / fineColor;
442 int c_j = j % fineColor;
443 A(i, j) = arg.C(0, parity, x_cb, s_i, s_j, c_i, c_j);
447 complex<Float>
mu(0., arg.mu);
448 if (ch == 0) mu *=
static_cast<Float
>(-1.0);
452 for (
int s = 0;
s < fineSpin / 2;
s++) {
453 for (
int c = 0; c < fineColor; c++) {
V(
s, c) = arg.
V(parity, x_cb, 2 * ch +
s, c, ic_c); }
463 #ifndef DYNAMIC_CLOVER 467 for (
int i = 0; i < N; i++) {
468 int s_i = 2 * ch + i / fineColor;
469 int c_i = i % fineColor;
471 for (
int j = 0; j <= i; j++) {
472 int s_j = 2 * ch + j / fineColor;
473 int c_j = j % fineColor;
474 Ainv(i, j) = arg.Cinv(0, parity, x_cb, s_i, s_j, c_i, c_j);
481 A += arg.mu * arg.mu;
487 for (
int s = 0;
s < fineSpin / 2;
s++)
488 for (
int c = 0; c < fineColor; c++) arg.AV(parity, x_cb, 2 * ch +
s, c, ic_c) =
AV(
s, c);
491 template <
typename Float,
int fineSpin,
int fineColor,
int coarseColor,
typename Arg>
void ComputeTMCAVCPU(
Arg &arg)
493 for (
int parity = 0; parity < 2; parity++) {
494 #pragma omp parallel for 495 for (
int x_cb=0; x_cb<arg.fineVolumeCB; x_cb++) {
496 for (
int ch = 0; ch < 2; ch++) {
497 for (
int ic_c = 0; ic_c < coarseColor; ic_c++) {
498 computeTMCAV<Float, fineSpin, fineColor, coarseColor, Arg>(
arg,
parity, x_cb, ch, ic_c);
505 template <
typename Float,
int fineSpin,
int fineColor,
int coarseColor,
typename Arg>
508 int x_cb = blockDim.x*blockIdx.x + threadIdx.x;
509 if (x_cb >= arg.fineVolumeCB)
return;
511 int ch_parity = blockDim.y * blockIdx.y + threadIdx.y;
512 if (ch_parity >= 4)
return;
513 int ch = ch_parity % 2;
514 int parity = ch_parity / 2;
516 int ic_c = blockDim.z * blockIdx.z + threadIdx.z;
517 if (ic_c >= coarseColor)
return;
520 computeTMCAV<Float, fineSpin, fineColor, coarseColor, Arg>(
arg,
parity, x_cb, 0, ic_c);
522 computeTMCAV<Float, fineSpin, fineColor, coarseColor, Arg>(
arg,
parity, x_cb, 1, ic_c);
536 template <
bool from_coarse,
typename Float,
int dim, QudaDirection dir,
int fineSpin,
int fineColor,
int coarseSpin,
int coarseColor,
typename Arg,
typename Gamma>
537 __device__ __host__
inline void multiplyVUV(complex<Float> vuv[],
const Arg &arg,
const Gamma &gamma,
int parity,
int x_cb,
int ic_c,
int jc_c) {
540 for (
int i=0; i<coarseSpin*coarseSpin; i++) vuv[i] = 0.0;
545 for (
int s = 0;
s < fineSpin;
s++) {
551 const int s_c_row = arg.spin_map(
s,parity);
558 const int s_col = gamma.
getcol(
s);
559 const int s_c_col = arg.spin_map(s_col,parity);
562 for (
int ic = 0; ic < fineColor; ic++) {
564 complex<Float>
V = arg.
V(parity, x_cb,
s, ic, ic_c);
568 caxpy(
conj(V), arg.UV(parity, x_cb,
s, ic, jc_c), vuv[s_c_row*coarseSpin+s_c_row]);
571 caxpy( gamma.
apply(
s,
conj(V)), arg.UV(parity, x_cb, s_col, ic, jc_c), vuv[s_c_row*coarseSpin+s_c_col]);
573 complex<Float>
AV = arg.AV(parity, x_cb,
s, ic, ic_c);
576 caxpy(
conj(AV), arg.UV(parity, x_cb,
s, ic, jc_c), vuv[s_c_row*coarseSpin+s_c_row]);
579 caxpy( -gamma.
apply(
s,
conj(AV)), arg.UV(parity, x_cb, s_col, ic, jc_c), vuv[s_c_row*coarseSpin+s_c_col]);
587 for(
int ic = 0; ic < fineColor; ic++) {
589 for (
int s = 0;
s < fineSpin;
s++) {
590 complex<Float>
AV = arg.AV(parity, x_cb,
s, ic, ic_c);
592 for (
int s_col=0; s_col<fineSpin; s_col++) {
593 complex<Float>
UV = arg.UV(parity, x_cb, s_col*fineSpin+
s, ic, jc_c);
603 template<
typename Arg>
605 constexpr
int warp_size = 32;
606 int warp_id = threadIdx.x / warp_size;
607 int warp_lane = threadIdx.x % warp_size;
608 int tx = warp_id * (warp_size / arg.aggregates_per_block) + warp_lane / arg.aggregates_per_block;
612 template<
typename Arg>
614 int block_dim_x = blockDim.x / arg.aggregates_per_block;
618 template<
typename Arg>
620 constexpr
int warp_size = 32;
621 int warp_lane = threadIdx.x % warp_size;
622 int x_coarse = (arg.coarse_color_wave ? blockIdx.y : blockIdx.x)*arg.aggregates_per_block + warp_lane % arg.aggregates_per_block;
627 int fineSpin,
int fineColor,
int coarseSpin,
int coarseColor,
typename Arg,
typename Gamma>
628 __device__ __host__
void computeVUV(Arg &arg,
const Gamma &gamma,
int parity,
int x_cb,
int c_row,
int c_col,
int parity_coarse_,
int coarse_x_cb_) {
630 constexpr
int nDim = 4;
634 getCoords(coord, x_cb, arg.x_size, parity);
635 for(
int d = 0; d < nDim; d++) coord_coarse[d] = coord[d]/arg.geo_bs[d];
639 const bool isDiagonal = ((coord[dim]+1)%arg.x_size[dim])/arg.geo_bs[dim] == coord_coarse[dim] ? true :
false;
641 int coarse_parity = shared_atomic ? parity_coarse_ : 0;
642 if (!shared_atomic) {
643 for (
int d=0; d<nDim; d++) coarse_parity += coord_coarse[d];
645 coord_coarse[0] /= 2;
647 int coarse_x_cb = shared_atomic ? coarse_x_cb_ : ((coord_coarse[3]*arg.xc_size[2]+coord_coarse[2])*arg.xc_size[1]+coord_coarse[1])*(arg.xc_size[0]/2) + coord_coarse[0];
649 complex<Float> vuv[coarseSpin*coarseSpin];
650 multiplyVUV<from_coarse,Float,dim,dir,fineSpin,fineColor,coarseSpin,coarseColor,Arg>(vuv,
arg, gamma,
parity, x_cb, c_row, c_col);
652 const int dim_index = arg.dim_index % arg.Y_atomic.geometry;
660 int x_ = coarse_x_cb%arg.aggregates_per_block;
663 int s_col = tx / coarseSpin;
664 int s_row = tx % coarseSpin;
669 if (tx < coarseSpin*coarseSpin) {
670 Y[c_row_block][c_col_block][x_][s_row][s_col] = 0;
671 X[c_row_block][c_col_block][x_][s_row][s_col] = 0;
678 for (
int s_row = 0; s_row < coarseSpin; s_row++) {
680 for (
int s_col = 0; s_col < coarseSpin; s_col++) {
681 if (gauge::fixed_point<Float,storeType>()) {
682 Float scale = arg.Y_atomic.accessor.scale;
683 complex<storeType> a(round(scale * vuv[s_row*coarseSpin+s_col].real()),
684 round(scale * vuv[s_row*coarseSpin+s_col].imag()));
685 atomicAdd(&Y[c_row_block][c_col_block][x_][s_row][s_col],a);
687 atomicAdd(&Y[c_row_block][c_col_block][x_][s_row][s_col],
reinterpret_cast<complex<storeType>*
>(vuv)[s_row*coarseSpin+s_col]);
693 for (
int s_row = 0; s_row < coarseSpin; s_row++) {
695 for (
int s_col = 0; s_col < coarseSpin; s_col++) {
696 vuv[s_row*coarseSpin+s_col] *= -arg.kappa;
697 if (gauge::fixed_point<Float,storeType>()) {
698 Float scale = arg.X_atomic.accessor.scale;
699 complex<storeType> a(round(scale * vuv[s_row*coarseSpin+s_col].real()),
700 round(scale * vuv[s_row*coarseSpin+s_col].imag()));
701 atomicAdd(&X[c_row_block][c_col_block][x_][s_row][s_col],a);
703 atomicAdd(&X[c_row_block][c_col_block][x_][s_row][s_col],
reinterpret_cast<complex<storeType>*
>(vuv)[s_row*coarseSpin+s_col]);
711 if (tx < coarseSpin*coarseSpin && (parity == 0 || parity_flip == 1) ) {
712 arg.Y_atomic.atomicAdd(dim_index,coarse_parity,coarse_x_cb,s_row,s_col,c_row,c_col,Y[c_row_block][c_col_block][x_][s_row][s_col]);
715 arg.X_atomic.atomicAdd(0,coarse_parity,coarse_x_cb,s_col,s_row,c_col,c_row,
conj(X[c_row_block][c_col_block][x_][s_row][s_col]));
717 arg.X_atomic.atomicAdd(0,coarse_parity,coarse_x_cb,s_row,s_col,c_row,c_col,X[c_row_block][c_col_block][x_][s_row][s_col]);
720 if (!arg.bidirectional) {
721 if (s_row == s_col) arg.X_atomic.atomicAdd(0,coarse_parity,coarse_x_cb,s_row,s_col,c_row,c_col,X[c_row_block][c_col_block][x_][s_row][s_col]);
722 else arg.X_atomic.atomicAdd(0,coarse_parity,coarse_x_cb,s_row,s_col,c_row,c_col,-X[c_row_block][c_col_block][x_][s_row][s_col]);
726 errorQuda(
"Shared-memory atomic aggregation not supported on CPU");
733 for (
int s_row = 0; s_row < coarseSpin; s_row++) {
735 for (
int s_col = 0; s_col < coarseSpin; s_col++) {
736 arg.Y_atomic.atomicAdd(dim_index,coarse_parity,coarse_x_cb,s_row,s_col,c_row,c_col,vuv[s_row*coarseSpin+s_col]);
741 for (
int s2=0; s2<coarseSpin*coarseSpin; s2++) vuv[s2] *= -arg.kappa;
745 for (
int s_row = 0; s_row < coarseSpin; s_row++) {
747 for (
int s_col = 0; s_col < coarseSpin; s_col++) {
748 arg.X_atomic.atomicAdd(0,coarse_parity,coarse_x_cb,s_col,s_row,c_col,c_row,
conj(vuv[s_row*coarseSpin+s_col]));
753 for (
int s_row = 0; s_row < coarseSpin; s_row++) {
755 for (
int s_col = 0; s_col < coarseSpin; s_col++) {
756 arg.X_atomic.atomicAdd(0,coarse_parity,coarse_x_cb,s_row,s_col,c_row,c_col,vuv[s_row*coarseSpin+s_col]);
761 if (!arg.bidirectional) {
763 for (
int s_row = 0; s_row < coarseSpin; s_row++) {
765 for (
int s_col = 0; s_col < coarseSpin; s_col++) {
766 const Float sign = (s_row == s_col) ? static_cast<Float>(1.0) :
static_cast<Float
>(-1.0);
767 arg.X_atomic.atomicAdd(0,coarse_parity,coarse_x_cb,s_row,s_col,c_row,c_col,sign*vuv[s_row*coarseSpin+s_col]);
778 template<
bool from_coarse,
typename Float,
int dim, QudaDirection dir,
int fineSpin,
int fineColor,
int coarseSpin,
int coarseColor,
typename Arg>
782 constexpr
bool shared_atomic =
false;
783 constexpr
bool parity_flip =
true;
785 for (
int parity=0; parity<2; parity++) {
786 #pragma omp parallel for 787 for (
int x_cb=0; x_cb<arg.fineVolumeCB; x_cb++) {
788 for (
int c_row=0; c_row<coarseColor; c_row++)
789 for (
int c_col=0; c_col<coarseColor; c_col++)
790 computeVUV<shared_atomic,parity_flip,from_coarse,Float,dim,dir,fineSpin,fineColor,coarseSpin,coarseColor>(arg, gamma, parity, x_cb, c_row, c_col, 0, 0);
796 template <
bool parity_flip,
typename Arg>
797 __device__
inline void getIndicesShared(
const Arg &arg,
int &parity,
int &x_cb,
int &parity_coarse,
int &x_coarse_cb,
int &c_col,
int &c_row) {
799 if (arg.coarse_color_wave) {
800 int parity_c_col_block_z = blockDim.y*blockIdx.x + threadIdx.y;
801 int c_col_block_z = parity_flip ? (parity_c_col_block_z % arg.coarse_color_grid_z ) : (parity_c_col_block_z / 2);
802 parity = parity_flip ? (parity_c_col_block_z / arg.coarse_color_grid_z ) : (parity_c_col_block_z % 2);
803 c_col = c_col_block_z % arg.coarse_color;
804 c_row = blockDim.z*(c_col_block_z/arg.coarse_color) + threadIdx.z;
806 int parity_c_col = blockDim.y*blockIdx.y + threadIdx.y;
807 c_col = parity_flip ? (parity_c_col % arg.coarse_color) : (parity_c_col / 2);
808 parity = parity_flip ? (parity_c_col / arg.coarse_color) : (parity_c_col % 2);
809 c_row = blockDim.z*blockIdx.z + threadIdx.z;
816 parity_coarse = x_coarse >= arg.coarseVolumeCB ? 1 : 0;
817 x_coarse_cb = x_coarse - parity_coarse*arg.coarseVolumeCB;
828 int x_fine = arg.coarse_to_fine[ (x_coarse*2 +
parity) * block_dim_x + thread_idx_x];
829 x_cb = x_fine - parity*arg.fineVolumeCB;
833 template <
bool parity_flip,
typename Arg>
834 __device__
inline void getIndicesGlobal(
const Arg &arg,
int &parity,
int &x_cb,
int &parity_coarse,
int &x_coarse_cb,
int &c_col,
int &c_row) {
836 x_cb = blockDim.x*(arg.coarse_color_wave ? blockIdx.y : blockIdx.x) + threadIdx.x;
838 if (arg.coarse_color_wave) {
839 int parity_c_col_block_z = blockDim.y*blockIdx.x + threadIdx.y;
840 int c_col_block_z = parity_flip ? (parity_c_col_block_z % arg.coarse_color_grid_z ) : (parity_c_col_block_z / 2);
841 parity = parity_flip ? (parity_c_col_block_z / arg.coarse_color_grid_z ) : (parity_c_col_block_z % 2);
842 c_col = c_col_block_z % arg.coarse_color;
843 c_row = blockDim.z*(c_col_block_z/arg.coarse_color) + threadIdx.z;
845 int parity_c_col = blockDim.y*blockIdx.y + threadIdx.y;
846 c_col = parity_flip ? (parity_c_col % arg.coarse_color) : (parity_c_col / 2);
847 parity = parity_flip ? (parity_c_col / arg.coarse_color) : (parity_c_col % 2);
848 c_row = blockDim.z*blockIdx.z + threadIdx.z;
856 int fineSpin,
int fineColor,
int coarseSpin,
int coarseColor,
typename Arg>
860 int parity, x_cb, parity_coarse, x_coarse_cb, c_col, c_row;
861 if (shared_atomic) getIndicesShared<parity_flip>(
arg,
parity, x_cb, parity_coarse, x_coarse_cb, c_col, c_row);
862 else getIndicesGlobal<parity_flip>(
arg,
parity, x_cb, parity_coarse, x_coarse_cb, c_col, c_row);
864 if (parity > 1)
return;
865 if (c_col >= arg.coarse_color)
return;
866 if (c_row >= arg.coarse_color)
return;
867 if (!shared_atomic && x_cb >= arg.fineVolumeCB)
return;
869 computeVUV<shared_atomic,parity_flip,from_coarse,Float,dim,dir,fineSpin,fineColor,coarseSpin,coarseColor>(
arg, gamma,
parity, x_cb, c_row, c_col, parity_coarse, x_coarse_cb);
876 template<
typename Float,
int nSpin,
int nColor,
typename Arg>
877 __device__ __host__
void computeYreverse(Arg &arg,
int parity,
int x_cb,
int ic_c,
int jc_c) {
881 for (
int d=0; d<4; d++) {
883 for (
int s_row = 0; s_row < nSpin; s_row++) {
885 for (
int s_col = 0; s_col < nSpin; s_col++) {
887 Y(d+4,parity,x_cb,s_row,s_col,ic_c,jc_c) =
Y(d,parity,x_cb,s_row,s_col,ic_c,jc_c);
889 Y(d+4,parity,x_cb,s_row,s_col,ic_c,jc_c) = -
Y(d,parity,x_cb,s_row,s_col,ic_c,jc_c);
897 template<
typename Float,
int nSpin,
int nColor,
typename Arg>
899 for (
int parity=0; parity<2; parity++) {
900 #pragma omp parallel for 901 for (
int x_cb=0; x_cb<arg.coarseVolumeCB; x_cb++) {
902 for (
int ic_c = 0; ic_c <
nColor; ic_c++) {
903 for (
int jc_c = 0; jc_c <
nColor; jc_c++) {
904 computeYreverse<Float,nSpin,nColor,Arg>(
arg,
parity, x_cb, ic_c, jc_c);
911 template<
typename Float,
int nSpin,
int nColor,
typename Arg>
913 int x_cb = blockDim.x*blockIdx.x + threadIdx.x;
914 if (x_cb >= arg.coarseVolumeCB)
return;
916 int parity_jc_c = blockDim.y*blockIdx.y + threadIdx.y;
917 if (parity_jc_c >= 2*
nColor)
return;
918 int parity = parity_jc_c /
nColor;
919 int jc_c = parity_jc_c %
nColor;
921 int ic_c = blockDim.z*blockIdx.z + threadIdx.z;
922 if (ic_c >=
nColor)
return;
924 computeYreverse<Float,nSpin,nColor,Arg>(
arg,
parity, x_cb, ic_c, jc_c);
927 template<
bool from_coarse,
typename Float,
int fineSpin,
int coarseSpin,
int fineColor,
int coarseColor,
typename Arg>
935 getCoords(coord, x_cb, arg.x_size, parity);
936 for (
int d=0; d<nDim; d++) coord_coarse[d] = coord[d]/arg.geo_bs[d];
938 int coarse_parity = 0;
939 for (
int d=0; d<nDim; d++) coarse_parity += coord_coarse[d];
941 coord_coarse[0] /= 2;
942 int coarse_x_cb = ((coord_coarse[3]*arg.xc_size[2]+coord_coarse[2])*arg.xc_size[1]+coord_coarse[1])*(arg.xc_size[0]/2) + coord_coarse[0];
946 complex<Float>
X[coarseSpin*coarseSpin];
947 for (
int i=0; i<coarseSpin*coarseSpin; i++) X[i] = 0.0;
951 for (
int s = 0;
s < fineSpin;
s++) {
952 const int s_c = arg.spin_map(
s,parity);
955 for (
int s_col = s_c*arg.spin_bs; s_col < (s_c+1)*arg.spin_bs; s_col++) {
956 for (
int ic = 0; ic < fineColor; ic++) {
957 for (
int jc = 0; jc < fineColor; jc++) {
958 X[s_c*coarseSpin + s_c] +=
959 conj(arg.
V(parity, x_cb,
s, ic, ic_c)) * arg.C(0, parity, x_cb,
s, s_col, ic, jc) * arg.
V(parity, x_cb, s_col, jc, jc_c);
967 for (
int s = 0;
s < fineSpin;
s++) {
968 for (
int s_col = 0; s_col < fineSpin; s_col++) {
969 for (
int ic = 0; ic < fineColor; ic++) {
970 for (
int jc = 0; jc < fineColor; jc++) {
971 X[
s*coarseSpin + s_col] +=
972 conj(arg.
V(parity, x_cb,
s, ic, ic_c)) * arg.C(0, parity, x_cb,
s, s_col, ic, jc) * arg.
V(parity, x_cb, s_col, jc, jc_c);
979 for (
int si = 0; si < coarseSpin; si++) {
980 for (
int sj = 0; sj < coarseSpin; sj++) {
981 arg.X_atomic.atomicAdd(0,coarse_parity,coarse_x_cb,si,sj,ic_c,jc_c,X[si*coarseSpin+sj]);
987 template <
bool from_coarse,
typename Float,
int fineSpin,
int coarseSpin,
int fineColor,
int coarseColor,
typename Arg>
989 for (
int parity=0; parity<2; parity++) {
990 #pragma omp parallel for 991 for (
int x_cb=0; x_cb<arg.fineVolumeCB; x_cb++) {
992 for (
int jc_c=0; jc_c<coarseColor; jc_c++) {
993 for (
int ic_c=0; ic_c<coarseColor; ic_c++) {
994 computeCoarseClover<from_coarse,Float,fineSpin,coarseSpin,fineColor,coarseColor>(
arg,
parity, x_cb, ic_c, jc_c);
1001 template <
bool from_coarse,
typename Float,
int fineSpin,
int coarseSpin,
int fineColor,
int coarseColor,
typename Arg>
1003 int x_cb = blockDim.x*blockIdx.x + threadIdx.x;
1004 if (x_cb >= arg.fineVolumeCB)
return;
1006 int parity_c_col = blockDim.y*blockIdx.y + threadIdx.y;
1007 if (parity_c_col >= 2*coarseColor)
return;
1008 int jc_c = parity_c_col % coarseColor;
1009 int parity = parity_c_col / coarseColor;
1011 int ic_c = blockDim.z*blockIdx.z + threadIdx.z;
1012 if (ic_c >= coarseColor)
return;
1013 computeCoarseClover<from_coarse,Float,fineSpin,coarseSpin,fineColor,coarseColor>(
arg,
parity, x_cb, ic_c, jc_c);
1019 template<
typename Float,
int nSpin,
int nColor,
typename Arg>
1021 for (
int parity=0; parity<2; parity++) {
1022 #pragma omp parallel for 1023 for (
int x_cb=0; x_cb<arg.coarseVolumeCB; x_cb++) {
1024 for(
int s = 0;
s < nSpin;
s++) {
1025 for(
int c = 0; c <
nColor; c++) {
1026 arg.X_atomic(0,parity,x_cb,
s,
s,c,c) += complex<Float>(1.0,0.0);
1035 template<
typename Float,
int nSpin,
int nColor,
typename Arg>
1037 int x_cb = blockDim.x*blockIdx.x + threadIdx.x;
1038 if (x_cb >= arg.coarseVolumeCB)
return;
1039 int parity = blockDim.y*blockIdx.y + threadIdx.y;
1041 for(
int s = 0;
s < nSpin;
s++) {
1042 for(
int c = 0; c <
nColor; c++) {
1043 arg.X_atomic(0,parity,x_cb,
s,
s,c,c) += complex<Float>(1.0,0.0);
1049 template<
typename Float,
int nSpin,
int nColor,
typename Arg>
1052 const complex<Float>
mu(0., arg.mu*arg.mu_factor);
1054 for (
int parity=0; parity<2; parity++) {
1055 #pragma omp parallel for 1056 for (
int x_cb=0; x_cb<arg.coarseVolumeCB; x_cb++) {
1057 for(
int s = 0;
s < nSpin/2;
s++) {
1058 for(
int c = 0; c <
nColor; c++) {
1059 arg.X_atomic(0,parity,x_cb,
s,
s,c,c) +=
mu;
1062 for(
int s = nSpin/2;
s < nSpin;
s++) {
1063 for(
int c = 0; c <
nColor; c++) {
1064 arg.X_atomic(0,parity,x_cb,
s,
s,c,c) -=
mu;
1072 template<
typename Float,
int nSpin,
int nColor,
typename Arg>
1074 int x_cb = blockDim.x*blockIdx.x + threadIdx.x;
1075 if (x_cb >= arg.coarseVolumeCB)
return;
1076 int parity = blockDim.y*blockIdx.y + threadIdx.y;
1078 const complex<Float>
mu(0., arg.mu*arg.mu_factor);
1080 for(
int s = 0;
s < nSpin/2;
s++) {
1081 for(
int ic_c = 0; ic_c <
nColor; ic_c++) {
1082 arg.X_atomic(0,parity,x_cb,
s,
s,ic_c,ic_c) +=
mu;
1085 for(
int s = nSpin/2;
s < nSpin;
s++) {
1086 for(
int ic_c = 0; ic_c <
nColor; ic_c++) {
1087 arg.X_atomic(0,parity,x_cb,
s,
s,ic_c,ic_c) -=
mu;
1095 template<
typename Float,
int nSpin,
int nColor,
typename Arg>
1096 __device__ __host__
void convert(Arg &arg,
int parity,
int x_cb,
int c_row,
int c_col) {
1098 if (arg.dim_index < 8) {
1100 const auto &
in = arg.Y_atomic;
1101 int d_in = arg.dim_index %
in.geometry;
1102 int d_out = arg.dim_index % arg.Y.geometry;
1105 for (
int s_row = 0; s_row < nSpin; s_row++) {
1107 for (
int s_col = 0; s_col < nSpin; s_col++) {
1108 complex<Float> M =
in(d_in,parity,x_cb,s_row,s_col,c_row,c_col);
1109 arg.Y(d_out,parity,x_cb,s_row,s_col,c_row,c_col) = M;
1115 const auto &
in = arg.X_atomic;
1116 int d_in = arg.dim_index %
in.geometry;
1117 int d_out = arg.dim_index % arg.X.geometry;
1120 for (
int s_row = 0; s_row < nSpin; s_row++) {
1122 for (
int s_col = 0; s_col < nSpin; s_col++) {
1123 complex<Float> M =
in(d_in,parity,x_cb,s_row,s_col,c_row,c_col);
1124 arg.X(d_out,parity,x_cb,s_row,s_col,c_row,c_col) = M;
1132 template<
typename Float,
int nSpin,
int nColor,
typename Arg>
1134 for (
int parity=0; parity<2; parity++) {
1135 #pragma omp parallel for 1136 for (
int x_cb=0; x_cb<arg.coarseVolumeCB; x_cb++) {
1137 for(
int c_row = 0; c_row <
nColor; c_row++) {
1138 for(
int c_col = 0; c_col <
nColor; c_col++) {
1139 convert<Float,nSpin,nColor,Arg>(
arg,
parity, x_cb, c_row, c_col);
1146 template<
typename Float,
int nSpin,
int nColor,
typename Arg>
1148 int x_cb = blockDim.x*blockIdx.x + threadIdx.x;
1149 if (x_cb >= arg.coarseVolumeCB)
return;
1151 int parity_c_col = blockDim.y*blockIdx.y + threadIdx.y;
1152 if (parity_c_col >= 2*
nColor)
return;
1154 int c_col = parity_c_col %
nColor;
1155 int parity = parity_c_col /
nColor;
1157 int c_row = blockDim.z*blockIdx.z + threadIdx.z;
1158 if (c_row >=
nColor)
return;
1160 convert<Float,nSpin,nColor,Arg>(
arg,
parity, x_cb, c_row, c_col);
1166 template<
typename Float,
int nSpin,
int nColor,
typename Arg>
1167 __device__ __host__
void rescaleY(Arg &arg,
int parity,
int x_cb,
int c_row,
int c_col) {
1170 for (
int s_row = 0; s_row < nSpin; s_row++) {
1172 for (
int s_col = 0; s_col < nSpin; s_col++) {
1173 complex<Float> M = arg.Y(arg.dim_index,parity,x_cb,s_row,s_col,c_row,c_col);
1174 arg.Y(arg.dim_index,parity,x_cb,s_row,s_col,c_row,c_col) = arg.rescale*M;
1180 template<
typename Float,
int nSpin,
int nColor,
typename Arg>
1182 for (
int parity=0; parity<2; parity++) {
1183 #pragma omp parallel for 1184 for (
int x_cb=0; x_cb<arg.coarseVolumeCB; x_cb++) {
1185 for(
int c_row = 0; c_row <
nColor; c_row++) {
1186 for(
int c_col = 0; c_col <
nColor; c_col++) {
1187 rescaleY<Float,nSpin,nColor,Arg>(
arg,
parity, x_cb, c_row, c_col);
1194 template<
typename Float,
int nSpin,
int nColor,
typename Arg>
1196 int x_cb = blockDim.x*blockIdx.x + threadIdx.x;
1197 if (x_cb >= arg.coarseVolumeCB)
return;
1199 int parity_c_col = blockDim.y*blockIdx.y + threadIdx.y;
1200 if (parity_c_col >= 2*
nColor)
return;
1202 int c_col = parity_c_col %
nColor;
1203 int parity = parity_c_col /
nColor;
1205 int c_row = blockDim.z*blockIdx.z + threadIdx.z;
1206 if (c_row >=
nColor)
return;
1208 rescaleY<Float,nSpin,nColor,Arg>(
arg,
parity, x_cb, c_row, c_col);
__device__ __host__ int coarseIndex(const Arg &arg)
__device__ __host__ int virtualThreadIdx(const Arg &arg)
__device__ __host__ Vector backward(const Vector &b)
Backward substition to solve L^dagger x = b.
void RescaleYCPU(Arg &arg)
Compute Cholesky decomposition of A. By default, we use a modified Cholesky which avoids the division...
__global__ void RescaleYGPU(Arg arg)
__global__ void ComputeCoarseCloverGPU(Arg arg)
__device__ __host__ complex< ValueType > apply(int row, const complex< ValueType > &a) const
coarseGaugeAtomic X_atomic
__global__ void ComputeTMCAVGPU(Arg arg)
__device__ __host__ void computeCoarseClover(Arg &arg, int parity, int x_cb, int ic_c, int jc_c)
void Mat(sFloat *out, gFloat **link, sFloat *in, int daggerBit, int mu)
Main header file for host and device accessors to CloverFields.
void ComputeTMAVCPU(Arg &arg)
coarseGaugeAtomic Y_atomic
static __device__ double2 atomicAdd(double2 *addr, double2 val)
Implementation of double2 atomic addition using two double-precision additions.
int comm_dim[QUDA_MAX_DIM]
void ComputeVUVCPU(Arg arg)
enum QudaDirection_s QudaDirection
void AddCoarseTmDiagonalCPU(Arg &arg)
__device__ void getIndicesShared(const Arg &arg, int &parity, int &x_cb, int &parity_coarse, int &x_coarse_cb, int &c_col, int &c_row)
__device__ __host__ void caxpy(const complex< Float > &a, const complex< Float > &x, complex< Float > &y)
const spin_mapper< fineSpin, coarseSpin > spin_map
int_fastdiv geo_bs[QUDA_MAX_DIM]
__device__ __host__ void computeVUV(Arg &arg, const Gamma &gamma, int parity, int x_cb, int c_row, int c_col, int parity_coarse_, int coarse_x_cb_)
__device__ __host__ void multiplyVUV(complex< Float > vuv[], const Arg &arg, const Gamma &gamma, int parity, int x_cb, int ic_c, int jc_c)
Do a single (AV)^ * UV product, where for preconditioned clover, AV correspond to the clover inverse ...
__device__ void getIndicesGlobal(const Arg &arg, int &parity, int &x_cb, int &parity_coarse, int &x_coarse_cb, int &c_col, int &c_row)
#define max_color_per_block
int_fastdiv aggregates_per_block
const int * coarse_to_fine
__device__ __host__ int getcol(int row) const
Specialized container for Hermitian matrices (e.g., used for wrapping clover matrices) ...
static constexpr bool coarse_color_wave
Main header file for host and device accessors to GaugeFields.
__global__ void ComputeUVGPU(Arg arg)
__device__ __host__ void rescaleY(Arg &arg, int parity, int x_cb, int c_row, int c_col)
__global__ void ConvertGPU(Arg arg)
__global__ void AddCoarseDiagonalGPU(Arg arg)
static constexpr int coarse_color
const int * fine_to_coarse
__global__ void ComputeVUVGPU(Arg arg)
__device__ __host__ Vector forward(const Vector &b)
Forward substition to solve Lx = b.
int xc_size[QUDA_MAX_DIM]
__device__ __host__ void computeTMCAV(Arg &arg, int parity, int x_cb, int ch, int ic_c)
__global__ void ComputeYReverseGPU(Arg arg)
__device__ __host__ void computeAV(Arg &arg, int parity, int x_cb, int ch, int ic_c)
void ComputeCoarseCloverCPU(Arg &arg)
__device__ __host__ Mat< T, N > invert()
Compute the inverse of A (the matrix used to construct the Cholesky decomposition).
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
colorspinor::FieldOrderCB< real, Ns, Nc, 1, order > V
void ComputeAVCPU(Arg &arg)
__device__ __host__ HMatrix< T, N > square() const
Hermitian matrix square.
CalculateYArg(coarseGauge &Y, coarseGauge &X, coarseGaugeAtomic &Y_atomic, coarseGaugeAtomic &X_atomic, fineSpinorTmp &UV, fineSpinor &AV, const fineGauge &U, const fineSpinorV &V, const fineClover &C, const fineClover &Cinv, double kappa, double mu, double mu_factor, const int *x_size_, const int *xc_size_, int *geo_bs_, int spin_bs_, const int *fine_to_coarse, const int *coarse_to_fine, bool bidirectional)
void AddCoarseDiagonalCPU(Arg &arg)
void ComputeYReverseCPU(Arg &arg)
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
__global__ void ComputeTMAVGPU(Arg arg)
__host__ __device__ ValueType conj(ValueType x)
int_fastdiv x_size[QUDA_MAX_DIM]
static __device__ __host__ int linkIndexP1(const int x[], const I X[4], const int mu)
void ComputeUVCPU(Arg &arg)
__global__ void AddCoarseTmDiagonalGPU(Arg arg)
void ConvertCPU(Arg &arg)
__device__ __host__ void computeYreverse(Arg &arg, int parity, int x_cb, int ic_c, int jc_c)
__device__ __host__ void computeTMAV(Arg &arg, int parity, int x_cb, int v)
void ComputeTMCAVCPU(Arg &arg)
int comm_dim_partitioned(int dim)
__device__ __host__ int virtualBlockDim(const Arg &arg)
__host__ __device__ int getCoords(int coord[], const Arg &arg, int &idx, int parity, int &dim)
Compute the space-time coordinates we are at.
__device__ void convert(OutputType x[], InputType y[], const int N)
__global__ void ComputeAVGPU(Arg arg)
int_fastdiv coarse_color_grid_z
__device__ __host__ void computeUV(Arg &arg, const Wtype &W, int parity, int x_cb, int ic_c)