6 template <
typename Float,
typename PreconditionedGauge,
typename Gauge,
int n>
struct CalculateYhatArg {
7 PreconditionedGauge
Yhat;
18 CalculateYhatArg(
const PreconditionedGauge &Yhat,
const Gauge Y,
const Gauge Xinv,
const int *dim,
19 const int *comm_dim,
int nFace) :
24 coarseVolumeCB(Y.VolumeCB()),
28 for (
int i=0; i<4; i++) {
29 this->comm_dim[i] = comm_dim[i];
30 this->dim[i] = dim[i];
36 template<
typename Float>
37 inline __device__ __host__
void caxpy(
const complex<Float> &a,
const complex<Float> &x, complex<Float> &y) {
44 template <
typename Float,
int n,
bool compute_max_only,
typename Arg>
48 constexpr
int nDim = 4;
52 const int ghost_idx = ghostFaceIndex<0, nDim>(coord, arg.dim, d, arg.nFace);
57 if ( arg.comm_dim[d] && (coord[d] - arg.nFace < 0) ) {
59 complex<Float> yHat = 0.0;
61 for(
int k = 0; k<n; k++) {
62 caxpy(arg.Y.Ghost(d,1-parity,ghost_idx,i,k),
conj(arg.Xinv(0,parity,x_cb,j,k)), yHat);
64 if (compute_max_only) {
65 yHatMax = fmax(fabs(yHat.x), fabs(yHat.y));
67 arg.Yhat.Ghost(d, 1 - parity, ghost_idx, i, j) = yHat;
71 const int back_idx =
linkIndexM1(coord, arg.dim, d);
73 complex<Float> yHat = 0.0;
75 for (
int k = 0; k<n; k++) {
76 caxpy(arg.Y(d,1-parity,back_idx,i,k),
conj(arg.Xinv(0,parity,x_cb,j,k)), yHat);
78 if (compute_max_only) {
79 yHatMax = fmax(fabs(yHat.x), fabs(yHat.y));
81 arg.Yhat(d, 1 - parity, back_idx, i, j) = yHat;
86 complex<Float> yHat = 0.0;
88 for (
int k = 0; k<n; k++) {
89 caxpy(arg.Xinv(0,parity,x_cb,i,k), arg.Y(d+4,parity,x_cb,k,j), yHat);
91 if (compute_max_only) {
92 yHatMax = fmax(yHatMax, fmax(fabs(yHat.x), fabs(yHat.y)));
94 arg.Yhat(d + 4, parity, x_cb, i, j) = yHat;
103 for (
int d=0; d<4; d++) {
105 #pragma omp parallel for 106 for (
int x_cb = 0; x_cb < arg.Y.VolumeCB(); x_cb++) {
107 for (
int i = 0; i < n; i++)
108 for (
int j = 0; j < n; j++) {
109 Float max_x = computeYhat<Float, n, compute_max_only>(
arg, d, x_cb,
parity, i, j);
110 if (compute_max_only) max = max > max_x ? max : max_x;
115 if (compute_max_only) *arg.max_h = max;
120 int x_cb = blockDim.x*blockIdx.x + threadIdx.x;
121 if (x_cb >= arg.coarseVolumeCB)
return;
122 int i_parity = blockDim.y*blockIdx.y + threadIdx.y;
123 if (i_parity >= 2*n)
return;
124 int j_d = blockDim.z*blockIdx.z + threadIdx.z;
125 if (j_d >= 4*n)
return;
127 int i = i_parity % n;
128 int parity = i_parity / n;
132 Float max = computeYhat<Float, n, compute_max_only>(
arg, d, x_cb,
parity, i, j);
133 if (compute_max_only)
atomicMax(arg.max_d, max);
void CalculateYhatCPU(Arg &arg)
__device__ __host__ void caxpy(const complex< Float > &a, const complex< Float > &x, complex< Float > &y)
static __device__ __host__ int linkIndexM1(const int x[], const I X[4], const int mu)
int comm_dim[QUDA_MAX_DIM]
CalculateYhatArg(const PreconditionedGauge &Yhat, const Gauge Y, const Gauge Xinv, const int *dim, const int *comm_dim, int nFace)
Main header file for host and device accessors to GaugeFields.
__device__ __host__ Float computeYhat(Arg &arg, int d, int x_cb, int parity, int i, int j)
__global__ void CalculateYhatGPU(Arg arg)
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
__host__ __device__ ValueType conj(ValueType x)
static __device__ float atomicMax(float *addr, float val)
Implementation of single-precision atomic max using compare and swap. May not support NaNs properly...
__host__ __device__ int getCoords(int coord[], const Arg &arg, int &idx, int parity, int &dim)
Compute the space-time coordinates we are at.