5 #define DISABLE_GHOST true // do not rename this (it is both a template parameter and a macro) 15 #define MAX_MATRIX_SIZE 4096 24 template <
typename Rotator,
typename Vector,
int fineSpin,
int spinBlockSize,
int coarseSpin,
int nVec>
38 template <
typename... T>
42 fine_to_coarse(fine_to_coarse),
43 coarse_to_fine(coarse_to_fine),
46 nParity(meta.SiteSubset()),
47 nBlockOrtho(n_block_ortho),
50 const Vector Btmp[nVec]{*B...};
52 memcpy(B_array_h, (
void *)Btmp,
sizeof(Btmp));
54 for (
int d = 0; d < V.
Ndim(); d++) geoBlockSize *= geo_bs[d];
55 geoBlockSizeCB = geoBlockSize/2;
56 coarseVolume = meta.
Volume() / geoBlockSize;
58 if (nParity != 2)
errorQuda(
"BlockOrtho only presently supports full fields");
62 template <
int nColor,
typename sumType,
typename real>
64 complex<real> w[nColor])
67 for (
int c = 0; c <
nColor; c++) {
68 dot.x += w[c].real() * v[c].real();
69 dot.x += w[c].imag() * v[c].imag();
70 dot.y += w[c].real() * v[c].imag();
71 dot.y -= w[c].imag() * v[c].real();
75 template <
int nColor,
typename sumType,
typename real>
79 for (
int c = 0; c <
nColor; c++) {
80 nrm += v[c].real() * v[c].real();
81 nrm += v[c].imag() * v[c].imag();
85 template <
typename real,
int nColor>
89 for (
int c = 0; c <
nColor; c++) {
90 v[c].x -= a.real() * w[c].real();
91 v[c].x += a.imag() * w[c].imag();
92 v[c].y -= a.real() * w[c].imag();
93 v[c].y -= a.imag() * w[c].real();
97 template <
typename real,
int nColor>
inline __device__ __host__
void colorScale(complex<real> v[
nColor], real a)
100 for (
int c=0; c<
nColor; c++) v[c] *= a;
103 #ifndef __CUDACC_RTC___ 104 template <
typename sumFloat,
typename Float,
int nSpin,
int spinBlockSize,
int nColor,
int coarseSpin,
int nVec,
typename Arg>
108 #pragma omp parallel for 109 for (
int x_coarse=0; x_coarse<arg.coarseVolume; x_coarse++) {
112 for (
int n = 0; n < arg.nBlockOrtho; n++) {
113 for (
int j = 0; j < nVec; j++) {
115 for (
int i = 0; i < j; i++) {
119 complex<sumFloat>
dot[coarseSpin];
120 for (
int s = 0;
s < coarseSpin;
s++) dot[
s] = 0.0;
124 for (
int b = 0; b < arg.geoBlockSizeCB; b++) {
126 int x = arg.coarse_to_fine[(x_coarse * 2 +
parity) * arg.geoBlockSizeCB + b];
127 int x_cb = x -
parity * arg.fineVolumeCB;
129 complex<Float> v[nSpin][
nColor];
132 for (
int s = 0;
s < nSpin;
s++)
133 for (
int c = 0; c <
nColor; c++) v[
s][c] = arg.B[j](
parity, x_cb,
s, c);
135 for (
int s = 0;
s < nSpin;
s++)
136 for (
int c = 0; c <
nColor; c++) v[
s][c] = arg.
V(
parity, x_cb,
s, c, j);
139 for (
int s = 0;
s < nSpin;
s++) {
140 complex<Float> vis[
nColor];
141 for (
int c = 0; c <
nColor; c++) vis[c] = arg.
V(
parity, x_cb,
s, c, i);
142 colorInnerProduct<nColor>(dot[arg.spin_map(
s,
parity)], i, v[
s], vis);
151 for (
int b = 0; b < arg.geoBlockSizeCB; b++) {
153 int x = arg.coarse_to_fine[(x_coarse * 2 +
parity) * arg.geoBlockSizeCB + b];
154 int x_cb = x -
parity * arg.fineVolumeCB;
156 complex<Float> v[nSpin][
nColor];
158 for (
int s = 0;
s < nSpin;
s++)
159 for (
int c = 0; c <
nColor; c++) v[
s][c] = arg.B[j](
parity, x_cb,
s, c);
161 for (
int s = 0;
s < nSpin;
s++)
162 for (
int c = 0; c <
nColor; c++) v[
s][c] = arg.
V(
parity, x_cb,
s, c, j);
164 for (
int s = 0;
s < nSpin;
s++) {
165 complex<Float> vis[
nColor];
166 for (
int c = 0; c <
nColor; c++) vis[c] = arg.
V(
parity, x_cb,
s, c, i);
167 colorScaleSubtract<Float, nColor>(v[
s],
static_cast<complex<Float>
>(dot[arg.spin_map(
s,
parity)]), vis);
170 for (
int s = 0;
s < nSpin;
s++)
171 for (
int c = 0; c <
nColor; c++) arg.
V(
parity, x_cb,
s, c, j) = v[
s][c];
177 sumFloat nrm[coarseSpin] = {};
181 for (
int b = 0; b < arg.geoBlockSizeCB; b++) {
183 int x = arg.coarse_to_fine[(x_coarse * 2 +
parity) * arg.geoBlockSizeCB + b];
184 int x_cb = x -
parity * arg.fineVolumeCB;
186 complex<Float> v[nSpin][
nColor];
188 for (
int s = 0;
s < nSpin;
s++)
189 for (
int c = 0; c <
nColor; c++) v[
s][c] = arg.B[j](
parity, x_cb,
s, c);
191 for (
int s = 0;
s < nSpin;
s++)
192 for (
int c = 0; c <
nColor; c++) v[
s][c] = arg.
V(
parity, x_cb,
s, c, j);
194 for (
int s = 0;
s < nSpin;
s++) { colorNorm<nColor>(nrm[arg.spin_map(
s,
parity)], v[
s]); }
198 for (
int s = 0; s < coarseSpin; s++) nrm[s] = nrm[s] > 0.0 ? rsqrt(nrm[
s]) : 0.0;
203 for (
int b = 0; b < arg.geoBlockSizeCB; b++) {
205 int x = arg.coarse_to_fine[(x_coarse * 2 +
parity) * arg.geoBlockSizeCB + b];
206 int x_cb = x -
parity * arg.fineVolumeCB;
208 complex<Float> v[nSpin][
nColor];
210 for (
int s = 0; s < nSpin; s++)
211 for (
int c = 0; c <
nColor; c++) v[s][c] = arg.B[j](
parity, x_cb, s, c);
213 for (
int s = 0; s < nSpin; s++)
214 for (
int c = 0; c <
nColor; c++) v[s][c] = arg.
V(
parity, x_cb, s, c, j);
216 for (
int s = 0; s < nSpin; s++) { colorScale<Float, nColor>(v[
s], nrm[arg.spin_map(s,
parity)]); }
218 for (
int s = 0; s < nSpin; s++)
219 for (
int c = 0; c <
nColor; c++) arg.
V(
parity, x_cb, s, c, j) = v[
s][c];
231 template <
int block_size,
typename sumFloat,
typename Float,
int nSpin,
int spinBlockSize,
int nColor,
int coarseSpin,
232 int nVec,
typename Arg>
236 int x_coarse = blockIdx.x;
239 const int gridp = gridDim.x - gridDim.x % arg.swizzle;
241 if (blockIdx.x < gridp) {
243 const int i = blockIdx.x % arg.swizzle;
244 const int j = blockIdx.x / arg.swizzle;
247 x_coarse = i * (gridp / arg.swizzle) + j;
250 int parity = (arg.
nParity == 2) ? threadIdx.y + blockIdx.y*blockDim.y : arg.parity;
251 int x = arg.coarse_to_fine[ (x_coarse*2 + parity) * blockDim.x + threadIdx.x];
252 int x_cb = x - parity*arg.fineVolumeCB;
253 if (x_cb >= arg.fineVolumeCB)
return;
254 int chirality = blockIdx.z;
256 constexpr
int spinBlock = nSpin / coarseSpin;
257 typedef cub::BlockReduce<complex<sumFloat>, block_size, cub::BLOCK_REDUCE_WARP_REDUCTIONS, 2> dotReduce;
258 typedef cub::BlockReduce<sumFloat, block_size, cub::BLOCK_REDUCE_WARP_REDUCTIONS, 2> normReduce;
260 __shared__
typename dotReduce::TempStorage dot_storage;
261 typename normReduce::TempStorage *norm_storage = (
typename normReduce::TempStorage *)&dot_storage;
262 complex<sumFloat> *
dot_ = (complex<sumFloat> *)&dot_storage;
263 sumFloat *nrm_ = (sumFloat *)&dot_storage;
266 typedef typename std::remove_reference<decltype(*arg.B)>::type
Vector;
267 const Vector *
B =
reinterpret_cast<const Vector *
>(
B_array_d);
270 for (
int n = 0; n < arg.nBlockOrtho; n++) {
271 for (
int j = 0; j < nVec; j++) {
273 complex<Float> v[spinBlock][
nColor];
276 for (
int s = 0;
s < spinBlock;
s++)
278 for (
int c = 0; c <
nColor; c++) v[
s][c] = B[j](parity, x_cb, chirality * spinBlock +
s, c);
281 for (
int s = 0;
s < spinBlock;
s++)
283 for (
int c = 0; c <
nColor; c++) v[
s][c] = arg.
V(parity, x_cb, chirality * spinBlock +
s, c, j);
286 for (
int i = 0; i < j; i++) {
288 complex<Float>
dot = 0.0;
291 complex<Float> vi[spinBlock][
nColor];
293 for (
int s = 0;
s < spinBlock;
s++)
295 for (
int c = 0; c <
nColor; c++) vi[
s][c] = arg.
V(parity, x_cb, chirality * spinBlock +
s, c, i);
298 for (
int s = 0;
s < spinBlock;
s++) { colorInnerProduct<nColor>(
dot, i, v[
s], vi[
s]); }
301 dot = dotReduce(dot_storage).Sum(dot);
302 if (threadIdx.x == 0 && threadIdx.y == 0) *dot_ =
dot;
308 for (
int s = 0;
s < spinBlock;
s++) { colorScaleSubtract<Float, nColor>(v[
s],
dot, vi[
s]); }
313 sumFloat nrm =
static_cast<sumFloat
>(0.0);
316 for (
int s = 0;
s < spinBlock;
s++) { colorNorm<nColor>(nrm, v[
s]); }
319 nrm = normReduce(*norm_storage).Sum(nrm);
320 if (threadIdx.x == 0 && threadIdx.y == 0) *nrm_ = nrm;
324 nrm = nrm > 0.0 ? rsqrt(nrm) : 0.0;
327 for (
int s = 0;
s < spinBlock;
s++) { colorScale<Float, nColor>(v[
s], nrm); }
330 for (
int s = 0;
s < spinBlock;
s++)
332 for (
int c = 0; c <
nColor; c++) arg.
V(parity, x_cb, chirality * spinBlock +
s, c, j) = v[
s][c];
BlockOrthoArg(ColorSpinorField &V, const int *fine_to_coarse, const int *coarse_to_fine, int parity, const int *geo_bs, const int n_block_ortho, const ColorSpinorField &meta, T... B)
const spin_mapper< fineSpin, coarseSpin > spin_map
void blockOrthoCPU(Arg &arg)
__device__ __host__ void colorNorm(sumType &nrm, complex< real > v[nColor])
const int * fine_to_coarse
static __constant__ signed char B_array_d[MAX_MATRIX_SIZE]
__device__ __host__ void colorScaleSubtract(complex< real > v[nColor], complex< real > a, complex< real > w[nColor])
const int * coarse_to_fine
__device__ __host__ void colorScale(complex< real > v[nColor], real a)
static signed char B_array_h[MAX_MATRIX_SIZE]
__device__ __host__ void colorInnerProduct(complex< sumType > &dot, int i, complex< real > v[nColor], complex< real > w[nColor])
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
colorspinor::FieldOrderCB< real, Ns, Nc, 1, order > V
int n_block_ortho[QUDA_MAX_MG_LEVEL]
__launch_bounds__(2 *block_size) __global__ void blockOrthoGPU(Arg arg)
__device__ __host__ void dot_(ReduceType &sum, const double2 &a, const double2 &b)
static void dot(sFloat *res, gFloat *a, sFloat *b)