14 template<
typename real,
int nSpin,
int nColor,
int nVec, QudaFieldOrder order>
21 FillVArg(ColorSpinorField &
V,
const std::vector<ColorSpinorField*> &B,
int v)
22 :
V(
V), B(*(B[v])), v(v) { }
27 template <
typename Float,
int nSpin,
int nColor,
int nVec,
typename Arg>
28 void FillVCPU(Arg &
arg,
int v) {
31 for (
int x_cb=0; x_cb<
arg.V.VolumeCB(); x_cb++) {
32 for (
int s=0;
s<nSpin;
s++) {
43 template <
typename Float,
int nSpin,
int nColor,
int nVec,
typename Arg>
44 __global__
void FillVGPU(Arg
arg,
int v) {
46 int x_cb = threadIdx.x +
blockDim.x*blockIdx.x;
49 for (
int s=0;
s<nSpin;
s++) {
57 template <
typename real,
int nSpin,
int nColor,
int nVec>
58 class FillVLaunch :
public TunableVectorY {
61 const std::vector<ColorSpinorField*> &B;
63 unsigned int minThreads()
const {
return V.VolumeCB(); }
66 FillVLaunch(ColorSpinorField &
V,
const std::vector<ColorSpinorField*> &B,
const int v)
67 : TunableVectorY(2),
V(
V), B(B), v(v) {
70 virtual ~FillVLaunch() { }
72 void apply(
const cudaStream_t &
stream) {
76 FillVArg<real,nSpin,nColor,nVec,QUDA_SPACE_SPIN_COLOR_FIELD_ORDER>
arg(
V,B,v);
77 FillVCPU<real,nSpin,nColor,nVec>(
arg,v);
79 errorQuda(
"Field order not implemented %d",
V.FieldOrder());
83 FillVArg<real,nSpin,nColor,nVec,QUDA_FLOAT2_FIELD_ORDER>
arg(
V,B,v);
84 FillVGPU<real,nSpin,nColor,nVec> <<<tp.grid,tp.block,tp.shared_bytes>>>(
arg,v);
86 errorQuda(
"Field order not implemented %d",
V.FieldOrder());
91 bool advanceTuneParam(TuneParam &
param)
const {
return false; }
93 TuneKey tuneKey()
const {
return TuneKey(
V.VolString(),
typeid(*this).name(), aux); }
95 long long flops()
const {
return 0; }
96 long long bytes()
const {
return 2*
V.Bytes(); }
100 template <
typename real,
int nSpin,
int nColor,
int nVec>
101 void FillV(ColorSpinorField &
V,
const std::vector<ColorSpinorField*> &B) {
102 for (
int v=0; v<nVec; v++) {
103 FillVLaunch<real,nSpin,nColor,nVec>
f(
V,B,v);
109 template <
typename Float,
int nSpin,
int nColor>
110 void FillV(ColorSpinorField &
V,
const std::vector<ColorSpinorField*> &B,
int Nvec) {
112 FillV<Float,nSpin,nColor,2>(
V,B);
113 }
else if (Nvec == 4) {
114 FillV<Float,nSpin,nColor,4>(
V,B);
115 }
else if (Nvec == 8) {
116 FillV<Float,nSpin,nColor,8>(
V,B);
117 }
else if (Nvec == 12) {
118 FillV<Float,nSpin,nColor,12>(
V,B);
119 }
else if (Nvec == 16) {
120 FillV<Float,nSpin,nColor,16>(
V,B);
121 }
else if (Nvec == 20) {
122 FillV<Float,nSpin,nColor,20>(
V,B);
123 }
else if (Nvec == 24) {
124 FillV<Float,nSpin,nColor,24>(
V,B);
125 }
else if (Nvec == 32) {
126 FillV<Float,nSpin,nColor,32>(
V,B);
127 }
else if (Nvec == 48) {
128 FillV<Float,nSpin,nColor,48>(
V,B);
134 template <
typename Float,
int nSpin>
135 void FillV(ColorSpinorField &
V,
const std::vector<ColorSpinorField*> &B,
int Nvec) {
138 if (B[0]->
Ncolor() == 2) {
139 FillV<Float,nSpin,2>(
V,B,Nvec);
140 }
else if(B[0]->
Ncolor() == 3) {
141 FillV<Float,nSpin,3>(
V,B,Nvec);
142 }
else if(B[0]->
Ncolor() == 8) {
143 FillV<Float,nSpin,8>(
V,B,Nvec);
144 }
else if(B[0]->
Ncolor() == 16) {
145 FillV<Float,nSpin,16>(
V,B,Nvec);
146 }
else if(B[0]->
Ncolor() == 24) {
147 FillV<Float,nSpin,24>(
V,B,Nvec);
148 }
else if(B[0]->
Ncolor() == 32) {
149 FillV<Float,nSpin,32>(
V,B,Nvec);
155 template <
typename Float>
156 void FillV(ColorSpinorField &
V,
const std::vector<ColorSpinorField*> &B,
int Nvec) {
157 if (
V.Nspin() == 4) {
158 FillV<Float,4>(
V,B,Nvec);
159 }
else if (
V.Nspin() == 2) {
160 FillV<Float,2>(
V,B,Nvec);
161 #ifdef GPU_STAGGERED_DIRAC 162 }
else if (
V.Nspin() == 1) {
163 FillV<Float,1>(
V,B,Nvec);
170 #endif // GPU_MULTIGRID 176 #ifdef GPU_MULTIGRID_DOUBLE 177 FillV<double>(
V,B,Nvec);
179 errorQuda(
"Double precision multigrid has not been enabled");
182 FillV<float>(
V,B,Nvec);
184 errorQuda(
"Unsupported precision %d",
V.Precision());
187 errorQuda(
"Multigrid has not been built");
195 template <
bool toBlock,
int nVec,
class Complex,
class FieldOrder>
197 const int *geo_map,
const int *geo_bs,
int spin_bs,
198 const cpuColorSpinorField &
V) {
200 int nSpin_coarse =
in.Nspin() / spin_bs;
203 int geoBlockSize = 1;
204 for (
int d=0;
d<
in.Ndim();
d++) geoBlockSize *= geo_bs[
d];
205 int blockSize = geoBlockSize *
in.Ncolor() * spin_bs;
216 for (
int x_cb=0; x_cb<
in.VolumeCB(); x_cb++) {
220 V.LatticeIndex(
x,
i);
225 for (
int d=
in.Ndim()-1;
d>=0;
d--) {
226 y[
d] =
x[
d]%geo_bs[
d];
227 blockOffset *= geo_bs[
d];
232 int offset = geo_map[
i]*nSpin_coarse*nVec*geoBlockSize*
in.Ncolor()*spin_bs;
234 for (
int v=0; v<
in.Nvec(); v++) {
235 for (
int s=0;
s<
in.Nspin();
s++) {
236 for (
int c=0;
c<
in.Ncolor();
c++) {
238 int chirality =
s / spin_bs;
239 int blockSpin =
s % spin_bs;
242 chirality * nVec * geoBlockSize * spin_bs *
in.Ncolor() +
243 v * geoBlockSize * spin_bs *
in.Ncolor() +
244 blockOffset * spin_bs *
in.Ncolor() +
245 blockSpin*
in.Ncolor() +
261 errorQuda(
"Number of elements packed %d does not match expected value %d nvec=%d nspin=%d ncolor=%d",
279 template <
bool toBlock,
int nVec,
class Complex,
class FieldOrder>
281 const int *geo_map,
const int *geo_bs,
int spin_bs,
282 const cpuColorSpinorField &
V) {
284 int geoBlockSize = 1;
285 for (
int d=0;
d<
in.Ndim();
d++) geoBlockSize *= geo_bs[
d];
297 for (
int x_cb=0; x_cb<
in.VolumeCB(); x_cb++) {
301 V.LatticeIndex(
x,
i);
306 for (
int d=
in.Ndim()-1;
d>=0;
d--) {
307 y[
d] =
x[
d]%geo_bs[
d];
308 blockOffset *= geo_bs[
d];
314 int offset = geo_map[
i]*nVec*geoBlockSize*
in.Ncolor();
318 for (
int v=0; v<
in.Nvec(); v++) {
319 for (
int c=0;
c<
in.Ncolor();
c++) {
321 int chirality = (
x[0]+
x[1]+
x[2]+
x[3])%2;
324 chirality * nVec * geoBlockSize *
in.Ncolor() +
325 v * geoBlockSize *
in.Ncolor() +
326 blockOffset *
in.Ncolor() +
341 errorQuda(
"Number of elements packed %d does not match expected value %d nvec=%d ncolor=%d",
354 template <
typename sumFloat,
typename Float,
int N>
355 void blockGramSchmidt(complex<Float> *v,
int nBlocks,
int blockSize) {
357 for (
int b=0;
b<nBlocks;
b++) {
358 for (
int jc=0; jc<N; jc++) {
360 for (
int ic=0; ic<jc; ic++) {
362 complex<Float>
dot = 0.0;
375 sumFloat scale = nrm2 > 0.0 ? 1.0/
sqrt(nrm2) : 0.0;
392 template <
typename sumType,
typename real,
int N>
393 class BlockGramSchmidt :
public Tunable {
398 const ColorSpinorField &meta;
400 unsigned int sharedBytesPerThread()
const {
return 0; }
401 unsigned int sharedBytesPerBlock(
const TuneParam &
param)
const {
return 0; }
404 BlockGramSchmidt(complex<real> *v,
int nBlock,
int blockSize,
const ColorSpinorField &meta)
410 virtual ~BlockGramSchmidt() { }
412 void apply(
const cudaStream_t &
stream) {
415 blockGramSchmidt<sumType, real, N>(v, nBlock,
blockSize);
421 bool advanceTuneParam(TuneParam &
param)
const {
return false; }
423 TuneKey tuneKey()
const {
return TuneKey(meta.VolString(),
typeid(*this).name(), aux); }
425 long long flops()
const {
return nBlock * N * ((N-1) * (8l + 8l) + 2l) *
blockSize; }
426 long long bytes()
const {
return 2*meta.Bytes(); }
429 template <
bool toBlock,
int N,
typename real,
typename Order>
430 class BlockOrderV :
public Tunable {
432 complex<real> *vBlock;
437 const ColorSpinorField &
V;
439 unsigned int sharedBytesPerThread()
const {
return 0; }
440 unsigned int sharedBytesPerBlock(
const TuneParam &
param)
const {
return 0; }
443 BlockOrderV(complex<real> *vBlock, Order &vOrder,
const int *geo_map,
const int *geo_bs,
int spin_bs,
const ColorSpinorField &
V)
444 : vBlock(vBlock), vOrder(vOrder), geo_map(geo_map), geo_bs(geo_bs), spin_bs(spin_bs),
V(
V) {
448 virtual ~BlockOrderV() { }
450 void apply(
const cudaStream_t &
stream) {
453 blockOrderV<toBlock,N,complex<real>,Order>(vBlock,vOrder,geo_map,geo_bs,spin_bs,
V);
459 bool advanceTuneParam(TuneParam &
param)
const {
return false; }
461 TuneKey tuneKey()
const {
return TuneKey(
V.VolString(),
typeid(*this).name(), aux); }
463 long long flops()
const {
return 0; }
464 long long bytes()
const {
return 2*
V.Bytes(); }
473 template <
typename Out,
typename In,
typename Rotator,
int fineSpin,
int coarseSpin>
474 struct BlockOrthoArg {
476 const int *fine_to_coarse;
477 const int *coarse_to_fine;
478 const spin_mapper<fineSpin,coarseSpin> spin_map;
483 BlockOrthoArg(Rotator &
V,
const int *fine_to_coarse,
const int *coarse_to_fine,
484 int parity,
const ColorSpinorField &meta) :
485 out(
out),
in(
in),
V(
V), fine_to_coarse(fine_to_coarse), coarse_to_fine(coarse_to_fine),
486 spin_map(),
parity(
parity), nParity(meta.SiteSubset()), swizzle(1)
489 BlockOrthoArg(
const BlockOrthoArg<Out,In,Rotator,fineSpin,coarseSpin> &
arg) :
491 fine_to_coarse(
arg.fine_to_coarse), coarse_to_fine(
arg.coarse_to_fine), spin_map(),
496 template <
typename Float,
int nVec,
int fineSpin,
int coarseSpin,
typename Arg>
497 void BlockOrtho(Arg &
arg) {
499 constexpr spinBlocks = fineSpin / coarseSpin;
501 for (
int b=0;
b<nBlocks;
b++) {
502 for (
int s=0;
s<spinBlocks;
s++) {
504 for (
int k=0; k<nVec; k++) {
506 for (
int l=0; l<k; l++) {
507 complex<Float>
dot = 0.0;
520 for (
int parity_coarse=0; parity_coarse<2; parity_coarse++)
521 for (
int x_coarse_cb=0; x_coarse_cb<
arg.out.VolumeCB(); x_coarse_cb++)
522 for (
int s=0;
s<coarseSpin;
s++)
523 for (
int c=0;
c<coarseColor;
c++)
524 arg.out(parity_coarse, x_coarse_cb,
s,
c) = 0.0;
530 for (
int x_cb=0; x_cb<
arg.in.VolumeCB(); x_cb++) {
533 int x_coarse =
arg.fine_to_coarse[
x];
534 int parity_coarse = (x_coarse >=
arg.out.VolumeCB()) ? 1 : 0;
535 int x_coarse_cb = x_coarse - parity_coarse*
arg.out.VolumeCB();
537 for (
int coarse_color_block=0; coarse_color_block<coarseColor; coarse_color_block+=coarse_colors_per_thread) {
538 complex<Float>
tmp[fineSpin*coarse_colors_per_thread];
539 rotateCoarseColor<Float,fineSpin,fineColor,coarseColor,coarse_colors_per_thread>
542 for (
int s=0;
s<fineSpin;
s++) {
543 for (
int coarse_color_local=0; coarse_color_local<coarse_colors_per_thread; coarse_color_local++) {
544 int c = coarse_color_block + coarse_color_local;
545 arg.out(parity_coarse,x_coarse_cb,
arg.spin_map(
s),
c) +=
tmp[
s*coarse_colors_per_thread+coarse_color_local];
556 template<
typename Float,
int nSpin,
int nColor,
int nVec>
557 void BlockOrthogonalize(ColorSpinorField &
V,
const int *geo_bs,
const int *geo_map,
int spin_bs) {
558 complex<Float> *Vblock =
new complex<Float>[
V.Volume()*
V.Nspin()*
V.Ncolor()];
564 VectorField vOrder(const_cast<ColorSpinorField&>(
V));
566 int geo_blocksize = 1;
567 for (
int d = 0;
d <
V.Ndim();
d++) geo_blocksize *= geo_bs[
d];
569 int blocksize = geo_blocksize * vOrder.Ncolor() * spin_bs;
570 int chiralBlocks = (
V.Nspin() == 1) ? 2 : vOrder.Nspin() / spin_bs;
571 int numblocks = (
V.Volume()/geo_blocksize) * chiralBlocks;
572 if (
V.Nspin() == 1) blocksize /= chiralBlocks;
574 printfQuda(
"Block Orthogonalizing %d blocks of %d length and width %d\n", numblocks, blocksize, nVec);
577 BlockOrthoArg<>
arg(
V);
582 BlockOrderV<true,nVec,Float,VectorField> reorder(Vblock, vOrder, geo_map, geo_bs, spin_bs,
V);
585 BlockGramSchmidt<double,Float,nVec> ortho(Vblock, numblocks, blocksize,
V);
588 BlockOrderV<false,nVec,Float,VectorField> reset(Vblock, vOrder, geo_map, geo_bs, spin_bs,
V);
594 errorQuda(
"Unsupported field order %d\n",
V.FieldOrder());
599 template<
typename Float,
int nSpin,
int nColor>
600 void BlockOrthogonalize(ColorSpinorField &
V,
int Nvec,
const int *geo_bs,
const int *geo_map,
int spin_bs) {
602 BlockOrthogonalize<Float,nSpin,nColor,2>(
V, geo_bs, geo_map, spin_bs);
603 }
else if (Nvec == 4) {
604 BlockOrthogonalize<Float,nSpin,nColor,4>(
V, geo_bs, geo_map, spin_bs);
605 }
else if (Nvec == 8) {
606 BlockOrthogonalize<Float,nSpin,nColor,8>(
V, geo_bs, geo_map, spin_bs);
607 }
else if (Nvec == 12) {
608 BlockOrthogonalize<Float,nSpin,nColor,12>(
V, geo_bs, geo_map, spin_bs);
609 }
else if (Nvec == 16) {
610 BlockOrthogonalize<Float,nSpin,nColor,16>(
V, geo_bs, geo_map, spin_bs);
611 }
else if (Nvec == 20) {
612 BlockOrthogonalize<Float,nSpin,nColor,20>(
V, geo_bs, geo_map, spin_bs);
613 }
else if (Nvec == 24) {
614 BlockOrthogonalize<Float,nSpin,nColor,24>(
V, geo_bs, geo_map, spin_bs);
615 }
else if (Nvec == 32) {
616 BlockOrthogonalize<Float,nSpin,nColor,32>(
V, geo_bs, geo_map, spin_bs);
617 }
else if (Nvec == 48) {
618 BlockOrthogonalize<Float,nSpin,nColor,48>(
V, geo_bs, geo_map, spin_bs);
620 errorQuda(
"Unsupported nVec %d\n", Nvec);
624 template<
typename Float,
int nSpin>
626 const int *geo_bs,
const int *geo_map,
int spin_bs) {
627 if (
V.Ncolor()/Nvec == 3) {
628 BlockOrthogonalize<Float,nSpin,3>(
V, Nvec, geo_bs, geo_map, spin_bs);
629 }
else if (
V.Ncolor()/Nvec == 2) {
630 BlockOrthogonalize<Float,nSpin,2>(
V, Nvec, geo_bs, geo_map, spin_bs);
631 }
else if (
V.Ncolor()/Nvec == 8) {
632 BlockOrthogonalize<Float,nSpin,8>(
V, Nvec, geo_bs, geo_map, spin_bs);
633 }
else if (
V.Ncolor()/Nvec == 16) {
634 BlockOrthogonalize<Float,nSpin,16>(
V, Nvec, geo_bs, geo_map, spin_bs);
635 }
else if (
V.Ncolor()/Nvec == 24) {
636 BlockOrthogonalize<Float,nSpin,24>(
V, Nvec, geo_bs, geo_map, spin_bs);
637 }
else if (
V.Ncolor()/Nvec == 32) {
638 BlockOrthogonalize<Float,nSpin,32>(
V, Nvec, geo_bs, geo_map, spin_bs);
639 }
else if (
V.Ncolor()/Nvec == 48) {
640 BlockOrthogonalize<Float,nSpin,48>(
V, Nvec, geo_bs, geo_map, spin_bs);
643 errorQuda(
"Unsupported nColor %d\n",
V.Ncolor()/Nvec);
647 template<
typename Float>
649 const int *geo_bs,
const int *geo_map,
int spin_bs) {
650 if (
V.Nspin() == 4) {
651 BlockOrthogonalize<Float,4>(
V, Nvec, geo_bs, geo_map, spin_bs);
652 }
else if(
V.Nspin() ==2) {
653 BlockOrthogonalize<Float,2>(
V, Nvec, geo_bs, geo_map, spin_bs);
654 }
else if (
V.Nspin() == 1) {
655 BlockOrthogonalize<Float,1>(
V, Nvec, geo_bs, geo_map, 1);
658 errorQuda(
"Unsupported nSpin %d\n",
V.Nspin());
662 #endif // GPU_MULTIGRID 665 const int *geo_bs,
const int *geo_map,
int spin_bs) {
668 #ifdef GPU_MULTIGRID_DOUBLE 669 BlockOrthogonalize<double>(
V, Nvec, geo_bs, geo_map, spin_bs);
671 errorQuda(
"Double precision multigrid has not been enabled");
674 BlockOrthogonalize<float>(
V, Nvec, geo_bs, geo_map, spin_bs);
676 errorQuda(
"Unsupported precision %d\n",
V.Precision());
679 errorQuda(
"Multigrid has not been built");
680 #endif // GPU_MULTIGRID
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
QudaVerbosity getVerbosity()
__host__ __device__ ValueType sqrt(ValueType x)
enum QudaFieldOrder_s QudaFieldOrder
std::complex< double > Complex
cudaColorSpinorField * tmp
void checkLength(const ColorSpinorField &a, ColorSpinorField &b)
char * strcpy(char *__dst, const char *__src)
void FillV(ColorSpinorField &V, const std::vector< ColorSpinorField *> &B, int Nvec)
char * index(const char *, int)
for(int s=0;s< param.dc.Ls;s++)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
int int int enum cudaChannelFormatKind f
cpuColorSpinorField * out
int sprintf(char *, const char *,...) __attribute__((__format__(__printf__
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
void BlockOrthogonalize(ColorSpinorField &V, int Nvec, const int *geo_bs, const int *fine_to_coarse, int spin_bs)
Block orthogonnalize the matrix field, where the blocks are defined by lookup tables that map the fin...
__host__ __device__ ValueType conj(ValueType x)
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
static __inline__ size_t size_t d
__device__ unsigned int count[QUDA_MAX_MULTI_REDUCE]
static void dot(sFloat *res, gFloat *a, sFloat *b)