9 #if __cplusplus >= 201402L 12 #include <integer_sequence.hpp> 29 template <
typename sumType,
typename vFloat,
typename bFloat,
int nSpin,
int spinBlockSize,
int nColor_,
int coarseSpin,
int nVec>
30 class BlockOrtho :
public Tunable {
35 typedef typename mapper<vFloat>::type RegType;
37 const std::vector<ColorSpinorField*> B;
38 const int *fine_to_coarse;
39 const int *coarse_to_fine;
45 unsigned int sharedBytesPerThread()
const {
return 0; }
46 unsigned int sharedBytesPerBlock(
const TuneParam &
param)
const {
return 0; }
47 unsigned int minThreads()
const {
return V.VolumeCB(); }
50 BlockOrtho(ColorSpinorField &V,
const std::vector<ColorSpinorField *> B,
const int *fine_to_coarse,
51 const int *coarse_to_fine,
const int *geo_bs,
const int n_block_ortho) :
54 fine_to_coarse(fine_to_coarse),
55 coarse_to_fine(coarse_to_fine),
57 n_block_ortho(n_block_ortho)
59 if (nColor_ != nColor)
60 errorQuda(
"Number of colors %d not supported with this precision %lu\n", nColor_,
sizeof(bFloat));
64 create_jitify_program(
"kernels/block_orthogonalize.cuh");
68 strcat(aux, V.AuxString());
69 strcat(aux,
",block_size=");
73 for (
int d = 0; d < V.Ndim(); d++) {
74 geoBlockSize *= geo_bs[d];
75 i32toa(geo_str, geo_bs[d]);
77 if (d < V.Ndim() - 1) strcat(aux,
"x");
80 strcat(aux,
",n_block_ortho=");
82 i32toa(n_ortho_str, n_block_ortho);
83 strcat(aux, n_ortho_str);
87 int chiralBlocks = (nSpin==1) ? 2 : V.Nspin() / spinBlockSize;
88 nBlock = (V.Volume()/geoBlockSize) * chiralBlocks;
91 virtual ~BlockOrtho() { }
99 template <
typename Rotator,
typename Vector, std::size_t...
S>
100 void CPU(
const std::vector<ColorSpinorField*> &B, std::index_sequence<S...>) {
101 typedef BlockOrthoArg<Rotator,Vector,nSpin,spinBlockSize,coarseSpin,nVec> Arg;
103 blockOrthoCPU<sumType,RegType,nSpin,spinBlockSize,nColor,coarseSpin,nVec,Arg>(
arg);
112 template <
typename Rotator,
typename Vector, std::size_t...
S>
113 void GPU(
const TuneParam &tp,
const cudaStream_t &
stream,
const std::vector<ColorSpinorField*> &B, std::index_sequence<S...>) {
114 typedef typename mapper<vFloat>::type RegType;
115 typedef BlockOrthoArg<Rotator,Vector,nSpin,spinBlockSize,coarseSpin,nVec> Arg;
117 arg.swizzle = tp.aux.x;
119 using namespace jitify::reflection;
120 auto instance = program->kernel(
"quda::blockOrthoGPU")
121 .instantiate((
int)tp.block.x,Type<sumType>(),Type<RegType>(),nSpin,spinBlockSize,nColor,coarseSpin,nVec,Type<Arg>());
123 jitify_error = instance.configure(tp.grid,tp.block,tp.shared_bytes,stream).launch(
arg);
125 cudaMemcpyToSymbolAsync(
B_array_d,
B_array_h, MAX_MATRIX_SIZE, 0, cudaMemcpyHostToDevice, stream);
126 LAUNCH_KERNEL_MG_BLOCK_SIZE(blockOrthoGPU,tp,stream,
arg,sumType,RegType,nSpin,spinBlockSize,nColor,coarseSpin,nVec,Arg);
130 void apply(
const cudaStream_t &stream) {
136 CPU<Rotator,Vector>(B, std::make_index_sequence<nVec>());
138 errorQuda(
"Unsupported field order %d\n", V.FieldOrder());
141 #if __COMPUTE_CAPABILITY__ >= 300 145 GPU<Rotator,Vector>(tp,
stream,B,std::make_index_sequence<nVec>());
147 errorQuda(
"Unsupported field order V=%d B=%d\n", V.FieldOrder(), B[0]->FieldOrder());
150 errorQuda(
"GPU block orthogonalization not supported on this GPU architecture");
155 bool advanceAux(TuneParam &
param)
const 158 if (param.aux.x < 2*
deviceProp.multiProcessorCount) {
170 bool advanceTuneParam(TuneParam ¶m)
const {
172 return advanceSharedBytes(param) || advanceAux(param);
178 TuneKey tuneKey()
const {
return TuneKey(V.VolString(),
typeid(*this).name(), aux); }
180 void initTuneParam(TuneParam ¶m)
const { defaultTuneParam(param); }
183 void defaultTuneParam(TuneParam ¶m)
const {
184 param.block = dim3(geoBlockSize/2, V.SiteSubset(), 1);
185 param.grid = dim3((minThreads() + param.block.x - 1) / param.block.x, 1, coarseSpin);
186 param.shared_bytes = 0;
190 long long flops()
const 192 return n_block_ortho * nBlock * (geoBlockSize / 2) * (spinBlockSize == 0 ? 1 : 2 * spinBlockSize) / 2 * nColor
193 * (nVec * ((nVec - 1) * (8l + 8l)) + 6l);
196 long long bytes()
const 198 return nVec * B[0]->Bytes() + (nVec - 1) * nVec / 2 * V.Bytes() / nVec + V.Bytes()
199 + (n_block_ortho - 1) * (V.Bytes() + (nVec - 1) * nVec / 2 * V.Bytes() / nVec + V.Bytes());
202 char *saveOut, *saveOutNorm;
204 void preTune() { V.backup(); }
205 void postTune() { V.restore(); }
209 template <
typename vFloat,
typename bFloat,
int nSpin,
int spinBlockSize,
int nColor,
int nVec>
210 void BlockOrthogonalize(ColorSpinorField &V,
const std::vector<ColorSpinorField *> &B,
const int *fine_to_coarse,
211 const int *coarse_to_fine,
const int *geo_bs,
const int n_block_ortho)
214 int geo_blocksize = 1;
215 for (
int d = 0; d < V.Ndim(); d++) geo_blocksize *= geo_bs[d];
217 int blocksize = geo_blocksize * V.Ncolor();
218 if (spinBlockSize == 0) { blocksize /= 2; }
else { blocksize *= spinBlockSize; }
219 int chiralBlocks = (spinBlockSize == 0) ? 2 : V.Nspin() / spinBlockSize;
220 int numblocks = (V.Volume()/geo_blocksize) * chiralBlocks;
221 constexpr
int coarseSpin = (nSpin == 4 || nSpin == 2 || spinBlockSize == 0) ? 2 : 1;
224 printfQuda(
"Block Orthogonalizing %d blocks of %d length and width %d repeating %d times\n", numblocks, blocksize,
225 nVec, n_block_ortho);
228 BlockOrtho<double, vFloat, bFloat, nSpin, spinBlockSize, nColor, coarseSpin, nVec> ortho(
229 V, B, fine_to_coarse, coarse_to_fine, geo_bs, n_block_ortho);
234 template <
typename vFloat,
typename bFloat,
int nSpin,
int spinBlockSize>
235 void BlockOrthogonalize(ColorSpinorField &V,
const std::vector<ColorSpinorField *> &B,
const int *fine_to_coarse,
236 const int *coarse_to_fine,
const int *geo_bs,
const int n_block_ortho)
239 const int Nvec = B.size();
240 if (V.Ncolor()/Nvec == 3) {
242 constexpr
int nColor = 3;
244 BlockOrthogonalize<vFloat, bFloat, nSpin, spinBlockSize, nColor, 6>(
V, B, fine_to_coarse, coarse_to_fine,
246 }
else if (Nvec == 24) {
247 BlockOrthogonalize<vFloat, bFloat, nSpin, spinBlockSize, nColor, 24>(
V, B, fine_to_coarse, coarse_to_fine,
249 }
else if (Nvec == 32) {
250 BlockOrthogonalize<vFloat, bFloat, nSpin, spinBlockSize, nColor, 32>(
V, B, fine_to_coarse, coarse_to_fine,
252 }
else if (Nvec == 48) {
253 BlockOrthogonalize<vFloat, bFloat, nSpin, spinBlockSize, nColor, 48>(
V, B, fine_to_coarse, coarse_to_fine,
256 errorQuda(
"Unsupported nVec %d\n", Nvec);
259 }
else if (V.Ncolor()/Nvec == 6) {
261 constexpr
int nColor = 6;
263 BlockOrthogonalize<vFloat, bFloat, nSpin, spinBlockSize, nColor, 6>(
V, B, fine_to_coarse, coarse_to_fine,
266 errorQuda(
"Unsupported nVec %d\n", Nvec);
269 }
else if (V.Ncolor()/Nvec == 24) {
271 constexpr
int nColor = 24;
273 BlockOrthogonalize<vFloat, bFloat, nSpin, spinBlockSize, nColor, 24>(
V, B, fine_to_coarse, coarse_to_fine,
275 }
else if (Nvec == 32) {
276 BlockOrthogonalize<vFloat, bFloat, nSpin, spinBlockSize, nColor, 32>(
V, B, fine_to_coarse, coarse_to_fine,
279 errorQuda(
"Unsupported nVec %d\n", Nvec);
282 }
else if (V.Ncolor()/Nvec == 32) {
284 constexpr
int nColor = 32;
286 BlockOrthogonalize<vFloat, bFloat, nSpin, spinBlockSize, nColor, 32>(
V, B, fine_to_coarse, coarse_to_fine,
289 errorQuda(
"Unsupported nVec %d\n", Nvec);
292 errorQuda(
"Unsupported nColor %d\n", V.Ncolor()/Nvec);
296 template <
typename vFloat,
typename bFloat>
297 void BlockOrthogonalize(ColorSpinorField &V,
const std::vector<ColorSpinorField *> &B,
const int *fine_to_coarse,
298 const int *coarse_to_fine,
const int *geo_bs,
const int spin_bs,
const int n_block_ortho)
300 if(V.Nspin() ==2 && spin_bs == 1) {
301 BlockOrthogonalize<vFloat, bFloat, 2, 1>(
V, B, fine_to_coarse, coarse_to_fine, geo_bs,
n_block_ortho);
302 #ifdef GPU_WILSON_DIRAC 303 }
else if (V.Nspin() == 4 && spin_bs == 2) {
304 BlockOrthogonalize<vFloat, bFloat, 4, 2>(
V, B, fine_to_coarse, coarse_to_fine, geo_bs,
n_block_ortho);
306 #ifdef GPU_STAGGERED_DIRAC 307 }
else if (V.Nspin() == 1 && spin_bs == 1) {
308 BlockOrthogonalize<vFloat, bFloat, 1, 1>(
V, B, fine_to_coarse, coarse_to_fine, geo_bs,
n_block_ortho);
311 errorQuda(
"Unsupported nSpin %d and spinBlockSize %d combination.\n", V.Nspin(), spin_bs);
315 #endif // GPU_MULTIGRID 318 const int *coarse_to_fine,
const int *geo_bs,
const int spin_bs,
const int n_block_ortho)
322 #ifdef GPU_MULTIGRID_DOUBLE 323 BlockOrthogonalize<double>(
V, B, fine_to_coarse, coarse_to_fine, geo_bs, spin_bs,
n_block_ortho);
325 errorQuda(
"Double precision multigrid has not been enabled");
328 BlockOrthogonalize<float, float>(
V, B, fine_to_coarse, coarse_to_fine, geo_bs, spin_bs,
n_block_ortho);
330 #if QUDA_PRECISION & 2 331 BlockOrthogonalize<short, float>(
V, B, fine_to_coarse, coarse_to_fine, geo_bs, spin_bs,
n_block_ortho);
333 errorQuda(
"QUDA_PRECISION=%d does not enable half precision", QUDA_PRECISION);
336 #if QUDA_PRECISION & 2 337 BlockOrthogonalize<short, short>(
V, B, fine_to_coarse, coarse_to_fine, geo_bs, spin_bs,
n_block_ortho);
339 errorQuda(
"QUDA_PRECISION=%d does not enable half precision", QUDA_PRECISION);
342 errorQuda(
"Unsupported precision combination V=%d B=%d\n", V.
Precision(), B[0]->Precision());
345 errorQuda(
"Multigrid has not been built");
346 #endif // GPU_MULTIGRID
cudaDeviceProp deviceProp
QudaVerbosity getVerbosity()
Helper file when using jitify run-time compilation. This file should be included in source code...
const char * compile_type_str(const LatticeField &meta, QudaFieldLocation location_=QUDA_INVALID_FIELD_LOCATION)
Helper function for setting auxilary string.
void i32toa(char *buffer, int32_t value)
This is just a dummy structure we use for trove to define the required structure size.
static __constant__ signed char B_array_d[MAX_MATRIX_SIZE]
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
char * getOmpThreadStr()
Returns a string of the form ",omp_threads=$OMP_NUM_THREADS", which can be used for storing the numbe...
static signed char B_array_h[MAX_MATRIX_SIZE]
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
int n_block_ortho[QUDA_MAX_MG_LEVEL]
QudaPrecision Precision() const
#define LAUNCH_KERNEL_MG_BLOCK_SIZE(kernel, tp, stream, arg,...)
void BlockOrthogonalize(ColorSpinorField &V, const std::vector< ColorSpinorField *> &B, const int *fine_to_coarse, const int *coarse_to_fine, const int *geo_bs, const int spin_bs, const int n_block_ortho)
Block orthogonnalize the matrix field, where the blocks are defined by lookup tables that map the fin...