1 __host__ __device__
inline double set(
double &
x) {
return x;}
2 __host__ __device__
inline double2
set(double2 &
x) {
return x;}
3 __host__ __device__
inline double3
set(double3 &
x) {
return x;}
4 __host__ __device__
inline void sum(
double &
a,
double &
b) {
a +=
b; }
5 __host__ __device__
inline void sum(double2 &
a, double2 &
b) {
a.x +=
b.x;
a.y +=
b.y; }
6 __host__ __device__
inline void sum(double3 &
a, double3 &
b) {
a.x +=
b.x;
a.y +=
b.y;
a.z +=
b.z; }
9 __host__ __device__
inline double set(
doubledouble &
a) {
return a.head(); }
10 __host__ __device__
inline double2
set(
doubledouble2 &
a) {
return make_double2(
a.x.head(),
a.y.head()); }
11 __host__ __device__
inline double3
set(
doubledouble3 &
a) {
return make_double3(
a.x.head(),
a.y.head(),
a.z.head()); }
13 __host__ __device__
inline void sum(double2 &
a,
doubledouble2 &
b) {
a.x +=
b.x.head();
a.y +=
b.y.head(); }
14 __host__ __device__
inline void sum(double3 &
a,
doubledouble3 &
b) {
a.x +=
b.x.head();
a.y +=
b.y.head();
a.z +=
b.z.head(); }
19 __device__
static unsigned int count = 0;
36 template <
int NXZ,
typename ReduceType,
typename SpinorX,
typename SpinorY,
37 typename SpinorZ,
typename SpinorW,
typename Reducer>
50 for (
int i=0;
i<NXZ; ++
i)
65 #define MAX_MATRIX_SIZE 4096 75 template<
int k,
int NXZ,
typename FloatN,
int M,
typename ReduceType,
typename Arg>
78 constexpr
int kmod = k;
85 FloatN
x[M],
y[M],
z[M],
w[M];
94 for (
int l=0; l < NXZ; l++) {
101 for (
int j=0; j<M; j++)
arg.r(
sum[l],
x[j],
y[j],
z[j],
w[j], k, l);
114 #ifdef WARP_MULTI_REDUCE 115 template<
typename ReduceType,
typename FloatN,
int M,
int NXZ,
116 typename SpinorX,
typename SpinorY,
typename SpinorZ,
typename SpinorW,
typename Reducer>
118 template<
int block_size,
typename ReduceType,
typename FloatN,
int M,
int NXZ,
119 typename SpinorX,
typename SpinorY,
typename SpinorZ,
typename SpinorW,
typename Reducer>
123 unsigned int i = blockIdx.x*
blockDim.x + threadIdx.x;
124 unsigned int k = blockIdx.y*
blockDim.y + threadIdx.y;
125 unsigned int parity = blockIdx.z;
127 if (k >=
arg.NYW)
return;
129 vector_type<ReduceType,NXZ>
sum;
132 case 0: compute< 0,NXZ,FloatN,M,ReduceType>(
sum,
arg,
i,
parity);
break;
133 #if MAX_MULTI_BLAS_N >= 2 134 case 1: compute< 1,NXZ,FloatN,M,ReduceType>(
sum,
arg,
i,
parity);
break;
135 #if MAX_MULTI_BLAS_N >= 3 136 case 2: compute< 2,NXZ,FloatN,M,ReduceType>(
sum,
arg,
i,
parity);
break;
137 #if MAX_MULTI_BLAS_N >= 4 138 case 3: compute< 3,NXZ,FloatN,M,ReduceType>(
sum,
arg,
i,
parity);
break;
139 #if MAX_MULTI_BLAS_N >= 5 140 case 4: compute< 4,NXZ,FloatN,M,ReduceType>(
sum,
arg,
i,
parity);
break;
141 #if MAX_MULTI_BLAS_N >= 6 142 case 5: compute< 5,NXZ,FloatN,M,ReduceType>(
sum,
arg,
i,
parity);
break;
143 #if MAX_MULTI_BLAS_N >= 7 144 case 6: compute< 6,NXZ,FloatN,M,ReduceType>(
sum,
arg,
i,
parity);
break;
145 #if MAX_MULTI_BLAS_N >= 8 146 case 7: compute< 7,NXZ,FloatN,M,ReduceType>(
sum,
arg,
i,
parity);
break;
147 #if MAX_MULTI_BLAS_N >= 9 148 case 8: compute< 8,NXZ,FloatN,M,ReduceType>(
sum,
arg,
i,
parity);
break;
149 #if MAX_MULTI_BLAS_N >= 10 150 case 9: compute< 9,NXZ,FloatN,M,ReduceType>(
sum,
arg,
i,
parity);
break;
151 #if MAX_MULTI_BLAS_N >= 11 152 case 10: compute<10,NXZ,FloatN,M,ReduceType>(
sum,
arg,
i,
parity);
break;
153 #if MAX_MULTI_BLAS_N >= 12 154 case 11: compute<11,NXZ,FloatN,M,ReduceType>(
sum,
arg,
i,
parity);
break;
155 #if MAX_MULTI_BLAS_N >= 13 156 case 12: compute<12,NXZ,FloatN,M,ReduceType>(
sum,
arg,
i,
parity);
break;
157 #if MAX_MULTI_BLAS_N >= 14 158 case 13: compute<13,NXZ,FloatN,M,ReduceType>(
sum,
arg,
i,
parity);
break;
159 #if MAX_MULTI_BLAS_N >= 15 160 case 14: compute<14,NXZ,FloatN,M,ReduceType>(
sum,
arg,
i,
parity);
break;
161 #if MAX_MULTI_BLAS_N >= 16 162 case 15: compute<15,NXZ,FloatN,M,ReduceType>(
sum,
arg,
i,
parity);
break;
180 #ifdef WARP_MULTI_REDUCE 181 ::quda::warp_reduce<vector_type<ReduceType,NXZ> >(
arg,
sum,
arg.NYW*
parity + k);
183 ::quda::reduce<block_size, vector_type<ReduceType,NXZ> >(
arg,
sum,
arg.NYW*
parity + k);
188 template<
typename doubleN,
typename ReduceType,
typename FloatN,
int M,
int NXZ,
189 typename SpinorX,
typename SpinorY,
typename SpinorZ,
typename SpinorW,
typename Reducer>
192 const TuneParam &tp,
const cudaStream_t &
stream) {
194 if(tp.grid.x > (
unsigned int)
deviceProp.maxGridSize[0])
198 #ifdef WARP_MULTI_REDUCE 199 multiReduceKernel<ReduceType,FloatN,M,NXZ><<<tp.grid,tp.block,tp.shared_bytes>>>(
arg);
204 #if (defined(_MSC_VER) && defined(_WIN64) || defined(__LP64__)) 213 for (
int i=0;
i<NXZ;
i++) {
214 for (
int j=0; j<
arg.NYW; j++) {
223 template<
unsigned... digits>
224 struct to_chars {
static const char value[]; };
226 template<
unsigned... digits>
227 const char to_chars<digits...>
::value[] = {(
'0' + digits)..., 0};
229 template<
unsigned rem,
unsigned... digits>
230 struct explode : explode<rem / 10, rem % 10, digits...> {};
232 template<
unsigned... digits>
233 struct explode<0, digits...> : to_chars<digits...> {};
236 template<
unsigned num>
239 template<
int NXZ,
typename doubleN,
typename ReduceType,
typename FloatN,
int M,
typename SpinorX,
240 typename SpinorY,
typename SpinorZ,
typename SpinorW,
typename Reducer>
252 std::vector<ColorSpinorField*> &
y, &
w;
259 TuneParam next(
param);
260 advanceBlockDim(next);
261 int nthreads = next.block.x * next.block.y * next.block.z;
272 Reducer &r,
int NYW,
int length,
int nParity, std::vector<ColorSpinorField*> &
y, std::vector<ColorSpinorField*> &
w)
273 :
NYW(
NYW),
arg(
X, Y,
Z, W, r,
NYW,
length/
nParity),
nParity(
nParity),
result(
result),
277 char name[TuneKey::name_n];
279 strcat(name, std::to_string(
NYW).c_str());
281 char aux[TuneKey::aux_n];
282 strcpy(aux,
"policy_kernel,");
289 multiReduceLaunch<doubleN,ReduceType,FloatN,M,NXZ>(
result,
arg,tp,
stream);
293 #ifdef WARP_MULTI_REDUCE 301 bool advanceBlockDim(TuneParam &
param)
const {
315 bool rtn = Tunable::advanceGridDim(
param);
321 Tunable::initTuneParam(
param);
328 Tunable::defaultTuneParam(
param);
358 template <
typename T>
368 template <
typename doubleN,
typename ReduceType,
typename RegType,
typename StoreType,
typename yType,
369 int M,
int NXZ,
template <
int MXZ,
typename ReducerType,
typename Float,
typename FloatN>
class Reducer,
typename write,
typename T>
370 void multiReduceCuda(doubleN result[],
const reduce::coeff_array<T> &
a,
const reduce::coeff_array<T> &
b,
const reduce::coeff_array<T> &
c,
371 std::vector<ColorSpinorField*>&
x, std::vector<ColorSpinorField*>&
y,
372 std::vector<ColorSpinorField*>&
z, std::vector<ColorSpinorField*>&
w,
375 const int NYW =
y.size();
377 int nParity =
x[0]->SiteSubset();
378 memset(result, 0, NXZ*NYW*
sizeof(doubleN));
380 const int N_MAX = NXZ > NYW ? NXZ : NYW;
381 const int N_MIN = NXZ < NYW ? NXZ : NYW;
384 static_assert(
MAX_MULTI_BLAS_N <= 16,
"MAX_MULTI_BLAS_N exceeds maximum size 16");
390 for (
int i=0;
i<N_MIN;
i++) {
392 if (!
x[
i]->isNative()) {
393 warningQuda(
"Reductions on non-native fields are not supported\n");
398 typedef typename scalar<RegType>::type Float;
404 if (
a.data &&
a.use_const) {
408 for (
int i=0;
i<NXZ;
i++)
for (
int j=0; j<NYW; j++)
412 Amatrix_h =
reinterpret_cast<signed char*
>(
const_cast<T*
>(
a.data));
415 if (
b.data &&
b.use_const) {
419 for (
int i=0;
i<NXZ;
i++)
for (
int j=0; j<NYW; j++)
423 Bmatrix_h =
reinterpret_cast<signed char*
>(
const_cast<T*
>(
b.data));
426 if (
c.data &&
c.use_const) {
430 for (
int i=0;
i<NXZ;
i++)
for (
int j=0; j<NYW; j++)
434 Cmatrix_h =
reinterpret_cast<signed char*
>(
const_cast<T*
>(
c.data));
439 if (
typeid(StoreType) !=
typeid(yType)) {
444 multi::SpinorTexture<RegType,StoreType,M,0>
X[NXZ];
446 multi::SpinorTexture<RegType,StoreType,M,2>
Z[NXZ];
449 for (
int i=0;
i<NXZ;
i++) {
450 X[
i].set(*dynamic_cast<cudaColorSpinorField *>(
x[
i]));
451 Z[
i].set(*dynamic_cast<cudaColorSpinorField *>(
z[
i]));
453 for (
int i=0;
i<NYW;
i++) {
454 Y[
i].set(*dynamic_cast<cudaColorSpinorField *>(
y[
i]));
455 W[
i].set(*dynamic_cast<cudaColorSpinorField *>(
w[
i]));
459 bool is_norm =
false;
462 for (
int i=0;
i<NXZ;
i++) {
471 Reducer<NXZ, ReduceType, Float2, RegType> r(
a,
b,
c, NYW);
474 multi::SpinorTexture<RegType,StoreType,M,0>,
475 multi::Spinor<RegType,yType,M,write::Y,1>,
476 multi::SpinorTexture<RegType,StoreType,M,2>,
477 multi::Spinor<RegType,StoreType,M,write::W,3>,
479 reduce(result,
X, Y,
Z, W, r, NYW,
length,
x[0]->SiteSubset(),
y,
w);
#define qudaMemcpy(dst, src, count, kind)
static __constant__ signed char Bmatrix_d[MAX_MATRIX_SIZE]
void * getHostReduceBuffer()
cudaError_t qudaEventQuery(cudaEvent_t &event)
Wrapper around cudaEventQuery or cuEventQuery.
cudaDeviceProp deviceProp
#define LAUNCH_KERNEL_LOCAL_PARITY(kernel, tp, stream, arg,...)
QudaVerbosity getVerbosity()
#define QUDA_MAX_MULTI_REDUCE
Maximum number of simultaneous reductions that can take place. This number may be increased if needed...
std::complex< double > Complex
char * Ynorm_h[MAX_MULTI_BLAS_N]
char * Wnorm_h[MAX_MULTI_BLAS_N]
char * Y_h[MAX_MULTI_BLAS_N]
bool advanceGridDim(TuneParam ¶m) const
Parameter struct for generic multi-blas kernel.
char * strcpy(char *__dst, const char *__src)
void multiReduceLaunch(doubleN result[], MultiReduceArg< NXZ, ReduceType, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer > &arg, const TuneParam &tp, const cudaStream_t &stream)
char * strcat(char *__s1, const char *__s2)
static __shared__ bool isLastBlockDone
void * getMappedHostReduceBuffer()
static signed char * Bmatrix_h
std::vector< ColorSpinorField * > & w
coeff_array(const T *data, bool use_const)
void defaultTuneParam(TuneParam ¶m) const
__global__ void multiReduceKernel(MultiReduceArg< NXZ, ReduceType, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer > arg)
cudaStream_t * getStream()
char * W_h[MAX_MULTI_BLAS_N]
__host__ __device__ void sum(double &a, double &b)
static struct quda::blas::@4 blasStrings
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
static __constant__ signed char Amatrix_d[MAX_MATRIX_SIZE]
virtual bool advanceSharedBytes(TuneParam ¶m) const
static __device__ unsigned int count
std::vector< ColorSpinorField * > & y
cudaEvent_t * getReduceEvent()
unsigned int sharedBytesPerBlock(const TuneParam ¶m) const
static signed char * Cmatrix_h
void checkSpinor(const ColorSpinorField &a, const ColorSpinorField &b)
void * memset(void *__b, int __c, size_t __len)
static signed char * Amatrix_h
void initTuneParam(TuneParam ¶m) const
SpinorY Y[MAX_MULTI_BLAS_N]
unsigned int sharedBytesPerThread() const
__device__ void reduce(ReduceArg< T > arg, const T &in, const int idx=0)
MultiReduceArg< NXZ, ReduceType, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer > arg
static __constant__ signed char Cmatrix_d[MAX_MATRIX_SIZE]
void apply(const cudaStream_t &stream)
MultiReduceArg(SpinorX X[NXZ], SpinorY Y[], SpinorZ Z[NXZ], SpinorW W[], Reducer r, int NYW, int length)
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
unsigned int maxBlockSize() const
MultiReduceCuda(doubleN result[], SpinorX X[], SpinorY Y[], SpinorZ Z[], SpinorW W[], Reducer &r, int NYW, int length, int nParity, std::vector< ColorSpinorField *> &y, std::vector< ColorSpinorField *> &w)
cudaError_t qudaEventRecord(cudaEvent_t &event, cudaStream_t stream=0)
Wrapper around cudaEventRecord or cuEventRecord.
static const char value[]
SpinorW W[MAX_MULTI_BLAS_N]
__device__ void compute(vector_type< ReduceType, NXZ > &sum, Arg &arg, int idx, int parity)
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
void multiReduceCuda(doubleN result[], const reduce::coeff_array< T > &a, const reduce::coeff_array< T > &b, const reduce::coeff_array< T > &c, std::vector< ColorSpinorField *> &x, std::vector< ColorSpinorField *> &y, std::vector< ColorSpinorField *> &z, std::vector< ColorSpinorField *> &w, int length)