15 #define BLAS_SPINOR // do not include ghost functions in Spinor class to reduce parameter space overhead 19 #define MAX_MATRIX_SIZE 4096 28 #if CUDA_VERSION < 9000 46 template <
int NXZ,
typename ReduceType,
typename SpinorX,
typename SpinorY,
typename SpinorZ,
typename SpinorW,
57 MultiReduceArg(SpinorX X[NXZ], SpinorY Y[], SpinorZ Z[NXZ], SpinorW W[], Reducer r,
int NYW,
int length) :
62 for (
int i = 0; i < NXZ; ++i) {
67 for (
int i = 0; i <
NYW; ++i) {
74 #ifdef WARP_MULTI_REDUCE 75 template <
typename ReduceType,
typename FloatN,
int M,
int NXZ,
typename Arg>
77 template <
int block_size,
typename ReduceType,
typename FloatN,
int M,
int NXZ,
typename Arg>
81 #if CUDA_VERSION >= 9000 84 Arg &arg = *((
Arg *)arg_buffer);
86 unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
87 unsigned int k = blockIdx.y * blockDim.y + threadIdx.y;
88 unsigned int parity = blockIdx.z;
90 if (k >= arg.NYW)
return;
94 while (idx < arg.length) {
96 FloatN x[M], y[M], z[M], w[M];
98 arg.Y[k].load(y, idx, parity);
99 arg.W[k].load(w, idx, parity);
105 for (
int l = 0; l < NXZ; l++) {
106 arg.X[l].load(x, idx, parity);
107 arg.Z[l].load(z, idx, parity);
112 for (
int j = 0; j < M; j++) arg.r(sum[l], x[j], y[j], z[j], w[j], k, l);
117 arg.Y[k].save(y, idx, parity);
118 arg.W[k].save(w, idx, parity);
120 idx += gridDim.x * blockDim.x;
123 #ifdef WARP_MULTI_REDUCE 124 ::quda::warp_reduce<vector_type<ReduceType, NXZ>>(
arg,
sum, arg.NYW * parity + k);
126 ::quda::reduce<block_size, vector_type<ReduceType, NXZ>>(
arg,
sum, arg.NYW * parity + k);
132 const bool use_const;
134 coeff_array(
const T *data,
bool use_const) : data(data), use_const(use_const) {}
140 template <
int NXZ,
typename ReduceType,
typename Float2,
typename FloatN>
struct MultiReduceFunctor {
143 virtual __device__ __host__
void pre() { ; }
146 virtual __device__ __host__ __host__
void operator()(
147 ReduceType &
sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w,
const int i,
const int j)
151 virtual __device__ __host__
void post(ReduceType &sum) { ; }
158 template <
typename ReduceType> __device__ __host__
void dot_(ReduceType &
sum,
const double2 &a,
const double2 &b)
160 sum += (ReduceType)a.x * (ReduceType)b.x;
161 sum += (ReduceType)a.y * (ReduceType)b.y;
164 template <
typename ReduceType> __device__ __host__
void dot_(ReduceType &
sum,
const float2 &a,
const float2 &b)
166 sum += (ReduceType)a.x * (ReduceType)b.x;
167 sum += (ReduceType)a.y * (ReduceType)b.y;
170 template <
typename ReduceType> __device__ __host__
void dot_(ReduceType &
sum,
const float4 &a,
const float4 &b)
172 sum += (ReduceType)a.x * (ReduceType)b.x;
173 sum += (ReduceType)a.y * (ReduceType)b.y;
174 sum += (ReduceType)a.z * (ReduceType)b.z;
175 sum += (ReduceType)a.w * (ReduceType)b.w;
178 template <
int NXZ,
typename ReduceType,
typename Float2,
typename FloatN>
188 ReduceType &
sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w,
const int i,
const int j)
190 dot_<ReduceType>(
sum, x, y);
199 template <
typename ReduceType> __device__ __host__
void cdot_(ReduceType &
sum,
const double2 &a,
const double2 &b)
202 sum.x += (scalar)a.x * (scalar)b.x;
203 sum.x += (scalar)a.y * (scalar)b.y;
204 sum.y += (scalar)a.x * (scalar)b.y;
205 sum.y -= (scalar)a.y * (scalar)b.x;
208 template <
typename ReduceType> __device__ __host__
void cdot_(ReduceType &
sum,
const float2 &a,
const float2 &b)
211 sum.x += (scalar)a.x * (scalar)b.x;
212 sum.x += (scalar)a.y * (scalar)b.y;
213 sum.y += (scalar)a.x * (scalar)b.y;
214 sum.y -= (scalar)a.y * (scalar)b.x;
217 template <
typename ReduceType> __device__ __host__
void cdot_(ReduceType &
sum,
const float4 &a,
const float4 &b)
220 sum.x += (scalar)a.x * (scalar)b.x;
221 sum.x += (scalar)a.y * (scalar)b.y;
222 sum.x += (scalar)a.z * (scalar)b.z;
223 sum.x += (scalar)a.w * (scalar)b.w;
224 sum.y += (scalar)a.x * (scalar)b.y;
225 sum.y -= (scalar)a.y * (scalar)b.x;
226 sum.y += (scalar)a.z * (scalar)b.w;
227 sum.y -= (scalar)a.w * (scalar)b.z;
230 template <
int NXZ,
typename ReduceType,
typename Float2,
typename FloatN>
240 ReduceType &
sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w,
const int i,
const int j)
242 cdot_<ReduceType>(
sum, x, y);
248 template <
int NXZ,
typename ReduceType,
typename Float2,
typename FloatN>
258 ReduceType &
sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w,
const int i,
const int j)
260 cdot_<ReduceType>(
sum, x, y);
static int flops()
total number of input and output streams
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)
where the reduction is usually computed and any auxiliary operations
static __constant__ signed char Cmatrix_d[MAX_MATRIX_SIZE]
__device__ __host__ void cdot_(ReduceType &sum, const double2 &a, const double2 &b)
Cdot(const coeff_array< Complex > &a, const coeff_array< Complex > &b, const coeff_array< Complex > &c, int NYW)
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)
where the reduction is usually computed and any auxiliary operations
static __constant__ signed char Amatrix_d[MAX_MATRIX_SIZE]
virtual __device__ __host__ void pre()
pre-computation routine called before the "M-loop"
__host__ __device__ void sum(double &a, double &b)
SpinorW W[MAX_MULTI_BLAS_N]
coeff_array(const T *data, bool use_const)
SpinorY Y[MAX_MULTI_BLAS_N]
static signed char * Bmatrix_h
static __constant__ signed char Bmatrix_d[MAX_MATRIX_SIZE]
Parameter struct for generic multi-blas kernel.
scalar< Float2 >::type real
static int flops()
total number of input and output streams
static __constant__ signed char arg_buffer[MAX_MATRIX_SIZE]
static int flops()
total number of input and output streams
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Dot(const coeff_array< Complex > &a, const coeff_array< Complex > &b, const coeff_array< Complex > &c, int NYW)
__global__ void multiReduceKernel(Arg arg_)
CdotCopy(const coeff_array< Complex > &a, const coeff_array< Complex > &b, const coeff_array< Complex > &c, int NYW)
static signed char * Amatrix_h
__device__ __host__ void dot_(ReduceType &sum, const double2 &a, const double2 &b)
scalar< Float2 >::type real
static signed char * Cmatrix_h
virtual __device__ __host__ void post(ReduceType &sum)
post-computation routine called after the "M-loop"
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)
where the reduction is usually computed and any auxiliary operations
MultiReduceArg(SpinorX X[NXZ], SpinorY Y[], SpinorZ Z[NXZ], SpinorW W[], Reducer r, int NYW, int length)
scalar< Float2 >::type real