12 #define BLAS_SPINOR // do not include ghost functions in Spinor class to reduce parameter space overhead 16 #define MAX_MATRIX_SIZE 4096 25 #if CUDA_VERSION < 9000 42 template <
int NXZ,
typename SpinorX,
typename SpinorY,
typename SpinorZ,
typename SpinorW,
typename Functor>
52 MultiBlasArg(SpinorX X[NXZ], SpinorY Y[], SpinorZ Z[NXZ], SpinorW W[], Functor f,
int NYW,
int length) :
57 for (
int i = 0; i < NXZ; ++i) {
61 for (
int i = 0; i <
NYW; ++i) {
73 template <
typename FloatN,
int M,
int NXZ,
typename Arg> __global__
void multiBlasKernel(
Arg arg_)
75 #if CUDA_VERSION >= 9000 78 Arg &arg = *((
Arg *)arg_buffer);
82 unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
83 unsigned int k = blockIdx.y * blockDim.y + threadIdx.y;
84 unsigned int parity = blockIdx.z;
87 if (k >= arg.NYW)
return;
89 while (idx < arg.length) {
91 FloatN x[M], y[M], z[M], w[M];
92 arg.Y[k].load(y, idx, parity);
93 arg.W[k].load(w, idx, parity);
96 for (
int l = 0; l < NXZ; l++) {
97 arg.X[l].load(x, idx, parity);
98 arg.Z[l].load(z, idx, parity);
101 for (
int j = 0; j < M; j++) arg.f(x[j], y[j], z[j], w[j], k, l);
103 arg.Y[k].save(y, idx, parity);
104 arg.W[k].save(w, idx, parity);
106 idx += gridDim.x * blockDim.x;
114 coeff_array(
const T *data,
bool use_const) : data(data), use_const(use_const) {}
120 virtual __device__ __host__
void init() { ; }
123 virtual __device__ __host__
void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w,
const int i,
const int j)
131 __device__ __host__
inline void _caxpy(
const float2 &a,
const float4 &x, float4 &y)
143 __device__ __host__
inline void _caxpy(
const float2 &a,
const float2 &x, float2 &y)
151 __device__ __host__
inline void _caxpy(
const double2 &a,
const double2 &x, double2 &y)
159 template <
int NXZ,
typename Float2,
typename FloatN>
168 __device__ __host__
inline void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w,
const int i,
const int j)
171 Float2 *a =
reinterpret_cast<Float2 *
>(
Amatrix_d);
174 Float2 *a =
reinterpret_cast<Float2 *
>(
Amatrix_h);
175 _caxpy(a[NYW * j + i], x, y);
186 template <
int NXZ,
typename Float2,
typename FloatN>
195 __device__ __host__
inline void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w,
const int i,
const int j)
198 Float2 *a =
reinterpret_cast<Float2 *
>(
Amatrix_d);
202 Float2 *a =
reinterpret_cast<Float2 *
>(
Amatrix_h);
204 _caxpy(a[NYW * j + i], x, w);
215 template <
int NXZ,
typename Float2,
typename FloatN>
228 for (
int i = 0; i <
NYW; i++) {
229 this->a[i] = a.
data[i];
230 this->b[i] = b.data[i];
231 this->c[i] = c.data[i];
234 __device__ __host__
inline void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w,
const int i,
const int j)
237 w = b[i] * x + c[i] * w;
246 template <
int NXZ,
typename Float2,
typename FloatN>
258 __device__ __host__
inline void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w,
const int i,
const int j)
261 Float2 *a =
reinterpret_cast<Float2 *
>(
Amatrix_d);
262 Float2 *b =
reinterpret_cast<Float2 *
>(
Bmatrix_d);
266 Float2 *a =
reinterpret_cast<Float2 *
>(
Amatrix_h);
267 Float2 *b =
reinterpret_cast<Float2 *
>(
Bmatrix_h);
virtual __device__ __host__ void init()
pre-computation routine before the main loop
multi_axpyBzpcx_(const coeff_array< double > &a, const coeff_array< double > &b, const coeff_array< double > &c, int NYW)
static __constant__ signed char Cmatrix_d[MAX_MATRIX_SIZE]
SpinorY Y[MAX_MULTI_BLAS_N]
Parameter struct for generic multi-blas kernel.
static __constant__ signed char Amatrix_d[MAX_MATRIX_SIZE]
multicaxpyz_(const coeff_array< Complex > &a, const coeff_array< Complex > &b, const coeff_array< Complex > &c, int NYW)
multi_caxpyBxpz_(const coeff_array< Complex > &a, const coeff_array< Complex > &b, const coeff_array< Complex > &c, int NYW)
int flops()
total number of input and output streams
__device__ __host__ void _caxpy(const float2 &a, const float4 &x, float4 &y)
__device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)
where the reduction is usually computed and any auxiliary operations
coeff_array(const T *data, bool use_const)
scalar< Float2 >::type real
__device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)
where the reduction is usually computed and any auxiliary operations
static signed char * Bmatrix_h
static __constant__ signed char Bmatrix_d[MAX_MATRIX_SIZE]
__global__ void multiBlasKernel(Arg arg_)
Generic multi-blas kernel with four loads and up to four stores.
__device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)
where the reduction is usually computed and any auxiliary operations
scalar< Float2 >::type real
int flops()
total number of input and output streams
__device__ __host__ void operator()(FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)
where the reduction is usually computed and any auxiliary operations
static __constant__ signed char arg_buffer[MAX_MATRIX_SIZE]
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
SpinorW W[MAX_MULTI_BLAS_N]
MultiBlasArg(SpinorX X[NXZ], SpinorY Y[], SpinorZ Z[NXZ], SpinorW W[], Functor f, int NYW, int length)
multicaxpy_(const coeff_array< Complex > &a, const coeff_array< Complex > &b, const coeff_array< Complex > &c, int NYW)
int flops()
total number of input and output streams
int flops()
total number of input and output streams
static signed char * Amatrix_h
static signed char * Cmatrix_h