quda-ref/v1.0.0/multi__reduce__core_8cuh_source.html

 #pragma once

 #include <color_spinor_field_order.h>
 #include <blas_helper.cuh>
 #include <cub_helper.cuh>

 //#define WARP_MULTI_REDUCE

 namespace quda
 {

   namespace blas
   {

 #define BLAS_SPINOR // do not include ghost functions in Spinor class to reduce parameter space overhead
 #include <texture.h>

     // storage for matrix coefficients
 #define MAX_MATRIX_SIZE 4096
     static __constant__ signed char Amatrix_d[MAX_MATRIX_SIZE];
     static __constant__ signed char Bmatrix_d[MAX_MATRIX_SIZE];
     static __constant__ signed char Cmatrix_d[MAX_MATRIX_SIZE];

     static signed char *Amatrix_h;
     static signed char *Bmatrix_h;
     static signed char *Cmatrix_h;

 #if CUDA_VERSION < 9000
     // as a performance work around we put the argument struct into
     // __constant__ memory to prevent the compiler from spilling
     // registers on older CUDA
     static __constant__ signed char arg_buffer[MAX_MATRIX_SIZE];
 #endif

     template <int NXZ, typename ReduceType, typename SpinorX, typename SpinorY, typename SpinorZ, typename SpinorW,
         typename Reducer>
     struct MultiReduceArg : public ReduceArg<vector_type<ReduceType, NXZ>> {

       const int NYW;
       SpinorX X[NXZ];
       SpinorY Y[MAX_MULTI_BLAS_N];
       SpinorZ Z[NXZ];
       SpinorW W[MAX_MULTI_BLAS_N];
       Reducer r;
       const int length;
       MultiReduceArg(SpinorX X[NXZ], SpinorY Y[], SpinorZ Z[NXZ], SpinorW W[], Reducer r, int NYW, int length) :
           NYW(NYW),
           r(r),
           length(length)
       {
         for (int i = 0; i < NXZ; ++i) {
           this->X[i] = X[i];
           this->Z[i] = Z[i];
         }

         for (int i = 0; i < NYW; ++i) {
           this->Y[i] = Y[i];
           this->W[i] = W[i];
         }
       }
     };

 #ifdef WARP_MULTI_REDUCE
     template <typename ReduceType, typename FloatN, int M, int NXZ, typename Arg>
 #else
     template <int block_size, typename ReduceType, typename FloatN, int M, int NXZ, typename Arg>
 #endif
     __global__ void multiReduceKernel(Arg arg_)
     {
 #if CUDA_VERSION >= 9000
       Arg &arg = arg_;
 #else
       Arg &arg = *((Arg *)arg_buffer);
 #endif
       unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
       unsigned int k = blockIdx.y * blockDim.y + threadIdx.y;
       unsigned int parity = blockIdx.z;

       if (k >= arg.NYW) return; // safe since k are different thread blocks

       vector_type<ReduceType, NXZ> sum;

       while (idx < arg.length) {

         FloatN x[M], y[M], z[M], w[M];

         arg.Y[k].load(y, idx, parity);
         arg.W[k].load(w, idx, parity);

         // Each NYW owns its own thread.
         // The NXZ's are all in the same thread block,
         // so they can share the same memory.
 #pragma unroll
         for (int l = 0; l < NXZ; l++) {
           arg.X[l].load(x, idx, parity);
           arg.Z[l].load(z, idx, parity);

           arg.r.pre();

 #pragma unroll
           for (int j = 0; j < M; j++) arg.r(sum[l], x[j], y[j], z[j], w[j], k, l);

           arg.r.post(sum[l]);
         }

         arg.Y[k].save(y, idx, parity);
         arg.W[k].save(w, idx, parity);

         idx += gridDim.x * blockDim.x;
       }

 #ifdef WARP_MULTI_REDUCE
       ::quda::warp_reduce<vector_type<ReduceType, NXZ>>(arg, sum, arg.NYW * parity + k);
 #else
       ::quda::reduce<block_size, vector_type<ReduceType, NXZ>>(arg, sum, arg.NYW * parity + k);
 #endif
     } // multiReduceKernel

     template <typename T> struct coeff_array {
       const T *data;
       const bool use_const;
       coeff_array() : data(nullptr), use_const(false) {}
       coeff_array(const T *data, bool use_const) : data(data), use_const(use_const) {}
     };

     template <int NXZ, typename ReduceType, typename Float2, typename FloatN> struct MultiReduceFunctor {

       virtual __device__ __host__ void pre() { ; }

       virtual __device__ __host__ __host__ void operator()(
           ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)
           = 0;

       virtual __device__ __host__ void post(ReduceType &sum) { ; }
     };

     template <typename ReduceType> __device__ __host__ void dot_(ReduceType &sum, const double2 &a, const double2 &b)
     {
       sum += (ReduceType)a.x * (ReduceType)b.x;
       sum += (ReduceType)a.y * (ReduceType)b.y;
     }

     template <typename ReduceType> __device__ __host__ void dot_(ReduceType &sum, const float2 &a, const float2 &b)
     {
       sum += (ReduceType)a.x * (ReduceType)b.x;
       sum += (ReduceType)a.y * (ReduceType)b.y;
     }

     template <typename ReduceType> __device__ __host__ void dot_(ReduceType &sum, const float4 &a, const float4 &b)
     {
       sum += (ReduceType)a.x * (ReduceType)b.x;
       sum += (ReduceType)a.y * (ReduceType)b.y;
       sum += (ReduceType)a.z * (ReduceType)b.z;
       sum += (ReduceType)a.w * (ReduceType)b.w;
     }

     template <int NXZ, typename ReduceType, typename Float2, typename FloatN>
     struct Dot : public MultiReduceFunctor<NXZ, ReduceType, Float2, FloatN> {
       typedef typename scalar<Float2>::type real;
       const int NYW;
       Dot(const coeff_array<Complex> &a, const coeff_array<Complex> &b, const coeff_array<Complex> &c, int NYW) :
           NYW(NYW)
       {
         ;
       }
       __device__ __host__ void operator()(
           ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)
       {
         dot_<ReduceType>(sum, x, y);
       }
       static int streams() { return 2; }
       static int flops() { return 2; }
     };

     template <typename ReduceType> __device__ __host__ void cdot_(ReduceType &sum, const double2 &a, const double2 &b)
     {
       typedef typename scalar<ReduceType>::type scalar;
       sum.x += (scalar)a.x * (scalar)b.x;
       sum.x += (scalar)a.y * (scalar)b.y;
       sum.y += (scalar)a.x * (scalar)b.y;
       sum.y -= (scalar)a.y * (scalar)b.x;
     }

     template <typename ReduceType> __device__ __host__ void cdot_(ReduceType &sum, const float2 &a, const float2 &b)
     {
       typedef typename scalar<ReduceType>::type scalar;
       sum.x += (scalar)a.x * (scalar)b.x;
       sum.x += (scalar)a.y * (scalar)b.y;
       sum.y += (scalar)a.x * (scalar)b.y;
       sum.y -= (scalar)a.y * (scalar)b.x;
     }

     template <typename ReduceType> __device__ __host__ void cdot_(ReduceType &sum, const float4 &a, const float4 &b)
     {
       typedef typename scalar<ReduceType>::type scalar;
       sum.x += (scalar)a.x * (scalar)b.x;
       sum.x += (scalar)a.y * (scalar)b.y;
       sum.x += (scalar)a.z * (scalar)b.z;
       sum.x += (scalar)a.w * (scalar)b.w;
       sum.y += (scalar)a.x * (scalar)b.y;
       sum.y -= (scalar)a.y * (scalar)b.x;
       sum.y += (scalar)a.z * (scalar)b.w;
       sum.y -= (scalar)a.w * (scalar)b.z;
     }

     template <int NXZ, typename ReduceType, typename Float2, typename FloatN>
     struct Cdot : public MultiReduceFunctor<NXZ, ReduceType, Float2, FloatN> {
       typedef typename scalar<Float2>::type real;
       const int NYW;
       Cdot(const coeff_array<Complex> &a, const coeff_array<Complex> &b, const coeff_array<Complex> &c, int NYW) :
           NYW(NYW)
       {
         ;
       }
       __device__ __host__ inline void operator()(
           ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)
       {
         cdot_<ReduceType>(sum, x, y);
       }
       static int streams() { return 2; }
       static int flops() { return 4; }
     };

     template <int NXZ, typename ReduceType, typename Float2, typename FloatN>
     struct CdotCopy : public MultiReduceFunctor<NXZ, ReduceType, Float2, FloatN> {
       typedef typename scalar<Float2>::type real;
       const int NYW;
       CdotCopy(const coeff_array<Complex> &a, const coeff_array<Complex> &b, const coeff_array<Complex> &c, int NYW) :
           NYW(NYW)
       {
         ;
       }
       __device__ __host__ inline void operator()(
           ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)
       {
         cdot_<ReduceType>(sum, x, y);
         if (i == j) w = y;
       }
       static int streams() { return 2; }
       static int flops() { return 4; }
     };

   } // namespace blas

 } // namespace quda
quda::blas::CdotCopy::flops
static int flops()
total number of input and output streams
Definition: multi_reduce_core.cuh:264

quda::vector_type< ReduceType, NXZ >

quda::blas::CdotCopy
Definition: multi_reduce_core.cuh:249

quda::blas::Dot::operator()
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)
where the reduction is usually computed and any auxiliary operations
Definition: multi_reduce_core.cuh:187

quda::blas::Cmatrix_d
static __constant__ signed char Cmatrix_d[MAX_MATRIX_SIZE]
Definition: multi_blas_core.cuh:19

quda::blas::cdot_
__device__ __host__ void cdot_(ReduceType &sum, const double2 &a, const double2 &b)
Definition: multi_reduce_core.cuh:199

quda::blas::Cdot::Cdot
Cdot(const coeff_array< Complex > &a, const coeff_array< Complex > &b, const coeff_array< Complex > &c, int NYW)
Definition: multi_reduce_core.cuh:234

quda::blas::CdotCopy::operator()
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)
where the reduction is usually computed and any auxiliary operations
Definition: multi_reduce_core.cuh:257

quda::blas::Dot::streams
static int streams()
Definition: multi_reduce_core.cuh:192

quda::blas::Amatrix_d
static __constant__ signed char Amatrix_d[MAX_MATRIX_SIZE]
Definition: multi_blas_core.cuh:17

quda::blas::MultiReduceFunctor::pre
virtual __device__ __host__ void pre()
pre-computation routine called before the "M-loop"
Definition: multi_reduce_core.cuh:143

cub_helper.cuh

quda::blas::MultiReduceArg::Z
SpinorZ Z[NXZ]
Definition: multi_reduce_core.cuh:53

quda::blas::CdotCopy::streams
static int streams()
Definition: multi_reduce_core.cuh:263

quda::blas::coeff_array
Definition: multi_blas_core.cuh:110

quda::blas::Cdot::streams
static int streams()
Definition: multi_reduce_core.cuh:244

quda::blas::CdotCopy::NYW
const int NYW
Definition: multi_reduce_core.cuh:251

quda::sum
__host__ __device__ void sum(double &a, double &b)
Definition: blas_helper.cuh:62

texture.h

quda
Definition: blas_cublas.h:5

quda::scalar
Definition: register_traits.h:113

quda::blas::MultiReduceFunctor
Definition: multi_reduce_core.cuh:140

quda::blas::MultiReduceArg::W
SpinorW W[MAX_MULTI_BLAS_N]
Definition: multi_reduce_core.cuh:54

MAX_MATRIX_SIZE
#define MAX_MATRIX_SIZE
Definition: multi_reduce_core.cuh:19

quda::blas::MultiReduceArg::X
SpinorX X[NXZ]
Definition: multi_reduce_core.cuh:51

quda::blas::coeff_array::coeff_array
coeff_array(const T *data, bool use_const)
Definition: multi_reduce_core.cuh:134

quda::blas::Cdot
Definition: multi_reduce_core.cuh:231

quda::blas::MultiReduceArg::r
Reducer r
Definition: multi_reduce_core.cuh:55

quda::blas::MultiReduceArg::Y
SpinorY Y[MAX_MULTI_BLAS_N]
Definition: multi_reduce_core.cuh:52

quda::blas::Bmatrix_h
static signed char * Bmatrix_h
Definition: multi_blas_core.cuh:22

quda::blas::Bmatrix_d
static __constant__ signed char Bmatrix_d[MAX_MATRIX_SIZE]
Definition: multi_blas_core.cuh:18

quda::ReduceArg
Definition: cub_helper.cuh:69

quda::blas::MultiReduceArg
Parameter struct for generic multi-blas kernel.
Definition: multi_reduce_core.cuh:48

blas_helper.cuh

quda::blas::Dot::real
scalar< Float2 >::type real
Definition: multi_reduce_core.cuh:180

quda::Arg
Definition: spinor_noise.cu:22

quda::blas::Cdot::flops
static int flops()
total number of input and output streams
Definition: multi_reduce_core.cuh:245

quda::blas::Dot
Definition: multi_reduce_core.cuh:179

color_spinor_field_order.h

quda::blas::arg_buffer
static __constant__ signed char arg_buffer[MAX_MATRIX_SIZE]
Definition: multi_blas_core.cuh:29

quda::blas::Dot::flops
static int flops()
total number of input and output streams
Definition: multi_reduce_core.cuh:193

quda::arg
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:1076

quda::blas::Dot::Dot
Dot(const coeff_array< Complex > &a, const coeff_array< Complex > &b, const coeff_array< Complex > &c, int NYW)
Definition: multi_reduce_core.cuh:182

quda::blas::multiReduceKernel
__global__ void multiReduceKernel(Arg arg_)
Definition: multi_reduce_core.cuh:79

quda::blas::MultiReduceArg::NYW
const int NYW
Definition: multi_reduce_core.cuh:50

quda::blas::Dot::NYW
const int NYW
Definition: multi_reduce_core.cuh:181

quda::blas::MultiReduceArg::length
const int length
Definition: multi_reduce_core.cuh:56

quda::blas::CdotCopy::CdotCopy
CdotCopy(const coeff_array< Complex > &a, const coeff_array< Complex > &b, const coeff_array< Complex > &c, int NYW)
Definition: multi_reduce_core.cuh:252

quda::blas::Amatrix_h
static signed char * Amatrix_h
Definition: multi_blas_core.cuh:21

parity
QudaParity parity
Definition: covdev_test.cpp:54

quda::blas::dot_
__device__ __host__ void dot_(ReduceType &sum, const double2 &a, const double2 &b)
Definition: multi_reduce_core.cuh:158

quda::blas::coeff_array::coeff_array
coeff_array()
Definition: multi_reduce_core.cuh:133

quda::blas::CdotCopy::real
scalar< Float2 >::type real
Definition: multi_reduce_core.cuh:250

quda::blas::Cmatrix_h
static signed char * Cmatrix_h
Definition: multi_blas_core.cuh:23

quda::blas::MultiReduceFunctor::post
virtual __device__ __host__ void post(ReduceType &sum)
post-computation routine called after the "M-loop"
Definition: multi_reduce_core.cuh:151

quda::blas::Cdot::operator()
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)
where the reduction is usually computed and any auxiliary operations
Definition: multi_reduce_core.cuh:239

quda::blas::MultiReduceArg::MultiReduceArg
MultiReduceArg(SpinorX X[NXZ], SpinorY Y[], SpinorZ Z[NXZ], SpinorW W[], Reducer r, int NYW, int length)
Definition: multi_reduce_core.cuh:57

quda::blas::Cdot::NYW
const int NYW
Definition: multi_reduce_core.cuh:233

MAX_MULTI_BLAS_N
#define MAX_MULTI_BLAS_N
Definition: multi_reduce_quda.cu:14

quda::blas::Cdot::real
scalar< Float2 >::type real
Definition: multi_reduce_core.cuh:232