QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Classes | Functions | Variables
reduce_core_cub.h File Reference

Go to the source code of this file.

Classes

struct  ReduceArg< ReduceType, SpinorX, SpinorY, SpinorZ, SpinorW, SpinorV, Reducer >
 
class  ReduceCuda< doubleN, ReduceType, ReduceSimpleType, FloatN, M, SpinorX, SpinorY, SpinorZ, SpinorW, SpinorV, Reducer >
 

Functions

__host__ __device__ void zero (double &x)
 
__host__ __device__ void zero (double2 &x)
 
__host__ __device__ void zero (double3 &x)
 
__device__ void copytoshared (double *s, const int i, const double x, const int block)
 
__device__ void copytoshared (double *s, const int i, const double2 x, const int block)
 
__device__ void copytoshared (double *s, const int i, const double3 x, const int block)
 
__device__ void copytoshared (volatile double *s, const int i, const double x, const int block)
 
__device__ void copytoshared (volatile double *s, const int i, const double2 x, const int block)
 
__device__ void copytoshared (volatile double *s, const int i, const double3 x, const int block)
 
__device__ void copyfromshared (double &x, const double *s, const int i, const int block)
 
__device__ void copyfromshared (double2 &x, const double *s, const int i, const int block)
 
__device__ void copyfromshared (double3 &x, const double *s, const int i, const int block)
 
template<typename ReduceType , typename ReduceSimpleType >
__device__ void add (ReduceType &sum, ReduceSimpleType *s, const int i, const int block)
 
template<>
__device__ void add< double, double > (double &sum, double *s, const int i, const int block)
 
template<>
__device__ void add< double2, double > (double2 &sum, double *s, const int i, const int block)
 
template<>
__device__ void add< double3, double > (double3 &sum, double *s, const int i, const int block)
 
template<typename ReduceType , typename ReduceSimpleType >
__device__ void add (ReduceSimpleType *s, const int i, const int j, const int block)
 
template<typename ReduceType , typename ReduceSimpleType >
__device__ void add (volatile ReduceSimpleType *s, const int i, const int j, const int block)
 
template<>
__device__ void add< double, double > (double *s, const int i, const int j, const int block)
 
template<>
__device__ void add< double, double > (volatile double *s, const int i, const int j, const int block)
 
template<>
__device__ void add< double2, double > (double *s, const int i, const int j, const int block)
 
template<>
__device__ void add< double2, double > (volatile double *s, const int i, const int j, const int block)
 
template<>
__device__ void add< double3, double > (double *s, const int i, const int j, const int block)
 
template<>
__device__ void add< double3, double > (volatile double *s, const int i, const int j, const int block)
 
__host__ __device__ void zero (doublesingle &x)
 
__host__ __device__ void zero (doublesingle2 &x)
 
__host__ __device__ void zero (doublesingle3 &x)
 
__device__ void copytoshared (doublesingle *s, const int i, const doublesingle x, const int block)
 
__device__ void copytoshared (doublesingle *s, const int i, const doublesingle2 x, const int block)
 
__device__ void copytoshared (doublesingle *s, const int i, const doublesingle3 x, const int block)
 
__device__ void copytoshared (volatile doublesingle *s, const int i, const doublesingle x, const int block)
 
__device__ void copytoshared (volatile doublesingle *s, const int i, const doublesingle2 x, const int block)
 
__device__ void copytoshared (volatile doublesingle *s, const int i, const doublesingle3 x, const int block)
 
__device__ void copyfromshared (doublesingle &x, const doublesingle *s, const int i, const int block)
 
__device__ void copyfromshared (doublesingle2 &x, const doublesingle *s, const int i, const int block)
 
__device__ void copyfromshared (doublesingle3 &x, const doublesingle *s, const int i, const int block)
 
template<>
__device__ void add< doublesingle, doublesingle > (doublesingle &sum, doublesingle *s, const int i, const int block)
 
template<>
__device__ void add< doublesingle2, doublesingle > (doublesingle2 &sum, doublesingle *s, const int i, const int block)
 
template<>
__device__ void add< doublesingle3, doublesingle > (doublesingle3 &sum, doublesingle *s, const int i, const int block)
 
template<>
__device__ void add< doublesingle, doublesingle > (doublesingle *s, const int i, const int j, const int block)
 
template<>
__device__ void add< doublesingle, doublesingle > (volatile doublesingle *s, const int i, const int j, const int block)
 
template<>
__device__ void add< doublesingle2, doublesingle > (doublesingle *s, const int i, const int j, const int block)
 
template<>
__device__ void add< doublesingle2, doublesingle > (volatile doublesingle *s, const int i, const int j, const int block)
 
template<>
__device__ void add< doublesingle3, doublesingle > (doublesingle *s, const int i, const int j, const int block)
 
template<>
__device__ void add< doublesingle3, doublesingle > (volatile doublesingle *s, const int i, const int j, const int block)
 
template<int block_size, typename ReduceType , typename ReduceSimpleType , typename FloatN , int M, typename SpinorX , typename SpinorY , typename SpinorZ , typename SpinorW , typename SpinorV , typename Reducer >
__global__ void reduceKernel (ReduceArg< ReduceType, SpinorX, SpinorY, SpinorZ, SpinorW, SpinorV, Reducer > arg)
 
template<typename doubleN , typename ReduceType , typename ReduceSimpleType , typename FloatN , int M, typename SpinorX , typename SpinorY , typename SpinorZ , typename SpinorW , typename SpinorV , typename Reducer >
doubleN reduceLaunch (ReduceArg< ReduceType, SpinorX, SpinorY, SpinorZ, SpinorW, SpinorV, Reducer > &arg, const TuneParam &tp, const cudaStream_t &stream)
 
template<typename doubleN , typename ReduceType , typename ReduceSimpleType , template< typename ReducerType, typename Float, typename FloatN > class Reducer, int writeX, int writeY, int writeZ, int writeW, int writeV, bool siteUnroll>
doubleN reduceCuda (const double2 &a, const double2 &b, cudaColorSpinorField &x, cudaColorSpinorField &y, cudaColorSpinorField &z, cudaColorSpinorField &w, cudaColorSpinorField &v)
 

Variables

__device__ unsigned int count = 0
 
__shared__ bool isLastBlockDone
 

Function Documentation

template<typename ReduceType , typename ReduceSimpleType >
__device__ void add ( ReduceType &  sum,
ReduceSimpleType *  s,
const int  i,
const int  block 
)

Definition at line 21 of file reduce_core_cub.h.

template<typename ReduceType , typename ReduceSimpleType >
__device__ void add ( ReduceSimpleType *  s,
const int  i,
const int  j,
const int  block 
)

Definition at line 30 of file reduce_core_cub.h.

template<typename ReduceType , typename ReduceSimpleType >
__device__ void add ( volatile ReduceSimpleType *  s,
const int  i,
const int  j,
const int  block 
)

Definition at line 32 of file reduce_core_cub.h.

template<>
__device__ void add< double, double > ( double &  sum,
double *  s,
const int  i,
const int  block 
)

Definition at line 22 of file reduce_core_cub.h.

template<>
__device__ void add< double, double > ( double *  s,
const int  i,
const int  j,
const int  block 
)

Definition at line 34 of file reduce_core_cub.h.

template<>
__device__ void add< double, double > ( volatile double *  s,
const int  i,
const int  j,
const int  block 
)

Definition at line 36 of file reduce_core_cub.h.

template<>
__device__ void add< double2, double > ( double2 &  sum,
double *  s,
const int  i,
const int  block 
)

Definition at line 24 of file reduce_core_cub.h.

template<>
__device__ void add< double2, double > ( double *  s,
const int  i,
const int  j,
const int  block 
)

Definition at line 39 of file reduce_core_cub.h.

template<>
__device__ void add< double2, double > ( volatile double *  s,
const int  i,
const int  j,
const int  block 
)

Definition at line 41 of file reduce_core_cub.h.

template<>
__device__ void add< double3, double > ( double3 &  sum,
double *  s,
const int  i,
const int  block 
)

Definition at line 26 of file reduce_core_cub.h.

template<>
__device__ void add< double3, double > ( double *  s,
const int  i,
const int  j,
const int  block 
)

Definition at line 44 of file reduce_core_cub.h.

template<>
__device__ void add< double3, double > ( volatile double *  s,
const int  i,
const int  j,
const int  block 
)

Definition at line 46 of file reduce_core_cub.h.

template<>
__device__ void add< doublesingle, doublesingle > ( doublesingle sum,
doublesingle s,
const int  i,
const int  block 
)

Definition at line 70 of file reduce_core_cub.h.

template<>
__device__ void add< doublesingle, doublesingle > ( doublesingle s,
const int  i,
const int  j,
const int  block 
)

Definition at line 77 of file reduce_core_cub.h.

template<>
__device__ void add< doublesingle, doublesingle > ( volatile doublesingle s,
const int  i,
const int  j,
const int  block 
)

Definition at line 79 of file reduce_core_cub.h.

template<>
__device__ void add< doublesingle2, doublesingle > ( doublesingle2 sum,
doublesingle s,
const int  i,
const int  block 
)

Definition at line 72 of file reduce_core_cub.h.

template<>
__device__ void add< doublesingle2, doublesingle > ( doublesingle s,
const int  i,
const int  j,
const int  block 
)

Definition at line 82 of file reduce_core_cub.h.

template<>
__device__ void add< doublesingle2, doublesingle > ( volatile doublesingle s,
const int  i,
const int  j,
const int  block 
)

Definition at line 84 of file reduce_core_cub.h.

template<>
__device__ void add< doublesingle3, doublesingle > ( doublesingle3 sum,
doublesingle s,
const int  i,
const int  block 
)

Definition at line 74 of file reduce_core_cub.h.

template<>
__device__ void add< doublesingle3, doublesingle > ( doublesingle s,
const int  i,
const int  j,
const int  block 
)

Definition at line 87 of file reduce_core_cub.h.

template<>
__device__ void add< doublesingle3, doublesingle > ( volatile doublesingle s,
const int  i,
const int  j,
const int  block 
)

Definition at line 89 of file reduce_core_cub.h.

__device__ void copyfromshared ( double &  x,
const double *  s,
const int  i,
const int  block 
)

Definition at line 14 of file reduce_core_cub.h.

__device__ void copyfromshared ( double2 &  x,
const double *  s,
const int  i,
const int  block 
)

Definition at line 15 of file reduce_core_cub.h.

__device__ void copyfromshared ( double3 &  x,
const double *  s,
const int  i,
const int  block 
)

Definition at line 17 of file reduce_core_cub.h.

__device__ void copyfromshared ( doublesingle x,
const doublesingle s,
const int  i,
const int  block 
)

Definition at line 64 of file reduce_core_cub.h.

__device__ void copyfromshared ( doublesingle2 x,
const doublesingle s,
const int  i,
const int  block 
)

Definition at line 65 of file reduce_core_cub.h.

__device__ void copyfromshared ( doublesingle3 x,
const doublesingle s,
const int  i,
const int  block 
)

Definition at line 67 of file reduce_core_cub.h.

__device__ void copytoshared ( double *  s,
const int  i,
const double  x,
const int  block 
)

Definition at line 4 of file reduce_core_cub.h.

__device__ void copytoshared ( double *  s,
const int  i,
const double2  x,
const int  block 
)

Definition at line 5 of file reduce_core_cub.h.

__device__ void copytoshared ( double *  s,
const int  i,
const double3  x,
const int  block 
)

Definition at line 7 of file reduce_core_cub.h.

__device__ void copytoshared ( volatile double *  s,
const int  i,
const double  x,
const int  block 
)

Definition at line 9 of file reduce_core_cub.h.

__device__ void copytoshared ( volatile double *  s,
const int  i,
const double2  x,
const int  block 
)

Definition at line 10 of file reduce_core_cub.h.

__device__ void copytoshared ( volatile double *  s,
const int  i,
const double3  x,
const int  block 
)

Definition at line 12 of file reduce_core_cub.h.

__device__ void copytoshared ( doublesingle s,
const int  i,
const doublesingle  x,
const int  block 
)

Definition at line 53 of file reduce_core_cub.h.

__device__ void copytoshared ( doublesingle s,
const int  i,
const doublesingle2  x,
const int  block 
)

Definition at line 54 of file reduce_core_cub.h.

__device__ void copytoshared ( doublesingle s,
const int  i,
const doublesingle3  x,
const int  block 
)

Definition at line 56 of file reduce_core_cub.h.

__device__ void copytoshared ( volatile doublesingle s,
const int  i,
const doublesingle  x,
const int  block 
)

Definition at line 58 of file reduce_core_cub.h.

__device__ void copytoshared ( volatile doublesingle s,
const int  i,
const doublesingle2  x,
const int  block 
)

Definition at line 59 of file reduce_core_cub.h.

__device__ void copytoshared ( volatile doublesingle s,
const int  i,
const doublesingle3  x,
const int  block 
)

Definition at line 61 of file reduce_core_cub.h.

template<typename doubleN , typename ReduceType , typename ReduceSimpleType , template< typename ReducerType, typename Float, typename FloatN > class Reducer, int writeX, int writeY, int writeZ, int writeW, int writeV, bool siteUnroll>
doubleN reduceCuda ( const double2 &  a,
const double2 &  b,
cudaColorSpinorField &  x,
cudaColorSpinorField &  y,
cudaColorSpinorField &  z,
cudaColorSpinorField &  w,
cudaColorSpinorField &  v 
)

Driver for generic reduction routine with two loads.

Parameters
ReduceType
siteUnroll- if this is true, then one site corresponds to exactly one thread

Definition at line 559 of file reduce_core_cub.h.

template<int block_size, typename ReduceType , typename ReduceSimpleType , typename FloatN , int M, typename SpinorX , typename SpinorY , typename SpinorZ , typename SpinorW , typename SpinorV , typename Reducer >
__global__ void reduceKernel ( ReduceArg< ReduceType, SpinorX, SpinorY, SpinorZ, SpinorW, SpinorV, Reducer >  arg)

Generic reduction kernel with up to four loads and three saves.

Definition at line 119 of file reduce_core_cub.h.

template<typename doubleN , typename ReduceType , typename ReduceSimpleType , typename FloatN , int M, typename SpinorX , typename SpinorY , typename SpinorZ , typename SpinorW , typename SpinorV , typename Reducer >
doubleN reduceLaunch ( ReduceArg< ReduceType, SpinorX, SpinorY, SpinorZ, SpinorW, SpinorV, Reducer > &  arg,
const TuneParam &  tp,
const cudaStream_t &  stream 
)

Generic reduction kernel launcher

Definition at line 296 of file reduce_core_cub.h.

__host__ __device__ void zero ( double &  x)

Definition at line 1 of file reduce_core_cub.h.

__host__ __device__ void zero ( double2 &  x)

Definition at line 2 of file reduce_core_cub.h.

__host__ __device__ void zero ( double3 &  x)

Definition at line 3 of file reduce_core_cub.h.

__host__ __device__ void zero ( doublesingle x)

Definition at line 50 of file reduce_core_cub.h.

__host__ __device__ void zero ( doublesingle2 x)

Definition at line 51 of file reduce_core_cub.h.

__host__ __device__ void zero ( doublesingle3 x)

Definition at line 52 of file reduce_core_cub.h.

Variable Documentation

__device__ unsigned int count = 0

Definition at line 93 of file reduce_core_cub.h.

__shared__ bool isLastBlockDone

Definition at line 94 of file reduce_core_cub.h.