#include <launch_kernel.cuh>

Include dependency graph for multi_reduce_core.cuh:

This graph shows which files directly or indirectly include this file:

Classes
struct	MultiReduceArg< NXZ, ReduceType, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer >
	Parameter struct for generic multi-blas kernel. More...

struct	detail::to_chars< digits >

struct	detail::explode< rem, digits >

struct	detail::explode< 0, digits... >

struct	num_to_string< num >

class	MultiReduceCuda< NXZ, doubleN, ReduceType, FloatN, M, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer >

struct	coeff_array< T >

Namespaces
	detail

Macros
#define	MAX_MATRIX_SIZE 4096

Functions
__host__ __device__ double	set (double &x)

__host__ __device__ double2	set (double2 &x)

__host__ __device__ double3	set (double3 &x)

__host__ __device__ void	sum (double &a, double &b)

__host__ __device__ void	sum (double2 &a, double2 &b)

__host__ __device__ void	sum (double3 &a, double3 &b)

template<int k, int NXZ, typename FloatN , int M, typename ReduceType , typename Arg >
__device__ void	compute (vector_type< ReduceType, NXZ > &sum, Arg &arg, int idx, int parity)

template<int block_size, typename ReduceType , typename FloatN , int M, int NXZ, typename SpinorX , typename SpinorY , typename SpinorZ , typename SpinorW , typename Reducer >
__global__ void	multiReduceKernel (MultiReduceArg< NXZ, ReduceType, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer > arg)

template<typename doubleN , typename ReduceType , typename FloatN , int M, int NXZ, typename SpinorX , typename SpinorY , typename SpinorZ , typename SpinorW , typename Reducer >
void	multiReduceLaunch (doubleN result[], MultiReduceArg< NXZ, ReduceType, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer > &arg, const TuneParam &tp, const cudaStream_t &stream)

template<typename doubleN , typename ReduceType , typename RegType , typename StoreType , typename yType , int M, int NXZ, template< int MXZ, typename ReducerType, typename Float, typename FloatN > class Reducer, typename write , typename T >
void	multiReduceCuda (doubleN result[], const reduce::coeff_array< T > &a, const reduce::coeff_array< T > &b, const reduce::coeff_array< T > &c, std::vector< ColorSpinorField > &x, std::vector< ColorSpinorField > &y, std::vector< ColorSpinorField > &z, std::vector< ColorSpinorField > &w, int length)

Variables
static __device__ unsigned int	count = 0

static __shared__ bool	isLastBlockDone

static __constant__ signed char	Amatrix_d [MAX_MATRIX_SIZE]

static __constant__ signed char	Bmatrix_d [MAX_MATRIX_SIZE]

static __constant__ signed char	Cmatrix_d [MAX_MATRIX_SIZE]

static signed char *	Amatrix_h

static signed char *	Bmatrix_h

static signed char *	Cmatrix_h

Macro Definition Documentation

◆ MAX_MATRIX_SIZE

#define MAX_MATRIX_SIZE 4096

Definition at line 65 of file multi_reduce_core.cuh.

Referenced by multiReduceCuda().

Function Documentation

◆ compute()

template<int k, int NXZ, typename FloatN , int M, typename ReduceType , typename Arg >

__device__ void compute	(	vector_type< ReduceType, NXZ > &	sum,
		Arg &	arg,
		int	idx,
		int	parity
	)

inline

Definition at line 76 of file multi_reduce_core.cuh.

References quda::arg(), blockDim, gridDim, idx, parity, sum(), w, x, y, and z.

Here is the call graph for this function:

◆ multiReduceCuda()

template<typename doubleN , typename ReduceType , typename RegType , typename StoreType , typename yType , int M, int NXZ, template< int MXZ, typename ReducerType, typename Float, typename FloatN > class Reducer, typename write , typename T >

void multiReduceCuda	(	doubleN	result[],
		const reduce::coeff_array< T > &	a,
		const reduce::coeff_array< T > &	b,
		const reduce::coeff_array< T > &	c,
		std::vector< ColorSpinorField *> &	x,
		std::vector< ColorSpinorField *> &	y,
		std::vector< ColorSpinorField *> &	z,
		std::vector< ColorSpinorField *> &	w,
		int	length
	)

Definition at line 370 of file multi_reduce_core.cuh.

References a, Amatrix_d, Amatrix_h, b, quda::blas::blasStrings, Bmatrix_d, Bmatrix_h, quda::blas::bytes, c, checkCudaError, checkSpinor(), Cmatrix_d, Cmatrix_h, errorQuda, quda::blas::flops, quda::blas::getStream(), fused_exterior_ndeg_tm_dslash_cuda_gen::i, length, MAX_MATRIX_SIZE, MAX_MULTI_BLAS_N, memset(), QUDA_MAX_MULTI_REDUCE, quda::reduce(), strcat(), strcpy(), V, w, warningQuda, X, x, y, Z, and z.

Here is the call graph for this function:

◆ multiReduceKernel()

template<int block_size, typename ReduceType , typename FloatN , int M, int NXZ, typename SpinorX , typename SpinorY , typename SpinorZ , typename SpinorW , typename Reducer >

__global__ void multiReduceKernel ( MultiReduceArg< NXZ, ReduceType, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer > arg )

Definition at line 121 of file multi_reduce_core.cuh.

References quda::arg(), blockDim, fused_exterior_ndeg_tm_dslash_cuda_gen::i, parity, and sum().

Referenced by multiReduceLaunch().

Here is the call graph for this function:

Here is the caller graph for this function:

◆ multiReduceLaunch()

template<typename doubleN , typename ReduceType , typename FloatN , int M, int NXZ, typename SpinorX , typename SpinorY , typename SpinorZ , typename SpinorW , typename Reducer >

void multiReduceLaunch	(	doubleN	result[],
		MultiReduceArg< NXZ, ReduceType, SpinorX, SpinorY, SpinorZ, SpinorW, Reducer > &	arg,
		const TuneParam &	tp,
		const cudaStream_t &	stream
	)

Definition at line 190 of file multi_reduce_core.cuh.

References quda::arg(), deviceProp, errorQuda, quda::blas::getHostReduceBuffer(), quda::blas::getMappedHostReduceBuffer(), quda::blas::getReduceEvent(), fused_exterior_ndeg_tm_dslash_cuda_gen::i, LAUNCH_KERNEL_LOCAL_PARITY, multiReduceKernel(), quda::qudaEventQuery(), quda::qudaEventRecord(), qudaMemcpy, stream, and sum().

Here is the call graph for this function:

◆ set() [1/3]

__host__ __device__ double set ( double & x )

inline

Definition at line 1 of file multi_reduce_core.cuh.

References x.

◆ set() [2/3]

__host__ __device__ double2 set ( double2 & x )

inline

Definition at line 2 of file multi_reduce_core.cuh.

References x.

◆ set() [3/3]

__host__ __device__ double3 set ( double3 & x )

inline

Definition at line 3 of file multi_reduce_core.cuh.

References x.

◆ sum() [1/3]

__host__ __device__ void sum	(	double &	a,
		double &	b
	)

inline

Definition at line 4 of file multi_reduce_core.cuh.

References a, and b.

Here is the caller graph for this function:

◆ sum() [2/3]

__host__ __device__ void sum	(	double2 &	a,
		double2 &	b
	)

inline

Definition at line 5 of file multi_reduce_core.cuh.

References a, and b.

◆ sum() [3/3]

__host__ __device__ void sum	(	double3 &	a,
		double3 &	b
	)

inline

Definition at line 6 of file multi_reduce_core.cuh.

References a, and b.

Variable Documentation

◆ Amatrix_d

__constant__ signed char Amatrix_d[MAX_MATRIX_SIZE]

static

Definition at line 66 of file multi_reduce_core.cuh.

Referenced by multiReduceCuda().

◆ Amatrix_h

signed char* Amatrix_h

static

Definition at line 70 of file multi_reduce_core.cuh.

Referenced by multiReduceCuda().

◆ Bmatrix_d

__constant__ signed char Bmatrix_d[MAX_MATRIX_SIZE]

static

Definition at line 67 of file multi_reduce_core.cuh.

Referenced by multiReduceCuda().

◆ Bmatrix_h

signed char* Bmatrix_h

static

Definition at line 71 of file multi_reduce_core.cuh.

Referenced by multiReduceCuda().

◆ Cmatrix_d

__constant__ signed char Cmatrix_d[MAX_MATRIX_SIZE]

static

Definition at line 68 of file multi_reduce_core.cuh.

Referenced by multiReduceCuda().

◆ Cmatrix_h

signed char* Cmatrix_h

static

Definition at line 72 of file multi_reduce_core.cuh.

Referenced by multiReduceCuda().

◆ count

__device__ unsigned int count = 0

static

Definition at line 19 of file multi_reduce_core.cuh.

◆ isLastBlockDone

__shared__ bool isLastBlockDone

static

Definition at line 20 of file multi_reduce_core.cuh.

Classes

Namespaces

Macros

Functions

Variables

Macro Definition Documentation

◆ MAX_MATRIX_SIZE

Function Documentation

◆ compute()

◆ multiReduceCuda()

◆ multiReduceKernel()

◆ multiReduceLaunch()

◆ set() [1/3]

◆ set() [2/3]

◆ set() [3/3]

◆ sum() [1/3]

◆ sum() [2/3]

◆ sum() [3/3]

Variable Documentation

◆ Amatrix_d

◆ Amatrix_h

◆ Bmatrix_d

◆ Bmatrix_h

◆ Cmatrix_d

◆ Cmatrix_h

◆ count

◆ isLastBlockDone