1 __host__ __device__
inline double set(
double &
x) {
return x;}
2 __host__ __device__
inline double2
set(double2 &
x) {
return x;}
3 __host__ __device__
inline double3
set(double3 &
x) {
return x;}
4 __host__ __device__
inline double4
set(double4 &
x) {
return x;}
5 __host__ __device__
inline void sum(
double &
a,
double &
b) {
a +=
b; }
6 __host__ __device__
inline void sum(double2 &
a, double2 &
b) {
a.x +=
b.x;
a.y +=
b.y; }
7 __host__ __device__
inline void sum(double3 &
a, double3 &
b) {
a.x +=
b.x;
a.y +=
b.y;
a.z +=
b.z; }
8 __host__ __device__
inline void sum(double4 &
a, double4 &
b) {
a.x +=
b.x;
a.y +=
b.y;
a.z +=
b.z;
a.w +=
b.w; }
11 __host__ __device__
inline double set(
doubledouble &
a) {
return a.head(); }
12 __host__ __device__
inline double2
set(
doubledouble2 &
a) {
return make_double2(
a.x.head(),
a.y.head()); }
13 __host__ __device__
inline double3
set(
doubledouble3 &
a) {
return make_double3(
a.x.head(),
a.y.head(),
a.z.head()); }
15 __host__ __device__
inline void sum(double2 &
a,
doubledouble2 &
b) {
a.x +=
b.x.head();
a.y +=
b.y.head(); }
16 __host__ __device__
inline void sum(double3 &
a,
doubledouble3 &
b) {
a.x +=
b.x.head();
a.y +=
b.y.head();
a.z +=
b.z.head(); }
19 __device__
static unsigned int count = 0;
24 template <
typename ReduceType,
typename SpinorX,
typename SpinorY,
25 typename SpinorZ,
typename SpinorW,
typename SpinorV,
typename Reducer>
41 template <
int block_size,
typename ReduceType,
typename FloatN,
int M,
typename SpinorX,
42 typename SpinorY,
typename SpinorZ,
typename SpinorW,
typename SpinorV,
typename Reducer>
45 unsigned int i = blockIdx.x*
blockDim.x + threadIdx.x;
46 unsigned int parity = blockIdx.y;
52 while (
i <
arg.length) {
53 FloatN
x[M],
y[M],
z[M],
w[M], v[M];
63 for (
int j=0; j<M; j++)
arg.r(
sum,
x[j],
y[j],
z[j],
w[j], v[j]);
76 ::quda::reduce<block_size, ReduceType>(
arg,
sum,
parity);
83 template <
typename doubleN,
typename ReduceType,
typename FloatN,
int M,
typename SpinorX,
84 typename SpinorY,
typename SpinorZ,
typename SpinorW,
typename SpinorV,
typename Reducer>
86 const TuneParam &tp,
const cudaStream_t &
stream) {
87 if (tp.grid.x > (
unsigned int)
deviceProp.maxGridSize[0])
93 #if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) 101 doubleN cpu_sum =
set(((ReduceType*)
h_reduce)[0]);
102 if (tp.grid.y==2)
sum(cpu_sum, ((ReduceType*)
h_reduce)[1]);
107 template <
typename doubleN,
typename ReduceType,
typename FloatN,
int M,
typename SpinorX,
108 typename SpinorY,
typename SpinorZ,
typename SpinorW,
typename SpinorV,
typename Reducer>
128 TuneParam next(
param);
129 advanceBlockDim(next);
130 int nthreads = next.block.x * next.block.y * next.block.z;
139 const size_t *
bytes,
const size_t *norm_bytes) :
172 Tunable::initTuneParam(
param);
177 Tunable::defaultTuneParam(
param);
195 return ((
arg.r.streams()-2)*base_bytes + 2*extra_bytes)*
arg.length*
nParity;
202 template <
typename doubleN,
typename ReduceType,
typename RegType,
typename StoreType,
typename zType,
203 int M,
template <
typename ReducerType,
typename Float,
typename FloatN>
class Reducer,
204 int writeX,
int writeY,
int writeZ,
int writeW,
int writeV>
206 ColorSpinorField &
x, ColorSpinorField &
y,
207 ColorSpinorField &
z, ColorSpinorField &
w,
208 ColorSpinorField &v,
int length) {
213 warningQuda(
"Device reductions on non-native fields is not supported\n");
221 if (
typeid(StoreType) !=
typeid(zType)) {
226 size_t bytes[] = {
x.Bytes(),
y.Bytes(),
z.Bytes(),
w.Bytes()};
227 size_t norm_bytes[] = {
x.NormBytes(),
y.NormBytes(),
z.NormBytes(),
w.NormBytes()};
235 typedef typename scalar<RegType>::type Float;
238 Reducer<ReduceType, Float2, RegType> r((Float2)vec2(
a), (Float2)vec2(
b));
241 int partitions = (
x.IsComposite() ?
x.CompositeDim() : 1) * (
x.SiteSubset());
248 Reducer<ReduceType, Float2, RegType> >
249 reduce(
value,
X, Y,
Z, W,
V, r,
length, partitions,
bytes, norm_bytes);
266 template <
typename ReduceType,
typename Float2,
int writeX,
int writeY,
int writeZ,
267 int writeW,
int writeV,
typename SpinorX,
typename SpinorY,
typename SpinorZ,
268 typename SpinorW,
typename SpinorV,
typename Reducer>
269 ReduceType
genericReduce(SpinorX &
X, SpinorY &Y, SpinorZ &
Z, SpinorW &W, SpinorV &
V, Reducer r) {
275 for (
int x=0;
x<
X.VolumeCB();
x++) {
277 for (
int s=0;
s<
X.Nspin();
s++) {
278 for (
int c=0;
c<
X.Ncolor();
c++) {
279 Float2 X2 = make_Float2<Float2>(
X(
parity,
x,
s,
c) );
280 Float2 Y2 = make_Float2<Float2>( Y(
parity,
x,
s,
c) );
281 Float2 Z2 = make_Float2<Float2>(
Z(
parity,
x,
s,
c) );
282 Float2 W2 = make_Float2<Float2>( W(
parity,
x,
s,
c) );
283 Float2 V2 = make_Float2<Float2>(
V(
parity,
x,
s,
c) );
284 r(
sum, X2, Y2, Z2, W2, V2);
299 template<
typename,
int N>
struct vector { };
303 template <
typename ReduceType,
typename Float,
typename zFloat,
int nSpin,
int nColor,
QudaFieldOrder order,
304 int writeX,
int writeY,
int writeZ,
int writeW,
int writeV,
typename R>
306 ColorSpinorField &
w, ColorSpinorField &v,
R r) {
307 colorspinor::FieldOrderCB<Float,nSpin,nColor,1,order>
X(
x), Y(
y), W(
w),
V(v);
308 colorspinor::FieldOrderCB<zFloat,nSpin,nColor,1,order>
Z(
z);
310 return genericReduce<ReduceType,Float2,writeX,writeY,writeZ,writeW,writeV>(
X, Y,
Z, W,
V, r);
313 template <
typename ReduceType,
typename Float,
typename zFloat,
int nSpin,
QudaFieldOrder order,
314 int writeX,
int writeY,
int writeZ,
int writeW,
int writeV,
typename R>
316 ColorSpinorField &
w, ColorSpinorField &v,
R r) {
318 if (
x.Ncolor() == 2) {
319 value = genericReduce<ReduceType,Float,zFloat,nSpin,2,order,writeX,writeY,writeZ,writeW,writeV,R>(
x,
y,
z,
w, v, r);
320 }
else if (
x.Ncolor() == 3) {
321 value = genericReduce<ReduceType,Float,zFloat,nSpin,3,order,writeX,writeY,writeZ,writeW,writeV,R>(
x,
y,
z,
w, v, r);
322 }
else if (
x.Ncolor() == 4) {
323 value = genericReduce<ReduceType,Float,zFloat,nSpin,4,order,writeX,writeY,writeZ,writeW,writeV,R>(
x,
y,
z,
w, v, r);
324 }
else if (
x.Ncolor() == 6) {
325 value = genericReduce<ReduceType,Float,zFloat,nSpin,6,order,writeX,writeY,writeZ,writeW,writeV,R>(
x,
y,
z,
w, v, r);
326 }
else if (
x.Ncolor() == 8) {
327 value = genericReduce<ReduceType,Float,zFloat,nSpin,8,order,writeX,writeY,writeZ,writeW,writeV,R>(
x,
y,
z,
w, v, r);
328 }
else if (
x.Ncolor() == 12) {
329 value = genericReduce<ReduceType,Float,zFloat,nSpin,12,order,writeX,writeY,writeZ,writeW,writeV,R>(
x,
y,
z,
w, v, r);
330 }
else if (
x.Ncolor() == 16) {
331 value = genericReduce<ReduceType,Float,zFloat,nSpin,16,order,writeX,writeY,writeZ,writeW,writeV,R>(
x,
y,
z,
w, v, r);
332 }
else if (
x.Ncolor() == 20) {
333 value = genericReduce<ReduceType,Float,zFloat,nSpin,20,order,writeX,writeY,writeZ,writeW,writeV,R>(
x,
y,
z,
w, v, r);
334 }
else if (
x.Ncolor() == 24) {
335 value = genericReduce<ReduceType,Float,zFloat,nSpin,24,order,writeX,writeY,writeZ,writeW,writeV,R>(
x,
y,
z,
w, v, r);
336 }
else if (
x.Ncolor() == 32) {
337 value = genericReduce<ReduceType,Float,zFloat,nSpin,32,order,writeX,writeY,writeZ,writeW,writeV,R>(
x,
y,
z,
w, v, r);
338 }
else if (
x.Ncolor() == 72) {
339 value = genericReduce<ReduceType,Float,zFloat,nSpin,72,order,writeX,writeY,writeZ,writeW,writeV,R>(
x,
y,
z,
w, v, r);
340 }
else if (
x.Ncolor() == 576) {
341 value = genericReduce<ReduceType,Float,zFloat,nSpin,576,order,writeX,writeY,writeZ,writeW,writeV,R>(
x,
y,
z,
w, v, r);
344 errorQuda(
"nColor = %d not implemeneted",
x.Ncolor());
349 template <
typename ReduceType,
typename Float,
typename zFloat,
QudaFieldOrder order,
350 int writeX,
int writeY,
int writeZ,
int writeW,
int writeV,
typename R>
351 ReduceType
genericReduce(ColorSpinorField &
x, ColorSpinorField &
y, ColorSpinorField &
z, ColorSpinorField &
w, ColorSpinorField &v,
R r) {
354 if (
x.Nspin() == 4) {
355 value = genericReduce<ReduceType,Float,zFloat,4,order,writeX,writeY,writeZ,writeW,writeV,R>(
x,
y,
z,
w, v, r);
356 }
else if (
x.Nspin() == 2) {
357 value = genericReduce<ReduceType,Float,zFloat,2,order,writeX,writeY,writeZ,writeW,writeV,R>(
x,
y,
z,
w, v, r);
358 #ifdef GPU_STAGGERED_DIRAC 359 }
else if (
x.Nspin() == 1) {
360 value = genericReduce<ReduceType,Float,zFloat,1,order,writeX,writeY,writeZ,writeW,writeV,R>(
x,
y,
z,
w, v, r);
363 errorQuda(
"nSpin = %d not implemeneted",
x.Nspin());
368 template <
typename doubleN,
typename ReduceType,
typename Float,
typename zFloat,
369 int writeX,
int writeY,
int writeZ,
int writeW,
int writeV,
typename R>
371 ColorSpinorField &
w, ColorSpinorField &v,
R r) {
375 value = genericReduce<ReduceType,Float,zFloat,QUDA_SPACE_SPIN_COLOR_FIELD_ORDER,writeX,writeY,writeZ,writeW,writeV,R>
378 warningQuda(
"CPU reductions not implemeneted for %d field order",
x.FieldOrder());
#define qudaMemcpy(dst, src, count, kind)
unsigned int sharedBytesPerBlock(const TuneParam ¶m) const
virtual bool advanceSharedBytes(TuneParam ¶m) const
bool commAsyncReduction()
cudaError_t qudaEventQuery(cudaEvent_t &event)
Wrapper around cudaEventQuery or cuEventQuery.
cudaDeviceProp deviceProp
QudaVerbosity getVerbosity()
doubleN reduceLaunch(ReductionArg< ReduceType, SpinorX, SpinorY, SpinorZ, SpinorW, SpinorV, Reducer > &arg, const TuneParam &tp, const cudaStream_t &stream)
enum QudaFieldOrder_s QudaFieldOrder
static __shared__ bool isLastBlockDone
__global__ void reduceKernel(ReductionArg< ReduceType, SpinorX, SpinorY, SpinorZ, SpinorW, SpinorV, Reducer > arg)
void checkLength(const ColorSpinorField &a, ColorSpinorField &b)
char * strcpy(char *__dst, const char *__src)
ReduceCuda(doubleN &result, SpinorX &X, SpinorY &Y, SpinorZ &Z, SpinorW &W, SpinorV &V, Reducer &r, int length, int nParity, const size_t *bytes, const size_t *norm_bytes)
char * strcat(char *__s1, const char *__s2)
void apply(const cudaStream_t &stream)
void initTuneParam(TuneParam ¶m) const
complex< double > make_Complex(const double2 &a)
cudaStream_t * getStream()
static cudaEvent_t reduceEnd
const size_t * norm_bytes_
ReductionArg(SpinorX X, SpinorY Y, SpinorZ Z, SpinorW W, SpinorV V, Reducer r, int length)
static struct quda::blas::@4 blasStrings
ReductionArg< ReduceType, SpinorX, SpinorY, SpinorZ, SpinorW, SpinorV, Reducer > arg
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
doubleN reduceCuda(const double2 &a, const double2 &b, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z, ColorSpinorField &w, ColorSpinorField &v, int length)
void zero(ColorSpinorField &a)
#define LAUNCH_KERNEL(kernel, tp, stream, arg,...)
unsigned int sharedBytesPerThread() const
static __device__ unsigned int count
__host__ __device__ void sum(double &a, double &b)
ReduceType genericReduce(SpinorX &X, SpinorY &Y, SpinorZ &Z, SpinorW &W, SpinorV &V, Reducer r)
__device__ void reduce(ReduceArg< T > arg, const T &in, const int idx=0)
static QudaSumFloat * h_reduce
void defaultTuneParam(TuneParam ¶m) const
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
cudaError_t qudaEventRecord(cudaEvent_t &event, cudaStream_t stream=0)
Wrapper around cudaEventRecord or cuEventRecord.
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
static QudaSumFloat * hd_reduce