1 __host__ __device__
void zero(
double &
x) { x = 0.0; }
2 __host__ __device__
void zero(double2 &
x) { x.x = 0.0; x.y = 0.0; }
3 __host__ __device__
void zero(double3 &
x) { x.x = 0.0; x.y = 0.0; x.z = 0.0; }
4 __device__
void copytoshared(
double *
s,
const int i,
const double x,
const int block) { s[i] =
x; }
5 __device__
void copytoshared(
double *
s,
const int i,
const double2
x,
const int block)
6 { s[i] = x.x; s[i+block] = x.y; }
7 __device__
void copytoshared(
double *
s,
const int i,
const double3
x,
const int block)
8 { s[i] = x.x; s[i+block] = x.y; s[i+2*block] = x.z; }
9 __device__
void copytoshared(
volatile double *
s,
const int i,
const double x,
const int block) { s[i] =
x; }
10 __device__
void copytoshared(
volatile double *
s,
const int i,
const double2
x,
const int block)
11 { s[i] = x.x; s[i+block] = x.y; }
12 __device__
void copytoshared(
volatile double *
s,
const int i,
const double3
x,
const int block)
13 { s[i] = x.x; s[i+block] = x.y; s[i+2*block] = x.z; }
14 __device__
void copyfromshared(
double &
x,
const double *
s,
const int i,
const int block) { x = s[i]; }
15 __device__
void copyfromshared(double2 &
x,
const double *
s,
const int i,
const int block)
16 { x.x = s[i]; x.y = s[i+block]; }
17 __device__
void copyfromshared(double3 &
x,
const double *
s,
const int i,
const int block)
18 { x.x = s[i]; x.y = s[i+block]; x.z = s[i+2*block]; }
20 template<
typename ReduceType,
typename ReduceSimpleType>
21 __device__
void add(ReduceType &sum, ReduceSimpleType *
s,
const int i,
const int block) { }
25 { sum.x +=
s[i]; sum.y +=
s[i+block]; }
27 { sum.x +=
s[i]; sum.y +=
s[i+block]; sum.z +=
s[i+2*block]; }
29 template<
typename ReduceType,
typename ReduceSimpleType>
30 __device__
void add(ReduceSimpleType *
s,
const int i,
const int j,
const int block) { }
31 template<
typename ReduceType,
typename ReduceSimpleType>
32 __device__
void add(
volatile ReduceSimpleType *
s,
const int i,
const int j,
const int block) { }
36 template<> __device__
void add<double,double>(
volatile double *
s,
const int i,
const int j,
const int block)
40 {
s[i] +=
s[j];
s[i+block] +=
s[j+block];}
41 template<> __device__
void add<double2,double>(
volatile double *
s,
const int i,
const int j,
const int block)
42 {
s[i] +=
s[j];
s[i+block] +=
s[j+block];}
45 {
s[i] +=
s[j];
s[i+block] +=
s[j+block];
s[i+2*block] +=
s[j+2*block];}
46 template<> __device__
void add<double3,double>(
volatile double *
s,
const int i,
const int j,
const int block)
47 {
s[i] +=
s[j];
s[i+block] +=
s[j+block];
s[i+2*block] +=
s[j+2*block];}
49 #if (__COMPUTE_CAPABILITY__ < 130)
55 { s[i] = x.
x; s[i+block] = x.
y; }
57 { s[i] = x.
x; s[i+block] = x.
y; s[i+2*block] = x.
z; }
60 { s[i].
a.x = x.
x.
a.x; s[i].
a.y = x.
x.
a.y; s[i+block].
a.x = x.
y.
a.x; s[i+block].
a.y = x.
y.
a.y; }
62 { s[i].
a.x = x.
x.
a.x; s[i].
a.y = x.
x.
a.y; s[i+block].
a.x = x.
y.
a.x; s[i+block].
a.y = x.
y.
a.y;
63 s[i+2*block].
a.x = x.
z.
a.x; s[i+2*block].
a.y = x.
z.
a.y; }
66 { x.
x = s[i]; x.
y = s[i+block]; }
68 { x.
x = s[i]; x.
y = s[i+block]; x.
z = s[i+2*block]; }
73 { sum.x +=
s[i]; sum.y +=
s[i+block]; }
75 { sum.x +=
s[i]; sum.y +=
s[i+block]; sum.z +=
s[i+2*block]; }
83 {
s[i] +=
s[j];
s[i+block] +=
s[j+block];}
85 {
s[i] +=
s[j];
s[i+block] +=
s[j+block];}
88 {
s[i] +=
s[j];
s[i+block] +=
s[j+block];
s[i+2*block] +=
s[j+2*block];}
90 {
s[i] +=
s[j];
s[i+block] +=
s[j+block];
s[i+2*block] +=
s[j+2*block];}
93 __device__
unsigned int count = 0;
96 template <
typename ReduceType,
typename SpinorX,
typename SpinorY,
97 typename SpinorZ,
typename SpinorW,
typename SpinorV,
typename Reducer>
110 : X(X), Y(Y), Z(Z), W(W), V(V), r(r), partial(partial), complete(complete), length(length) { ; }
116 template <
int block_size,
typename ReduceType,
typename ReduceSimpleType,
117 typename FloatN,
int M,
typename SpinorX,
typename SpinorY,
118 typename SpinorZ,
typename SpinorW,
typename SpinorV,
typename Reducer>
120 unsigned int tid = threadIdx.x;
121 unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
122 unsigned int gridSize = gridDim.x*blockDim.x;
127 FloatN
x[M],
y[M], z[M], w[M], v[M];
134 #if (__COMPUTE_CAPABILITY__ >= 200)
139 for (
int j=0; j<M; j++) arg.
r(sum, x[j], y[j], z[j], w[j], v[j]);
141 #if (__COMPUTE_CAPABILITY__ >= 200)
155 #ifndef CUB_REDUCTION
157 extern __shared__ ReduceSimpleType sdata[];
158 ReduceSimpleType *
s = sdata + tid;
159 if (tid >= warpSize)
copytoshared(s, 0, sum, block_size);
166 for (
int i=warpSize; i<block_size; i+=warpSize) { add<ReduceType>(sum,
s, i, block_size); }
169 volatile ReduceSimpleType *sv =
s;
172 if (block_size >= 32) { add<ReduceType>(sv, 0, 16, block_size); }
173 if (block_size >= 16) { add<ReduceType>(sv, 0, 8, block_size); }
174 if (block_size >= 8) { add<ReduceType>(sv, 0, 4, block_size); }
175 if (block_size >= 4) { add<ReduceType>(sv, 0, 2, block_size); }
176 if (block_size >= 2) { add<ReduceType>(sv, 0, 1, block_size); }
192 unsigned int value = atomicInc(&
count, gridDim.x);
202 unsigned int i = threadIdx.x;
206 while (i < gridDim.x) {
211 extern __shared__ ReduceSimpleType sdata[];
212 ReduceSimpleType *s = sdata + tid;
213 if (tid >= warpSize)
copytoshared(s, 0, sum, block_size);
220 for (
int i=warpSize; i<block_size; i+=warpSize) { add<ReduceType>(sum,
s, i, block_size); }
223 volatile ReduceSimpleType *sv =
s;
226 if (block_size >= 32) { add<ReduceType>(sv, 0, 16, block_size); }
227 if (block_size >= 16) { add<ReduceType>(sv, 0, 8, block_size); }
228 if (block_size >= 8) { add<ReduceType>(sv, 0, 4, block_size); }
229 if (block_size >= 4) { add<ReduceType>(sv, 0, 2, block_size); }
230 if (block_size >= 2) { add<ReduceType>(sv, 0, 1, block_size); }
237 if (threadIdx.x == 0) {
247 typedef cub::BlockReduce<ReduceType, block_size> BlockReduce;
248 __shared__
typename BlockReduce::TempStorage temp_storage;
251 sum = BlockReduce(temp_storage).Sum(sum);
259 unsigned int value = atomicInc(&
count, gridDim.x);
270 unsigned int i = threadIdx.x;
274 while (i < gridDim.x) {
279 sum = BlockReduce(temp_storage).Sum(sum);
282 if (threadIdx.x == 0) {
293 template <
typename doubleN,
typename ReduceType,
typename ReduceSimpleType,
typename FloatN,
294 int M,
typename SpinorX,
typename SpinorY,
typename SpinorZ,
295 typename SpinorW,
typename SpinorV,
typename Reducer>
297 const TuneParam &tp,
const cudaStream_t &
stream) {
301 switch (tp.block.x) {
303 reduceKernel<32,ReduceType,ReduceSimpleType,FloatN,M>
304 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
arg);
307 reduceKernel<64,ReduceType,ReduceSimpleType,FloatN,M>
308 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
arg);
311 reduceKernel<96,ReduceType,ReduceSimpleType,FloatN,M>
312 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
arg);
315 reduceKernel<128,ReduceType,ReduceSimpleType,FloatN,M>
316 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
arg);
319 reduceKernel<160,ReduceType,ReduceSimpleType,FloatN,M>
320 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
arg);
323 reduceKernel<192,ReduceType,ReduceSimpleType,FloatN,M>
324 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
arg);
327 reduceKernel<224,ReduceType,ReduceSimpleType,FloatN,M>
328 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
arg);
331 reduceKernel<256,ReduceType,ReduceSimpleType,FloatN,M>
332 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
arg);
335 reduceKernel<288,ReduceType,ReduceSimpleType,FloatN,M>
336 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
arg);
339 reduceKernel<320,ReduceType,ReduceSimpleType,FloatN,M>
340 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
arg);
343 reduceKernel<352,ReduceType,ReduceSimpleType,FloatN,M>
344 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
arg);
347 reduceKernel<384,ReduceType,ReduceSimpleType,FloatN,M>
348 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
arg);
351 reduceKernel<416,ReduceType,ReduceSimpleType,FloatN,M>
352 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
arg);
355 reduceKernel<448,ReduceType,ReduceSimpleType,FloatN,M>
356 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
arg);
359 reduceKernel<480,ReduceType,ReduceSimpleType,FloatN,M>
360 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
arg);
363 reduceKernel<512,ReduceType,ReduceSimpleType,FloatN,M>
364 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
arg);
367 reduceKernel<544,ReduceType,ReduceSimpleType,FloatN,M>
368 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
arg);
371 reduceKernel<576,ReduceType,ReduceSimpleType,FloatN,M>
372 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
arg);
375 reduceKernel<608,ReduceType,ReduceSimpleType,FloatN,M>
376 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
arg);
379 reduceKernel<640,ReduceType,ReduceSimpleType,FloatN,M>
380 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
arg);
383 reduceKernel<672,ReduceType,ReduceSimpleType,FloatN,M>
384 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
arg);
387 reduceKernel<704,ReduceType,ReduceSimpleType,FloatN,M>
388 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
arg);
391 reduceKernel<736,ReduceType,ReduceSimpleType,FloatN,M>
392 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
arg);
395 reduceKernel<768,ReduceType,ReduceSimpleType,FloatN,M>
396 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
arg);
399 reduceKernel<800,ReduceType,ReduceSimpleType,FloatN,M>
400 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
arg);
403 reduceKernel<832,ReduceType,ReduceSimpleType,FloatN,M>
404 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
arg);
407 reduceKernel<864,ReduceType,ReduceSimpleType,FloatN,M>
408 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
arg);
411 reduceKernel<896,ReduceType,ReduceSimpleType,FloatN,M>
412 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
arg);
415 reduceKernel<928,ReduceType,ReduceSimpleType,FloatN,M>
416 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
arg);
419 reduceKernel<960,ReduceType,ReduceSimpleType,FloatN,M>
420 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
arg);
423 reduceKernel<992,ReduceType,ReduceSimpleType,FloatN,M>
424 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
arg);
427 reduceKernel<1024,ReduceType,ReduceSimpleType,FloatN,M>
428 <<< tp.grid, tp.block, tp.shared_bytes, stream >>>(
arg);
431 errorQuda(
"Reduction not implemented for %d threads", tp.block.x);
434 #if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__)
436 cudaEventRecord(reduceEnd, stream);
437 while (cudaSuccess != cudaEventQuery(reduceEnd)) { ; }
440 { cudaMemcpy(h_reduce, hd_reduce,
sizeof(ReduceType), cudaMemcpyDeviceToHost); }
444 cpu_sum += ((ReduceType*)h_reduce)[0];
446 const int Nreduce =
sizeof(doubleN) /
sizeof(
double);
453 template <
typename doubleN,
typename ReduceType,
typename ReduceSimpleType,
typename FloatN,
454 int M,
typename SpinorX,
typename SpinorY,
typename SpinorZ,
455 typename SpinorW,
typename SpinorV,
typename Reducer>
464 char *X_h, *Y_h, *Z_h, *W_h, *V_h;
465 char *Xnorm_h, *Ynorm_h, *Znorm_h, *Wnorm_h, *Vnorm_h;
467 unsigned int sharedBytesPerThread()
const {
return sizeof(ReduceType); }
471 unsigned int sharedBytesPerBlock(
const TuneParam &
param)
const {
473 return 2*warpSize*
sizeof(ReduceType);
476 virtual bool advanceSharedBytes(TuneParam &
param)
const
478 TuneParam next(param);
479 advanceBlockDim(next);
480 int nthreads = next.block.x * next.block.y * next.block.z;
481 param.shared_bytes = sharedBytesPerThread()*nthreads > sharedBytesPerBlock(param) ?
482 sharedBytesPerThread()*nthreads : sharedBytesPerBlock(param);
488 SpinorW &W, SpinorV &
V, Reducer &r,
int length) :
489 arg(X, Y, Z, W, V, r, (ReduceType*)d_reduce, (ReduceType*)hd_reduce, length),
490 result(result), X_h(0), Y_h(0), Z_h(0), W_h(0), V_h(0),
491 Xnorm_h(0), Ynorm_h(0), Znorm_h(0), Wnorm_h(0), Vnorm_h(0)
496 std::stringstream vol, aux;
497 vol << blasConstants.x[0] <<
"x";
498 vol << blasConstants.x[1] <<
"x";
499 vol << blasConstants.x[2] <<
"x";
500 vol << blasConstants.x[3];
501 aux <<
"stride=" << blasConstants.stride <<
",prec=" << arg.X.Precision();
502 return TuneKey(vol.str(),
typeid(arg.r).name(), aux.str());
507 result = reduceLaunch<doubleN,ReduceType,ReduceSimpleType,FloatN,M>(arg, tp,
stream);
511 size_t bytes = arg.X.Precision()*(
sizeof(FloatN)/
sizeof(((FloatN*)0)->x))*M*arg.X.Stride();
512 size_t norm_bytes = (arg.X.Precision() ==
QUDA_HALF_PRECISION) ?
sizeof(
float)*arg.length : 0;
513 arg.X.save(&X_h, &Xnorm_h, bytes, norm_bytes);
514 arg.Y.save(&Y_h, &Ynorm_h, bytes, norm_bytes);
515 arg.Z.save(&Z_h, &Znorm_h, bytes, norm_bytes);
516 arg.W.save(&W_h, &Wnorm_h, bytes, norm_bytes);
517 arg.V.save(&V_h, &Vnorm_h, bytes, norm_bytes);
521 size_t bytes = arg.X.Precision()*(
sizeof(FloatN)/
sizeof(((FloatN*)0)->x))*M*arg.X.Stride();
522 size_t norm_bytes = (arg.X.Precision() ==
QUDA_HALF_PRECISION) ?
sizeof(
float)*arg.length : 0;
523 arg.X.load(&X_h, &Xnorm_h, bytes, norm_bytes);
524 arg.Y.load(&Y_h, &Ynorm_h, bytes, norm_bytes);
525 arg.Z.load(&Z_h, &Znorm_h, bytes, norm_bytes);
526 arg.W.load(&W_h, &Wnorm_h, bytes, norm_bytes);
527 arg.V.load(&V_h, &Vnorm_h, bytes, norm_bytes);
530 long long flops()
const {
return arg.r.flops()*(
sizeof(FloatN)/
sizeof(((FloatN*)0)->x))*arg.length*M; }
532 size_t bytes = arg.X.Precision()*(
sizeof(FloatN)/
sizeof(((FloatN*)0)->x))*M;
534 return arg.r.streams()*bytes*arg.length; }
556 template <
typename doubleN,
typename ReduceType,
typename ReduceSimpleType,
557 template <
typename ReducerType,
typename Float,
typename FloatN>
class Reducer,
558 int writeX,
int writeY,
int writeZ,
int writeW,
int writeV,
bool siteUnroll>
559 doubleN
reduceCuda(
const double2 &a,
const double2 &b, cudaColorSpinorField &
x,
560 cudaColorSpinorField &
y, cudaColorSpinorField &z, cudaColorSpinorField &w,
561 cudaColorSpinorField &v) {
564 reduceCuda<doubleN,ReduceType,ReduceSimpleType,Reducer,writeX,
565 writeY,writeZ,writeW,writeV,siteUnroll>
566 (a, b, x.Even(), y.Even(), z.Even(), w.Even(), v.Even());
568 reduceCuda<doubleN,ReduceType,ReduceSimpleType,Reducer,writeX,
569 writeY,writeZ,writeW,writeV,siteUnroll>
570 (a, b, x.Odd(), y.Odd(), z.Odd(), w.Odd(), v.Odd());
580 warningQuda(
"Reductions on non-native fields is not supported\n");
586 for (
int d=0; d<
QUDA_MAX_DIM; d++) blasConstants.x[d] = x.X()[d];
587 blasConstants.stride = x.Stride();
589 int reduce_length = siteUnroll ? x.RealLength() : x.Length();
597 const int M = siteUnroll ? 12 : 1;
603 Reducer<ReduceType, double2, double2> r(a,b);
604 ReduceCuda<doubleN,ReduceType,ReduceSimpleType,double2,M,
605 Spinor<double2,double2,double2,M,writeX>,
Spinor<double2,double2,double2,M,writeY>,
606 Spinor<double2,double2,double2,M,writeZ>,
Spinor<double2,double2,double2,M,writeW>,
608 reduce(value, X, Y, Z, W, V, r, reduce_length/(2*M));
610 }
else if (x.Nspin() == 1){
611 const int M = siteUnroll ? 3 : 1;
617 Reducer<ReduceType, double2, double2> r(a,b);
618 ReduceCuda<doubleN,ReduceType,ReduceSimpleType,double2,M,
619 Spinor<double2,double2,double2,M,writeX>,
Spinor<double2,double2,double2,M,writeY>,
620 Spinor<double2,double2,double2,M,writeZ>,
Spinor<double2,double2,double2,M,writeW>,
622 reduce(value, X, Y, Z, W, V, r, reduce_length/(2*M));
624 }
else {
errorQuda(
"ERROR: nSpin=%d is not supported\n", x.Nspin()); }
627 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC)
628 const int M = siteUnroll ? 6 : 1;
634 Reducer<ReduceType, float2, float4> r(make_float2(a.x, a.y), make_float2(b.x, b.y));
635 ReduceCuda<doubleN,ReduceType,ReduceSimpleType,float4,M,
636 Spinor<float4,float4,float4,M,writeX,0>,
Spinor<float4,float4,float4,M,writeY,1>,
637 Spinor<float4,float4,float4,M,writeZ,2>,
Spinor<float4,float4,float4,M,writeW,3>,
639 reduce(value, X, Y, Z, W, V, r, reduce_length/(4*M));
642 errorQuda(
"blas has not been built for Nspin=%d fields", x.Nspin());
644 }
else if (x.Nspin() == 1) {
645 #ifdef GPU_STAGGERED_DIRAC
646 const int M = siteUnroll ? 3 : 1;
652 Reducer<ReduceType, float2, float2> r(make_float2(a.x, a.y), make_float2(b.x, b.y));
653 ReduceCuda<doubleN,ReduceType,ReduceSimpleType,float2,M,
654 Spinor<float2,float2,float2,M,writeX,0>,
Spinor<float2,float2,float2,M,writeY,1>,
655 Spinor<float2,float2,float2,M,writeZ,2>,
Spinor<float2,float2,float2,M,writeW,3>,
657 reduce(value, X, Y, Z, W, V, r, reduce_length/(2*M));
660 errorQuda(
"blas has not been built for Nspin=%d fields", x.Nspin());
662 }
else {
errorQuda(
"ERROR: nSpin=%d is not supported\n", x.Nspin()); }
665 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC)
671 Reducer<ReduceType, float2, float4> r(make_float2(a.x, a.y), make_float2(b.x, b.y));
672 ReduceCuda<doubleN,ReduceType,ReduceSimpleType,float4,6,
673 Spinor<float4,float4,short4,6,writeX,0>,
Spinor<float4,float4,short4,6,writeY,1>,
674 Spinor<float4,float4,short4,6,writeZ,2>,
Spinor<float4,float4,short4,6,writeW,3>,
676 reduce(value, X, Y, Z, W, V, r, y.Volume());
679 errorQuda(
"blas has not been built for Nspin=%d fields", x.Nspin());
681 }
else if (x.Nspin() == 1) {
682 #ifdef GPU_STAGGERED_DIRAC
688 Reducer<ReduceType, float2, float2> r(make_float2(a.x, a.y), make_float2(b.x, b.y));
689 ReduceCuda<doubleN,ReduceType,ReduceSimpleType,float2,3,
690 Spinor<float2,float2,short2,3,writeX,0>,
Spinor<float2,float2,short2,3,writeY,1>,
691 Spinor<float2,float2,short2,3,writeZ,2>,
Spinor<float2,float2,short2,3,writeW,3>,
693 reduce(value, X, Y, Z, W, V, r, y.Volume());
696 errorQuda(
"blas has not been built for Nspin=%d fields", x.Nspin());
698 }
else {
errorQuda(
"ERROR: nSpin=%d is not supported\n", x.Nspin()); }
702 blas_flops += Reducer<ReduceType,double2,double2>::flops()*(
unsigned long long)x.RealLength();
__device__ void add< double2, double >(double2 &sum, double *s, const int i, const int block)
__device__ void add< double3, double >(double3 &sum, double *s, const int i, const int block)
ReduceCuda(doubleN &result, SpinorX &X, SpinorY &Y, SpinorZ &Z, SpinorW &W, SpinorV &V, Reducer &r, int length)
__device__ void add< doublesingle2, doublesingle >(doublesingle2 &sum, doublesingle *s, const int i, const int block)
cudaDeviceProp deviceProp
QudaVerbosity getVerbosity()
ReduceArg(SpinorX X, SpinorY Y, SpinorZ Z, SpinorW W, SpinorV V, Reducer r, ReduceType *partial, ReduceType *complete, int length)
unsigned long long blas_bytes
__device__ unsigned int count
cudaColorSpinorField * tmp
__device__ void add< doublesingle3, doublesingle >(doublesingle3 &sum, doublesingle *s, const int i, const int block)
void reduceDoubleArray(double *, const int len)
__device__ void add< doublesingle, doublesingle >(doublesingle &sum, doublesingle *s, const int i, const int block)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
__global__ void reduceKernel(ReduceArg< ReduceType, SpinorX, SpinorY, SpinorZ, SpinorW, SpinorV, Reducer > arg)
doubleN reduceCuda(const double2 &a, const double2 &b, cudaColorSpinorField &x, cudaColorSpinorField &y, cudaColorSpinorField &z, cudaColorSpinorField &w, cudaColorSpinorField &v)
__host__ __device__ void zero(double &x)
__device__ void add(ReduceType &sum, ReduceSimpleType *s, const int i, const int block)
cudaStream_t * getBlasStream()
unsigned long long blas_flops
void apply(const cudaStream_t &stream)
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
#define checkSpinor(a, b)
#define REDUCE_MAX_BLOCKS
__shared__ bool isLastBlockDone
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
__device__ void add< double, double >(double &sum, double *s, const int i, const int block)
__device__ void copyfromshared(double &x, const double *s, const int i, const int block)
doubleN reduceLaunch(ReduceArg< ReduceType, SpinorX, SpinorY, SpinorZ, SpinorW, SpinorV, Reducer > &arg, const TuneParam &tp, const cudaStream_t &stream)
__device__ void copytoshared(double *s, const int i, const double x, const int block)