23 #define QudaSumFloat doubledouble 24 #define QudaSumFloat2 doubledouble2 25 #define QudaSumFloat3 doubledouble3 31 #define QudaSumFloat double 32 #define QudaSumFloat2 double2 33 #define QudaSumFloat3 double3 37 #if (__COMPUTE_CAPABILITY__ < 300) 38 #undef MAX_MULTI_BLAS_N 39 #define MAX_MULTI_BLAS_N 2 43 if (
a.Length() !=
b.Length())
44 errorQuda(
"lengths do not match: %lu %lu",
a.Length(),
b.Length());
45 if (
a.Stride() !=
b.Stride())
46 errorQuda(
"strides do not match: %d %d",
a.Stride(),
b.Stride());
58 typedef std::map<TuneKey, TuneParam>
map;
71 template <
int writeX,
int writeY,
int writeZ,
int writeW>
73 static constexpr
int X = writeX;
74 static constexpr
int Y = writeY;
75 static constexpr
int Z = writeZ;
76 static constexpr
int W = writeW;
82 #define BLAS_SPINOR // do not include ghost functions in Spinor class to reduce parameter space overhead 94 template <
int NXZ,
typename ReduceType,
typename Float2,
typename FloatN>
98 virtual __device__ __host__
void pre() { ; }
101 virtual __device__ __host__ __host__
void operator()(ReduceType &
sum, FloatN &
x, FloatN &
y,
102 FloatN &
z, FloatN &
w,
const int i,
const int j) = 0;
105 virtual __device__ __host__
void post(ReduceType &
sum) { ; }
114 template<
typename ReduceType> __device__ __host__
void dot_(ReduceType &
sum,
const double2 &
a,
const double2 &
b) {
115 sum += (ReduceType)
a.x*(ReduceType)
b.x;
116 sum += (ReduceType)
a.y*(ReduceType)
b.y;
119 template<
typename ReduceType> __device__ __host__
void dot_(ReduceType &
sum,
const float2 &
a,
const float2 &
b) {
120 sum += (ReduceType)
a.x*(ReduceType)
b.x;
121 sum += (ReduceType)
a.y*(ReduceType)
b.y;
124 template<
typename ReduceType> __device__ __host__
void dot_(ReduceType &
sum,
const float4 &
a,
const float4 &
b) {
125 sum += (ReduceType)
a.x*(ReduceType)
b.x;
126 sum += (ReduceType)
a.y*(ReduceType)
b.y;
127 sum += (ReduceType)
a.z*(ReduceType)
b.z;
128 sum += (ReduceType)
a.w*(ReduceType)
b.w;
131 template <
int NXZ,
typename ReduceType,
typename Float2,
typename FloatN>
135 Dot(
const reduce::coeff_array<Complex> &
a,
const reduce::coeff_array<Complex> &
b,
const reduce::coeff_array<Complex> &
c,
int NYW) :
NYW(
NYW) { ; }
136 __device__ __host__
void operator()(ReduceType &
sum, FloatN &
x, FloatN &
y, FloatN &
z, FloatN &
w,
const int i,
const int j)
137 { dot_<ReduceType>(
sum,
x,
y); }
142 void reDotProduct(
double* result, std::vector<ColorSpinorField*>&
x, std::vector<ColorSpinorField*>&
y){
148 reduce::multiReduceCuda<1,double,QudaSumFloat,Dot,0,0,0,0,false>
149 (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x,
y,
x,
y);
152 reduce::multiReduceCuda<2,double,QudaSumFloat,Dot,0,0,0,0,false>
153 (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x,
y,
x,
y);
156 reduce::multiReduceCuda<3,double,QudaSumFloat,Dot,0,0,0,0,false>
157 (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x,
y,
x,
y);
160 reduce::multiReduceCuda<4,double,QudaSumFloat,Dot,0,0,0,0,false>
161 (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x,
y,
x,
y);
164 reduce::multiReduceCuda<5,double,QudaSumFloat,Dot,0,0,0,0,false>
165 (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x,
y,
x,
y);
168 reduce::multiReduceCuda<6,double,QudaSumFloat,Dot,0,0,0,0,false>
169 (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x,
y,
x,
y);
172 reduce::multiReduceCuda<7,double,QudaSumFloat,Dot,0,0,0,0,false>
173 (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x,
y,
x,
y);
176 reduce::multiReduceCuda<8,double,QudaSumFloat,Dot,0,0,0,0,false>
177 (result, make_double2(0.0, 0.0), make_double2(0.0, 0.0),
x,
y,
x,
y);
217 const int Nreduce =
x.size()*
y.size();
225 template<
typename ReduceType>
226 __device__ __host__
void cdot_(ReduceType &
sum,
const double2 &
a,
const double2 &
b) {
234 template<
typename ReduceType>
235 __device__ __host__
void cdot_(ReduceType &
sum,
const float2 &
a,
const float2 &
b) {
243 template<
typename ReduceType>
244 __device__ __host__
void cdot_(ReduceType &
sum,
const float4 &
a,
const float4 &
b) {
256 template <
int NXZ,
typename ReduceType,
typename Float2,
typename FloatN>
260 Cdot(
const reduce::coeff_array<Complex> &
a,
const reduce::coeff_array<Complex> &
b,
const reduce::coeff_array<Complex> &
c,
int NYW) :
NYW(
NYW) { ; }
261 __device__ __host__
inline void operator()(ReduceType &
sum, FloatN &
x, FloatN &
y, FloatN &
z, FloatN &
w,
const int i,
const int j)
262 { cdot_<ReduceType>(
sum,
x,
y); }
267 template <
int NXZ,
typename ReduceType,
typename Float2,
typename FloatN>
271 CdotCopy(
const reduce::coeff_array<Complex> &
a,
const reduce::coeff_array<Complex> &
b,
const reduce::coeff_array<Complex> &
c,
int NYW) :
NYW(
NYW) { ; }
272 __device__ __host__
inline void operator()(ReduceType &
sum, FloatN &
x, FloatN &
y, FloatN &
z, FloatN &
w,
const int i,
const int j)
273 { cdot_<ReduceType>(
sum,
x,
y);
if (
i==j)
w =
y;}
280 template <
template <
int MXZ,
typename ReducerType,
typename Float,
typename FloatN>
class ReducerDiagonal,
typename writeDiagonal,
281 template <
int MXZ,
typename ReducerType,
typename Float,
typename FloatN>
class ReducerOffDiagonal,
typename writeOffDiagonal>
283 std::vector<ColorSpinorField*>&
z, std::vector<ColorSpinorField*>&
w,
int i_idx,
int j_idx,
bool hermitian,
unsigned int tile_size) {
285 if (
y.size() > tile_size)
289 Complex* result1 = &result[
x.size()*(
y.size()/2)];
290 std::vector<ColorSpinorField*>
y0(
y.begin(),
y.begin() +
y.size()/2);
291 std::vector<ColorSpinorField*>
y1(
y.begin() +
y.size()/2,
y.end());
292 std::vector<ColorSpinorField*> w0(
w.begin(),
w.begin() +
w.size()/2);
293 std::vector<ColorSpinorField*> w1(
w.begin() +
w.size()/2,
w.end());
294 multiReduce_recurse<ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal>(result0,
x,
y0,
z, w0, i_idx, 2*j_idx+0, hermitian, tile_size);
295 multiReduce_recurse<ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal>(result1,
x,
y1,
z, w1, i_idx, 2*j_idx+1, hermitian, tile_size);
299 double2* cdot =
new double2[
x.size()*
y.size()];
302 if (
x.size() <= tile_size && hermitian) {
303 if (j_idx < i_idx) {
return; }
306 reduce::coeff_array<Complex>
a,
b,
c;
308 if (
x.size() <= tile_size) {
311 reduce::multiReduceCuda<1,double2,QudaSumFloat2,ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal,false>
312 (cdot,
a,
b,
c,
x,
y,
z,
w, i_idx, j_idx );
314 #if MAX_MULTI_BLAS_N >= 2 316 reduce::multiReduceCuda<2,double2,QudaSumFloat2,ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal,false>
317 (cdot,
a,
b,
c,
x,
y,
z,
w, i_idx, j_idx );
319 #if MAX_MULTI_BLAS_N >= 3 321 reduce::multiReduceCuda<3,double2,QudaSumFloat2,ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal,false>
322 (cdot,
a,
b,
c,
x,
y,
z,
w, i_idx, j_idx );
324 #if MAX_MULTI_BLAS_N >= 4 326 reduce::multiReduceCuda<4,double2,QudaSumFloat2,ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal,false>
327 (cdot,
a,
b,
c,
x,
y,
z,
w, i_idx, j_idx );
329 #if MAX_MULTI_BLAS_N >= 5 331 reduce::multiReduceCuda<5,double2,QudaSumFloat2,ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal,false>
332 (cdot,
a,
b,
c,
x,
y,
z,
w, i_idx, j_idx );
334 #if MAX_MULTI_BLAS_N >= 6 336 reduce::multiReduceCuda<6,double2,QudaSumFloat2,ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal,false>
337 (cdot,
a,
b,
c,
x,
y,
z,
w, i_idx, j_idx );
339 #if MAX_MULTI_BLAS_N >= 7 341 reduce::multiReduceCuda<7,double2,QudaSumFloat2,ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal,false>
342 (cdot,
a,
b,
c,
x,
y,
z,
w, i_idx, j_idx );
344 #if MAX_MULTI_BLAS_N >= 8 346 reduce::multiReduceCuda<8,double2,QudaSumFloat2,ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal,false>
347 (cdot,
a,
b,
c,
x,
y,
z,
w, i_idx, j_idx );
349 #if MAX_MULTI_BLAS_N >= 9 351 reduce::multiReduceCuda<9,double2,QudaSumFloat2,ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal,false>
352 (cdot,
a,
b,
c,
x,
y,
z,
w, i_idx, j_idx );
354 #if MAX_MULTI_BLAS_N >= 10 356 reduce::multiReduceCuda<10,double2,QudaSumFloat2,ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal,false>
357 (cdot,
a,
b,
c,
x,
y,
z,
w, i_idx, j_idx );
359 #if MAX_MULTI_BLAS_N >= 11 361 reduce::multiReduceCuda<11,double2,QudaSumFloat2,ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal,false>
362 (cdot,
a,
b,
c,
x,
y,
z,
w, i_idx, j_idx );
364 #if MAX_MULTI_BLAS_N >= 12 366 reduce::multiReduceCuda<12,double2,QudaSumFloat2,ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal,false>
367 (cdot,
a,
b,
c,
x,
y,
z,
w, i_idx, j_idx );
369 #if MAX_MULTI_BLAS_N >= 13 371 reduce::multiReduceCuda<13,double2,QudaSumFloat2,ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal,false>
372 (cdot,
a,
b,
c,
x,
y,
z,
w, i_idx, j_idx );
374 #if MAX_MULTI_BLAS_N >= 14 376 reduce::multiReduceCuda<14,double2,QudaSumFloat2,ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal,false>
377 (cdot,
a,
b,
c,
x,
y,
z,
w, i_idx, j_idx );
379 #if MAX_MULTI_BLAS_N >= 15 381 reduce::multiReduceCuda<15,double2,QudaSumFloat2,ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal,false>
382 (cdot,
a,
b,
c,
x,
y,
z,
w, i_idx, j_idx );
384 #if MAX_MULTI_BLAS_N >= 16 386 reduce::multiReduceCuda<16,double2,QudaSumFloat2,ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal,false>
387 (cdot,
a,
b,
c,
x,
y,
z,
w, i_idx, j_idx );
411 Complex* result0 = &tmpmajor[0];
412 Complex* result1 = &tmpmajor[(
x.size()/2)*
y.size()];
413 std::vector<ColorSpinorField*> x0(
x.begin(),
x.begin() +
x.size()/2);
414 std::vector<ColorSpinorField*> x1(
x.begin() +
x.size()/2,
x.end());
415 std::vector<ColorSpinorField*> z0(
z.begin(),
z.begin() +
z.size()/2);
416 std::vector<ColorSpinorField*> z1(
z.begin() +
z.size()/2,
z.end());
418 multiReduce_recurse<ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal>(result0, x0,
y, z0,
w, 2*i_idx+0, j_idx, hermitian, tile_size);
419 multiReduce_recurse<ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal>(result1, x1,
y, z1,
w, 2*i_idx+1, j_idx, hermitian, tile_size);
421 const unsigned int xlen0 =
x.size()/2;
422 const unsigned int xlen1 =
x.size() - xlen0;
423 const unsigned int ylen =
y.size();
426 int count = 0, count0 = 0, count1 = 0;
427 for (
unsigned int i = 0;
i < ylen;
i++)
429 for (
unsigned int j = 0; j < xlen0; j++)
430 result[
count++] = result0[count0++];
431 for (
unsigned int j = 0; j < xlen1; j++)
432 result[
count++] = result1[count1++];
439 if (
x.size() <= tile_size)
441 const unsigned int xlen =
x.size();
442 const unsigned int ylen =
y.size();
443 for (
unsigned int j = 0; j < xlen; j++)
444 for (
unsigned int i = 0;
i < ylen;
i++)
445 result[
i*xlen+j] =
Complex(cdot[j*ylen +
i].
x, cdot[j*ylen+
i].
y);
452 template <
template <
int MXZ,
typename ReducerType,
typename Float,
typename FloatN>
class ReducerDiagonal,
453 typename writeDiagonal,
454 template <
int MXZ,
typename ReducerType,
typename Float,
typename FloatN>
class ReducerOffDiagonal,
455 typename writeOffDiagonal>
457 typedef std::vector<ColorSpinorField*>
vec;
492 if (
x.size()==1 ||
y.size()==1 ) {
497 for (
unsigned int tile_size=1; tile_size <=
max_tile_size; tile_size++) {
498 multiReduce_recurse<ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal>
507 unsigned int max_count = 0;
509 while (tile_size_tmp != 1) { tile_size_tmp = tile_size_tmp >> 1; max_count++; }
511 for (
unsigned int i = 0;
i < max_count;
i++) { tile_size_tmp = tile_size_tmp << 1; }
515 for (
unsigned int tile_size=1; tile_size <=
max_tile_size && tile_size <=
x.size() &&
516 (tile_size <=
y.size() ||
y.size()==1) ; tile_size*=2) {
517 multiReduce_recurse<ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal>
537 multiReduce_recurse<ReducerDiagonal,writeDiagonal,ReducerOffDiagonal,writeOffDiagonal>
545 if (
x.size()==1 ||
y.size()==1 ) {
584 return TuneKey(
x[0]->VolString(),
typeid(*this).name(),
aux);
587 long long flops()
const {
return 0; }
588 long long bytes()
const {
return 0; }
595 if (
x.size() == 0 ||
y.size() == 0)
errorQuda(
"vector.size() == 0");
597 for (
unsigned int i = 0;
i <
x.size()*
y.size();
i++) result_tmp[
i] = 0.0;
602 TileSizeTune<Cdot,write<0,0,0,0>,
Cdot,
write<0,0,0,0> > tile(result_tmp,
x,
y,
x,
y,
false);
606 const int Nreduce = 2*
x.size()*
y.size();
610 const unsigned int xlen =
x.size();
611 const unsigned int ylen =
y.size();
612 for (
unsigned int j = 0; j < xlen; j++)
613 for (
unsigned int i = 0;
i < ylen;
i++)
614 result[j*ylen+
i] = result_tmp[
i*xlen + j];
620 if (
x.size() == 0 ||
y.size() == 0)
errorQuda(
"vector.size() == 0");
621 if (
x.size() !=
y.size())
errorQuda(
"Cannot call Hermitian block dot product on non-square inputs");
624 for (
unsigned int i = 0;
i <
x.size()*
y.size();
i++) result_tmp[
i] = 0.0;
626 TileSizeTune<Cdot,write<0,0,0,0>,
Cdot,
write<0,0,0,0> > tile(result_tmp,
x,
y,
x,
y,
true,
false);
630 const int Nreduce = 2*
x.size()*
y.size();
634 const unsigned int xlen =
x.size();
635 const unsigned int ylen =
y.size();
636 for (
unsigned int j = 0; j < xlen; j++)
637 for (
unsigned int i = j;
i < ylen;
i++) {
638 result[j*ylen+
i] = result_tmp[
i*xlen + j];
639 result[
i*ylen+j] =
conj(result_tmp[
i*xlen + j]);
647 if (
x.size() == 0 ||
y.size() == 0)
errorQuda(
"vector.size() == 0");
648 if (
x.size() !=
y.size())
errorQuda(
"Cannot call Hermitian block A-norm dot product on non-square inputs");
651 for (
unsigned int i = 0;
i <
x.size()*
y.size();
i++) result_tmp[
i] = 0.0;
653 TileSizeTune<Cdot,write<0,0,0,0>,
Cdot,
write<0,0,0,0> > tile(result_tmp,
x,
y,
x,
y,
true,
true);
657 const int Nreduce = 2*
x.size()*
y.size();
661 const unsigned int xlen =
x.size();
662 const unsigned int ylen =
y.size();
663 for (
unsigned int j = 0; j < xlen; j++)
664 for (
unsigned int i = j;
i < ylen;
i++) {
665 result[j*ylen+
i] = result_tmp[
i*xlen + j];
666 result[
i*ylen+j] =
conj(result_tmp[
i*xlen + j]);
674 std::vector<ColorSpinorField*>&
z){
677 if (
x.size() == 0 ||
y.size() == 0)
errorQuda(
"vector.size() == 0");
678 if (
y.size() !=
z.size())
errorQuda(
"Cannot copy input y of size %lu into z of size %lu\n",
y.size(),
z.size());
681 for (
unsigned int i = 0;
i <
x.size()*
y.size();
i++) result_tmp[
i] = 0.0;
684 TileSizeTune<CdotCopy,write<0,0,0,1>,
Cdot,
write<0,0,0,0> > tile(result_tmp,
x,
y,
x,
y,
true);
688 const int Nreduce = 2*
x.size()*
y.size();
692 const unsigned int xlen =
x.size();
693 const unsigned int ylen =
y.size();
694 for (
unsigned int j = 0; j < xlen; j++)
695 for (
unsigned int i = 0;
i < ylen;
i++)
696 result[j*ylen+
i] = result_tmp[
i*xlen + j];
700 errorQuda(
"cDotProductCopy not enabled");
static int flops()
total number of input and output streams
static void checkSpinor(const ColorSpinorField &a, const ColorSpinorField &b)
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)
where the reduction is usually computed and any auxiliary operations
void disableProfileCount()
__device__ __host__ void cdot_(ReduceType &sum, const double2 &a, const double2 &b)
QudaVerbosity getVerbosity()
unsigned int sharedBytesPerBlock(const TuneParam ¶m) const
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)
where the reduction is usually computed and any auxiliary operations
unsigned int max_tile_size
Complex cDotProduct(ColorSpinorField &, ColorSpinorField &)
void cDotProductCopy(Complex *result, std::vector< ColorSpinorField *> &a, std::vector< ColorSpinorField *> &b, std::vector< ColorSpinorField *> &c)
Computes the matrix of inner products between the vector set a and the vector set b...
virtual __device__ __host__ void pre()
pre-computation routine called before the "M-loop"
std::complex< double > Complex
void reduceDoubleArray(double *, const int len)
char * strcpy(char *__dst, const char *__src)
double reDotProduct(ColorSpinorField &x, ColorSpinorField &y)
double y0(double) __attribute__((availability(macosx
char * strcat(char *__s1, const char *__s2)
void multiReduce_recurse(Complex *result, std::vector< ColorSpinorField *> &x, std::vector< ColorSpinorField *> &y, std::vector< ColorSpinorField *> &z, std::vector< ColorSpinorField *> &w, int i_idx, int j_idx, bool hermitian, unsigned int tile_size)
void initTuneParam(TuneParam ¶m) const
Dot(const reduce::coeff_array< Complex > &a, const reduce::coeff_array< Complex > &b, const reduce::coeff_array< Complex > &c, int NYW)
void defaultTuneParam(TuneParam ¶m) const
void enableProfileCount()
cudaStream_t * getStream()
CdotCopy(const reduce::coeff_array< Complex > &a, const reduce::coeff_array< Complex > &b, const reduce::coeff_array< Complex > &c, int NYW)
char aux_tmp[quda::TuneKey::aux_n]
double y1(double) __attribute__((availability(macosx
virtual __device__ __host__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)=0
where the reduction is usually computed and any auxiliary operations
__host__ __device__ void sum(double &a, double &b)
unsigned int sharedBytesPerThread() const
bool advanceTuneParam(TuneParam ¶m) const
std::vector< ColorSpinorField * > vec
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
void hDotProduct_Anorm(Complex *result, std::vector< ColorSpinorField *> &a, std::vector< ColorSpinorField *> &b)
Computes the matrix of inner products between the vector set a and the vector set b...
cudaEvent_t * getReduceEvent()
void setPolicyTuning(bool)
static struct @7 blasStrings
scalar< Float2 >::type real
Cdot(const reduce::coeff_array< Complex > &a, const reduce::coeff_array< Complex > &b, const reduce::coeff_array< Complex > &c, int NYW)
void apply(const cudaStream_t &stream)
std::map< TuneKey, TuneParam > map
static int flops()
total number of input and output streams
TileSizeTune(Complex *result, vec &x, vec &y, vec &z, vec &w, bool hermitian, bool Anorm=false)
void hDotProduct(Complex *result, std::vector< ColorSpinorField *> &a, std::vector< ColorSpinorField *> &b)
Computes the matrix of inner products between the vector set a and the vector set b...
__device__ void reduce(ReduceArg< T > arg, const T &in, const int idx=0)
bool advanceAux(TuneParam ¶m) const
static int flops()
total number of input and output streams
virtual void initTuneParam(TuneParam ¶m) const
const map & getTuneCache()
__host__ __device__ ValueType conj(ValueType x)
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
void u64toa(char *buffer, uint64_t value)
__device__ unsigned int count[QUDA_MAX_MULTI_REDUCE]
__device__ __host__ void dot_(ReduceType &sum, const double2 &a, const double2 &b)
scalar< Float2 >::type real
virtual __device__ __host__ void post(ReduceType &sum)
post-computation routine called after the "M-loop"
__device__ __host__ void operator()(ReduceType &sum, FloatN &x, FloatN &y, FloatN &z, FloatN &w, const int i, const int j)
where the reduction is usually computed and any auxiliary operations
virtual void defaultTuneParam(TuneParam ¶m) const
scalar< Float2 >::type real