6 template<
int NXZ,
typename doubleN,
typename ReduceType,
7 template <
int MXZ,
typename ReducerType,
typename Float,
typename FloatN>
class Reducer,
typename write,
bool siteUnroll,
typename T>
8 void multiReduceCuda(doubleN result[],
const reduce::coeff_array<T> &
a,
const reduce::coeff_array<T> &
b,
const reduce::coeff_array<T> &
c,
11 const int NYW =
y.size();
13 int reduce_length = siteUnroll ?
x[0]->RealLength() :
x[0]->Length();
17 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC) || defined(GPU_MULTIGRID) 18 const int M = siteUnroll ? 12 : 1;
19 if (
x[0]->
Nspin() == 2 && siteUnroll)
errorQuda(
"siteUnroll not supported for nSpin==2");
20 multiReduceCuda<doubleN,ReduceType,double2,double2,double2,M,NXZ,Reducer,write>
21 (result,
a,
b,
c,
x,
y,
z,
w, reduce_length/(2*M));
23 errorQuda(
"blas has not been built for Nspin=%d fields",
x[0]->
Nspin());
25 }
else if (
x[0]->
Nspin() == 1) {
26 #ifdef GPU_STAGGERED_DIRAC 27 const int M = siteUnroll ? 3 : 1;
28 multiReduceCuda<doubleN,ReduceType,double2,double2,double2,M,NXZ,Reducer,write>
29 (result,
a,
b,
c,
x,
y,
z,
w, reduce_length/(2*M));
36 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC) 37 const int M = siteUnroll ? 6 : 1;
38 multiReduceCuda<doubleN,ReduceType,float4,float4,float4,M,NXZ,Reducer,write>
39 (result,
a,
b,
c,
x,
y,
z,
w, reduce_length/(4*M));
41 errorQuda(
"blas has not been built for Nspin=%d fields",
x[0]->
Nspin());
44 #if defined(GPU_STAGGERED_DIRAC) || defined(GPU_MULTIGRID) 45 const int M = siteUnroll ? 3 : 1;
46 if (
x[0]->
Nspin() == 2 && siteUnroll)
errorQuda(
"siteUnroll not supported for nSpin==2");
47 multiReduceCuda<doubleN,ReduceType,float2,float2,float2,M,NXZ,Reducer,write>
48 (result,
a,
b,
c,
x,
y,
z,
w, reduce_length/(2*M));
50 errorQuda(
"blas has not been built for Nspin=%d fields",
x[0]->
Nspin());
55 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC) 57 multiReduceCuda<doubleN,ReduceType,float4,short4,short4,M,NXZ,Reducer,write>
58 (result,
a,
b,
c,
x,
y,
z,
w,
x[0]->Volume());
60 errorQuda(
"blas has not been built for Nspin=%d fields",
x[0]->
Nspin());
62 }
else if(
x[0]->
Nspin() == 1) {
63 #ifdef GPU_STAGGERED_DIRAC 65 multiReduceCuda<doubleN,ReduceType,float2,short2,short2,M,NXZ,Reducer,write>
66 (result,
a,
b,
c,
x,
y,
z,
w,
x[0]->Volume());
68 errorQuda(
"blas has not been built for Nspin=%d fields",
x[0]->
Nspin());
76 template<
int NXZ,
typename doubleN,
typename ReduceType,
77 template <
int MXZ,
typename ReducerType,
typename Float,
typename FloatN>
class ReducerDiagonal,
typename writeDiagonal,
78 template <
int MXZ,
typename ReducerType,
typename Float,
typename FloatN>
class ReducerOffDiagonal,
typename writeOffDiagonal,
79 bool siteUnroll,
typename T>
80 void multiReduceCuda(doubleN result[],
const reduce::coeff_array<T> &
a,
const reduce::coeff_array<T> &
b,
const reduce::coeff_array<T> &
c,
84 if (
x[0]->Precision()==
y[0]->Precision()) {
86 multiReduceCuda<NXZ,doubleN,ReduceType,ReducerDiagonal,writeDiagonal,siteUnroll,T>(result,
a,
b,
c,
x,
y,
z,
w);
88 multiReduceCuda<NXZ,doubleN,ReduceType,ReducerOffDiagonal,writeOffDiagonal,siteUnroll,T>(result,
a,
b,
c,
x,
y,
z,
w);
92 mixed::multiReduceCuda<NXZ,doubleN,ReduceType,ReducerDiagonal,writeDiagonal,true,T>(result,
a,
b,
c,
x,
y,
z,
w);
94 mixed::multiReduceCuda<NXZ,doubleN,ReduceType,ReducerOffDiagonal,writeOffDiagonal,true,T>(result,
a,
b,
c,
x,
y,
z,
w);
std::vector< ColorSpinorField * > CompositeColorSpinorField
void multiReduceCuda(doubleN result[], const reduce::coeff_array< T > &a, const reduce::coeff_array< T > &b, const reduce::coeff_array< T > &c, CompositeColorSpinorField &x, CompositeColorSpinorField &y, CompositeColorSpinorField &z, CompositeColorSpinorField &w)