QUDA  0.9.0
multi_reduce_core.h
Go to the documentation of this file.
2 
6 template<int NXZ, typename doubleN, typename ReduceType,
7  template <int MXZ, typename ReducerType, typename Float, typename FloatN> class Reducer, typename write, bool siteUnroll, typename T>
8  void multiReduceCuda(doubleN result[], const reduce::coeff_array<T> &a, const reduce::coeff_array<T> &b, const reduce::coeff_array<T> &c,
11  const int NYW = y.size();
12 
13  int reduce_length = siteUnroll ? x[0]->RealLength() : x[0]->Length();
14 
15  if (x[0]->Precision() == QUDA_DOUBLE_PRECISION) {
16  if (x[0]->Nspin() == 4 || x[0]->Nspin() == 2) { // wilson
17 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC) || defined(GPU_MULTIGRID)
18  const int M = siteUnroll ? 12 : 1; // determines how much work per thread to do
19  if (x[0]->Nspin() == 2 && siteUnroll) errorQuda("siteUnroll not supported for nSpin==2");
20  multiReduceCuda<doubleN,ReduceType,double2,double2,double2,M,NXZ,Reducer,write>
21  (result, a, b, c, x, y, z, w, reduce_length/(2*M));
22 #else
23  errorQuda("blas has not been built for Nspin=%d fields", x[0]->Nspin());
24 #endif
25  } else if (x[0]->Nspin() == 1) {
26 #ifdef GPU_STAGGERED_DIRAC
27  const int M = siteUnroll ? 3 : 1; // determines how much work per thread to do
28  multiReduceCuda<doubleN,ReduceType,double2,double2,double2,M,NXZ,Reducer,write>
29  (result, a, b, c, x, y, z, w, reduce_length/(2*M));
30 #else
31  errorQuda("blas has not been built for Nspin=%d field", x[0]->Nspin());
32 #endif
33  } else { errorQuda("nSpin=%d is not supported\n", x[0]->Nspin()); }
34  } else if (x[0]->Precision() == QUDA_SINGLE_PRECISION) {
35  if (x[0]->Nspin() == 4) { // wilson
36 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC)
37  const int M = siteUnroll ? 6 : 1; // determines how much work per thread to do
38  multiReduceCuda<doubleN,ReduceType,float4,float4,float4,M,NXZ,Reducer,write>
39  (result, a, b, c, x, y, z, w, reduce_length/(4*M));
40 #else
41  errorQuda("blas has not been built for Nspin=%d fields", x[0]->Nspin());
42 #endif
43  } else if(x[0]->Nspin() == 1 || x[0]->Nspin() == 2) { // staggered
44 #if defined(GPU_STAGGERED_DIRAC) || defined(GPU_MULTIGRID)
45  const int M = siteUnroll ? 3 : 1;
46  if (x[0]->Nspin() == 2 && siteUnroll) errorQuda("siteUnroll not supported for nSpin==2");
47  multiReduceCuda<doubleN,ReduceType,float2,float2,float2,M,NXZ,Reducer,write>
48  (result, a, b, c, x, y, z, w, reduce_length/(2*M));
49 #else
50  errorQuda("blas has not been built for Nspin=%d fields", x[0]->Nspin());
51 #endif
52  } else { errorQuda("nSpin=%d is not supported\n", x[0]->Nspin()); }
53  } else { // half precision
54  if (x[0]->Nspin() == 4) { // wilson
55 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC)
56  const int M = 6;
57  multiReduceCuda<doubleN,ReduceType,float4,short4,short4,M,NXZ,Reducer,write>
58  (result, a, b, c, x, y, z, w, x[0]->Volume());
59 #else
60  errorQuda("blas has not been built for Nspin=%d fields", x[0]->Nspin());
61 #endif
62  } else if(x[0]->Nspin() == 1) { // staggered
63 #ifdef GPU_STAGGERED_DIRAC
64  const int M = 3;
65  multiReduceCuda<doubleN,ReduceType,float2,short2,short2,M,NXZ,Reducer,write>
66  (result, a, b, c, x, y, z, w, x[0]->Volume());
67 #else
68  errorQuda("blas has not been built for Nspin=%d fields", x[0]->Nspin());
69 #endif
70  } else { errorQuda("nSpin=%d is not supported\n", x[0]->Nspin()); }
71  }
72 
73  return;
74 }
75 
76 template<int NXZ, typename doubleN, typename ReduceType,
77  template <int MXZ, typename ReducerType, typename Float, typename FloatN> class ReducerDiagonal, typename writeDiagonal,
78  template <int MXZ, typename ReducerType, typename Float, typename FloatN> class ReducerOffDiagonal, typename writeOffDiagonal,
79  bool siteUnroll, typename T>
80  void multiReduceCuda(doubleN result[], const reduce::coeff_array<T> &a, const reduce::coeff_array<T> &b, const reduce::coeff_array<T> &c,
83 
84  if (x[0]->Precision()==y[0]->Precision()) {
85  if (i==j) { // we are on the diagonal so invoke the diagonal reducer
86  multiReduceCuda<NXZ,doubleN,ReduceType,ReducerDiagonal,writeDiagonal,siteUnroll,T>(result, a, b, c, x, y, z, w);
87  } else { // we are on the diagonal so invoke the off-diagonal reducer
88  multiReduceCuda<NXZ,doubleN,ReduceType,ReducerOffDiagonal,writeOffDiagonal,siteUnroll,T>(result, a, b, c, x, y, z, w);
89  }
90  } else {
91  if (i==j) { // we are on the diagonal so invoke the diagonal reducer
92  mixed::multiReduceCuda<NXZ,doubleN,ReduceType,ReducerDiagonal,writeDiagonal,true,T>(result, a, b, c, x, y, z, w);
93  } else { // we are on the diagonal so invoke the off-diagonal reducer
94  mixed::multiReduceCuda<NXZ,doubleN,ReduceType,ReducerOffDiagonal,writeOffDiagonal,true,T>(result, a, b, c, x, y, z, w);
95  }
96  }
97 
98 }
#define errorQuda(...)
Definition: util_quda.h:90
int Nspin
Definition: blas_test.cu:45
#define b
std::vector< ColorSpinorField * > CompositeColorSpinorField
void multiReduceCuda(doubleN result[], const reduce::coeff_array< T > &a, const reduce::coeff_array< T > &b, const reduce::coeff_array< T > &c, CompositeColorSpinorField &x, CompositeColorSpinorField &y, CompositeColorSpinorField &z, CompositeColorSpinorField &w)
int int int w
const void * c
#define a