QUDA  0.9.0
multi_reduce_mixed_core.h
Go to the documentation of this file.
1 namespace mixed {
2 
6  template<int NXZ, typename doubleN, typename ReduceType,
7  template <int MXZ, typename ReducerType, typename Float, typename FloatN> class Reducer, typename write, bool siteUnroll, typename T>
8  void multiReduceCuda(doubleN result[], const reduce::coeff_array<T> &a, const reduce::coeff_array<T> &b, const reduce::coeff_array<T> &c,
11  const int NYW = y.size();
12 
13  assert(siteUnroll==true);
14  int reduce_length = siteUnroll ? x[0]->RealLength() : x[0]->Length();
15 
16  if (y[0]->Precision() == QUDA_DOUBLE_PRECISION && x[0]->Precision() == QUDA_SINGLE_PRECISION) {
17 
18  if (x[0]->Nspin() == 4) { // wilson
19 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC)
20  const int M = 12; // determines how much work per thread to do
21  multiReduceCuda<doubleN,ReduceType,double2,float4,double2,M,NXZ,Reducer,write>
22  (result, a, b, c, x, y, z, w, reduce_length/(2*M));
23 #else
24  errorQuda("blas has not been built for Nspin=%d fields", x[0]->Nspin());
25 #endif
26  } else if (x[0]->Nspin() == 1) {
27 #ifdef GPU_STAGGERED_DIRAC
28  const int M = 3; // determines how much work per thread to do
29  multiReduceCuda<doubleN,ReduceType,double2,float2,double2,M,NXZ,Reducer,write>
30  (result, a, b, c, x, y, z, w, reduce_length/(2*M));
31 #else
32  errorQuda("blas has not been built for Nspin=%d field", x[0]->Nspin());
33 #endif
34  } else { errorQuda("nSpin=%d is not supported\n", x[0]->Nspin()); }
35 
36  } else if (y[0]->Precision() == QUDA_DOUBLE_PRECISION && x[0]->Precision() == QUDA_HALF_PRECISION) {
37 
38  if (x[0]->Nspin() == 4) { // wilson
39 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC)
40  const int M = 6; // determines how much work per thread to do
41  multiReduceCuda<doubleN,ReduceType,double2,short4,double2,M,NXZ,Reducer,write>
42  (result, a, b, c, x, y, z, w, reduce_length/(4*M));
43 #else
44  errorQuda("blas has not been built for Nspin=%d fields", x[0]->Nspin());
45 #endif
46  } else if(x[0]->Nspin() == 1 || x[0]->Nspin() == 2) { // staggered
47 #if defined(GPU_STAGGERED_DIRAC)
48  const int M = 3;
49  multiReduceCuda<doubleN,ReduceType,double2,short2,double2,M,NXZ,Reducer,write>
50  (result, a, b, c, x, y, z, w, reduce_length/(2*M));
51 #else
52  errorQuda("blas has not been built for Nspin=%d fields", x[0]->Nspin());
53 #endif
54  } else { errorQuda("nSpin=%d is not supported\n", x[0]->Nspin()); }
55 
56  } else if (y[0]->Precision() == QUDA_SINGLE_PRECISION && x[0]->Precision() == QUDA_HALF_PRECISION) {
57 
58  if (x[0]->Nspin() == 4) { // wilson
59 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC)
60  const int M = 6;
61  multiReduceCuda<doubleN,ReduceType,float4,short4,float4,M,NXZ,Reducer,write>
62  (result, a, b, c, x, y, z, w, x[0]->Volume());
63 #else
64  errorQuda("blas has not been built for Nspin=%d fields", x[0]->Nspin());
65 #endif
66  } else if(x[0]->Nspin() == 1) { // staggered
67 #ifdef GPU_STAGGERED_DIRAC
68  const int M = 3;
69  multiReduceCuda<doubleN,ReduceType,float2,short2,float2,M,NXZ,Reducer,write>
70  (result, a, b, c, x, y, z, w, x[0]->Volume());
71 #else
72  errorQuda("blas has not been built for Nspin=%d fields", x[0]->Nspin());
73 #endif
74  } else { errorQuda("nSpin=%d is not supported\n", x[0]->Nspin()); }
75 
76  } else {
77  errorQuda("Precision combination x=%d y=%d not supported\n", x[0]->Precision(), y[0]->Precision());
78  }
79 
80  return;
81  }
82 
83 } // namespace mixed
#define errorQuda(...)
Definition: util_quda.h:90
void multiReduceCuda(doubleN result[], const reduce::coeff_array< T > &a, const reduce::coeff_array< T > &b, const reduce::coeff_array< T > &c, CompositeColorSpinorField &x, CompositeColorSpinorField &y, CompositeColorSpinorField &z, CompositeColorSpinorField &w)
int Nspin
Definition: blas_test.cu:45
#define b
std::vector< ColorSpinorField * > CompositeColorSpinorField
int int int w
const void * c
#define a