QUDA  1.0.0
clover_quda.cu
Go to the documentation of this file.
1 #include <quda_internal.h>
2 #include <quda_matrix.h>
3 #include <tune_quda.h>
4 #include <clover_field.h>
5 #include <gauge_field.h>
6 #include <gauge_field_order.h>
7 #include <clover_field_order.h>
8 
9 namespace quda {
10 
11 #ifdef GPU_CLOVER_DIRAC
12 
13  template<typename Float, typename Clover, typename Fmunu>
14  struct CloverArg {
15  int threads; // number of active threads required
16  int X[4]; // grid dimensions
17  Float cloverCoeff;
18 
19  Clover clover;
20  Fmunu f;
21 
22  CloverArg(Clover &clover, Fmunu& f, const GaugeField &meta, double cloverCoeff)
23  : threads(meta.VolumeCB()), cloverCoeff(cloverCoeff), clover(clover), f(f)
24  {
25  for(int dir=0; dir<4; ++dir) X[dir] = meta.X()[dir];
26  }
27  };
28 
29  /*
30  Put into clover order
31  Upper-left block (chirality index 0)
32  / \
33  | 1 + c*I*(F[0,1] - F[2,3]) , c*I*(F[1,2] - F[0,3]) + c*(F[0,2] + F[1,3]) |
34  | |
35  | c*I*(F[1,2] - F[0,3]) - c*(F[0,2] + F[1,3]), 1 - c*I*(F[0,1] - F[2,3]) |
36  | |
37  \ /
38 
39  /
40  | 1 - c*I*(F[0] - F[5]), -c*I*(F[2] - F[3]) - c*(F[1] + F[4])
41  |
42  | -c*I*(F[2] -F[3]) + c*(F[1] + F[4]), 1 + c*I*(F[0] - F[5])
43  |
44  \
45 
46  Lower-right block (chirality index 1)
47 
48  / \
49  | 1 - c*I*(F[0] + F[5]), -c*I*(F[2] + F[3]) - c*(F[1] - F[4]) |
50  | |
51  | -c*I*(F[2]+F[3]) + c*(F[1]-F[4]), 1 + c*I*(F[0] + F[5]) |
52  \ /
53  */
54  // Core routine for constructing clover term from field strength
55  template<typename Float, typename Arg>
56  __device__ __host__ void cloverComputeCore(Arg &arg, int x_cb, int parity){
57 
58  constexpr int nColor = 3;
59  constexpr int nSpin = 4;
60  constexpr int N = nColor*nSpin/2;
61  typedef complex<Float> Complex;
62  typedef Matrix<Complex,nColor> Link;
63 
64  // Load the field-strength tensor from global memory
65  Link F[6];
66 #pragma unroll
67  for (int i=0; i<6; ++i) F[i] = arg.f(i, x_cb, parity);
68 
69  Complex I(0.0,1.0);
70  Complex coeff(0.0,arg.cloverCoeff);
71  Link block1[2], block2[2];
72  block1[0] = coeff*(F[0]-F[5]); // (18 + 6*9=) 72 floating-point ops
73  block1[1] = coeff*(F[0]+F[5]); // 72 floating-point ops
74  block2[0] = arg.cloverCoeff*(F[1]+F[4] - I*(F[2]-F[3])); // 126 floating-point ops
75  block2[1] = arg.cloverCoeff*(F[1]-F[4] - I*(F[2]+F[3])); // 126 floating-point ops
76 
77  // This uses lots of unnecessary memory
78 #pragma unroll
79  for (int ch=0; ch<2; ++ch) {
80  HMatrix<Float,N> A;
81  // c = 0(1) => positive(negative) chiral block
82  // Compute real diagonal elements
83 #pragma unroll
84  for(int i=0; i<N/2; ++i){
85  A(i+0,i+0) = 1.0 - block1[ch](i,i).real();
86  A(i+3,i+3) = 1.0 + block1[ch](i,i).real();
87  }
88 
89  // Compute off diagonal components
90  // First row
91  A(1,0) = -block1[ch](1,0);
92  // Second row
93  A(2,0) = -block1[ch](2,0);
94  A(2,1) = -block1[ch](2,1);
95  // Third row
96  A(3,0) = block2[ch](0,0);
97  A(3,1) = block2[ch](0,1);
98  A(3,2) = block2[ch](0,2);
99  // Fourth row
100  A(4,0) = block2[ch](1,0);
101  A(4,1) = block2[ch](1,1);
102  A(4,2) = block2[ch](1,2);
103  A(4,3) = block1[ch](1,0);
104  // Fifth row
105  A(5,0) = block2[ch](2,0);
106  A(5,1) = block2[ch](2,1);
107  A(5,2) = block2[ch](2,2);
108  A(5,3) = block1[ch](2,0);
109  A(5,4) = block1[ch](2,1);
110  A *= static_cast<Float>(0.5);
111 
112  arg.clover(x_cb, parity, ch) = A;
113  } // ch
114  // 84 floating-point ops
115 
116  return;
117  }
118 
119 
120  template<typename Float, typename Clover, typename Fmunu>
121  __global__ void cloverComputeKernel(CloverArg<Float,Clover,Fmunu> arg){
122  int x_cb = threadIdx.x + blockIdx.x*blockDim.x;
123  int parity = threadIdx.y + blockIdx.y*blockDim.y;
124  if (x_cb >= arg.threads) return;
125  cloverComputeCore<Float>(arg, x_cb, parity);
126  }
127 
128  template<typename Float, typename Clover, typename Fmunu>
129  void cloverComputeCPU(CloverArg<Float,Clover,Fmunu> arg){
130  for (int parity = 0; parity<2; parity++) {
131  for (int x_cb=0; x_cb<arg.threads; x_cb++){
132  cloverComputeCore<Float>(arg, x_cb, parity);
133  }
134  }
135  }
136 
137 
138  template<typename Float, typename Clover, typename Fmunu>
139  class CloverCompute : TunableVectorY {
140  CloverArg<Float,Clover,Fmunu> arg;
141  const GaugeField &meta;
142  const QudaFieldLocation location;
143 
144  private:
145  unsigned int sharedBytesPerThread() const { return 0; }
146  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
147 
148  bool tuneSharedBytes() const { return false; } // Don't tune the shared memory.
149  bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
150  unsigned int minThreads() const { return arg.threads; }
151 
152  public:
153  CloverCompute(CloverArg<Float,Clover,Fmunu> &arg, const GaugeField &meta, QudaFieldLocation location)
154  : TunableVectorY(2), arg(arg), meta(meta), location(location) {
155  writeAuxString("threads=%d,stride=%d,prec=%lu",arg.threads,arg.clover.stride,sizeof(Float));
156  }
157 
158  virtual ~CloverCompute() {}
159 
160  void apply(const cudaStream_t &stream) {
161  if(location == QUDA_CUDA_FIELD_LOCATION){
162  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
163  cloverComputeKernel<<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
164  } else { // run the CPU code
165  cloverComputeCPU(arg);
166  }
167  }
168 
169  TuneKey tuneKey() const {
170  return TuneKey(meta.VolString(), typeid(*this).name(), aux);
171  }
172 
173  long long flops() const { return 2*arg.threads*480ll; }
174  long long bytes() const { return 2*arg.threads*(6*arg.f.Bytes() + arg.clover.Bytes()); }
175  };
176 
177 
178 
179  template<typename Float, typename Clover, typename Fmunu>
180  void computeClover(Clover clover, Fmunu f, const GaugeField &meta, Float cloverCoeff, QudaFieldLocation location){
181  CloverArg<Float,Clover,Fmunu> arg(clover, f, meta, cloverCoeff);
182  CloverCompute<Float,Clover,Fmunu> cloverCompute(arg, meta, location);
183  cloverCompute.apply(0);
184  checkCudaError();
186  }
187 
188  template<typename Float>
189  void computeClover(CloverField &clover, const GaugeField &f, Float cloverCoeff, QudaFieldLocation location){
190  if (f.Order() == QUDA_FLOAT2_GAUGE_ORDER) {
191  if (clover.isNative()) {
192  typedef typename clover_mapper<Float>::type C;
193  computeClover(C(clover,0), gauge::FloatNOrder<Float,18,2,18>(f), f, cloverCoeff, location);
194  } else {
195  errorQuda("Clover field order %d not supported", clover.Order());
196  } // clover order
197  } else {
198  errorQuda("Fmunu field order %d not supported", f.Precision());
199  }
200  }
201 
202 #endif
203 
204  void computeClover(CloverField &clover, const GaugeField& f, double cloverCoeff, QudaFieldLocation location){
205 
206 #ifdef GPU_CLOVER_DIRAC
207  if(clover.Precision() != f.Precision()){
208  errorQuda("Fmunu precision %d must match gauge precision %d", clover.Precision(), f.Precision());
209  }
210 
211  if (clover.Precision() == QUDA_DOUBLE_PRECISION){
212  computeClover<double>(clover, f, cloverCoeff, location);
213  } else if(clover.Precision() == QUDA_SINGLE_PRECISION) {
214  computeClover<float>(clover, f, cloverCoeff, location);
215  } else {
216  errorQuda("Precision %d not supported", clover.Precision());
217  }
218  return;
219 #else
220  errorQuda("Clover has not been built");
221 #endif
222 
223  }
224 
225 } // namespace quda
226 
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21
#define errorQuda(...)
Definition: util_quda.h:121
cudaStream_t * stream
Main header file for host and device accessors to CloverFields.
QudaGaugeParam param
Definition: pack_test.cpp:17
#define qudaDeviceSynchronize()
const int nColor
Definition: covdev_test.cpp:75
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:643
clover_mapper< Float, length >::type C
Definition: dslash_quda.cu:476
Main header file for host and device accessors to GaugeFields.
int X[4]
Definition: covdev_test.cpp:70
std::complex< double > Complex
Definition: quda_internal.h:46
CloverArg(ColorSpinorField &out, const ColorSpinorField &in, const CloverField &clover, bool inverse, int parity, RegType kappa=0.0, RegType mu=0.0, RegType epsilon=0.0, bool dagger=false, QudaTwistGamma5Type twist=QUDA_TWIST_GAMMA5_INVALID)
Definition: dslash_quda.cu:493
enum QudaFieldLocation_s QudaFieldLocation
colorspinor_mapper< Float, nSpin, nColor >::type F
Definition: dslash_quda.cu:475
unsigned long long flops
Definition: blas_quda.cu:22
const int parity
Definition: dslash_quda.cu:484
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
#define checkCudaError()
Definition: util_quda.h:161
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:52
QudaPrecision Precision() const
void computeClover(CloverField &clover, const GaugeField &gauge, double coeff, QudaFieldLocation location)
Definition: clover_quda.cu:204
unsigned long long bytes
Definition: blas_quda.cu:23