QUDA  0.9.0
clover_invert.cu
Go to the documentation of this file.
1 #include <tune_quda.h>
2 #include <clover_field_order.h>
3 #include <complex_quda.h>
4 #include <launch_kernel.cuh>
5 #include <atomic.cuh>
6 #include <cub_helper.cuh>
7 #include <quda_matrix.h>
8 #include <linalg.cuh>
9 
10 namespace quda {
11 
12  using namespace clover;
13 
14 #ifdef GPU_CLOVER_DIRAC
15 
16  template <typename Float, typename Clover>
17  struct CloverInvertArg : public ReduceArg<double2> {
18  const Clover clover;
19  Clover inverse;
20  bool computeTraceLog;
21  bool twist;
22  Float mu2;
23  CloverInvertArg(Clover &inverse, const Clover &clover, bool computeTraceLog=0) :
24  ReduceArg<double2>(), inverse(inverse), clover(clover), computeTraceLog(computeTraceLog),
25  twist(clover.Twisted()), mu2(clover.Mu2()) { }
26  };
27 
31  template <typename Float, typename Arg, bool computeTrLog, bool twist>
32  __device__ __host__ inline double cloverInvertCompute(Arg &arg, int x_cb, int parity) {
33 
34  constexpr int nColor = 3;
35  constexpr int nSpin = 4;
36  constexpr int N = nColor*nSpin/2;
37  typedef HMatrix<Float,N> Mat;
38  double trlogA = 0.0;
39 
40  for (int ch=0; ch<2; ch++) {
41  Mat A = arg.clover(x_cb, parity, ch);
42  A *= static_cast<Float>(2.0); // factor of two is inherent to QUDA clover storage
43 
44  if (twist) { // Compute (T^2 + mu2) first, then invert
45  A = A.square();
46  A += arg.mu2;
47  }
48 
49  // compute the Colesky decomposition
51 
52  // Accumulate trlogA
53  if (computeTrLog) for (int j=0; j<N; j++) trlogA += 2.0*log(cholesky.D(j));
54 
55  Mat Ainv = static_cast<Float>(0.5)*cholesky.invert(); // return full inverse
56  arg.inverse(x_cb, parity, ch) = Ainv;
57  }
58 
59  return trlogA;
60  }
61 
62  template <typename Float, typename Arg, bool computeTrLog, bool twist>
63  void cloverInvert(Arg &arg) {
64  for (int parity=0; parity<2; parity++) {
65  for (int x=0; x<arg.clover.volumeCB; x++) {
66  // should make this thread safe if we ever apply threads to cpu code
67  double trlogA = cloverInvertCompute<Float,Arg,computeTrLog,twist>(arg, x, parity);
68  if (computeTrLog) {
69  if (parity) arg.result_h[0].y += trlogA;
70  else arg.result_h[0].x += trlogA;
71  }
72  }
73  }
74  }
75 
76  template <int blockSize, typename Float, typename Arg, bool computeTrLog, bool twist>
77  __global__ void cloverInvertKernel(Arg arg) {
78  int idx = blockIdx.x*blockDim.x + threadIdx.x;
79  int parity = threadIdx.y;
80  double trlogA_parity = 0.0;
81  if (idx < arg.clover.volumeCB)
82  trlogA_parity = cloverInvertCompute<Float,Arg,computeTrLog,twist>(arg, idx, parity);
83  double2 trlogA = parity ? make_double2(0.0,trlogA_parity) : make_double2(trlogA_parity, 0.0);
84 
85  if (computeTrLog) reduce2d<blockSize,2>(arg, trlogA);
86  }
87 
88  template <typename Float, typename Arg>
89  class CloverInvert : TunableLocalParity {
90  Arg arg;
91  const CloverField &meta; // used for meta data only
92  const QudaFieldLocation location;
93 
94  private:
95  bool tuneSharedBytes() const { return false; } // Don't tune the shared memory
96  unsigned int minThreads() const { return arg.clover.volumeCB; }
97 
98  public:
99  CloverInvert(Arg &arg, const CloverField &meta, QudaFieldLocation location)
100  : arg(arg), meta(meta), location(location) {
101  writeAuxString("stride=%d,prec=%lu,trlog=%s,twist=%s", arg.clover.stride, sizeof(Float),
102  arg.computeTraceLog ? "true" : "false", arg.twist ? "true" : "false");
103  }
104  virtual ~CloverInvert() { ; }
105 
106  void apply(const cudaStream_t &stream) {
107  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
108  arg.result_h[0] = make_double2(0.,0.);
109  if (location == QUDA_CUDA_FIELD_LOCATION) {
110  if (arg.computeTraceLog) {
111  if (arg.twist) {
112  errorQuda("Not instantiated");
113  } else {
114  LAUNCH_KERNEL_LOCAL_PARITY(cloverInvertKernel, tp, stream, arg, Float, Arg, true, false);
115  }
116  } else {
117  if (arg.twist) {
118  cloverInvertKernel<1,Float,Arg,false,true> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
119  } else {
120  cloverInvertKernel<1,Float,Arg,false,false> <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
121  }
122  }
123  } else {
124  if (arg.computeTraceLog) {
125  if (arg.twist) {
126  cloverInvert<Float, Arg, true, true>(arg);
127  } else {
128  cloverInvert<Float, Arg, true, false>(arg);
129  }
130  } else {
131  if (arg.twist) {
132  cloverInvert<Float, Arg, false, true>(arg);
133  } else {
134  cloverInvert<Float, Arg, false, false>(arg);
135  }
136  }
137  }
138  }
139 
140  TuneKey tuneKey() const {
141  return TuneKey(meta.VolString(), typeid(*this).name(), aux);
142  }
143 
144  long long flops() const { return 0; }
145  long long bytes() const { return 2*arg.clover.volumeCB*(arg.inverse.Bytes() + arg.clover.Bytes()); }
146 
147  void preTune() { if (arg.clover.clover == arg.inverse.clover) arg.inverse.save(); }
148  void postTune() { if (arg.clover.clover == arg.inverse.clover) arg.inverse.load(); }
149 
150  };
151 
152  template <typename Float, typename Clover>
153  void cloverInvert(Clover inverse, const Clover clover, bool computeTraceLog,
154  double* const trlog, const CloverField &meta, QudaFieldLocation location) {
155  CloverInvertArg<Float,Clover> arg(inverse, clover, computeTraceLog);
156  CloverInvert<Float,CloverInvertArg<Float,Clover>> invert(arg, meta, location);
157  invert.apply(0);
158 
159  if (arg.computeTraceLog) {
161  comm_allreduce_array((double*)arg.result_h, 2);
162  trlog[0] = arg.result_h[0].x;
163  trlog[1] = arg.result_h[0].y;
164  }
165  }
166 
167  template <typename Float>
168  void cloverInvert(const CloverField &clover, bool computeTraceLog, QudaFieldLocation location) {
169 
170  if (clover.isNative()) {
171  typedef typename clover_mapper<Float>::type C;
172  cloverInvert<Float>(C(clover, 1), C(clover, 0), computeTraceLog,
173  clover.TrLog(), clover, location);
174  } else {
175  errorQuda("Clover field %d order not supported", clover.Order());
176  }
177 
178  }
179 
180 #endif
181 
182  // this is the function that is actually called, from here on down we instantiate all required templates
183  void cloverInvert(CloverField &clover, bool computeTraceLog, QudaFieldLocation location) {
184 
185 #ifdef GPU_CLOVER_DIRAC
186  if (clover.Precision() == QUDA_HALF_PRECISION && clover.Order() > 4)
187  errorQuda("Half precision not supported for order %d", clover.Order());
188 
189  if (clover.Precision() == QUDA_DOUBLE_PRECISION) {
190  cloverInvert<double>(clover, computeTraceLog, location);
191  } else if (clover.Precision() == QUDA_SINGLE_PRECISION) {
192  cloverInvert<float>(clover, computeTraceLog, location);
193  } else {
194  errorQuda("Precision %d not supported", clover.Precision());
195  }
196 #else
197  errorQuda("Clover has not been built");
198 #endif
199  }
200 
201 } // namespace quda
dim3 dim3 blockDim
#define LAUNCH_KERNEL_LOCAL_PARITY(kernel, tp, stream, arg,...)
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
#define errorQuda(...)
Definition: util_quda.h:90
Compute Cholesky decomposition of A. By default, we use a modified Cholesky which avoids the division...
Definition: linalg.cuh:38
cudaStream_t * stream
void comm_allreduce_array(double *data, size_t size)
Definition: comm_mpi.cpp:296
const char * VolString() const
void Mat(sFloat *out, gFloat **link, sFloat *in, int daggerBit, int mu)
Main header file for host and device accessors to CloverFields.
void cloverInvert(CloverField &clover, bool computeTraceLog, QudaFieldLocation location)
This function compute the Cholesky decomposition of each clover matrix and stores the clover inverse ...
const int nColor
Definition: covdev_test.cpp:77
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603
Specialized container for Hermitian matrices (e.g., used for wrapping clover matrices) ...
Definition: quda_matrix.h:65
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
__host__ __device__ ValueType log(ValueType x)
Definition: complex_quda.h:90
enum QudaFieldLocation_s QudaFieldLocation
unsigned long long flops
Definition: blas_quda.cu:42
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:880
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51
QudaParity parity
Definition: covdev_test.cpp:53
unsigned long long bytes
Definition: blas_quda.cu:43