QUDA  0.9.0
momentum.cu
Go to the documentation of this file.
1 #include <quda_internal.h>
2 #include <quda_matrix.h>
3 #include <tune_quda.h>
4 #include <gauge_field_order.h>
5 #include <launch_kernel.cuh>
6 #include <cub_helper.cuh>
7 
8 namespace quda {
9 
10 using namespace gauge;
11 
12 #ifdef GPU_GAUGE_TOOLS
13 
14  template <typename Mom>
15  struct MomActionArg : public ReduceArg<double> {
16  int threads; // number of active threads required
17  Mom mom;
18  int X[4]; // grid dimensions
19 
20  MomActionArg(const Mom &mom, const GaugeField &meta)
21  : ReduceArg<double>(), mom(mom) {
22  threads = meta.VolumeCB();
23  for(int dir=0; dir<4; ++dir) X[dir] = meta.X()[dir];
24  }
25  };
26 
27  template<int blockSize, typename Float, typename Mom>
28  __global__ void computeMomAction(MomActionArg<Mom> arg){
29  int x = threadIdx.x + blockIdx.x*blockDim.x;
30  int parity = threadIdx.y;
31  double action = 0.0;
32 
33  if(x < arg.threads) {
34  // loop over direction
35  for (int mu=0; mu<4; mu++) {
36  Float v[10];
37  arg.mom.load(v, x, mu, parity);
38 
39  double local_sum = 0.0;
40  for (int j=0; j<6; j++) local_sum += v[j]*v[j];
41  for (int j=6; j<9; j++) local_sum += 0.5*v[j]*v[j];
42  local_sum -= 4.0;
43  action += local_sum;
44  }
45  }
46 
47  // perform final inter-block reduction and write out result
48  reduce2d<blockSize,2>(arg, action);
49  }
50 
51  template<typename Float, typename Mom>
52  class MomAction : TunableLocalParity {
53  MomActionArg<Mom> &arg;
54  const GaugeField &meta;
55 
56  private:
57  unsigned int minThreads() const { return arg.threads; }
58 
59  public:
60  MomAction(MomActionArg<Mom> &arg, const GaugeField &meta) : arg(arg), meta(meta) {}
61  virtual ~MomAction () { }
62 
63  void apply(const cudaStream_t &stream){
64  if (meta.Location() == QUDA_CUDA_FIELD_LOCATION){
65  arg.result_h[0] = 0.0;
66  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
68  } else {
69  errorQuda("CPU not supported yet\n");
70  }
71  }
72 
73  TuneKey tuneKey() const {
74  std::stringstream aux;
75  aux << "threads=" << arg.threads << ",prec=" << sizeof(Float);
76  return TuneKey(meta.VolString(), typeid(*this).name(), aux.str().c_str());
77  }
78 
79  long long flops() const { return 4*2*arg.threads*23; }
80  long long bytes() const { return 4*2*arg.threads*arg.mom.Bytes(); }
81  };
82 
83  template<typename Float, typename Mom>
84  void momAction(const Mom mom, const GaugeField& meta, double &action) {
85  MomActionArg<Mom> arg(mom, meta);
86  MomAction<Float,Mom> momAction(arg, meta);
87 
88  momAction.apply(0);
90 
91  comm_allreduce((double*)arg.result_h);
92  action = arg.result_h[0];
93  }
94 
95  template<typename Float>
96  double momAction(const GaugeField& mom) {
97  double action = 0.0;
98 
99  if (mom.Order() == QUDA_FLOAT2_GAUGE_ORDER) {
100  if (mom.Reconstruct() == QUDA_RECONSTRUCT_10) {
101  momAction<Float>(FloatNOrder<Float,10,2,10>(mom), mom, action);
102  } else {
103  errorQuda("Reconstruction type %d not supported", mom.Reconstruct());
104  }
105  } else {
106  errorQuda("Gauge Field order %d not supported", mom.Order());
107  }
108 
109  return action;
110  }
111 #endif
112 
113  double computeMomAction(const GaugeField& mom) {
114  double action = 0.0;
115 #ifdef GPU_GAUGE_TOOLS
116  if (mom.Precision() == QUDA_DOUBLE_PRECISION) {
117  action = momAction<double>(mom);
118  } else if(mom.Precision() == QUDA_SINGLE_PRECISION) {
119  action = momAction<float>(mom);
120  } else {
121  errorQuda("Precision %d not supported", mom.Precision());
122  }
123 #else
124  errorQuda("%s not build", __func__);
125 #endif
126  return action;
127  }
128 
129 
130 #ifdef GPU_GAUGE_TOOLS
131  template<typename Float, typename Mom, typename Force>
132  struct UpdateMomArg {
133  int threads;
134  Mom mom;
135  Float coeff;
136  Force force;
137  int X[4]; // grid dimensions
138  UpdateMomArg(Mom &mom, const Float &coeff, Force &force, GaugeField &meta)
139  : threads(meta.VolumeCB()), mom(mom), coeff(coeff), force(force) {
140  for (int dir=0; dir<4; ++dir) X[dir] = meta.X()[dir];
141  }
142  };
143 
144  template<typename Float, typename Mom, typename Force>
145  __global__ void UpdateMomKernel(UpdateMomArg<Float, Mom, Force> arg) {
146  int x = blockIdx.x*blockDim.x + threadIdx.x;
147  int parity = threadIdx.y;
148  Matrix<complex<Float>,3> m, f;
149  while(x<arg.threads){
150  for (int d=0; d<4; d++) {
151  arg.mom.load(reinterpret_cast<Float*>(m.data), x, d, parity);
152  arg.force.load(reinterpret_cast<Float*>(f.data), x, d, parity);
153 
154  m = m + arg.coeff * f;
155  makeAntiHerm(m);
156 
157  arg.mom.save(reinterpret_cast<Float*>(m.data), x, d, parity);
158  }
159 
160  x += gridDim.x*blockDim.x;
161  }
162  return;
163  } // UpdateMom
164 
165 
166  template<typename Float, typename Mom, typename Force>
167  class UpdateMom : TunableLocalParity {
168  UpdateMomArg<Float, Mom, Force> &arg;
169  const GaugeField &meta;
170 
171  private:
172  unsigned int minThreads() const { return arg.threads; }
173 
174  public:
175  UpdateMom(UpdateMomArg<Float,Mom,Force> &arg, const GaugeField &meta) : arg(arg), meta(meta) {}
176  virtual ~UpdateMom () { }
177 
178  void apply(const cudaStream_t &stream){
179  if(meta.Location() == QUDA_CUDA_FIELD_LOCATION){
180  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
181  UpdateMomKernel<Float,Mom,Force><<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
182  } else {
183  errorQuda("CPU not supported yet\n");
184  }
185  }
186 
187  TuneKey tuneKey() const {
188  std::stringstream aux;
189  aux << "threads=" << arg.threads << ",prec=" << sizeof(Float);
190  return TuneKey(meta.VolString(), typeid(*this).name(), aux.str().c_str());
191  }
192 
193  void preTune() { arg.mom.save();}
194  void postTune() { arg.mom.load();}
195  long long flops() const { return 4*2*arg.threads*(36+42); }
196  long long bytes() const { return 4*2*arg.threads*(2*arg.mom.Bytes()+arg.force.Bytes()); }
197  };
198 
199  template<typename Float, typename Mom, typename Force>
200  void updateMomentum(Mom mom, Float coeff, Force force, GaugeField &meta) {
201  UpdateMomArg<Float,Mom,Force> arg(mom, coeff, force, meta);
202  UpdateMom<Float,Mom,Force> update(arg, meta);
203  update.apply(0);
204  }
205 
206  template <typename Float>
207  void updateMomentum(GaugeField &mom, double coeff, GaugeField &force) {
208  if (mom.Reconstruct() != QUDA_RECONSTRUCT_10)
209  errorQuda("Momentum field with reconstruct %d not supported", mom.Reconstruct());
210 
211  if (force.Reconstruct() == QUDA_RECONSTRUCT_10) {
212  updateMomentum<Float>(FloatNOrder<Float, 18, 2, 11>(mom), static_cast<Float>(coeff),
213  FloatNOrder<Float, 18, 2, 11>(force), force);
214  } else if (force.Reconstruct() == QUDA_RECONSTRUCT_NO) {
215  updateMomentum<Float>(FloatNOrder<Float, 18, 2, 11>(mom), static_cast<Float>(coeff),
216  FloatNOrder<Float, 18, 2, 18>(force), force);
217  } else {
218  errorQuda("Unsupported force reconstruction: %d", force.Reconstruct());
219  }
220 
221  }
222 #endif // GPU_GAUGE_TOOLS
223 
224  void updateMomentum(GaugeField &mom, double coeff, GaugeField &force) {
225 #ifdef GPU_GAUGE_TOOLS
226  if(mom.Order() != QUDA_FLOAT2_GAUGE_ORDER)
227  errorQuda("Unsupported output ordering: %d\n", mom.Order());
228 
229  if (mom.Precision() != force.Precision())
230  errorQuda("Mixed precision not supported: %d %d\n", mom.Precision(), force.Precision());
231 
232  if (mom.Precision() == QUDA_DOUBLE_PRECISION) {
233  updateMomentum<double>(mom, coeff, force);
234  } else {
235  errorQuda("Unsupported precision: %d", mom.Precision());
236  }
237 
238  checkCudaError();
239 #else
240  errorQuda("%s not built", __func__);
241 #endif // GPU_GAUGE_TOOLS
242 
243  return;
244  }
245 
246 
247 #ifdef GPU_GAUGE_TOOLS
248 
249  template<typename Float, typename Force, typename Gauge>
250  struct ApplyUArg {
251  int threads;
252  Force force;
253  Gauge U;
254  int X[4]; // grid dimensions
255  ApplyUArg(Force &force, Gauge &U, GaugeField &meta)
256  : threads(meta.VolumeCB()), force(force), U(U) {
257  for (int dir=0; dir<4; ++dir) X[dir] = meta.X()[dir];
258  }
259  };
260 
261  template<typename Float, typename Force, typename Gauge>
262  __global__ void ApplyUKernel(ApplyUArg<Float,Force,Gauge> arg) {
263  int x = blockIdx.x*blockDim.x + threadIdx.x;
264  int parity = threadIdx.y;
265  Matrix<complex<Float>,3> f, u;
266 
267  while (x<arg.threads) {
268  for (int d=0; d<4; d++) {
269  arg.force.load(reinterpret_cast<Float*>(f.data), x, d, parity);
270  arg.U.load(reinterpret_cast<Float*>(u.data), x, d, parity);
271 
272  f = u * f;
273 
274  arg.force.save(reinterpret_cast<Float*>(f.data), x, d, parity);
275  }
276 
277  x += gridDim.x*blockDim.x;
278  }
279 
280  return;
281  } // ApplyU
282 
283 
284  template<typename Float, typename Force, typename Gauge>
285  class ApplyU : TunableLocalParity {
286  ApplyUArg<Float, Force, Gauge> &arg;
287  const GaugeField &meta;
288 
289  private:
290  unsigned int minThreads() const { return arg.threads; }
291 
292  public:
293  ApplyU(ApplyUArg<Float,Force,Gauge> &arg, const GaugeField &meta) : arg(arg), meta(meta) {}
294  virtual ~ApplyU () { }
295 
296  void apply(const cudaStream_t &stream){
297  if(meta.Location() == QUDA_CUDA_FIELD_LOCATION){
298  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
299  ApplyUKernel<Float,Force,Gauge><<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
300  } else {
301  errorQuda("CPU not supported yet\n");
302  }
303  }
304 
305  TuneKey tuneKey() const {
306  std::stringstream aux;
307  aux << "threads=" << arg.threads << ",prec=" << sizeof(Float);
308  return TuneKey(meta.VolString(), typeid(*this).name(), aux.str().c_str());
309  }
310 
311  void preTune() { arg.force.save();}
312  void postTune() { arg.force.load();}
313  long long flops() const { return 4*2*arg.threads*198; }
314  long long bytes() const { return 4*2*arg.threads*(2*arg.force.Bytes()+arg.U.Bytes()); }
315  };
316 
317  template<typename Float, typename Force, typename Gauge>
318  void applyU(Force force, Gauge U, GaugeField &meta) {
319  ApplyUArg<Float,Force,Gauge> arg(force, U, meta);
320  ApplyU<Float,Force,Gauge> applyU(arg, meta);
321  applyU.apply(0);
323  }
324  template <typename Float>
325  void applyU(GaugeField &force, GaugeField &U) {
326  if (force.Reconstruct() != QUDA_RECONSTRUCT_NO)
327  errorQuda("Force field with reconstruct %d not supported", force.Reconstruct());
328 
329  if (U.Reconstruct() == QUDA_RECONSTRUCT_NO) {
330  applyU<Float>(FloatNOrder<Float, 18, 2, 18>(force), FloatNOrder<Float, 18, 2, 18>(U), force);
331  } else if (U.Reconstruct() == QUDA_RECONSTRUCT_NO) {
332  applyU<Float>(FloatNOrder<Float, 18, 2, 18>(force), FloatNOrder<Float, 18, 2, 12>(U), force);
333  } else {
334  errorQuda("Unsupported gauge reconstruction: %d", U.Reconstruct());
335  }
336 
337  }
338 #endif // GPU_GAUGE_TOOLS
339 
340  void applyU(GaugeField &force, GaugeField &U) {
341 #ifdef GPU_GAUGE_TOOLS
342  if(force.Order() != QUDA_FLOAT2_GAUGE_ORDER)
343  errorQuda("Unsupported output ordering: %d\n", force.Order());
344 
345  if (force.Precision() != U.Precision())
346  errorQuda("Mixed precision not supported: %d %d\n", force.Precision(), U.Precision());
347 
348  if (force.Precision() == QUDA_DOUBLE_PRECISION) {
349  applyU<double>(force, U);
350  } else {
351  errorQuda("Unsupported precision: %d", force.Precision());
352  }
353 
354  checkCudaError();
355 #else
356  errorQuda("%s not built", __func__);
357 #endif // GPU_GAUGE_TOOLS
358 
359  return;
360  }
361 
362 } // namespace quda
dim3 dim3 blockDim
double mu
Definition: test_util.cpp:1643
#define LAUNCH_KERNEL_LOCAL_PARITY(kernel, tp, stream, arg,...)
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
#define errorQuda(...)
Definition: util_quda.h:90
cudaStream_t * stream
const char * VolString() const
double computeMomAction(const GaugeField &mom)
Compute and return global the momentum action 1/2 mom^2.
Definition: momentum.cu:113
T data[N *N]
Definition: quda_matrix.h:74
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603
int int int enum cudaChannelFormatKind f
Main header file for host and device accessors to GaugeFields.
void applyU(GaugeField &force, GaugeField &U)
Definition: momentum.cu:340
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
QudaFieldLocation Location() const
int VolumeCB() const
unsigned long long flops
Definition: blas_quda.cu:42
void updateMomentum(GaugeField &mom, double coeff, GaugeField &force)
Definition: momentum.cu:224
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:880
Accessor routine for CloverFields in native field order.
__device__ __host__ void makeAntiHerm(Matrix< Complex, N > &m)
Definition: quda_matrix.h:636
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:203
QudaGaugeFieldOrder Order() const
Definition: gauge_field.h:204
#define checkCudaError()
Definition: util_quda.h:129
void comm_allreduce(double *data)
Definition: comm_mpi.cpp:281
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51
static __inline__ size_t size_t d
QudaPrecision Precision() const
QudaParity parity
Definition: covdev_test.cpp:53
unsigned long long bytes
Definition: blas_quda.cu:43
const int * X() const