QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
momentum.cu
Go to the documentation of this file.
1 #include <quda_internal.h>
2 #include <quda_matrix.h>
3 #include <tune_quda.h>
4 #include <gauge_field_order.h>
5 #include <launch_kernel.cuh>
6 #include <cub_helper.cuh>
7 #include <fstream>
8 
9 namespace quda {
10 
11 using namespace gauge;
12 
13  bool forceMonitor() {
14  static bool init = false;
15  static bool monitor = false;
16  if (!init) {
17  char *path = getenv("QUDA_RESOURCE_PATH");
18  char *enable_force_monitor = getenv("QUDA_ENABLE_FORCE_MONITOR");
19  if (path && enable_force_monitor && strcmp(enable_force_monitor, "1") == 0) monitor = true;
20  init = true;
21  }
22  return monitor;
23  }
24 
25  static std::stringstream force_stream;
26  static long long force_count = 0;
27  static long long force_flush = 1000; // how many force samples we accumulate before flushing
28 
30  if (!forceMonitor() || comm_rank() != 0) return;
31 
32  static std::string path = std::string(getenv("QUDA_RESOURCE_PATH"));
33  static char *profile_fname = getenv("QUDA_PROFILE_OUTPUT_BASE");
34 
35  std::ofstream force_file;
36  static long long count = 0;
37  if (count == 0) {
38  path += (profile_fname ? std::string("/") + profile_fname + "_force.tsv" : std::string("/force.tsv"));
39  force_file.open(path.c_str());
40  force_file << "Force\tL1\tL2\tdt" << std::endl;
41  } else {
42  force_file.open(path.c_str(), std::ios_base::app);
43  }
44  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Flushing force monitor data to %s\n", path.c_str());
45  force_file << force_stream.str();
46 
47  force_file.flush();
48  force_file.close();
49 
50  // empty the stream buffer
51  force_stream.clear();
52  force_stream.str(std::string());
53 
54  count++;
55  }
56 
57  void forceRecord(double2 &force, double dt, const char *fname) {
59  comm_allreduce_max_array((double*)&force, 2);
60 
61  if (comm_rank()==0) {
62  force_stream << fname << "\t" << std::setprecision(5) << force.x << "\t"
63  << std::setprecision(5) << force.y << "\t"
64  << std::setprecision(5) << dt << std::endl;
65  if (++force_count % force_flush == 0) flushForceMonitor();
66  }
67  }
68 
69 #ifdef GPU_GAUGE_TOOLS
70 
71  template <typename Mom>
72  struct MomActionArg : public ReduceArg<double> {
73  int threads; // number of active threads required
74  Mom mom;
75  int X[4]; // grid dimensions
76 
77  MomActionArg(const Mom &mom, const GaugeField &meta)
78  : ReduceArg<double>(), mom(mom) {
79  threads = meta.VolumeCB();
80  for(int dir=0; dir<4; ++dir) X[dir] = meta.X()[dir];
81  }
82  };
83 
84  template<int blockSize, typename Float, typename Mom>
85  __global__ void computeMomAction(MomActionArg<Mom> arg){
86  int x = threadIdx.x + blockIdx.x*blockDim.x;
87  int parity = threadIdx.y;
88  double action = 0.0;
89 
90  while (x < arg.threads) {
91  // loop over direction
92  for (int mu=0; mu<4; mu++) {
93  // FIXME should understand what this does exactly and cleanup (matches MILC)
94  complex<Float> v_[5];
95  arg.mom.load(v_, x, mu, parity);
96  Float v[10];
97  for (int i=0; i<5; i++) {
98  v[2*i+0] = v_[i].real();
99  v[2*i+1] = v_[i].imag();
100  }
101 
102  double local_sum = 0.0;
103  for (int j=0; j<6; j++) local_sum += v[j]*v[j];
104  for (int j=6; j<9; j++) local_sum += 0.5*v[j]*v[j];
105  local_sum -= 4.0;
106  action += local_sum;
107  }
108 
109  x += blockDim.x*gridDim.x;
110  }
111 
112  // perform final inter-block reduction and write out result
113  reduce2d<blockSize,2>(arg, action);
114  }
115 
116  template<typename Float, typename Mom>
117  class MomAction : TunableLocalParity {
118  MomActionArg<Mom> &arg;
119  const GaugeField &meta;
120 
121  private:
122  bool tuneGridDim() const { return true; }
123 
124  public:
125  MomAction(MomActionArg<Mom> &arg, const GaugeField &meta) : arg(arg), meta(meta) {}
126  virtual ~MomAction () { }
127 
128  void apply(const cudaStream_t &stream){
129  if (meta.Location() == QUDA_CUDA_FIELD_LOCATION){
130  arg.result_h[0] = 0.0;
131  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
132  LAUNCH_KERNEL_LOCAL_PARITY(computeMomAction, tp, stream, arg, Float, Mom);
133  } else {
134  errorQuda("CPU not supported yet\n");
135  }
136  }
137 
138  TuneKey tuneKey() const {
139  std::stringstream aux;
140  aux << "threads=" << arg.threads << ",prec=" << sizeof(Float);
141  return TuneKey(meta.VolString(), typeid(*this).name(), aux.str().c_str());
142  }
143 
144  long long flops() const { return 4*2*arg.threads*23; }
145  long long bytes() const { return 4*2*arg.threads*arg.mom.Bytes(); }
146  };
147 
148  template<typename Float, typename Mom>
149  void momAction(const Mom mom, const GaugeField& meta, double &action) {
150  MomActionArg<Mom> arg(mom, meta);
151  MomAction<Float,Mom> momAction(arg, meta);
152 
153  momAction.apply(0);
155 
156  comm_allreduce((double*)arg.result_h);
157  action = arg.result_h[0];
158  }
159 
160  template<typename Float>
161  double momAction(const GaugeField& mom) {
162  double action = 0.0;
163 
164  if (mom.Order() == QUDA_FLOAT2_GAUGE_ORDER) {
165  if (mom.Reconstruct() == QUDA_RECONSTRUCT_10) {
166  momAction<Float>(FloatNOrder<Float,10,2,10>(mom), mom, action);
167  } else {
168  errorQuda("Reconstruction type %d not supported", mom.Reconstruct());
169  }
170  } else {
171  errorQuda("Gauge Field order %d not supported", mom.Order());
172  }
173 
174  return action;
175  }
176 #endif
177 
178  double computeMomAction(const GaugeField& mom) {
179  double action = 0.0;
180 #ifdef GPU_GAUGE_TOOLS
181  if (mom.Precision() == QUDA_DOUBLE_PRECISION) {
182  action = momAction<double>(mom);
183  } else if(mom.Precision() == QUDA_SINGLE_PRECISION) {
184  action = momAction<float>(mom);
185  } else {
186  errorQuda("Precision %d not supported", mom.Precision());
187  }
188 #else
189  errorQuda("%s not build", __func__);
190 #endif
191  return action;
192  }
193 
194 
195 #ifdef GPU_GAUGE_TOOLS
196  template<typename Float, QudaReconstructType reconstruct_>
197  struct UpdateMomArg : public ReduceArg<double2> {
198  int threads;
199  static constexpr int force_recon = (reconstruct_ == QUDA_RECONSTRUCT_10 ? 11 : 18);
202  Float coeff;
203  int X[4]; // grid dimensions on mom
204  int E[4]; // grid dimensions on force (possibly extended)
205  int border[4]; //
206  UpdateMomArg(GaugeField &mom, const Float &coeff, GaugeField &force)
207  : threads(mom.VolumeCB()), mom(mom), coeff(coeff), force(force) {
208  for (int dir=0; dir<4; ++dir) {
209  X[dir] = mom.X()[dir];
210  E[dir] = force.X()[dir];
211  border[dir] = force.R()[dir];
212  }
213  }
214  };
215 
221  struct max_reducer2 {
222  __device__ __host__ inline double2 operator()(const double2 &a, const double2 &b) {
223  return make_double2(a.x > b.x ? a.x : b.x, a.y > b.y ? a.y : b.y);
224  }
225  };
226 
227  template <int blockSize, typename Float, typename Arg>
228  __global__ void UpdateMomKernel(Arg arg) {
229  int x_cb = blockIdx.x*blockDim.x + threadIdx.x;
230  int parity = threadIdx.y;
231  double2 norm2 = make_double2(0.0,0.0);
232  max_reducer2 r;
233 
234  while (x_cb<arg.threads) {
235  int x[4];
236  getCoords(x, x_cb, arg.X, parity);
237  for (int d=0; d<4; d++) x[d] += arg.border[d];
238  int e_cb = linkIndex(x,arg.E);
239 
240 #pragma unroll
241  for (int d=0; d<4; d++) {
242  Matrix<complex<Float>,3> m = arg.mom(d, x_cb, parity);
243  Matrix<complex<Float>,3> f = arg.force(d, e_cb, parity);
244 
245  // project to traceless anti-hermitian prior to taking norm
246  makeAntiHerm(f);
247 
248  // compute force norms
249  norm2 = r(make_double2(f.L1(), f.L2()), norm2);
250 
251  m = m + arg.coeff * f;
252 
253  // strictly speaking this shouldn't be needed since the
254  // momentum should already be traceless anti-hermitian but at
255  // present the unit test will fail without this
256  makeAntiHerm(m);
257  arg.mom(d, x_cb, parity) = m;
258  }
259 
260  x_cb += gridDim.x*blockDim.x;
261  }
262 
263  // perform final inter-block reduction and write out result
264  reduce2d<blockSize,2,double2,false,max_reducer2>(arg, norm2, 0);
265  } // UpdateMom
266 
267 
268  template<typename Float, typename Arg>
269  class UpdateMom : TunableLocalParity {
270  Arg &arg;
271  const GaugeField &meta;
272 
273  private:
274  bool tuneGridDim() const { return true; }
275 
276  public:
277  UpdateMom(Arg &arg, const GaugeField &meta) : arg(arg), meta(meta) {}
278  virtual ~UpdateMom () { }
279 
280  void apply(const cudaStream_t &stream){
281  if (meta.Location() == QUDA_CUDA_FIELD_LOCATION) {
282  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
283  LAUNCH_KERNEL_LOCAL_PARITY(UpdateMomKernel, tp, stream, arg, Float);
284  } else {
285  errorQuda("CPU not supported yet\n");
286  }
287  }
288 
289  TuneKey tuneKey() const {
290  std::stringstream aux;
291  aux << "threads=" << arg.threads << ",prec=" << sizeof(Float);
292  return TuneKey(meta.VolString(), typeid(*this).name(), aux.str().c_str());
293  }
294 
295  void preTune() { arg.mom.save();}
296  void postTune() { arg.mom.load();}
297  long long flops() const { return 4*2*arg.threads*(36+42); }
298  long long bytes() const { return 4*2*arg.threads*(2*arg.mom.Bytes()+arg.force.Bytes()); }
299  };
300 
301  template<typename Float, QudaReconstructType reconstruct>
302  void updateMomentum(GaugeField &mom, Float coeff, GaugeField &force, const char *fname) {
303  UpdateMomArg<Float,reconstruct> arg(mom, coeff, force);
304  UpdateMom<Float,decltype(arg)> update(arg, force);
305  update.apply(0);
306 
307  if (forceMonitor()) forceRecord(*((double2*)arg.result_h), arg.coeff, fname);
308  }
309 
310  template <typename Float>
311  void updateMomentum(GaugeField &mom, double coeff, GaugeField &force, const char *fname) {
312  if (mom.Reconstruct() != QUDA_RECONSTRUCT_10)
313  errorQuda("Momentum field with reconstruct %d not supported", mom.Reconstruct());
314  if (force.Order() != QUDA_FLOAT2_GAUGE_ORDER)
315  errorQuda("Force field with order %d not supported", force.Order());
316 
317  if (force.Reconstruct() == QUDA_RECONSTRUCT_10) {
318  updateMomentum<Float,QUDA_RECONSTRUCT_10>(mom, coeff, force, fname);
319  } else if (force.Reconstruct() == QUDA_RECONSTRUCT_NO) {
320  updateMomentum<Float,QUDA_RECONSTRUCT_NO>(mom, coeff, force, fname);
321  } else {
322  errorQuda("Unsupported force reconstruction: %d", force.Reconstruct());
323  }
324 
325  }
326 #endif // GPU_GAUGE_TOOLS
327 
328  void updateMomentum(GaugeField &mom, double coeff, GaugeField &force, const char *fname) {
329 #ifdef GPU_GAUGE_TOOLS
330  if(mom.Order() != QUDA_FLOAT2_GAUGE_ORDER)
331  errorQuda("Unsupported output ordering: %d\n", mom.Order());
332 
333  if (mom.Precision() != force.Precision())
334  errorQuda("Mixed precision not supported: %d %d\n", mom.Precision(), force.Precision());
335 
336  if (mom.Precision() == QUDA_DOUBLE_PRECISION) {
337  updateMomentum<double>(mom, coeff, force, fname);
338  } else if (mom.Precision() == QUDA_SINGLE_PRECISION) {
339  updateMomentum<float>(mom, coeff, force, fname);
340  } else {
341  errorQuda("Unsupported precision: %d", mom.Precision());
342  }
343 
344  checkCudaError();
345 #else
346  errorQuda("%s not built", __func__);
347 #endif // GPU_GAUGE_TOOLS
348 
349  return;
350  }
351 
352 
353 #ifdef GPU_GAUGE_TOOLS
354 
355  template<typename Float, typename Force, typename Gauge>
356  struct ApplyUArg {
357  int threads;
358  Force force;
359  Gauge U;
360  int X[4]; // grid dimensions
361  ApplyUArg(Force &force, Gauge &U, GaugeField &meta)
362  : threads(meta.VolumeCB()), force(force), U(U) {
363  for (int dir=0; dir<4; ++dir) X[dir] = meta.X()[dir];
364  }
365  };
366 
367  template<typename Float, typename Force, typename Gauge>
368  __global__ void ApplyUKernel(ApplyUArg<Float,Force,Gauge> arg) {
369  int x = blockIdx.x*blockDim.x + threadIdx.x;
370  int parity = threadIdx.y;
371  Matrix<complex<Float>,3> f, u;
372 
373  while (x<arg.threads) {
374  for (int d=0; d<4; d++) {
375  f = arg.force(d, x, parity);
376  u = arg.U(d, x, parity);
377 
378  f = u * f;
379 
380  arg.force(d, x, parity) = f;
381  }
382 
383  x += gridDim.x*blockDim.x;
384  }
385 
386  return;
387  } // ApplyU
388 
389 
390  template<typename Float, typename Force, typename Gauge>
391  class ApplyU : TunableLocalParity {
392  ApplyUArg<Float, Force, Gauge> &arg;
393  const GaugeField &meta;
394 
395  private:
396  unsigned int minThreads() const { return arg.threads; }
397 
398  public:
399  ApplyU(ApplyUArg<Float,Force,Gauge> &arg, const GaugeField &meta) : arg(arg), meta(meta) {}
400  virtual ~ApplyU () { }
401 
402  void apply(const cudaStream_t &stream){
403  if(meta.Location() == QUDA_CUDA_FIELD_LOCATION){
404  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
405  ApplyUKernel<Float,Force,Gauge><<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
406  } else {
407  errorQuda("CPU not supported yet\n");
408  }
409  }
410 
411  TuneKey tuneKey() const {
412  std::stringstream aux;
413  aux << "threads=" << arg.threads << ",prec=" << sizeof(Float);
414  return TuneKey(meta.VolString(), typeid(*this).name(), aux.str().c_str());
415  }
416 
417  void preTune() { arg.force.save();}
418  void postTune() { arg.force.load();}
419  long long flops() const { return 4*2*arg.threads*198; }
420  long long bytes() const { return 4*2*arg.threads*(2*arg.force.Bytes()+arg.U.Bytes()); }
421  };
422 
423  template<typename Float, typename Force, typename Gauge>
424  void applyU(Force force, Gauge U, GaugeField &meta) {
425  ApplyUArg<Float,Force,Gauge> arg(force, U, meta);
426  ApplyU<Float,Force,Gauge> applyU(arg, meta);
427  applyU.apply(0);
429  }
430  template <typename Float>
431  void applyU(GaugeField &force, GaugeField &U) {
432  if (force.Reconstruct() != QUDA_RECONSTRUCT_NO)
433  errorQuda("Force field with reconstruct %d not supported", force.Reconstruct());
434 
435  if (U.Reconstruct() == QUDA_RECONSTRUCT_NO) {
436  applyU<Float>(FloatNOrder<Float, 18, 2, 18>(force), FloatNOrder<Float, 18, 2, 18>(U), force);
437  } else if (U.Reconstruct() == QUDA_RECONSTRUCT_NO) {
438  applyU<Float>(FloatNOrder<Float, 18, 2, 18>(force), FloatNOrder<Float, 18, 2, 12>(U), force);
439  } else {
440  errorQuda("Unsupported gauge reconstruction: %d", U.Reconstruct());
441  }
442 
443  }
444 #endif // GPU_GAUGE_TOOLS
445 
446  void applyU(GaugeField &force, GaugeField &U) {
447 #ifdef GPU_GAUGE_TOOLS
448  if(force.Order() != QUDA_FLOAT2_GAUGE_ORDER)
449  errorQuda("Unsupported output ordering: %d\n", force.Order());
450 
451  if (force.Precision() != U.Precision())
452  errorQuda("Mixed precision not supported: %d %d\n", force.Precision(), U.Precision());
453 
454  if (force.Precision() == QUDA_DOUBLE_PRECISION) {
455  applyU<double>(force, U);
456  } else {
457  errorQuda("Unsupported precision: %d", force.Precision());
458  }
459 
460  checkCudaError();
461 #else
462  errorQuda("%s not built", __func__);
463 #endif // GPU_GAUGE_TOOLS
464 
465  return;
466  }
467 
468 } // namespace quda
int comm_rank(void)
Definition: comm_mpi.cpp:82
double mu
Definition: test_util.cpp:1648
static __device__ __host__ int linkIndex(const int x[], const I X[4])
#define LAUNCH_KERNEL_LOCAL_PARITY(kernel, tp, stream, arg,...)
static std::stringstream force_stream
Definition: momentum.cu:25
void forceRecord(double2 &force, double dt, const char *fname)
Definition: momentum.cu:57
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21
#define errorQuda(...)
Definition: util_quda.h:121
void updateMomentum(GaugeField &mom, double coeff, GaugeField &force, const char *fname)
Definition: momentum.cu:328
cudaStream_t * stream
void comm_allreduce_max_array(double *data, size_t size)
Definition: comm_mpi.cpp:296
const char * VolString() const
static long long force_flush
Definition: momentum.cu:27
bool forceMonitor()
Whether we are monitoring the force or not.
Definition: momentum.cu:13
int E[4]
Definition: test_util.cpp:35
double norm2(const CloverField &a, bool inverse=false)
double computeMomAction(const GaugeField &mom)
Compute and return global the momentum action 1/2 mom^2.
Definition: momentum.cu:178
const int * R() const
#define qudaDeviceSynchronize()
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:643
void flushForceMonitor()
Flush any outstanding force monitoring information.
Definition: momentum.cu:29
__device__ __host__ real L2()
Compute the matrix L2 norm. We actually compute the Frobenius norm which is an upper bound on the L2 ...
Definition: quda_matrix.h:162
Main header file for host and device accessors to GaugeFields.
int X[4]
Definition: covdev_test.cpp:70
void applyU(GaugeField &force, GaugeField &U)
Definition: momentum.cu:446
void init()
Create the CUBLAS context.
Definition: blas_cublas.cu:31
QudaFieldLocation Location() const
static long long force_count
Definition: momentum.cu:26
#define printfQuda(...)
Definition: util_quda.h:115
int VolumeCB() const
unsigned long long flops
Definition: blas_quda.cu:22
__device__ __host__ real L1()
Compute the matrix L1 norm - this is the maximum of the absolute column sums.
Definition: quda_matrix.h:143
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Accessor routine for CloverFields in native field order.
__device__ __host__ void makeAntiHerm(Matrix< Complex, N > &m)
Definition: quda_matrix.h:746
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:250
QudaGaugeFieldOrder Order() const
Definition: gauge_field.h:251
#define checkCudaError()
Definition: util_quda.h:161
void comm_allreduce(double *data)
Definition: comm_mpi.cpp:242
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:52
QudaPrecision Precision() const
__device__ unsigned int count[QUDA_MAX_MULTI_REDUCE]
Definition: cub_helper.cuh:90
QudaParity parity
Definition: covdev_test.cpp:54
unsigned long long bytes
Definition: blas_quda.cu:23
__host__ __device__ int getCoords(int coord[], const Arg &arg, int &idx, int parity, int &dim)
Compute the space-time coordinates we are at.
const int * X() const