QUDA  v1.1.0
A library for QCD on GPUs
momentum.cu
Go to the documentation of this file.
1 #include <quda_internal.h>
2 #include <quda_matrix.h>
3 #include <tune_quda.h>
4 #include <gauge_field_order.h>
5 #include <launch_kernel.cuh>
6 #include <reduce_helper.h>
7 #include <instantiate.h>
8 #include <fstream>
9 
10 namespace quda {
11 
12  bool forceMonitor() {
13  static bool init = false;
14  static bool monitor = false;
15  if (!init) {
16  char *path = getenv("QUDA_RESOURCE_PATH");
17  char *enable_force_monitor = getenv("QUDA_ENABLE_FORCE_MONITOR");
18  if (path && enable_force_monitor && strcmp(enable_force_monitor, "1") == 0) monitor = true;
19  init = true;
20  }
21  return monitor;
22  }
23 
24  static std::stringstream force_stream;
25  static long long force_count = 0;
26  static long long force_flush = 1000; // how many force samples we accumulate before flushing
27 
28  void flushForceMonitor() {
29  if (!forceMonitor() || comm_rank() != 0) return;
30 
31  static std::string path = std::string(getenv("QUDA_RESOURCE_PATH"));
32  static char *profile_fname = getenv("QUDA_PROFILE_OUTPUT_BASE");
33 
34  std::ofstream force_file;
35  static long long count = 0;
36  if (count == 0) {
37  path += (profile_fname ? std::string("/") + profile_fname + "_force.tsv" : std::string("/force.tsv"));
38  force_file.open(path.c_str());
39  force_file << "Force\tL1\tL2\tdt" << std::endl;
40  } else {
41  force_file.open(path.c_str(), std::ios_base::app);
42  }
43  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Flushing force monitor data to %s\n", path.c_str());
44  force_file << force_stream.str();
45 
46  force_file.flush();
47  force_file.close();
48 
49  // empty the stream buffer
50  force_stream.clear();
51  force_stream.str(std::string());
52 
53  count++;
54  }
55 
56  void forceRecord(double2 &force, double dt, const char *fname) {
57  qudaDeviceSynchronize();
58  comm_allreduce_max_array((double*)&force, 2);
59 
60  if (comm_rank()==0) {
61  force_stream << fname << "\t" << std::setprecision(5) << force.x << "\t"
62  << std::setprecision(5) << force.y << "\t"
63  << std::setprecision(5) << dt << std::endl;
64  if (++force_count % force_flush == 0) flushForceMonitor();
65  }
66  }
67 
68  template <typename Float_, int nColor_, QudaReconstructType recon_>
69  struct BaseArg {
70  using Float = Float_;
71  static constexpr int nColor = nColor_;
72  static constexpr QudaReconstructType recon = recon_;
73  int threads; // number of active threads required
74  BaseArg(const GaugeField &meta) :
75  threads(meta.VolumeCB()) {}
76  };
77 
78  template <typename Float, int nColor, QudaReconstructType recon>
79  struct MomActionArg : ReduceArg<double>, BaseArg<Float, nColor, recon> {
80  typedef typename gauge_mapper<Float, recon>::type Mom;
81  const Mom mom;
82 
83  MomActionArg(const GaugeField &mom) :
84  BaseArg<Float, nColor, recon>(mom),
85  mom(mom) {}
86  };
87 
88  // calculate the momentum contribution to the action. This uses the
89  // MILC convention where we subtract 4.0 from each matrix norm in
90  // order to increase stability
91  template <int blockSize, typename Arg>
92  __global__ void computeMomAction(Arg arg){
93  int x = threadIdx.x + blockIdx.x*blockDim.x;
94  int parity = threadIdx.y;
95  double action = 0.0;
96  using matrix = Matrix<complex<typename Arg::Float>, Arg::nColor>;
97 
98  while (x < arg.threads) {
99  // loop over direction
100  for (int mu=0; mu<4; mu++) {
101  const matrix mom = arg.mom(mu, x, parity);
102 
103  double local_sum = 0.0;
104  local_sum = 0.5 * mom(0,0).imag() * mom(0,0).imag();
105  local_sum += 0.5 * mom(1,1).imag() * mom(1,1).imag();
106  local_sum += 0.5 * mom(2,2).imag() * mom(2,2).imag();
107  local_sum += mom(0,1).real() * mom(0,1).real();
108  local_sum += mom(0,1).imag() * mom(0,1).imag();
109  local_sum += mom(0,2).real() * mom(0,2).real();
110  local_sum += mom(0,2).imag() * mom(0,2).imag();
111  local_sum += mom(1,2).real() * mom(1,2).real();
112  local_sum += mom(1,2).imag() * mom(1,2).imag();
113  local_sum -= 4.0;
114 
115  action += local_sum;
116  }
117 
118  x += blockDim.x*gridDim.x;
119  }
120 
121  // perform final inter-block reduction and write out result
122  arg.template reduce2d<blockSize,2>(action);
123  }
124 
125  template <typename Float, int nColor, QudaReconstructType recon>
126  class MomAction : TunableLocalParityReduction {
127  MomActionArg<Float, nColor, recon> arg;
128  const GaugeField &meta;
129 
130  public:
131  MomAction(const GaugeField &mom, double &action) :
132  arg(mom),
133  meta(mom)
134  {
135  apply(0);
136  arg.complete(action);
137  comm_allreduce(&action);
138  }
139 
140  void apply(const qudaStream_t &stream)
141  {
142  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
143  LAUNCH_KERNEL_LOCAL_PARITY(computeMomAction, (*this), tp, stream, arg, decltype(arg));
144  }
145 
146  TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), meta.AuxString()); }
147  long long flops() const { return 4*2*arg.threads*23; }
148  long long bytes() const { return 4*2*arg.threads*arg.mom.Bytes(); }
149  };
150 
151  double computeMomAction(const GaugeField& mom) {
152  if (!mom.isNative()) errorQuda("Unsupported output ordering: %d\n", mom.Order());
153  double action = 0.0;
154 #ifdef GPU_GAUGE_TOOLS
155  instantiate<MomAction, Reconstruct10>(mom, action);
156 #else
157  errorQuda("%s not build", __func__);
158 #endif
159  return action;
160  }
161 
162  template<typename Float_, int nColor, QudaReconstructType recon>
163  struct UpdateMomArg : ReduceArg<double2>, BaseArg<Float_, nColor, recon>
164  {
165  using Float = Float_;
166  typename gauge_mapper<Float, QUDA_RECONSTRUCT_10>::type mom;
167  typename gauge_mapper<Float, recon>::type force;
168  Float coeff;
169  int X[4]; // grid dimensions on mom
170  int E[4]; // grid dimensions on force (possibly extended)
171  int border[4]; //
172  UpdateMomArg(GaugeField &mom, const Float &coeff, GaugeField &force) :
173  BaseArg<Float, nColor, recon>(mom),
174  mom(mom),
175  coeff(coeff),
176  force(force) {
177  for (int dir=0; dir<4; ++dir) {
178  X[dir] = mom.X()[dir];
179  E[dir] = force.X()[dir];
180  border[dir] = force.R()[dir];
181  }
182  }
183  };
184 
185  /**
186  @brief Functor for finding the maximum over a double2 field.
187  Each lane of the double2 is evaluated separately. This functor
188  is passed to the reduce helper.
189  */
190  struct max_reducer2 {
191  __device__ __host__ inline double2 operator()(const double2 &a, const double2 &b) {
192  return make_double2(a.x > b.x ? a.x : b.x, a.y > b.y ? a.y : b.y);
193  }
194  };
195 
196  template <int blockSize, typename Arg>
197  __global__ void UpdateMomKernel(Arg arg) {
198  int x_cb = blockIdx.x*blockDim.x + threadIdx.x;
199  int parity = threadIdx.y;
200  double2 norm2 = make_double2(0.0,0.0);
201  max_reducer2 r;
202 
203  while (x_cb<arg.threads) {
204  int x[4];
205  getCoords(x, x_cb, arg.X, parity);
206  for (int d=0; d<4; d++) x[d] += arg.border[d];
207  int e_cb = linkIndex(x,arg.E);
208 
209 #pragma unroll
210  for (int d=0; d<4; d++) {
211  Matrix<complex<typename Arg::Float>,3> m = arg.mom(d, x_cb, parity);
212  Matrix<complex<typename Arg::Float>,3> f = arg.force(d, e_cb, parity);
213 
214  // project to traceless anti-hermitian prior to taking norm
215  makeAntiHerm(f);
216 
217  // compute force norms
218  norm2 = r(make_double2(f.L1(), f.L2()), norm2);
219 
220  m = m + arg.coeff * f;
221 
222  // strictly speaking this shouldn't be needed since the
223  // momentum should already be traceless anti-hermitian but at
224  // present the unit test will fail without this
225  makeAntiHerm(m);
226  arg.mom(d, x_cb, parity) = m;
227  }
228 
229  x_cb += gridDim.x*blockDim.x;
230  }
231 
232  // perform final inter-block reduction and write out result
233  arg.template reduce2d<blockSize,2,false,max_reducer2>(norm2);
234  } // UpdateMom
235 
236  template <typename Float, int nColor, QudaReconstructType recon>
237  class UpdateMom : TunableLocalParityReduction {
238  UpdateMomArg<Float, nColor, recon> arg;
239  const GaugeField &meta;
240 
241  public:
242  UpdateMom(GaugeField &force, GaugeField &mom, double coeff, const char *fname) :
243  arg(mom, coeff, force),
244  meta(force)
245  {
246  double2 force_max;
247  apply(0);
248  if (forceMonitor()) {
249  arg.complete(force_max);
250  forceRecord(force_max, arg.coeff, fname);
251  }
252  }
253 
254  void apply(const qudaStream_t &stream)
255  {
256  if (meta.Location() == QUDA_CUDA_FIELD_LOCATION) {
257  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
258  LAUNCH_KERNEL_LOCAL_PARITY(UpdateMomKernel, (*this), tp, stream, arg, decltype(arg));
259  } else {
260  errorQuda("CPU not supported yet\n");
261  }
262  }
263 
264  TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), meta.AuxString()); }
265  void preTune() { arg.mom.save();}
266  void postTune() { arg.mom.load();}
267  long long flops() const { return 4*2*arg.threads*(36+42); }
268  long long bytes() const { return 4*2*arg.threads*(2*arg.mom.Bytes()+arg.force.Bytes()); }
269  };
270 
271  void updateMomentum(GaugeField &mom, double coeff, GaugeField &force, const char *fname)
272  {
273 #ifdef GPU_GAUGE_TOOLS
274  if (mom.Reconstruct() != QUDA_RECONSTRUCT_10)
275  errorQuda("Momentum field with reconstruct %d not supported", mom.Reconstruct());
276 
277  checkPrecision(mom, force);
278  instantiate<UpdateMom, ReconstructMom>(force, mom, coeff, fname);
279 #else
280  errorQuda("%s not built", __func__);
281 #endif // GPU_GAUGE_TOOLS
282  }
283 
284  template <typename Float, int nColor, QudaReconstructType recon>
285  struct ApplyUArg : BaseArg<Float, nColor, recon>
286  {
287  typedef typename gauge_mapper<Float,recon>::type G;
288  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type F;
289  F force;
290  const G U;
291  int X[4]; // grid dimensions
292  ApplyUArg(GaugeField &force, const GaugeField &U) :
293  BaseArg<Float, nColor, recon>(U),
294  force(force),
295  U(U)
296  {
297  for (int dir=0; dir<4; ++dir) X[dir] = U.X()[dir];
298  }
299  };
300 
301  template <typename Arg>
302  __global__ void ApplyUKernel(Arg arg)
303  {
304  int x = blockIdx.x*blockDim.x + threadIdx.x;
305  if (x >= arg.threads) return;
306  int parity = threadIdx.y + blockIdx.y * blockDim.y;
307  using mat = Matrix<complex<typename Arg::Float>,Arg::nColor>;
308 
309  for (int d=0; d<4; d++) {
310  mat f = arg.force(d, x, parity);
311  mat u = arg.U(d, x, parity);
312 
313  f = u * f;
314 
315  arg.force(d, x, parity) = f;
316  }
317  } // ApplyU
318 
319  template <typename Float, int nColor, QudaReconstructType recon>
320  class ApplyU : TunableVectorY {
321  ApplyUArg<Float, nColor, recon> arg;
322  const GaugeField &meta;
323 
324  bool tuneGridDim() const { return false; }
325  unsigned int minThreads() const { return arg.threads; }
326 
327  public:
328  ApplyU(const GaugeField &U, GaugeField &force) :
329  TunableVectorY(2),
330  arg(force, U),
331  meta(U)
332  {
333  apply(0);
334  }
335 
336  void apply(const qudaStream_t &stream)
337  {
338  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
339  qudaLaunchKernel(ApplyUKernel<decltype(arg)>, tp, stream, arg);
340  }
341 
342  TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), meta.AuxString()); }
343  void preTune() { arg.force.save();}
344  void postTune() { arg.force.load();}
345  long long flops() const { return 4*2*arg.threads*198; }
346  long long bytes() const { return 4*2*arg.threads*(2*arg.force.Bytes()+arg.U.Bytes()); }
347  };
348 
349  void applyU(GaugeField &force, GaugeField &U)
350  {
351 #ifdef GPU_GAUGE_TOOLS
352  if (!force.isNative()) errorQuda("Unsupported output ordering: %d\n", force.Order());
353  checkPrecision(force, U);
354  instantiate<ApplyU, ReconstructNo12>(U, force);
355 #else
356  errorQuda("%s not built", __func__);
357 #endif // GPU_GAUGE_TOOLS
358  }
359 
360 } // namespace quda