1 #include <quda_internal.h>
2 #include <quda_matrix.h>
4 #include <gauge_field_order.h>
5 #include <launch_kernel.cuh>
6 #include <reduce_helper.h>
7 #include <instantiate.h>
13 static bool init = false;
14 static bool monitor = false;
16 char *path = getenv("QUDA_RESOURCE_PATH");
17 char *enable_force_monitor = getenv("QUDA_ENABLE_FORCE_MONITOR");
18 if (path && enable_force_monitor && strcmp(enable_force_monitor, "1") == 0) monitor = true;
24 static std::stringstream force_stream;
25 static long long force_count = 0;
26 static long long force_flush = 1000; // how many force samples we accumulate before flushing
28 void flushForceMonitor() {
29 if (!forceMonitor() || comm_rank() != 0) return;
31 static std::string path = std::string(getenv("QUDA_RESOURCE_PATH"));
32 static char *profile_fname = getenv("QUDA_PROFILE_OUTPUT_BASE");
34 std::ofstream force_file;
35 static long long count = 0;
37 path += (profile_fname ? std::string("/") + profile_fname + "_force.tsv" : std::string("/force.tsv"));
38 force_file.open(path.c_str());
39 force_file << "Force\tL1\tL2\tdt" << std::endl;
41 force_file.open(path.c_str(), std::ios_base::app);
43 if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Flushing force monitor data to %s\n", path.c_str());
44 force_file << force_stream.str();
49 // empty the stream buffer
51 force_stream.str(std::string());
56 void forceRecord(double2 &force, double dt, const char *fname) {
57 qudaDeviceSynchronize();
58 comm_allreduce_max_array((double*)&force, 2);
61 force_stream << fname << "\t" << std::setprecision(5) << force.x << "\t"
62 << std::setprecision(5) << force.y << "\t"
63 << std::setprecision(5) << dt << std::endl;
64 if (++force_count % force_flush == 0) flushForceMonitor();
68 template <typename Float_, int nColor_, QudaReconstructType recon_>
71 static constexpr int nColor = nColor_;
72 static constexpr QudaReconstructType recon = recon_;
73 int threads; // number of active threads required
74 BaseArg(const GaugeField &meta) :
75 threads(meta.VolumeCB()) {}
78 template <typename Float, int nColor, QudaReconstructType recon>
79 struct MomActionArg : ReduceArg<double>, BaseArg<Float, nColor, recon> {
80 typedef typename gauge_mapper<Float, recon>::type Mom;
83 MomActionArg(const GaugeField &mom) :
84 BaseArg<Float, nColor, recon>(mom),
88 // calculate the momentum contribution to the action. This uses the
89 // MILC convention where we subtract 4.0 from each matrix norm in
90 // order to increase stability
91 template <int blockSize, typename Arg>
92 __global__ void computeMomAction(Arg arg){
93 int x = threadIdx.x + blockIdx.x*blockDim.x;
94 int parity = threadIdx.y;
96 using matrix = Matrix<complex<typename Arg::Float>, Arg::nColor>;
98 while (x < arg.threads) {
99 // loop over direction
100 for (int mu=0; mu<4; mu++) {
101 const matrix mom = arg.mom(mu, x, parity);
103 double local_sum = 0.0;
104 local_sum = 0.5 * mom(0,0).imag() * mom(0,0).imag();
105 local_sum += 0.5 * mom(1,1).imag() * mom(1,1).imag();
106 local_sum += 0.5 * mom(2,2).imag() * mom(2,2).imag();
107 local_sum += mom(0,1).real() * mom(0,1).real();
108 local_sum += mom(0,1).imag() * mom(0,1).imag();
109 local_sum += mom(0,2).real() * mom(0,2).real();
110 local_sum += mom(0,2).imag() * mom(0,2).imag();
111 local_sum += mom(1,2).real() * mom(1,2).real();
112 local_sum += mom(1,2).imag() * mom(1,2).imag();
118 x += blockDim.x*gridDim.x;
121 // perform final inter-block reduction and write out result
122 arg.template reduce2d<blockSize,2>(action);
125 template <typename Float, int nColor, QudaReconstructType recon>
126 class MomAction : TunableLocalParityReduction {
127 MomActionArg<Float, nColor, recon> arg;
128 const GaugeField &meta;
131 MomAction(const GaugeField &mom, double &action) :
136 arg.complete(action);
137 comm_allreduce(&action);
140 void apply(const qudaStream_t &stream)
142 TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
143 LAUNCH_KERNEL_LOCAL_PARITY(computeMomAction, (*this), tp, stream, arg, decltype(arg));
146 TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), meta.AuxString()); }
147 long long flops() const { return 4*2*arg.threads*23; }
148 long long bytes() const { return 4*2*arg.threads*arg.mom.Bytes(); }
151 double computeMomAction(const GaugeField& mom) {
152 if (!mom.isNative()) errorQuda("Unsupported output ordering: %d\n", mom.Order());
154 #ifdef GPU_GAUGE_TOOLS
155 instantiate<MomAction, Reconstruct10>(mom, action);
157 errorQuda("%s not build", __func__);
162 template<typename Float_, int nColor, QudaReconstructType recon>
163 struct UpdateMomArg : ReduceArg<double2>, BaseArg<Float_, nColor, recon>
165 using Float = Float_;
166 typename gauge_mapper<Float, QUDA_RECONSTRUCT_10>::type mom;
167 typename gauge_mapper<Float, recon>::type force;
169 int X[4]; // grid dimensions on mom
170 int E[4]; // grid dimensions on force (possibly extended)
172 UpdateMomArg(GaugeField &mom, const Float &coeff, GaugeField &force) :
173 BaseArg<Float, nColor, recon>(mom),
177 for (int dir=0; dir<4; ++dir) {
178 X[dir] = mom.X()[dir];
179 E[dir] = force.X()[dir];
180 border[dir] = force.R()[dir];
186 @brief Functor for finding the maximum over a double2 field.
187 Each lane of the double2 is evaluated separately. This functor
188 is passed to the reduce helper.
190 struct max_reducer2 {
191 __device__ __host__ inline double2 operator()(const double2 &a, const double2 &b) {
192 return make_double2(a.x > b.x ? a.x : b.x, a.y > b.y ? a.y : b.y);
196 template <int blockSize, typename Arg>
197 __global__ void UpdateMomKernel(Arg arg) {
198 int x_cb = blockIdx.x*blockDim.x + threadIdx.x;
199 int parity = threadIdx.y;
200 double2 norm2 = make_double2(0.0,0.0);
203 while (x_cb<arg.threads) {
205 getCoords(x, x_cb, arg.X, parity);
206 for (int d=0; d<4; d++) x[d] += arg.border[d];
207 int e_cb = linkIndex(x,arg.E);
210 for (int d=0; d<4; d++) {
211 Matrix<complex<typename Arg::Float>,3> m = arg.mom(d, x_cb, parity);
212 Matrix<complex<typename Arg::Float>,3> f = arg.force(d, e_cb, parity);
214 // project to traceless anti-hermitian prior to taking norm
217 // compute force norms
218 norm2 = r(make_double2(f.L1(), f.L2()), norm2);
220 m = m + arg.coeff * f;
222 // strictly speaking this shouldn't be needed since the
223 // momentum should already be traceless anti-hermitian but at
224 // present the unit test will fail without this
226 arg.mom(d, x_cb, parity) = m;
229 x_cb += gridDim.x*blockDim.x;
232 // perform final inter-block reduction and write out result
233 arg.template reduce2d<blockSize,2,false,max_reducer2>(norm2);
236 template <typename Float, int nColor, QudaReconstructType recon>
237 class UpdateMom : TunableLocalParityReduction {
238 UpdateMomArg<Float, nColor, recon> arg;
239 const GaugeField &meta;
242 UpdateMom(GaugeField &force, GaugeField &mom, double coeff, const char *fname) :
243 arg(mom, coeff, force),
248 if (forceMonitor()) {
249 arg.complete(force_max);
250 forceRecord(force_max, arg.coeff, fname);
254 void apply(const qudaStream_t &stream)
256 if (meta.Location() == QUDA_CUDA_FIELD_LOCATION) {
257 TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
258 LAUNCH_KERNEL_LOCAL_PARITY(UpdateMomKernel, (*this), tp, stream, arg, decltype(arg));
260 errorQuda("CPU not supported yet\n");
264 TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), meta.AuxString()); }
265 void preTune() { arg.mom.save();}
266 void postTune() { arg.mom.load();}
267 long long flops() const { return 4*2*arg.threads*(36+42); }
268 long long bytes() const { return 4*2*arg.threads*(2*arg.mom.Bytes()+arg.force.Bytes()); }
271 void updateMomentum(GaugeField &mom, double coeff, GaugeField &force, const char *fname)
273 #ifdef GPU_GAUGE_TOOLS
274 if (mom.Reconstruct() != QUDA_RECONSTRUCT_10)
275 errorQuda("Momentum field with reconstruct %d not supported", mom.Reconstruct());
277 checkPrecision(mom, force);
278 instantiate<UpdateMom, ReconstructMom>(force, mom, coeff, fname);
280 errorQuda("%s not built", __func__);
281 #endif // GPU_GAUGE_TOOLS
284 template <typename Float, int nColor, QudaReconstructType recon>
285 struct ApplyUArg : BaseArg<Float, nColor, recon>
287 typedef typename gauge_mapper<Float,recon>::type G;
288 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type F;
291 int X[4]; // grid dimensions
292 ApplyUArg(GaugeField &force, const GaugeField &U) :
293 BaseArg<Float, nColor, recon>(U),
297 for (int dir=0; dir<4; ++dir) X[dir] = U.X()[dir];
301 template <typename Arg>
302 __global__ void ApplyUKernel(Arg arg)
304 int x = blockIdx.x*blockDim.x + threadIdx.x;
305 if (x >= arg.threads) return;
306 int parity = threadIdx.y + blockIdx.y * blockDim.y;
307 using mat = Matrix<complex<typename Arg::Float>,Arg::nColor>;
309 for (int d=0; d<4; d++) {
310 mat f = arg.force(d, x, parity);
311 mat u = arg.U(d, x, parity);
315 arg.force(d, x, parity) = f;
319 template <typename Float, int nColor, QudaReconstructType recon>
320 class ApplyU : TunableVectorY {
321 ApplyUArg<Float, nColor, recon> arg;
322 const GaugeField &meta;
324 bool tuneGridDim() const { return false; }
325 unsigned int minThreads() const { return arg.threads; }
328 ApplyU(const GaugeField &U, GaugeField &force) :
336 void apply(const qudaStream_t &stream)
338 TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
339 qudaLaunchKernel(ApplyUKernel<decltype(arg)>, tp, stream, arg);
342 TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), meta.AuxString()); }
343 void preTune() { arg.force.save();}
344 void postTune() { arg.force.load();}
345 long long flops() const { return 4*2*arg.threads*198; }
346 long long bytes() const { return 4*2*arg.threads*(2*arg.force.Bytes()+arg.U.Bytes()); }
349 void applyU(GaugeField &force, GaugeField &U)
351 #ifdef GPU_GAUGE_TOOLS
352 if (!force.isNative()) errorQuda("Unsupported output ordering: %d\n", force.Order());
353 checkPrecision(force, U);
354 instantiate<ApplyU, ReconstructNo12>(U, force);
356 errorQuda("%s not built", __func__);
357 #endif // GPU_GAUGE_TOOLS