11 using namespace gauge;
14 static bool init =
false;
15 static bool monitor =
false;
17 char *path = getenv(
"QUDA_RESOURCE_PATH");
18 char *enable_force_monitor = getenv(
"QUDA_ENABLE_FORCE_MONITOR");
19 if (path && enable_force_monitor && strcmp(enable_force_monitor,
"1") == 0) monitor =
true;
32 static std::string path = std::string(getenv(
"QUDA_RESOURCE_PATH"));
33 static char *profile_fname = getenv(
"QUDA_PROFILE_OUTPUT_BASE");
35 std::ofstream force_file;
36 static long long count = 0;
38 path += (profile_fname ? std::string(
"/") + profile_fname +
"_force.tsv" : std::string(
"/force.tsv"));
39 force_file.open(path.c_str());
40 force_file <<
"Force\tL1\tL2\tdt" << std::endl;
42 force_file.open(path.c_str(), std::ios_base::app);
45 force_file << force_stream.str();
52 force_stream.str(std::string());
57 void forceRecord(double2 &force,
double dt,
const char *fname) {
62 force_stream << fname <<
"\t" << std::setprecision(5) << force.x <<
"\t" 63 << std::setprecision(5) << force.y <<
"\t" 64 << std::setprecision(5) << dt << std::endl;
69 #ifdef GPU_GAUGE_TOOLS 71 template <
typename Mom>
72 struct MomActionArg :
public ReduceArg<double> {
77 MomActionArg(
const Mom &mom,
const GaugeField &meta)
80 for(
int dir=0; dir<4; ++dir) X[dir] = meta.
X()[dir];
84 template<
int blockSize,
typename Float,
typename Mom>
86 int x = threadIdx.x + blockIdx.x*blockDim.x;
90 while (x < arg.threads) {
92 for (
int mu=0;
mu<4;
mu++) {
95 arg.mom.load(v_, x,
mu, parity);
97 for (
int i=0; i<5; i++) {
98 v[2*i+0] = v_[i].real();
99 v[2*i+1] = v_[i].imag();
102 double local_sum = 0.0;
103 for (
int j=0; j<6; j++) local_sum += v[j]*v[j];
104 for (
int j=6; j<9; j++) local_sum += 0.5*v[j]*v[j];
109 x += blockDim.x*gridDim.x;
113 reduce2d<blockSize,2>(
arg, action);
116 template<
typename Float,
typename Mom>
118 MomActionArg<Mom> &
arg;
122 bool tuneGridDim()
const {
return true; }
125 MomAction(MomActionArg<Mom> &arg,
const GaugeField &meta) :
arg(arg), meta(meta) {}
126 virtual ~MomAction () { }
128 void apply(
const cudaStream_t &
stream){
130 arg.result_h[0] = 0.0;
139 std::stringstream aux;
140 aux <<
"threads=" << arg.threads <<
",prec=" <<
sizeof(Float);
144 long long flops()
const {
return 4*2*arg.threads*23; }
145 long long bytes()
const {
return 4*2*arg.threads*arg.mom.Bytes(); }
148 template<
typename Float,
typename Mom>
149 void momAction(
const Mom mom,
const GaugeField& meta,
double &action) {
150 MomActionArg<Mom>
arg(mom, meta);
151 MomAction<Float,Mom> momAction(arg, meta);
157 action = arg.result_h[0];
160 template<
typename Float>
180 #ifdef GPU_GAUGE_TOOLS 182 action = momAction<double>(mom);
184 action = momAction<float>(mom);
195 #ifdef GPU_GAUGE_TOOLS 196 template<
typename Float, QudaReconstructType reconstruct_>
197 struct UpdateMomArg :
public ReduceArg<double2> {
207 : threads(mom.
VolumeCB()), mom(mom), coeff(coeff), force(force) {
208 for (
int dir=0; dir<4; ++dir) {
209 X[dir] = mom.
X()[dir];
210 E[dir] = force.
X()[dir];
211 border[dir] = force.
R()[dir];
221 struct max_reducer2 {
222 __device__ __host__
inline double2 operator()(
const double2 &a,
const double2 &b) {
223 return make_double2(a.x > b.x ? a.x : b.x, a.y > b.y ? a.y : b.y);
227 template <
int blockSize,
typename Float,
typename Arg>
228 __global__
void UpdateMomKernel(
Arg arg) {
229 int x_cb = blockIdx.x*blockDim.x + threadIdx.x;
231 double2
norm2 = make_double2(0.0,0.0);
234 while (x_cb<arg.threads) {
237 for (
int d=0; d<4; d++) x[d] += arg.border[d];
241 for (
int d=0; d<4; d++) {
249 norm2 = r(make_double2(f.
L1(), f.
L2()), norm2);
251 m = m + arg.coeff * f;
257 arg.mom(d, x_cb, parity) = m;
260 x_cb += gridDim.x*blockDim.x;
264 reduce2d<blockSize,2,double2,false,max_reducer2>(
arg,
norm2, 0);
268 template<
typename Float,
typename Arg>
274 bool tuneGridDim()
const {
return true; }
278 virtual ~UpdateMom () { }
280 void apply(
const cudaStream_t &
stream){
290 std::stringstream aux;
291 aux <<
"threads=" << arg.threads <<
",prec=" <<
sizeof(Float);
295 void preTune() { arg.mom.save();}
296 void postTune() { arg.mom.load();}
297 long long flops()
const {
return 4*2*arg.threads*(36+42); }
298 long long bytes()
const {
return 4*2*arg.threads*(2*arg.mom.Bytes()+arg.force.Bytes()); }
301 template<
typename Float, QudaReconstructType reconstruct>
303 UpdateMomArg<Float,reconstruct>
arg(mom, coeff, force);
304 UpdateMom<Float,decltype(arg)> update(arg, force);
310 template <
typename Float>
315 errorQuda(
"Force field with order %d not supported", force.
Order());
318 updateMomentum<Float,QUDA_RECONSTRUCT_10>(mom, coeff, force, fname);
320 updateMomentum<Float,QUDA_RECONSTRUCT_NO>(mom, coeff, force, fname);
326 #endif // GPU_GAUGE_TOOLS 329 #ifdef GPU_GAUGE_TOOLS 337 updateMomentum<double>(mom, coeff, force, fname);
339 updateMomentum<float>(mom, coeff, force, fname);
347 #endif // GPU_GAUGE_TOOLS 353 #ifdef GPU_GAUGE_TOOLS 355 template<
typename Float,
typename Force,
typename Gauge>
361 ApplyUArg(Force &force, Gauge &U,
GaugeField &meta)
362 : threads(meta.
VolumeCB()), force(force), U(U) {
363 for (
int dir=0; dir<4; ++dir) X[dir] = meta.
X()[dir];
367 template<
typename Float,
typename Force,
typename Gauge>
368 __global__
void ApplyUKernel(ApplyUArg<Float,Force,Gauge> arg) {
369 int x = blockIdx.x*blockDim.x + threadIdx.x;
373 while (x<arg.threads) {
374 for (
int d=0; d<4; d++) {
375 f = arg.force(d, x, parity);
376 u = arg.U(d, x, parity);
380 arg.force(d, x, parity) = f;
383 x += gridDim.x*blockDim.x;
390 template<
typename Float,
typename Force,
typename Gauge>
392 ApplyUArg<Float, Force, Gauge> &
arg;
396 unsigned int minThreads()
const {
return arg.threads; }
399 ApplyU(ApplyUArg<Float,Force,Gauge> &arg,
const GaugeField &meta) :
arg(arg), meta(meta) {}
400 virtual ~ApplyU () { }
402 void apply(
const cudaStream_t &
stream){
412 std::stringstream aux;
413 aux <<
"threads=" << arg.threads <<
",prec=" <<
sizeof(Float);
417 void preTune() { arg.force.save();}
418 void postTune() { arg.force.load();}
419 long long flops()
const {
return 4*2*arg.threads*198; }
420 long long bytes()
const {
return 4*2*arg.threads*(2*arg.force.Bytes()+arg.U.Bytes()); }
423 template<
typename Float,
typename Force,
typename Gauge>
425 ApplyUArg<Float,Force,Gauge>
arg(force, U, meta);
426 ApplyU<Float,Force,Gauge>
applyU(arg, meta);
430 template <
typename Float>
444 #endif // GPU_GAUGE_TOOLS 447 #ifdef GPU_GAUGE_TOOLS 455 applyU<double>(force, U);
463 #endif // GPU_GAUGE_TOOLS
static __device__ __host__ int linkIndex(const int x[], const I X[4])
#define LAUNCH_KERNEL_LOCAL_PARITY(kernel, tp, stream, arg,...)
static std::stringstream force_stream
void forceRecord(double2 &force, double dt, const char *fname)
QudaVerbosity getVerbosity()
void updateMomentum(GaugeField &mom, double coeff, GaugeField &force, const char *fname)
void comm_allreduce_max_array(double *data, size_t size)
const char * VolString() const
static long long force_flush
bool forceMonitor()
Whether we are monitoring the force or not.
double norm2(const CloverField &a, bool inverse=false)
double computeMomAction(const GaugeField &mom)
Compute and return global the momentum action 1/2 mom^2.
#define qudaDeviceSynchronize()
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
void flushForceMonitor()
Flush any outstanding force monitoring information.
__device__ __host__ real L2()
Compute the matrix L2 norm. We actually compute the Frobenius norm which is an upper bound on the L2 ...
Main header file for host and device accessors to GaugeFields.
void applyU(GaugeField &force, GaugeField &U)
void init()
Create the CUBLAS context.
QudaFieldLocation Location() const
static long long force_count
__device__ __host__ real L1()
Compute the matrix L1 norm - this is the maximum of the absolute column sums.
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Accessor routine for CloverFields in native field order.
__device__ __host__ void makeAntiHerm(Matrix< Complex, N > &m)
QudaReconstructType Reconstruct() const
QudaGaugeFieldOrder Order() const
void comm_allreduce(double *data)
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
QudaPrecision Precision() const
__device__ unsigned int count[QUDA_MAX_MULTI_REDUCE]
__host__ __device__ int getCoords(int coord[], const Arg &arg, int &idx, int parity, int &dim)
Compute the space-time coordinates we are at.