quda-ref/v1.0.0/momentum_8cu_source.html

 #include <quda_internal.h>
 #include <quda_matrix.h>
 #include <tune_quda.h>
 #include <gauge_field_order.h>
 #include <launch_kernel.cuh>
 #include <cub_helper.cuh>
 #include <fstream>

 namespace quda {

 using namespace gauge;

   bool forceMonitor() {
     static bool init = false;
     static bool monitor = false;
     if (!init) {
       char *path = getenv("QUDA_RESOURCE_PATH");
       char *enable_force_monitor = getenv("QUDA_ENABLE_FORCE_MONITOR");
       if (path && enable_force_monitor && strcmp(enable_force_monitor, "1") == 0) monitor = true;
       init = true;
     }
     return monitor;
   }

   static std::stringstream force_stream;
   static long long force_count = 0;
   static long long force_flush = 1000; // how many force samples we accumulate before flushing

   void flushForceMonitor() {
     if (!forceMonitor() || comm_rank() != 0) return;

     static std::string path = std::string(getenv("QUDA_RESOURCE_PATH"));
     static char *profile_fname = getenv("QUDA_PROFILE_OUTPUT_BASE");

     std::ofstream force_file;
     static long long count = 0;
     if (count == 0) {
       path += (profile_fname ? std::string("/") + profile_fname + "_force.tsv" : std::string("/force.tsv"));
       force_file.open(path.c_str());
       force_file << "Force\tL1\tL2\tdt" << std::endl;
     } else {
       force_file.open(path.c_str(), std::ios_base::app);
     }
     if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Flushing force monitor data to %s\n", path.c_str());
     force_file << force_stream.str();

     force_file.flush();
     force_file.close();

     // empty the stream buffer
     force_stream.clear();
     force_stream.str(std::string());

     count++;
   }

   void forceRecord(double2 &force, double dt, const char *fname) {
     qudaDeviceSynchronize();
     comm_allreduce_max_array((double*)&force, 2);

     if (comm_rank()==0) {
       force_stream << fname << "\t" << std::setprecision(5) << force.x << "\t"
                    << std::setprecision(5) << force.y << "\t"
                    << std::setprecision(5) << dt << std::endl;
       if (++force_count % force_flush == 0) flushForceMonitor();
     }
   }

 #ifdef GPU_GAUGE_TOOLS

   template <typename Mom>
   struct MomActionArg : public ReduceArg<double> {
     int threads; // number of active threads required
     Mom mom;
     int X[4]; // grid dimensions

     MomActionArg(const Mom &mom, const GaugeField &meta)
       : ReduceArg<double>(), mom(mom) {
       threads = meta.VolumeCB();
       for(int dir=0; dir<4; ++dir) X[dir] = meta.X()[dir];
     }
   };

   template<int blockSize, typename Float, typename Mom>
   __global__ void computeMomAction(MomActionArg<Mom> arg){
     int x = threadIdx.x + blockIdx.x*blockDim.x;
     int parity = threadIdx.y;
     double action = 0.0;

     while (x < arg.threads) {
       // loop over direction
       for (int mu=0; mu<4; mu++) {
         // FIXME should understand what this does exactly and cleanup (matches MILC)
   complex<Float> v_[5];
   arg.mom.load(v_, x, mu, parity);
         Float v[10];
         for (int i=0; i<5; i++) {
           v[2*i+0] = v_[i].real();
           v[2*i+1] = v_[i].imag();
         }

   double local_sum = 0.0;
   for (int j=0; j<6; j++) local_sum += v[j]*v[j];
   for (int j=6; j<9; j++) local_sum += 0.5*v[j]*v[j];
   local_sum -= 4.0;
   action += local_sum;
       }

       x += blockDim.x*gridDim.x;
     }

     // perform final inter-block reduction and write out result
     reduce2d<blockSize,2>(arg, action);
   }

   template<typename Float, typename Mom>
   class MomAction : TunableLocalParity {
     MomActionArg<Mom> &arg;
     const GaugeField &meta;

   private:
     bool tuneGridDim() const { return true; }

   public:
     MomAction(MomActionArg<Mom> &arg, const GaugeField &meta) : arg(arg), meta(meta) {}
     virtual ~MomAction () { }

     void apply(const cudaStream_t &stream){
       if (meta.Location() == QUDA_CUDA_FIELD_LOCATION){
   arg.result_h[0] = 0.0;
   TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
   LAUNCH_KERNEL_LOCAL_PARITY(computeMomAction, tp, stream, arg, Float, Mom);
       } else {
   errorQuda("CPU not supported yet\n");
       }
     }

     TuneKey tuneKey() const {
       std::stringstream aux;
       aux << "threads=" << arg.threads << ",prec="  << sizeof(Float);
       return TuneKey(meta.VolString(), typeid(*this).name(), aux.str().c_str());
     }

     long long flops() const { return 4*2*arg.threads*23; }
     long long bytes() const { return 4*2*arg.threads*arg.mom.Bytes(); }
   };

   template<typename Float, typename Mom>
   void momAction(const Mom mom, const GaugeField& meta, double &action) {
     MomActionArg<Mom> arg(mom, meta);
     MomAction<Float,Mom> momAction(arg, meta);

     momAction.apply(0);
     qudaDeviceSynchronize();

     comm_allreduce((double*)arg.result_h);
     action = arg.result_h[0];
   }

   template<typename Float>
   double momAction(const GaugeField& mom) {
     double action = 0.0;

     if (mom.Order() == QUDA_FLOAT2_GAUGE_ORDER) {
       if (mom.Reconstruct() == QUDA_RECONSTRUCT_10) {
   momAction<Float>(FloatNOrder<Float,10,2,10>(mom), mom, action);
       } else {
   errorQuda("Reconstruction type %d not supported", mom.Reconstruct());
       }
     } else {
       errorQuda("Gauge Field order %d not supported", mom.Order());
     }

     return action;
   }
 #endif

   double computeMomAction(const GaugeField& mom) {
     double action = 0.0;
 #ifdef GPU_GAUGE_TOOLS
     if (mom.Precision() == QUDA_DOUBLE_PRECISION) {
       action = momAction<double>(mom);
     } else if(mom.Precision() == QUDA_SINGLE_PRECISION) {
       action = momAction<float>(mom);
     } else {
       errorQuda("Precision %d not supported", mom.Precision());
     }
 #else
     errorQuda("%s not build", __func__);
 #endif
     return action;
   }


 #ifdef GPU_GAUGE_TOOLS
   template<typename Float, QudaReconstructType reconstruct_>
   struct UpdateMomArg : public ReduceArg<double2> {
     int threads;
     static constexpr int force_recon = (reconstruct_ == QUDA_RECONSTRUCT_10 ? 11 : 18);
     FloatNOrder<Float,18,2,11> mom;
     FloatNOrder<Float,18,2,force_recon> force;
     Float coeff;
     int X[4]; // grid dimensions on mom
     int E[4]; // grid dimensions on force (possibly extended)
     int border[4]; //
     UpdateMomArg(GaugeField &mom, const Float &coeff, GaugeField &force)
       : threads(mom.VolumeCB()), mom(mom), coeff(coeff), force(force) {
       for (int dir=0; dir<4; ++dir) {
         X[dir] = mom.X()[dir];
         E[dir] = force.X()[dir];
         border[dir] = force.R()[dir];
       }
     }
   };

   struct max_reducer2 {
     __device__ __host__ inline double2 operator()(const double2 &a, const double2 &b) {
       return make_double2(a.x > b.x ? a.x : b.x, a.y > b.y ? a.y : b.y);
     }
   };

   template <int blockSize, typename Float, typename Arg>
   __global__ void UpdateMomKernel(Arg arg) {
     int x_cb = blockIdx.x*blockDim.x + threadIdx.x;
     int parity = threadIdx.y;
     double2 norm2 = make_double2(0.0,0.0);
     max_reducer2 r;

     while (x_cb<arg.threads) {
       int x[4];
       getCoords(x, x_cb, arg.X, parity);
       for (int d=0; d<4; d++) x[d] += arg.border[d];
       int e_cb = linkIndex(x,arg.E);

 #pragma unroll
       for (int d=0; d<4; d++) {
   Matrix<complex<Float>,3> m = arg.mom(d, x_cb, parity);
         Matrix<complex<Float>,3> f = arg.force(d, e_cb, parity);

         // project to traceless anti-hermitian prior to taking norm
   makeAntiHerm(f);

         // compute force norms
         norm2 = r(make_double2(f.L1(), f.L2()), norm2);

         m = m + arg.coeff * f;

         // strictly speaking this shouldn't be needed since the
         // momentum should already be traceless anti-hermitian but at
         // present the unit test will fail without this
   makeAntiHerm(m);
   arg.mom(d, x_cb, parity) = m;
       }

       x_cb += gridDim.x*blockDim.x;
     }

     // perform final inter-block reduction and write out result
     reduce2d<blockSize,2,double2,false,max_reducer2>(arg, norm2, 0);
   } // UpdateMom


   template<typename Float, typename Arg>
   class UpdateMom : TunableLocalParity {
     Arg &arg;
     const GaugeField &meta;

   private:
     bool tuneGridDim() const { return true; }

   public:
     UpdateMom(Arg &arg, const GaugeField &meta) : arg(arg), meta(meta) {}
     virtual ~UpdateMom () { }

     void apply(const cudaStream_t &stream){
       if (meta.Location() == QUDA_CUDA_FIELD_LOCATION) {
   TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
   LAUNCH_KERNEL_LOCAL_PARITY(UpdateMomKernel, tp, stream, arg, Float);
       } else {
   errorQuda("CPU not supported yet\n");
       }
     }

     TuneKey tuneKey() const {
       std::stringstream aux;
       aux << "threads=" << arg.threads << ",prec="  << sizeof(Float);
       return TuneKey(meta.VolString(), typeid(*this).name(), aux.str().c_str());
     }

     void preTune() { arg.mom.save();}
     void postTune() { arg.mom.load();}
     long long flops() const { return 4*2*arg.threads*(36+42); }
     long long bytes() const { return 4*2*arg.threads*(2*arg.mom.Bytes()+arg.force.Bytes()); }
   };

   template<typename Float, QudaReconstructType reconstruct>
   void updateMomentum(GaugeField &mom, Float coeff, GaugeField &force, const char *fname) {
     UpdateMomArg<Float,reconstruct> arg(mom, coeff, force);
     UpdateMom<Float,decltype(arg)> update(arg, force);
     update.apply(0);

     if (forceMonitor()) forceRecord(*((double2*)arg.result_h), arg.coeff, fname);
   }

   template <typename Float>
   void updateMomentum(GaugeField &mom, double coeff, GaugeField &force, const char *fname) {
     if (mom.Reconstruct() != QUDA_RECONSTRUCT_10)
       errorQuda("Momentum field with reconstruct %d not supported", mom.Reconstruct());
     if (force.Order() != QUDA_FLOAT2_GAUGE_ORDER)
       errorQuda("Force field with order %d not supported", force.Order());

     if (force.Reconstruct() == QUDA_RECONSTRUCT_10) {
       updateMomentum<Float,QUDA_RECONSTRUCT_10>(mom, coeff, force, fname);
     } else if (force.Reconstruct() == QUDA_RECONSTRUCT_NO) {
       updateMomentum<Float,QUDA_RECONSTRUCT_NO>(mom, coeff, force, fname);
     } else {
       errorQuda("Unsupported force reconstruction: %d", force.Reconstruct());
     }

   }
 #endif // GPU_GAUGE_TOOLS

   void updateMomentum(GaugeField &mom, double coeff, GaugeField &force, const char *fname) {
 #ifdef GPU_GAUGE_TOOLS
     if(mom.Order() != QUDA_FLOAT2_GAUGE_ORDER)
       errorQuda("Unsupported output ordering: %d\n", mom.Order());

     if (mom.Precision() != force.Precision())
       errorQuda("Mixed precision not supported: %d %d\n", mom.Precision(), force.Precision());

     if (mom.Precision() == QUDA_DOUBLE_PRECISION) {
       updateMomentum<double>(mom, coeff, force, fname);
     } else if (mom.Precision() == QUDA_SINGLE_PRECISION) {
       updateMomentum<float>(mom, coeff, force, fname);
     } else {
       errorQuda("Unsupported precision: %d", mom.Precision());
     }

     checkCudaError();
 #else
     errorQuda("%s not built", __func__);
 #endif // GPU_GAUGE_TOOLS

     return;
   }


 #ifdef GPU_GAUGE_TOOLS

   template<typename Float, typename Force, typename Gauge>
   struct ApplyUArg {
     int threads;
     Force force;
     Gauge U;
     int X[4]; // grid dimensions
     ApplyUArg(Force &force, Gauge &U, GaugeField &meta)
       : threads(meta.VolumeCB()), force(force), U(U) {
       for (int dir=0; dir<4; ++dir) X[dir] = meta.X()[dir];
     }
   };

   template<typename Float, typename Force, typename Gauge>
   __global__ void ApplyUKernel(ApplyUArg<Float,Force,Gauge> arg) {
     int x = blockIdx.x*blockDim.x + threadIdx.x;
     int parity = threadIdx.y;
     Matrix<complex<Float>,3> f, u;

     while (x<arg.threads) {
       for (int d=0; d<4; d++) {
   f = arg.force(d, x, parity);
   u = arg.U(d, x, parity);

   f = u * f;

   arg.force(d, x, parity) = f;
       }

       x += gridDim.x*blockDim.x;
     }

     return;
   } // ApplyU


   template<typename Float, typename Force, typename Gauge>
   class ApplyU : TunableLocalParity {
     ApplyUArg<Float, Force, Gauge> &arg;
     const GaugeField &meta;

   private:
     unsigned int minThreads() const { return arg.threads; }

   public:
     ApplyU(ApplyUArg<Float,Force,Gauge> &arg, const GaugeField &meta) : arg(arg), meta(meta) {}
     virtual ~ApplyU () { }

     void apply(const cudaStream_t &stream){
       if(meta.Location() == QUDA_CUDA_FIELD_LOCATION){
   TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
   ApplyUKernel<Float,Force,Gauge><<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
       } else {
   errorQuda("CPU not supported yet\n");
       }
     }

     TuneKey tuneKey() const {
       std::stringstream aux;
       aux << "threads=" << arg.threads << ",prec="  << sizeof(Float);
       return TuneKey(meta.VolString(), typeid(*this).name(), aux.str().c_str());
     }

     void preTune() { arg.force.save();}
     void postTune() { arg.force.load();}
     long long flops() const { return 4*2*arg.threads*198; }
     long long bytes() const { return 4*2*arg.threads*(2*arg.force.Bytes()+arg.U.Bytes()); }
   };

   template<typename Float, typename Force, typename Gauge>
   void applyU(Force force, Gauge U, GaugeField &meta) {
     ApplyUArg<Float,Force,Gauge> arg(force, U, meta);
     ApplyU<Float,Force,Gauge> applyU(arg, meta);
     applyU.apply(0);
     qudaDeviceSynchronize();
   }
   template <typename Float>
   void applyU(GaugeField &force, GaugeField &U) {
     if (force.Reconstruct() != QUDA_RECONSTRUCT_NO)
       errorQuda("Force field with reconstruct %d not supported", force.Reconstruct());

     if (U.Reconstruct() == QUDA_RECONSTRUCT_NO) {
       applyU<Float>(FloatNOrder<Float, 18, 2, 18>(force), FloatNOrder<Float, 18, 2, 18>(U), force);
     } else if (U.Reconstruct() == QUDA_RECONSTRUCT_NO) {
       applyU<Float>(FloatNOrder<Float, 18, 2, 18>(force), FloatNOrder<Float, 18, 2, 12>(U), force);
     } else {
       errorQuda("Unsupported gauge reconstruction: %d", U.Reconstruct());
     }

   }
 #endif // GPU_GAUGE_TOOLS

   void applyU(GaugeField &force, GaugeField &U) {
 #ifdef GPU_GAUGE_TOOLS
     if(force.Order() != QUDA_FLOAT2_GAUGE_ORDER)
       errorQuda("Unsupported output ordering: %d\n", force.Order());

     if (force.Precision() != U.Precision())
       errorQuda("Mixed precision not supported: %d %d\n", force.Precision(), U.Precision());

     if (force.Precision() == QUDA_DOUBLE_PRECISION) {
       applyU<double>(force, U);
     } else {
       errorQuda("Unsupported precision: %d", force.Precision());
     }

     checkCudaError();
 #else
     errorQuda("%s not built", __func__);
 #endif // GPU_GAUGE_TOOLS

     return;
   }

 } // namespace quda
QUDA_VERBOSE
Definition: enum_quda.h:265

comm_rank
int comm_rank(void)
Definition: comm_mpi.cpp:82

QUDA_RECONSTRUCT_NO
Definition: enum_quda.h:67

quda::TuneParam
Definition: tune_quda.h:17

QUDA_RECONSTRUCT_10
Definition: enum_quda.h:72

mu
double mu
Definition: test_util.cpp:1648

quda::TunableLocalParity
Definition: tune_quda.h:386

quda::linkIndex
static __device__ __host__ int linkIndex(const int x[], const I X[4])
Definition: index_helper.cuh:46

LAUNCH_KERNEL_LOCAL_PARITY
#define LAUNCH_KERNEL_LOCAL_PARITY(kernel, tp, stream, arg,...)
Definition: launch_kernel.cuh:135

quda::force_stream
static std::stringstream force_stream
Definition: momentum.cu:25

quda::forceRecord
void forceRecord(double2 &force, double dt, const char *fname)
Definition: momentum.cu:57

getVerbosity
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21

errorQuda
#define errorQuda(...)
Definition: util_quda.h:121

QUDA_CUDA_FIELD_LOCATION
Definition: enum_quda.h:326

quda::updateMomentum
void updateMomentum(GaugeField &mom, double coeff, GaugeField &force, const char *fname)
Definition: momentum.cu:328

cub_helper.cuh

QUDA_FLOAT2_GAUGE_ORDER
Definition: enum_quda.h:39

quda::stream
cudaStream_t * stream
Definition: cuda_color_spinor_field.cpp:897

comm_allreduce_max_array
void comm_allreduce_max_array(double *data, size_t size)
Definition: comm_mpi.cpp:296

quda::LatticeField::VolString
const char * VolString() const
Definition: lattice_field.h:624

quda::TuneParam::shared_bytes
int shared_bytes
Definition: tune_quda.h:22

quda::force_flush
static long long force_flush
Definition: momentum.cu:27

quda::forceMonitor
bool forceMonitor()
Whether we are monitoring the force or not.
Definition: momentum.cu:13

E
int E[4]
Definition: test_util.cpp:35

quda::TuneParam::grid
dim3 grid
Definition: tune_quda.h:21

quda
Definition: blas_cublas.h:5

quda::norm2
double norm2(const CloverField &a, bool inverse=false)
Definition: clover_field.cpp:470

quda::computeMomAction
double computeMomAction(const GaugeField &mom)
Compute and return global the momentum action 1/2 mom^2.
Definition: momentum.cu:178

quda::LatticeField::R
const int * R() const
Definition: lattice_field.h:536

qudaDeviceSynchronize
#define qudaDeviceSynchronize()
Definition: quda_cuda_api.h:145

launch_kernel.cuh

quda::TuneParam::block
dim3 block
Definition: tune_quda.h:20

quda::tuneLaunch
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:643

quda::flushForceMonitor
void flushForceMonitor()
Flush any outstanding force monitoring information.
Definition: momentum.cu:29

quda::Matrix::L2
__device__ __host__ real L2()
Compute the matrix L2 norm. We actually compute the Frobenius norm which is an upper bound on the L2 ...
Definition: quda_matrix.h:162

gauge_field_order.h
Main header file for host and device accessors to GaugeFields.

quda::ReduceArg
Definition: cub_helper.cuh:69

X
int X[4]
Definition: covdev_test.cpp:70

quda::applyU
void applyU(GaugeField &force, GaugeField &U)
Definition: momentum.cu:446

quda::cublas::init
void init()
Create the CUBLAS context.
Definition: blas_cublas.cu:31

tune_quda.h

quda::Arg
Definition: spinor_noise.cu:22

quda::LatticeField::Location
QudaFieldLocation Location() const
Definition: lattice_field.cpp:660

quda_matrix.h

QUDA_DOUBLE_PRECISION
Definition: enum_quda.h:62

QUDA_SINGLE_PRECISION
Definition: enum_quda.h:61

quda::force_count
static long long force_count
Definition: momentum.cu:26

printfQuda
#define printfQuda(...)
Definition: util_quda.h:115

quda::LatticeField::VolumeCB
int VolumeCB() const
Definition: lattice_field.h:509

quda::blas::flops
unsigned long long flops
Definition: blas_quda.cu:22

quda::Matrix::L1
__device__ __host__ real L1()
Compute the matrix L1 norm - this is the maximum of the absolute column sums.
Definition: quda_matrix.h:143

quda::arg
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:1076

quda::clover::FloatNOrder
Accessor routine for CloverFields in native field order.
Definition: clover_field_order.h:541

quda::makeAntiHerm
__device__ __host__ void makeAntiHerm(Matrix< Complex, N > &m)
Definition: quda_matrix.h:746

quda::GaugeField::Reconstruct
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:250

quda::GaugeField::Order
QudaGaugeFieldOrder Order() const
Definition: gauge_field.h:251

checkCudaError
#define checkCudaError()
Definition: util_quda.h:161

comm_allreduce
void comm_allreduce(double *data)
Definition: comm_mpi.cpp:242

getTuning
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:52

quda::LatticeField::Precision
QudaPrecision Precision() const
Definition: lattice_field.h:546

quda::count
__device__ unsigned int count[QUDA_MAX_MULTI_REDUCE]
Definition: cub_helper.cuh:90

parity
QudaParity parity
Definition: covdev_test.cpp:54

quda::Matrix
Definition: quda_matrix.h:64

quda::TuneKey
Definition: tune_key.h:8

quda::blas::bytes
unsigned long long bytes
Definition: blas_quda.cu:23

quda::getCoords
__host__ __device__ int getCoords(int coord[], const Arg &arg, int &idx, int parity, int &dim)
Compute the space-time coordinates we are at.
Definition: dslash_helper.cuh:88

quda::LatticeField::X
const int * X() const
Definition: lattice_field.h:499

quda_internal.h

quda::GaugeField
Definition: gauge_field.h:164