v0.9.0/doc/momentum_8cu_source.html

 #include <quda_internal.h>
 #include <quda_matrix.h>
 #include <tune_quda.h>
 #include <gauge_field_order.h>
 #include <launch_kernel.cuh>
 #include <cub_helper.cuh>

 namespace quda {

 using namespace gauge;

 #ifdef GPU_GAUGE_TOOLS

   template <typename Mom>
   struct MomActionArg : public ReduceArg<double> {
     int threads; // number of active threads required
     Mom mom;
     int X[4]; // grid dimensions

     MomActionArg(const Mom &mom, const GaugeField &meta)
       : ReduceArg<double>(), mom(mom) {
       threads = meta.VolumeCB();
       for(int dir=0; dir<4; ++dir) X[dir] = meta.X()[dir];
     }
   };

   template<int blockSize, typename Float, typename Mom>
   __global__ void computeMomAction(MomActionArg<Mom> arg){
     int x = threadIdx.x + blockIdx.x*blockDim.x;
     int parity = threadIdx.y;
     double action = 0.0;

     if(x < arg.threads) {
       // loop over direction
       for (int mu=0; mu<4; mu++) {
   Float v[10];
   arg.mom.load(v, x, mu, parity);

   double local_sum = 0.0;
   for (int j=0; j<6; j++) local_sum += v[j]*v[j];
   for (int j=6; j<9; j++) local_sum += 0.5*v[j]*v[j];
   local_sum -= 4.0;
   action += local_sum;
       }
     }

     // perform final inter-block reduction and write out result
     reduce2d<blockSize,2>(arg, action);
   }

   template<typename Float, typename Mom>
   class MomAction : TunableLocalParity {
     MomActionArg<Mom> &arg;
     const GaugeField &meta;

   private:
     unsigned int minThreads() const { return arg.threads; }

   public:
     MomAction(MomActionArg<Mom> &arg, const GaugeField &meta) : arg(arg), meta(meta) {}
     virtual ~MomAction () { }

     void apply(const cudaStream_t &stream){
       if (meta.Location() == QUDA_CUDA_FIELD_LOCATION){
   arg.result_h[0] = 0.0;
   TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
   LAUNCH_KERNEL_LOCAL_PARITY(computeMomAction, tp, stream, arg, Float, Mom);
       } else {
   errorQuda("CPU not supported yet\n");
       }
     }

     TuneKey tuneKey() const {
       std::stringstream aux;
       aux << "threads=" << arg.threads << ",prec="  << sizeof(Float);
       return TuneKey(meta.VolString(), typeid(*this).name(), aux.str().c_str());
     }

     long long flops() const { return 4*2*arg.threads*23; }
     long long bytes() const { return 4*2*arg.threads*arg.mom.Bytes(); }
   };

   template<typename Float, typename Mom>
   void momAction(const Mom mom, const GaugeField& meta, double &action) {
     MomActionArg<Mom> arg(mom, meta);
     MomAction<Float,Mom> momAction(arg, meta);

     momAction.apply(0);
     qudaDeviceSynchronize();

     comm_allreduce((double*)arg.result_h);
     action = arg.result_h[0];
   }

   template<typename Float>
   double momAction(const GaugeField& mom) {
     double action = 0.0;

     if (mom.Order() == QUDA_FLOAT2_GAUGE_ORDER) {
       if (mom.Reconstruct() == QUDA_RECONSTRUCT_10) {
   momAction<Float>(FloatNOrder<Float,10,2,10>(mom), mom, action);
       } else {
   errorQuda("Reconstruction type %d not supported", mom.Reconstruct());
       }
     } else {
       errorQuda("Gauge Field order %d not supported", mom.Order());
     }

     return action;
   }
 #endif

   double computeMomAction(const GaugeField& mom) {
     double action = 0.0;
 #ifdef GPU_GAUGE_TOOLS
     if (mom.Precision() == QUDA_DOUBLE_PRECISION) {
       action = momAction<double>(mom);
     } else if(mom.Precision() == QUDA_SINGLE_PRECISION) {
       action = momAction<float>(mom);
     } else {
       errorQuda("Precision %d not supported", mom.Precision());
     }
 #else
     errorQuda("%s not build", __func__);
 #endif
     return action;
   }


 #ifdef GPU_GAUGE_TOOLS
   template<typename Float, typename Mom, typename Force>
   struct UpdateMomArg {
     int threads;
     Mom mom;
     Float coeff;
     Force force;
     int X[4]; // grid dimensions
     UpdateMomArg(Mom &mom, const Float &coeff, Force &force, GaugeField &meta)
       : threads(meta.VolumeCB()), mom(mom), coeff(coeff), force(force) {
       for (int dir=0; dir<4; ++dir) X[dir] = meta.X()[dir];
     }
   };

   template<typename Float, typename Mom, typename Force>
   __global__ void UpdateMomKernel(UpdateMomArg<Float, Mom, Force> arg) {
     int x = blockIdx.x*blockDim.x + threadIdx.x;
     int parity = threadIdx.y;
     Matrix<complex<Float>,3> m, f;
     while(x<arg.threads){
       for (int d=0; d<4; d++) {
   arg.mom.load(reinterpret_cast<Float*>(m.data), x, d, parity);
   arg.force.load(reinterpret_cast<Float*>(f.data), x, d, parity);

   m = m + arg.coeff * f;
   makeAntiHerm(m);

   arg.mom.save(reinterpret_cast<Float*>(m.data), x, d, parity);
       }

       x += gridDim.x*blockDim.x;
     }
     return;
   } // UpdateMom


   template<typename Float, typename Mom, typename Force>
   class UpdateMom : TunableLocalParity {
     UpdateMomArg<Float, Mom, Force> &arg;
     const GaugeField &meta;

   private:
     unsigned int minThreads() const { return arg.threads; }

   public:
     UpdateMom(UpdateMomArg<Float,Mom,Force> &arg, const GaugeField &meta) : arg(arg), meta(meta) {}
     virtual ~UpdateMom () { }

     void apply(const cudaStream_t &stream){
       if(meta.Location() == QUDA_CUDA_FIELD_LOCATION){
   TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
   UpdateMomKernel<Float,Mom,Force><<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
       } else {
   errorQuda("CPU not supported yet\n");
       }
     }

     TuneKey tuneKey() const {
       std::stringstream aux;
       aux << "threads=" << arg.threads << ",prec="  << sizeof(Float);
       return TuneKey(meta.VolString(), typeid(*this).name(), aux.str().c_str());
     }

     void preTune() { arg.mom.save();}
     void postTune() { arg.mom.load();}
     long long flops() const { return 4*2*arg.threads*(36+42); }
     long long bytes() const { return 4*2*arg.threads*(2*arg.mom.Bytes()+arg.force.Bytes()); }
   };

   template<typename Float, typename Mom, typename Force>
   void updateMomentum(Mom mom, Float coeff, Force force, GaugeField &meta) {
     UpdateMomArg<Float,Mom,Force> arg(mom, coeff, force, meta);
     UpdateMom<Float,Mom,Force> update(arg, meta);
     update.apply(0);
   }

   template <typename Float>
   void updateMomentum(GaugeField &mom, double coeff, GaugeField &force) {
     if (mom.Reconstruct() != QUDA_RECONSTRUCT_10)
       errorQuda("Momentum field with reconstruct %d not supported", mom.Reconstruct());

     if (force.Reconstruct() == QUDA_RECONSTRUCT_10) {
       updateMomentum<Float>(FloatNOrder<Float, 18, 2, 11>(mom), static_cast<Float>(coeff),
             FloatNOrder<Float, 18, 2, 11>(force), force);
     } else if (force.Reconstruct() == QUDA_RECONSTRUCT_NO) {
       updateMomentum<Float>(FloatNOrder<Float, 18, 2, 11>(mom), static_cast<Float>(coeff),
             FloatNOrder<Float, 18, 2, 18>(force), force);
     } else {
       errorQuda("Unsupported force reconstruction: %d", force.Reconstruct());
     }

   }
 #endif // GPU_GAUGE_TOOLS

   void updateMomentum(GaugeField &mom, double coeff, GaugeField &force) {
 #ifdef GPU_GAUGE_TOOLS
     if(mom.Order() != QUDA_FLOAT2_GAUGE_ORDER)
       errorQuda("Unsupported output ordering: %d\n", mom.Order());

     if (mom.Precision() != force.Precision())
       errorQuda("Mixed precision not supported: %d %d\n", mom.Precision(), force.Precision());

     if (mom.Precision() == QUDA_DOUBLE_PRECISION) {
       updateMomentum<double>(mom, coeff, force);
     } else {
       errorQuda("Unsupported precision: %d", mom.Precision());
     }

     checkCudaError();
 #else
     errorQuda("%s not built", __func__);
 #endif // GPU_GAUGE_TOOLS

     return;
   }


 #ifdef GPU_GAUGE_TOOLS

   template<typename Float, typename Force, typename Gauge>
   struct ApplyUArg {
     int threads;
     Force force;
     Gauge U;
     int X[4]; // grid dimensions
     ApplyUArg(Force &force, Gauge &U, GaugeField &meta)
       : threads(meta.VolumeCB()), force(force), U(U) {
       for (int dir=0; dir<4; ++dir) X[dir] = meta.X()[dir];
     }
   };

   template<typename Float, typename Force, typename Gauge>
   __global__ void ApplyUKernel(ApplyUArg<Float,Force,Gauge> arg) {
     int x = blockIdx.x*blockDim.x + threadIdx.x;
     int parity = threadIdx.y;
     Matrix<complex<Float>,3> f, u;

     while (x<arg.threads) {
       for (int d=0; d<4; d++) {
   arg.force.load(reinterpret_cast<Float*>(f.data), x, d, parity);
   arg.U.load(reinterpret_cast<Float*>(u.data), x, d, parity);

   f = u * f;

   arg.force.save(reinterpret_cast<Float*>(f.data), x, d, parity);
       }

       x += gridDim.x*blockDim.x;
     }

     return;
   } // ApplyU


   template<typename Float, typename Force, typename Gauge>
   class ApplyU : TunableLocalParity {
     ApplyUArg<Float, Force, Gauge> &arg;
     const GaugeField &meta;

   private:
     unsigned int minThreads() const { return arg.threads; }

   public:
     ApplyU(ApplyUArg<Float,Force,Gauge> &arg, const GaugeField &meta) : arg(arg), meta(meta) {}
     virtual ~ApplyU () { }

     void apply(const cudaStream_t &stream){
       if(meta.Location() == QUDA_CUDA_FIELD_LOCATION){
   TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
   ApplyUKernel<Float,Force,Gauge><<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
       } else {
   errorQuda("CPU not supported yet\n");
       }
     }

     TuneKey tuneKey() const {
       std::stringstream aux;
       aux << "threads=" << arg.threads << ",prec="  << sizeof(Float);
       return TuneKey(meta.VolString(), typeid(*this).name(), aux.str().c_str());
     }

     void preTune() { arg.force.save();}
     void postTune() { arg.force.load();}
     long long flops() const { return 4*2*arg.threads*198; }
     long long bytes() const { return 4*2*arg.threads*(2*arg.force.Bytes()+arg.U.Bytes()); }
   };

   template<typename Float, typename Force, typename Gauge>
   void applyU(Force force, Gauge U, GaugeField &meta) {
     ApplyUArg<Float,Force,Gauge> arg(force, U, meta);
     ApplyU<Float,Force,Gauge> applyU(arg, meta);
     applyU.apply(0);
     qudaDeviceSynchronize();
   }
   template <typename Float>
   void applyU(GaugeField &force, GaugeField &U) {
     if (force.Reconstruct() != QUDA_RECONSTRUCT_NO)
       errorQuda("Force field with reconstruct %d not supported", force.Reconstruct());

     if (U.Reconstruct() == QUDA_RECONSTRUCT_NO) {
       applyU<Float>(FloatNOrder<Float, 18, 2, 18>(force), FloatNOrder<Float, 18, 2, 18>(U), force);
     } else if (U.Reconstruct() == QUDA_RECONSTRUCT_NO) {
       applyU<Float>(FloatNOrder<Float, 18, 2, 18>(force), FloatNOrder<Float, 18, 2, 12>(U), force);
     } else {
       errorQuda("Unsupported gauge reconstruction: %d", U.Reconstruct());
     }

   }
 #endif // GPU_GAUGE_TOOLS

   void applyU(GaugeField &force, GaugeField &U) {
 #ifdef GPU_GAUGE_TOOLS
     if(force.Order() != QUDA_FLOAT2_GAUGE_ORDER)
       errorQuda("Unsupported output ordering: %d\n", force.Order());

     if (force.Precision() != U.Precision())
       errorQuda("Mixed precision not supported: %d %d\n", force.Precision(), U.Precision());

     if (force.Precision() == QUDA_DOUBLE_PRECISION) {
       applyU<double>(force, U);
     } else {
       errorQuda("Unsupported precision: %d", force.Precision());
     }

     checkCudaError();
 #else
     errorQuda("%s not built", __func__);
 #endif // GPU_GAUGE_TOOLS

     return;
   }

 } // namespace quda
QUDA_RECONSTRUCT_NO
Definition: enum_quda.h:66

quda::TuneParam
Definition: tune_quda.h:17

blockDim
dim3 dim3 blockDim
Definition: CMakeCUDACompilerId.cpp1.ii:2471

QUDA_RECONSTRUCT_10
Definition: enum_quda.h:71

mu
double mu
Definition: test_util.cpp:1643

quda::TunableLocalParity
Definition: tune_quda.h:306

LAUNCH_KERNEL_LOCAL_PARITY
#define LAUNCH_KERNEL_LOCAL_PARITY(kernel, tp, stream, arg,...)
Definition: launch_kernel.cuh:135

getVerbosity
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20

errorQuda
#define errorQuda(...)
Definition: util_quda.h:90

QUDA_CUDA_FIELD_LOCATION
Definition: enum_quda.h:297

cub_helper.cuh

QUDA_FLOAT2_GAUGE_ORDER
Definition: enum_quda.h:39

quda::stream
cudaStream_t * stream
Definition: cuda_color_spinor_field.cu:898

quda::LatticeField::VolString
const char * VolString() const
Definition: lattice_field.h:524

quda::TuneParam::shared_bytes
int shared_bytes
Definition: tune_quda.h:22

quda::TuneParam::grid
dim3 grid
Definition: tune_quda.h:21

quda
Definition: blas_cublas.h:6

gridDim
dim3 gridDim
Definition: CMakeCUDACompilerId.cpp1.ii:2471

quda::computeMomAction
double computeMomAction(const GaugeField &mom)
Compute and return global the momentum action 1/2 mom^2.
Definition: momentum.cu:113

x
p x
Definition: CMakeCUDACompilerId.cpp1.ii:3011

launch_kernel.cuh

quda::TuneParam::block
dim3 block
Definition: tune_quda.h:20

quda::Matrix::data
T data[N *N]
Definition: quda_matrix.h:74

quda::tuneLaunch
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603

double
double
Definition: CMakeCUDACompilerId.cpp1.ii:8010

f
int int int enum cudaChannelFormatKind f
Definition: CMakeCUDACompilerId.cpp1.ii:2637

gauge_field_order.h
Main header file for host and device accessors to GaugeFields.

quda::ReduceArg
Definition: cub_helper.cuh:97

quda::applyU
void applyU(GaugeField &force, GaugeField &U)
Definition: momentum.cu:340

quda::qudaDeviceSynchronize
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
Definition: quda_cuda_api.cpp:277

tune_quda.h

X
int X
Definition: asym_wilson_clover_dslash_dagger_fermi_core.h:394

quda::LatticeField::Location
QudaFieldLocation Location() const
Definition: lattice_field.cpp:522

quda_matrix.h

QUDA_DOUBLE_PRECISION
Definition: enum_quda.h:61

dw_dslash_4D_cuda_gen.coeff
def coeff()
Definition: dw_dslash_4D_cuda_gen.py:1099

QUDA_SINGLE_PRECISION
Definition: enum_quda.h:60

quda::LatticeField::VolumeCB
int VolumeCB() const
Definition: lattice_field.h:425

quda::blas::flops
unsigned long long flops
Definition: blas_quda.cu:42

quda::updateMomentum
void updateMomentum(GaugeField &mom, double coeff, GaugeField &force)
Definition: momentum.cu:224

quda::arg
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:880

quda::clover::FloatNOrder
Accessor routine for CloverFields in native field order.
Definition: clover_field_order.h:367

quda::makeAntiHerm
__device__ __host__ void makeAntiHerm(Matrix< Complex, N > &m)
Definition: quda_matrix.h:636

quda::GaugeField::Reconstruct
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:203

quda::GaugeField::Order
QudaGaugeFieldOrder Order() const
Definition: gauge_field.h:204

checkCudaError
#define checkCudaError()
Definition: util_quda.h:129

comm_allreduce
void comm_allreduce(double *data)
Definition: comm_mpi.cpp:281

getTuning
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51

d
static __inline__ size_t size_t d
Definition: CMakeCUDACompilerId.cpp1.ii:3019

quda::LatticeField::Precision
QudaPrecision Precision() const
Definition: lattice_field.h:462

parity
QudaParity parity
Definition: covdev_test.cpp:53

quda::Matrix
Definition: quda_matrix.h:68

quda::TuneKey
Definition: tune_key.h:8

quda::blas::bytes
unsigned long long bytes
Definition: blas_quda.cu:43

quda::LatticeField::X
const int * X() const
Definition: lattice_field.h:415

quda_internal.h

quda::GaugeField
Definition: gauge_field.h:123