v0.9.0/doc/qcharge__quda_8cu_source.html

 #include <quda_internal.h>
 #include <quda_matrix.h>
 #include <tune_quda.h>
 #include <gauge_field.h>
 #include <gauge_field_order.h>

 #include <cub/cub.cuh>
 #include <launch_kernel.cuh>
 #include <atomic.cuh>
 #include <cub_helper.cuh>
 #include <index_helper.cuh>

 #ifndef Pi2
 #define Pi2   6.2831853071795864769252867665590
 #endif

 namespace quda {

 #ifdef GPU_GAUGE_TOOLS
   template<typename Float, typename Gauge>
   struct QChargeArg : public ReduceArg<double> {
     int threads; // number of active threads required
     Gauge data;
     QChargeArg(const Gauge &data, GaugeField& Fmunu)
       : ReduceArg<double>(), data(data), threads(Fmunu.Volume()) {}
   };

   // Core routine for computing the topological charge from the field strength
   template<int blockSize, typename Float, typename Gauge>
     __global__
     void qChargeComputeKernel(QChargeArg<Float,Gauge> arg) {
       int idx = threadIdx.x + blockIdx.x*blockDim.x;

       double tmpQ1 = 0.;

       if(idx < arg.threads) {
         int parity = 0;
         if(idx >= arg.threads/2) {
           parity = 1;
           idx -= arg.threads/2;
         }

         // Load the field-strength tensor from global memory
         Matrix<complex<Float>,3> F[6], temp1, temp2, temp3;
         double tmpQ2, tmpQ3;
         for(int i=0; i<6; ++i){
           arg.data.load((Float*)(F[i].data), idx, i, parity);
         }

         temp1 = F[0]*F[5];
         temp2 = F[1]*F[4];
         temp3 = F[3]*F[2];

         tmpQ1 = (getTrace(temp1)).x;
         tmpQ2 = (getTrace(temp2)).x;
         tmpQ3 = (getTrace(temp3)).x;
         tmpQ1 += (tmpQ3 - tmpQ2);
         tmpQ1 /= (Pi2*Pi2);
       }

       double Q = tmpQ1;
       reduce<blockSize>(arg, Q);
     }

   template<typename Float, typename Gauge>
     class QChargeCompute : Tunable {
       QChargeArg<Float,Gauge> arg;
       const QudaFieldLocation location;
       GaugeField *vol;

       private:
       unsigned int sharedBytesPerThread() const { return 0; };
       unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }

 //      bool tuneSharedBytes() const { return false; } // Don't tune the shared memory.
       bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
       unsigned int minThreads() const { return arg.threads; }

       public:
       QChargeCompute(QChargeArg<Float,Gauge> &arg, GaugeField *vol, QudaFieldLocation location)
         : arg(arg), vol(vol), location(location) {
   writeAuxString("threads=%d,prec=%lu",arg.threads,sizeof(Float));
       }

       virtual ~QChargeCompute() { }

       void apply(const cudaStream_t &stream) {
         if(location == QUDA_CUDA_FIELD_LOCATION){
           arg.result_h[0] = 0.;
           TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
           LAUNCH_KERNEL(qChargeComputeKernel, tp, stream, arg, Float);
           qudaDeviceSynchronize();
         }else{ // run the CPU code
     errorQuda("qChargeComputeKernel not supported on CPU");
 //          qChargeComputeCPU(arg);
         }
       }

       TuneKey tuneKey() const {
   return TuneKey(vol->VolString(), typeid(*this).name(), aux);
       }

       long long flops() const { return arg.threads*(3*198+9); }
       long long bytes() const { return arg.threads*(6*18)*sizeof(Float); }
     };


   template<typename Float, typename Gauge>
     void computeQCharge(const Gauge data, GaugeField& Fmunu, QudaFieldLocation location, Float &qChg){
       QChargeArg<Float,Gauge> arg(data,Fmunu);
       QChargeCompute<Float,Gauge> qChargeCompute(arg, &Fmunu, location);
       qChargeCompute.apply(0);
       checkCudaError();
       comm_allreduce((double*) arg.result_h);
       qChg = arg.result_h[0];
     }

   template<typename Float>
     Float computeQCharge(GaugeField &Fmunu, QudaFieldLocation location){
       Float res = 0.;

       if (!Fmunu.isNative()) errorQuda("Topological charge computation only supported on native ordered fields");

       if (Fmunu.Reconstruct() == QUDA_RECONSTRUCT_NO) {
         typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type Gauge;
         computeQCharge<Float>(Gauge(Fmunu), Fmunu, location, res);
       } else if(Fmunu.Reconstruct() == QUDA_RECONSTRUCT_12){
         typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type Gauge;
         computeQCharge<Float>(Gauge(Fmunu), Fmunu, location, res);
       } else if(Fmunu.Reconstruct() == QUDA_RECONSTRUCT_8){
         typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type Gauge;
         computeQCharge<Float>(Gauge(Fmunu), Fmunu, location, res);
       } else {
         errorQuda("Reconstruction type %d of gauge field not supported", Fmunu.Reconstruct());
       }
 //      computeQCharge(Fmunu, location, res);

       return res;
     }
 #endif

   double computeQCharge(GaugeField& Fmunu, QudaFieldLocation location){

     double charge = 0;
 #ifdef GPU_GAUGE_TOOLS
     if (Fmunu.Precision() == QUDA_SINGLE_PRECISION){
       charge = computeQCharge<float>(Fmunu, location);
     } else if(Fmunu.Precision() == QUDA_DOUBLE_PRECISION) {
       charge = computeQCharge<double>(Fmunu, location);
     } else {
       errorQuda("Precision %d not supported", Fmunu.Precision());
     }
 #else
     errorQuda("Gauge tools are not build");
 #endif
     return charge;

   }

 } // namespace quda

QUDA_RECONSTRUCT_NO
Definition: enum_quda.h:66

quda::TuneParam
Definition: tune_quda.h:17

blockDim
dim3 dim3 blockDim
Definition: CMakeCUDACompilerId.cpp1.ii:2471

getVerbosity
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20

errorQuda
#define errorQuda(...)
Definition: util_quda.h:90

QUDA_CUDA_FIELD_LOCATION
Definition: enum_quda.h:297

cub_helper.cuh

quda::stream
cudaStream_t * stream
Definition: cuda_color_spinor_field.cu:898

quda::LatticeField::VolString
const char * VolString() const
Definition: lattice_field.h:524

Pi2
#define Pi2
Definition: qcharge_quda.cu:14

quda
Definition: blas_cublas.h:6

param
QudaGaugeParam param
Definition: pack_test.cpp:17

x
p x
Definition: CMakeCUDACompilerId.cpp1.ii:3011

quda::Tunable
Definition: tune_quda.h:60

launch_kernel.cuh

QUDA_RECONSTRUCT_12
Definition: enum_quda.h:67

quda::tuneLaunch
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603

fused_exterior_ndeg_tm_dslash_cuda_gen.i
int i
start here
Definition: fused_exterior_ndeg_tm_dslash_cuda_gen.py:816

quda::LatticeField::Volume
int Volume() const
Definition: lattice_field.h:420

gauge_field_order.h
Main header file for host and device accessors to GaugeFields.

quda::ReduceArg
Definition: cub_helper.cuh:97

quda::qudaDeviceSynchronize
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
Definition: quda_cuda_api.cpp:277

tune_quda.h

LAUNCH_KERNEL
#define LAUNCH_KERNEL(kernel, tp, stream, arg,...)
Definition: launch_kernel.cuh:1

QUDA_RECONSTRUCT_8
Definition: enum_quda.h:68

quda_matrix.h

QUDA_DOUBLE_PRECISION
Definition: enum_quda.h:61

quda::getTrace
__device__ __host__ T getTrace(const Matrix< T, 3 > &a)
Definition: quda_matrix.h:305

QudaFieldLocation
enum QudaFieldLocation_s QudaFieldLocation

index_helper.cuh

atomic.cuh

idx
int idx
Definition: staggered_fused_exterior_dslash_core.h:355

QUDA_SINGLE_PRECISION
Definition: enum_quda.h:60

quda::computeQCharge
double computeQCharge(GaugeField &Fmunu, QudaFieldLocation location)
Definition: qcharge_quda.cu:143

quda::gauge_mapper
Definition: gauge_field_order.h:2083

quda::blas::flops
unsigned long long flops
Definition: blas_quda.cu:42

quda::arg
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:880

quda::GaugeField::Reconstruct
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:203

checkCudaError
#define checkCudaError()
Definition: util_quda.h:129

comm_allreduce
void comm_allreduce(double *data)
Definition: comm_mpi.cpp:281

getTuning
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51

quda::LatticeField::Precision
QudaPrecision Precision() const
Definition: lattice_field.h:462

quda::GaugeField::isNative
bool isNative() const
Definition: gauge_field.cpp:138

parity
QudaParity parity
Definition: covdev_test.cpp:53

gauge_field.h

quda::Matrix
Definition: quda_matrix.h:68

quda::TuneKey
Definition: tune_key.h:8

quda::blas::bytes
unsigned long long bytes
Definition: blas_quda.cu:43

quda_internal.h

quda::GaugeField
Definition: gauge_field.h:123