v0.9.0/doc/pgauge__det__trace_8cu_source.html

 #include <quda_internal.h>
 #include <quda_matrix.h>
 #include <tune_quda.h>
 #include <gauge_field.h>
 #include <gauge_field_order.h>
 #include <launch_kernel.cuh>
 #include <comm_quda.h>
 #include <pgauge_monte.h>
 #include <atomic.cuh>
 #include <cub_helper.cuh>
 #include <index_helper.cuh>

 namespace quda {

 #ifdef GPU_GAUGE_ALG

 template <typename Gauge>
 struct KernelArg : public ReduceArg<double2> {
   int threads; // number of active threads required
   int X[4]; // grid dimensions
 #ifdef MULTI_GPU
   int border[4];
 #endif
   Gauge dataOr;

   KernelArg(const Gauge &dataOr, const cudaGaugeField &data)
     : ReduceArg<double2>(), dataOr(dataOr) {
 #ifdef MULTI_GPU
     for(int dir=0; dir<4; ++dir){
       border[dir] = data.R()[dir];
       X[dir] = data.X()[dir] - border[dir]*2;
     }
 #else
     for(int dir=0; dir<4; ++dir) X[dir] = data.X()[dir];
 #endif
     threads = X[0]*X[1]*X[2]*X[3]/2;
   }
   double2 getValue(){return result_h[0];}
 };


 template<int blockSize, typename Float, typename Gauge, int NCOLORS, int functiontype>
 __global__ void compute_Value(KernelArg<Gauge> arg){
   int idx = threadIdx.x + blockIdx.x*blockDim.x;
   int parity = threadIdx.y;

   complex<double> val(0.0, 0.0);
   if(idx < arg.threads) {
     int X[4];
     #pragma unroll
     for(int dr=0; dr<4; ++dr) X[dr] = arg.X[dr];

     int x[4];
     getCoords(x, idx, X, parity);
   #ifdef MULTI_GPU
     #pragma unroll
     for(int dr=0; dr<4; ++dr) {
       x[dr] += arg.border[dr];
       X[dr] += 2*arg.border[dr];
     }
     idx = linkIndex(x,X);
   #endif
 #pragma unroll
     for (int mu = 0; mu < 4; mu++) {
       Matrix<complex<Float>,NCOLORS> U;
       arg.dataOr.load((Float*)(U.data), idx, mu, parity);
       if(functiontype == 0) val += getDeterminant(U);
       if(functiontype == 1) val += getTrace(U);
     }
   }

   double2 sum = make_double2(val.real(), val.imag());
   reduce2d<blockSize,2>(arg, sum);
 }


 template<typename Float, typename Gauge, int NCOLORS, int functiontype>
 class CalcFunc : TunableLocalParity {
   KernelArg<Gauge> arg;
   TuneParam tp;
   mutable char aux_string[128]; // used as a label in the autotuner
   private:
   unsigned int minThreads() const { return arg.threads; }

   public:
   CalcFunc(KernelArg<Gauge> &arg) : arg(arg) {}
   ~CalcFunc () { }

   void apply(const cudaStream_t &stream){
     tp = tuneLaunch(*this, getTuning(), getVerbosity());
     arg.result_h[0] = make_double2(0.0, 0.0);
     LAUNCH_KERNEL_LOCAL_PARITY(compute_Value, tp, stream, arg, Float, Gauge, NCOLORS, functiontype);
     qudaDeviceSynchronize();

     comm_allreduce_array((double*)arg.result_h, 2);
     arg.result_h[0].x  /= (double)(4*2*arg.threads*comm_size());
     arg.result_h[0].y  /= (double)(4*2*arg.threads*comm_size());
   }

   TuneKey tuneKey() const {
     std::stringstream vol;
     vol << arg.X[0] << "x" << arg.X[1] << "x" << arg.X[2] << "x" << arg.X[3];
     sprintf(aux_string,"threads=%d,prec=%lu", arg.threads, sizeof(Float));
     return TuneKey(vol.str().c_str(), typeid(*this).name(), aux_string);

   }

   long long flops() const {
     if(NCOLORS==3 && functiontype == 0) return 264LL*2*arg.threads+2LL*tp.block.x;
     else if(NCOLORS==3 && functiontype == 1) return 24LL*2*arg.threads+2LL*tp.block.x;
     else return 0;
   }// Only correct if there is no link reconstruction
   long long bytes() const { return 4LL*NCOLORS * NCOLORS * sizeof(Float)*2*2*arg.threads + tp.block.x * sizeof(double2); }

 };


 template<typename Float, int NCOLORS, int functiontype, typename Gauge>
 double2 computeValue( Gauge dataOr,  cudaGaugeField& data) {
   TimeProfile profileGenericFunc("GenericFunc", false);
   if (getVerbosity() >= QUDA_SUMMARIZE) profileGenericFunc.TPSTART(QUDA_PROFILE_COMPUTE);
   KernelArg<Gauge> arg(dataOr, data);
   CalcFunc<Float, Gauge, NCOLORS, functiontype> func(arg);
   func.apply(0);
   if(getVerbosity() >= QUDA_SUMMARIZE && functiontype == 0) printfQuda("Determinant: %.16e, %.16e\n", arg.getValue().x, arg.getValue().y);
   if(getVerbosity() >= QUDA_SUMMARIZE && functiontype == 1) printfQuda("Trace: %.16e, %.16e\n", arg.getValue().x, arg.getValue().y);
   checkCudaError();
   qudaDeviceSynchronize();
   if (getVerbosity() >= QUDA_SUMMARIZE){
     profileGenericFunc.TPSTOP(QUDA_PROFILE_COMPUTE);
     double secs = profileGenericFunc.Last(QUDA_PROFILE_COMPUTE);
     double gflops = (func.flops()*1e-9)/(secs);
     double gbytes = func.bytes()/(secs*1e9);
     if(functiontype == 0){
       #ifdef MULTI_GPU
       printfQuda("Determinant: Time = %6.6f s, Gflop/s = %6.1f, GB/s = %6.1f\n", secs, gflops*comm_size(), gbytes*comm_size());
       #else
       printfQuda("Determinant: Time = %6.6f s, Gflop/s = %6.1f, GB/s = %6.1f\n", secs, gflops, gbytes);
       #endif
     }
     if(functiontype == 1){
       #ifdef MULTI_GPU
       printfQuda("Trace: Time = %6.6f s, Gflop/s = %6.1f, GB/s = %6.1f\n", secs, gflops*comm_size(), gbytes*comm_size());
       #else
       printfQuda("Trace: Time = %6.6f s, Gflop/s = %6.1f, GB/s = %6.1f\n", secs, gflops, gbytes);
       #endif
     }
   }
   return arg.getValue();
 }


 template<typename Float, int functiontype>
 double2 computeValue(cudaGaugeField& data) {

   double2 rtn = make_double2(0.0,0.0);

   // Switching to FloatNOrder for the gauge field in order to support RECONSTRUCT_12
   if(data.isNative()) {
     if(data.Reconstruct() == QUDA_RECONSTRUCT_NO) {
     //printfQuda("QUDA_RECONSTRUCT_NO\n");
       typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type Gauge;
       rtn = computeValue<Float, 3, functiontype>(Gauge(data), data);
     } else if(data.Reconstruct() == QUDA_RECONSTRUCT_12){
     //printfQuda("QUDA_RECONSTRUCT_12\n");
       typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type Gauge;
       rtn = computeValue<Float, 3, functiontype>(Gauge(data), data);
     } else if(data.Reconstruct() == QUDA_RECONSTRUCT_8){
     //printfQuda("QUDA_RECONSTRUCT_8\n");
       typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type Gauge;
       rtn = computeValue<Float, 3, functiontype>(Gauge(data), data);
     } else {
       errorQuda("Reconstruction type %d of gauge field not supported", data.Reconstruct());
     }
   } else {
     errorQuda("Invalid Gauge Order\n");
   }
   return rtn;
 }
 #endif // GPU_GAUGE_ALG

 double2 getLinkDeterminant( cudaGaugeField& data) {
   double2 det = make_double2(0.0,0.0);
 #ifdef GPU_GAUGE_ALG
   if (data.Precision() == QUDA_SINGLE_PRECISION) {
     det = computeValue<float, 0> (data);
   } else if(data.Precision() == QUDA_DOUBLE_PRECISION) {
     det = computeValue<double, 0>(data);
   } else {
     errorQuda("Precision %d not supported", data.Precision());
   }
 #else
   errorQuda("Pure gauge code has not been built");
 #endif // GPU_GAUGE_ALG
   return det;
 }

 double2 getLinkTrace( cudaGaugeField& data) {
   double2 det = make_double2(0.0,0.0);
 #ifdef GPU_GAUGE_ALG
   if (data.Precision() == QUDA_SINGLE_PRECISION) {
     det = computeValue<float, 1> (data);
   } else if(data.Precision() == QUDA_DOUBLE_PRECISION) {
     det = computeValue<double, 1>(data);
   } else {
     errorQuda("Precision %d not supported", data.Precision());
   }
 #else
   errorQuda("Pure gauge code has not been built");
 #endif // GPU_GAUGE_ALG
   return det;
 }


 } // namespace quda
QUDA_RECONSTRUCT_NO
Definition: enum_quda.h:66

quda::TuneParam
Definition: tune_quda.h:17

blockDim
dim3 dim3 blockDim
Definition: CMakeCUDACompilerId.cpp1.ii:2471

mu
double mu
Definition: test_util.cpp:1643

quda::TunableLocalParity
Definition: tune_quda.h:306

quda::linkIndex
static __device__ __host__ int linkIndex(const int x[], const I X[4])
Definition: index_helper.cuh:46

LAUNCH_KERNEL_LOCAL_PARITY
#define LAUNCH_KERNEL_LOCAL_PARITY(kernel, tp, stream, arg,...)
Definition: launch_kernel.cuh:135

getVerbosity
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20

quda::getLinkDeterminant
double2 getLinkDeterminant(cudaGaugeField &data)
Calculate the Determinant.
Definition: pgauge_det_trace.cu:193

func
const void * func
Definition: CMakeCUDACompilerId.cpp1.ii:2248

pgauge_monte.h

errorQuda
#define errorQuda(...)
Definition: util_quda.h:90

cub_helper.cuh

QUDA_SUMMARIZE
Definition: enum_quda.h:236

quda::stream
cudaStream_t * stream
Definition: cuda_color_spinor_field.cu:898

comm_allreduce_array
void comm_allreduce_array(double *data, size_t size)
Definition: comm_mpi.cpp:296

quda
Definition: blas_cublas.h:6

comm_quda.h

x
p x
Definition: CMakeCUDACompilerId.cpp1.ii:3011

quda::LatticeField::R
const int * R() const
Definition: lattice_field.h:452

comm_size
int comm_size(void)
Definition: comm_mpi.cpp:126

launch_kernel.cuh

QUDA_RECONSTRUCT_12
Definition: enum_quda.h:67

quda::TuneParam::block
dim3 block
Definition: tune_quda.h:20

sum
__host__ __device__ void sum(double &a, double &b)
Definition: multi_reduce_core.cuh:4

quda::Matrix::data
T data[N *N]
Definition: quda_matrix.h:74

quda::QUDA_PROFILE_COMPUTE
Definition: quda_internal.h:172

quda::tuneLaunch
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603

quda::getLinkTrace
double2 getLinkTrace(cudaGaugeField &data)
Calculate the Trace.
Definition: pgauge_det_trace.cu:214

double
double
Definition: CMakeCUDACompilerId.cpp1.ii:8010

quda::cudaGaugeField
Definition: gauge_field.h:298

gauge_field_order.h
Main header file for host and device accessors to GaugeFields.

quda::ReduceArg
Definition: cub_helper.cuh:97

quda::qudaDeviceSynchronize
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
Definition: quda_cuda_api.cpp:277

tune_quda.h

X
int X
Definition: asym_wilson_clover_dslash_dagger_fermi_core.h:394

QUDA_RECONSTRUCT_8
Definition: enum_quda.h:68

quda_matrix.h

QUDA_DOUBLE_PRECISION
Definition: enum_quda.h:61

quda::getTrace
__device__ __host__ T getTrace(const Matrix< T, 3 > &a)
Definition: quda_matrix.h:305

index_helper.cuh

atomic.cuh

idx
int idx
Definition: staggered_fused_exterior_dslash_core.h:355

QUDA_SINGLE_PRECISION
Definition: enum_quda.h:60

sprintf
int sprintf(char *, const char *,...) __attribute__((__format__(__printf__

quda::gauge_mapper
Definition: gauge_field_order.h:2083

printfQuda
#define printfQuda(...)
Definition: util_quda.h:84

quda::blas::flops
unsigned long long flops
Definition: blas_quda.cu:42

e
return e
Definition: CMakeCUDACompilerId.cpp1.ii:3026

quda::arg
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:880

quda::TimeProfile
Definition: quda_internal.h:232

quda::GaugeField::Reconstruct
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:203

quda::complex< double >
Definition: complex_quda.h:554

checkCudaError
#define checkCudaError()
Definition: util_quda.h:129

quda::getDeterminant
__device__ __host__ T getDeterminant(const Mat< T, 3 > &a)
Definition: quda_matrix.h:312

getTuning
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51

quda::LatticeField::Precision
QudaPrecision Precision() const
Definition: lattice_field.h:462

quda::GaugeField::isNative
bool isNative() const
Definition: gauge_field.cpp:138

val
static __inline__ enum cudaRoundMode mode enum cudaRoundMode mode enum cudaRoundMode mode enum cudaRoundMode mode int val
Definition: CMakeCUDACompilerId.cpp1.ii:12148

parity
QudaParity parity
Definition: covdev_test.cpp:53

gauge_field.h

quda::Matrix
Definition: quda_matrix.h:68

quda::TuneKey
Definition: tune_key.h:8

quda::blas::bytes
unsigned long long bytes
Definition: blas_quda.cu:43

quda::LatticeField::X
const int * X() const
Definition: lattice_field.h:415

quda_internal.h

quda::getCoords
static __device__ __host__ void getCoords(int x[], int cb_index, const I X[], int parity)
Definition: index_helper.cuh:129