quda-ref/v0.7.0/clover__trace__quda_8cu_source.html

 #include <quda_internal.h>

 #include <quda_matrix.h>

 #include <tune_quda.h>

 #include <clover_field.h>

 #include <gauge_field.h>

 #include <gauge_field_order.h>


 namespace CloverOrder {

   using namespace quda;

 #include <clover_field_order.h>

 } // CloverOrder


 namespace quda {


 #ifdef GPU_CLOVER_DIRAC


   template<typename Clover1, typename Clover2, typename Gauge>

     struct CloverTraceArg {

       Clover1 clover1;

       Clover2 clover2;

       Gauge gauge;

       int dir1;

       int dir2;


       CloverTraceArg(Clover1 &clover1, Clover2 &clover2, Gauge &gauge, int dir1, int dir2)

         : clover1(clover1), clover2(clover2), gauge(gauge), dir1(dir1), dir2(dir2) {}

     };


   template <typename Float, typename Clover1, typename Clover2, typename Gauge>

     __device__ __host__ void cloverSigmaTraceCompute(CloverTraceArg<Clover1, Clover2, Gauge>& arg, int x, int parity)

     {


       Float A[72];

       typedef typename ComplexTypeId<Float>::Type Complex;


       Matrix<Complex,3> mat;

       setZero(&mat);


       // load the clover term into memory


       int dir1 = arg.dir1;

       int dir2 = arg.dir2;


       Float sign = 1;

       if(dir2 < dir1){

         int tmp = dir2;

         dir2 = dir1;

         dir1 = tmp;

         sign = -1;

       }


       Float diag[2][6];

       complex<Float> tri[2][15];

       const int idtab[15]={0,1,3,6,10,2,4,7,11,5,8,12,9,13,14};

       complex<Float> ctmp;


       if(parity==0){

         arg.clover1.load(A,x,parity);

       }else{

         arg.clover2.load(A,x,parity);

       }


       for(int ch=0; ch<2; ++ch){

         // factor of two is inherent to QUDA clover storage

         for (int i=0; i<6; i++) diag[ch][i] = 2.0*A[ch*36+i];

         for (int i=0; i<15; i++) tri[ch][idtab[i]] = complex<Float>(2.0*A[ch*36+6+2*i], 2.0*A[ch*36+6+2*i+1]);

       }


       // X, Y

       if(dir1 == 0){

         if(dir2 == 1){

           for(int j=0; j<3; ++j){

             mat(j,j).y = diag[0][j+3] + diag[1][j+3] - diag[0][j] - diag[1][j];

           }


           // triangular part

           int jk=0;

           for(int j=1; j<3; ++j){

             int jk2 = (j+3)*(j+2)/2 + 3;

             for(int k=0; k<j; ++k){

               ctmp = tri[0][jk2] + tri[1][jk2] - tri[0][jk] - tri[1][jk];


               mat(j,k).x = -ctmp.imag();

               mat(j,k).y =  ctmp.real();


               mat(k,j).x =  ctmp.imag();

               mat(k,j).y =  ctmp.real();


               jk++; jk2++;

             }

           } // X Y


         }else if(dir2 == 2){


           for(int j=0; j<3; ++j){

             int jk = (j+3)*(j+2)/2;

             for(int k=0; k<3; ++k){

               int kj = (k+3)*(k+2)/2 + j;

               ctmp = conj(tri[0][kj]) - tri[0][jk] + conj(tri[1][kj]) - tri[1][jk];

               mat(j,k).x = ctmp.real();

               mat(j,k).y = ctmp.imag();

               jk++;

             }

           } // X Z


         }else if(dir2 == 3){

           for(int j=0; j<3; ++j){

             int jk = (j+3)*(j+2)/2;

             for(int k=0; k<3; ++k){

               int kj = (k+3)*(k+2)/2 + j;

               ctmp = conj(tri[0][kj]) + tri[0][jk] - conj(tri[1][kj]) - tri[1][jk];

               mat(j,k).x = -ctmp.imag();

               mat(j,k).y =  ctmp.real();

               jk++;

             }

           }


         } // dir2 == 3 // X T


       }else if(dir1 == 1){

         if(dir2 == 2){ // Y Z

           for(int j=0; j<3; ++j){

             int jk = (j+3)*(j+2)/2;

             for(int k=0; k<3; ++k){

               int kj = (k+3)*(k+2)/2 + j;

               ctmp = conj(tri[0][kj]) + tri[0][jk] + conj(tri[1][kj]) + tri[1][jk];

               mat(j,k).x =  ctmp.imag();

               mat(j,k).y = -ctmp.real();

               jk++;

             }

           }

         }else if(dir2 == 3){ // Y T

           for(int j=0; j<3; ++j){

             int jk = (j+3)*(j+2)/2;

             for(int k=0; k<3; ++k){

               int kj = (k+3)*(k+2)/2 + j;

               ctmp = conj(tri[0][kj]) - tri[0][jk] - conj(tri[1][kj]) + tri[1][jk];

               mat(j,k).x = ctmp.real();

               mat(j,k).y = ctmp.imag();

               jk++;

             }

           }

         } // dir2 == 3

       } // dir1 == 1

       else if(dir1 == 2){

         if(dir2 == 3){

           for(int j=0; j<3; ++j){

             mat(j,j).y = diag[0][j] - diag[0][j+3] - diag[1][j] + diag[1][j+3];

           }

           int jk=0;

           for(int j=1; j<3; ++j){

             int jk2 = (j+3)*(j+2)/2 + 3;

             for(int k=0; k<j; ++k){

               ctmp = tri[0][jk] - tri[0][jk2] - tri[1][jk] + tri[1][jk2];

               mat(j,k).x = -ctmp.imag();

               mat(j,k).y =  ctmp.real();


               mat(k,j).x = ctmp.imag();

               mat(k,j).y = ctmp.real();

               jk++; jk2++;

             }

           }

         }

       }

       // if we dir1 and dir2 were swapped, multiply by -1

       mat *= sign;


       arg.gauge.save((Float*)(mat.data), x, 0, parity);


       return;

     }


   template<typename Float, typename Clover1, typename Clover2, typename Gauge>

     void cloverSigmaTrace(CloverTraceArg<Clover1,Clover2,Gauge> arg)

     {

       for(int x=0; x<arg.clover1.volumeCB; x++){

         cloverSigmaTraceCompute<Float,Clover1,Clover2,Gauge>(arg, x, 1);

       }

       return;

     }


   template<typename Float, typename Clover1, typename Clover2, typename Gauge>

     __global__ void cloverSigmaTraceKernel(CloverTraceArg<Clover1,Clover2,Gauge> arg)

     {

       int idx = blockIdx.x*blockDim.x + threadIdx.x;

       if(idx >= arg.clover1.volumeCB) return;

       // odd parity

       cloverSigmaTraceCompute<Float,Clover1,Clover2,Gauge>(arg, idx, 1);

     }


   template<typename Float, typename Clover1, typename Clover2, typename Gauge>

     class CloverSigmaTrace : Tunable {

       CloverTraceArg<Clover1,Clover2,Gauge> arg;

       const GaugeField &meta;

       const QudaFieldLocation location;


       private:

       unsigned int sharedBytesPerThread() const { return 0; }

       unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }


       bool tuneSharedBytes() const { return false; } // Don't tune the shared memory

       bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.

       unsigned int minThreads() const { return arg.clover1.volumeCB; }


       public:

       CloverSigmaTrace(CloverTraceArg<Clover1,Clover2,Gauge> &arg, const GaugeField &meta, QudaFieldLocation location)

         : arg(arg), meta(meta), location(location) {

         writeAuxString("stride=%d", arg.clover1.stride);

       }

       virtual ~CloverSigmaTrace() {;}


       void apply(const cudaStream_t &stream){

         if(location == QUDA_CUDA_FIELD_LOCATION){

 #if (__COMPUTE_CAPABILITY__ >= 200)

           dim3 blockDim(128, 1, 1);

           dim3 gridDim((arg.clover1.volumeCB + blockDim.x - 1)/blockDim.x, 1, 1);

           cloverSigmaTraceKernel<Float,Clover1,Clover2,Gauge><<<gridDim,blockDim,0>>>(arg);

 #else

           errorQuda("cloverSigmaTrace not supported on pre-Fermi architecture");

 #endif

         }else{

           cloverSigmaTrace<Float,Clover1,Clover2,Gauge>(arg);

         }

       }


       TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }


       std::string paramString(const TuneParam &param) const { // Don't print the grid dim.

         std::stringstream ps;

         ps << "block=(" << param.block.x << "," << param.block.y << "," << param.block.z << "), ";

         ps << "shared=" << param.shared_bytes;

         return ps.str();

       }


       long long flops() const { return 0; } // Fix this

       long long bytes() const { return 0; } // Fix this


     }; // CloverSigmaTrace


   template<typename Float, typename Clover1, typename Clover2, typename Gauge>

   void computeCloverSigmaTrace(Clover1 clover1, Clover2 clover2, Gauge gauge, int dir1, int dir2,

                                const GaugeField &meta, QudaFieldLocation location)

   {

     CloverTraceArg<Clover1, Clover2, Gauge> arg(clover1, clover2, gauge, dir1, dir2);

     CloverSigmaTrace<Float,Clover1,Clover2,Gauge> traceCompute(arg, meta, location);

     traceCompute.apply(0);

     cudaDeviceSynchronize();

     return;

   }


   template<typename Float>

     void computeCloverSigmaTrace(GaugeField& gauge, const CloverField& clover, int dir1, int dir2,

         QudaFieldLocation location){


       if(clover.Order() == QUDA_FLOAT2_CLOVER_ORDER){

         if(gauge.Order() == QUDA_FLOAT2_GAUGE_ORDER){

           if(gauge.Reconstruct() == QUDA_RECONSTRUCT_NO){

             computeCloverSigmaTrace<Float>(CloverOrder::quda::FloatNOrder<Float,72,2>(clover,0),

                                            CloverOrder::quda::FloatNOrder<Float,72,2>(clover,1),

                                            FloatNOrder<Float, 18, 2, 18>(gauge), dir1, dir2, gauge, location);

           }else if(gauge.Reconstruct() == QUDA_RECONSTRUCT_12){

             computeCloverSigmaTrace<Float>(CloverOrder::quda::FloatNOrder<Float,72,2>(clover,0),

                                            CloverOrder::quda::FloatNOrder<Float,72,2>(clover,1),

                                            FloatNOrder<Float, 18, 2, 12>(gauge), dir1, dir2, gauge, location);


           }else{

             errorQuda("Reconstruction type %d not supported",gauge.Reconstruct());

           }


         }else if(gauge.Order() == QUDA_FLOAT4_GAUGE_ORDER){

           if(gauge.Reconstruct() == QUDA_RECONSTRUCT_12){

             computeCloverSigmaTrace<Float>(CloverOrder::quda::FloatNOrder<Float,72,2>(clover,0),

                                            CloverOrder::quda::FloatNOrder<Float,72,2>(clover,1),

                                            FloatNOrder<Float,18,4,12>(gauge),  dir1, dir2, gauge, location);

           }else{

             errorQuda("Reconstruction type %d not supported",gauge.Reconstruct());

           }

         }

       }else if(clover.Order() == QUDA_FLOAT4_CLOVER_ORDER){

         if(gauge.Order() == QUDA_FLOAT2_GAUGE_ORDER){

           if(gauge.Reconstruct() == QUDA_RECONSTRUCT_NO){

             computeCloverSigmaTrace<Float>(CloverOrder::quda::FloatNOrder<Float,72,4>(clover,0),

                                            CloverOrder::quda::FloatNOrder<Float,72,4>(clover,1),

                                            FloatNOrder<Float,18,2,18>(gauge),  dir1, dir2, gauge, location);

           }else if(gauge.Reconstruct() == QUDA_RECONSTRUCT_12){

             computeCloverSigmaTrace<Float>(CloverOrder::quda::FloatNOrder<Float,72,4>(clover,0),

                                            CloverOrder::quda::FloatNOrder<Float,72,4>(clover,1),

                                            FloatNOrder<Float,18,2,12>(gauge),  dir1, dir2, gauge, location);

           }else{

             errorQuda("Reconstruction type %d not supported",gauge.Reconstruct());

           }

         }else if(gauge.Order() == QUDA_FLOAT4_GAUGE_ORDER){

           errorQuda("Reconstruction type %d not supported",gauge.Reconstruct());

         }

       } // clover order

     }


 #endif


   void computeCloverSigmaTrace(GaugeField& gauge, const CloverField& clover, int dir1, int dir2,

       QudaFieldLocation location){


 #ifdef GPU_CLOVER_DIRAC

     if(clover.Precision() == QUDA_HALF_PRECISION){

       errorQuda("Half precision not supported\n");

     }


     if(clover.Precision() == QUDA_SINGLE_PRECISION){

       computeCloverSigmaTrace<float>(gauge, clover, dir1, dir2, location);

     }else if(clover.Precision() == QUDA_DOUBLE_PRECISION){

       computeCloverSigmaTrace<double>(gauge, clover, dir1, dir2, location);

     }else{

       errorQuda("Precision %d not supported", clover.Precision());

     }

 #else

     errorQuda("Clover has not been built");

 #endif


   }


 } // namespace quda

QUDA_RECONSTRUCT_NO
Definition: enum_quda.h:55

quda::TuneParam
Definition: tune_quda.h:16

quda::setZero
__device__ __host__ void setZero(Matrix< T, N > *m)
Definition: quda_matrix.h:640

conj
Matrix< N, std::complex< T > > conj(const Matrix< N, std::complex< T > > &mat)
Definition: hisq_force_reference2.cpp:231

errorQuda
#define errorQuda(...)
Definition: util_quda.h:73

QUDA_CUDA_FIELD_LOCATION
Definition: enum_quda.h:271

QUDA_HALF_PRECISION
Definition: enum_quda.h:48

quda::CloverField
Definition: clover_field.h:32

quda::Complex
std::complex< double > Complex
Definition: eig_variables.h:13

quda::GaugeField::Order
QudaGaugeFieldOrder Order() const
Definition: gauge_field.h:169

QUDA_FLOAT2_GAUGE_ORDER
Definition: enum_quda.h:30

quda::stream
cudaStream_t * stream
Definition: cuda_color_spinor_field.cu:816

testing::internal::string
::std::string string
Definition: gtest.h:1979

mat
void mat(void *out, void **fatlink, void **longlink, void *in, double kappa, int dagger_bit, QudaPrecision sPrecision, QudaPrecision gPrecision)
Definition: staggered_dslash_reference.cpp:136

quda::complex::imag
__host__ __device__ ValueType imag() const volatile

quda::TuneParam::shared_bytes
int shared_bytes
Definition: tune_quda.h:21

clover_field_order.h

quda::ComplexTypeId
Definition: quda_matrix.h:19

param
QudaGaugeParam param
Definition: pack_test.cpp:17

quda::LatticeField::Precision
QudaPrecision Precision() const
Definition: lattice_field.h:176

QUDA_FLOAT2_CLOVER_ORDER
Definition: enum_quda.h:206

quda::Tunable
Definition: tune_quda.h:40

tmp
cudaColorSpinorField * tmp
Definition: staggered_dslash_test.cpp:48

QUDA_RECONSTRUCT_12
Definition: enum_quda.h:56

location
const QudaFieldLocation location
Definition: pack_test.cpp:46

testing::internal::Float
FloatingPoint< float > Float
Definition: gtest.h:7350

quda::TuneParam::block
dim3 block
Definition: tune_quda.h:19

quda::CloverField::Order
QudaCloverFieldOrder Order() const
Definition: clover_field.h:66

quda::Matrix::data
T data[N *N]
Definition: quda_matrix.h:351

quda::GaugeField::Reconstruct
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:168

quda::FloatNOrder
Definition: clover_field_order.h:56

quda::complex::real
__host__ __device__ ValueType real() const volatile

gauge_field_order.h

QUDA_FLOAT4_CLOVER_ORDER
Definition: enum_quda.h:207

quda::computeCloverSigmaTrace
void computeCloverSigmaTrace(GaugeField &gauge, const CloverField &clover, int dir1, int dir2, QudaFieldLocation location)
Definition: clover_trace_quda.cu:310

clover_field.h

x
int x[4]
Definition: hisq_paths_force_core.h:99

tune_quda.h

quda_matrix.h

QUDA_DOUBLE_PRECISION
Definition: enum_quda.h:50

QudaFieldLocation
enum QudaFieldLocation_s QudaFieldLocation

idx
int idx
Definition: staggered_fused_exterior_dslash_core.h:342

QUDA_SINGLE_PRECISION
Definition: enum_quda.h:49

QUDA_FLOAT4_GAUGE_ORDER
Definition: enum_quda.h:31

quda::arg
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:843

quda::complex
Definition: complex_quda.h:119

parity
const QudaParity parity
Definition: dslash_test.cpp:29

gauge_field.h

gauge
void * gauge[4]
Definition: su3_test.cpp:15

quda::Matrix
Definition: quda_matrix.h:348

quda::TuneKey
Definition: tune_key.h:8

quda_internal.h

quda::GaugeField
Definition: gauge_field.h:118