quda-ref/v0.7.0/extract__gauge__ghost_8cu_source.html

 #include <gauge_field_order.h>


 namespace quda {

   template <typename Order, int nDim>

   struct ExtractGhostArg {

     Order order;

     const unsigned char nFace;

     unsigned short X[nDim];

     unsigned short A[nDim];

     unsigned short B[nDim];

     unsigned short C[nDim];

     int f[nDim][nDim];

     bool localParity[nDim];

     ExtractGhostArg(const Order &order, int nFace, const int *X_, const int *A_,

                     const int *B_, const int *C_, const int f_[nDim][nDim], const int *localParity_)

   : order(order), nFace(nFace) {

       for (int d=0; d<nDim; d++) {

         X[d] = X_[d];

         A[d] = A_[d];

         B[d] = B_[d];

         C[d] = C_[d];

         for (int e=0; e<nDim; e++) f[d][e] = f_[d][e];

         localParity[d] = localParity_[d];

       }

     }

   };


   template <typename Float, int length, int nDim, typename Order>

   void extractGhost(ExtractGhostArg<Order,nDim> arg) {

     typedef typename mapper<Float>::type RegType;


     for (int parity=0; parity<2; parity++) {


       for (int dim=0; dim<nDim; dim++) {


         // linear index used for writing into ghost buffer

         int indexDst = 0;

         // the following 4-way loop means this is specialized for 4 dimensions


         // FIXME redefine a, b, c, d such that we always optimize for locality

         for (int d=arg.X[dim]-arg.nFace; d<arg.X[dim]; d++) { // loop over last nFace faces in this dimension

           for (int a=0; a<arg.A[dim]; a++) { // loop over the surface elements of this face

             for (int b=0; b<arg.B[dim]; b++) { // loop over the surface elements of this face

               for (int c=0; c<arg.C[dim]; c++) { // loop over the surface elements of this face

                 // index is a checkboarded spacetime coordinate

                 int indexCB = (a*arg.f[dim][0] + b*arg.f[dim][1] + c*arg.f[dim][2] + d*arg.f[dim][3]) >> 1;

                 // we only do the extraction for parity we are currently working on

                 int oddness = (a+b+c+d) & 1;

                 if (oddness == parity) {

                   RegType u[length];

                   arg.order.load(u, indexCB, dim, parity); // load the ghost element from the bulk

                   arg.order.saveGhost(u, indexDst, dim, (parity+arg.localParity[dim])&1);

                   indexDst++;

                 } // oddness == parity

               } // c

             } // b

           } // a

         } // d


         //assert(indexDst == arg.nFace*arg.surfaceCB[dim]);

         assert(indexDst == arg.order.faceVolumeCB[dim]);

       } // dim


     } // parity


   }


   template <typename Float, int length, int nDim, typename Order>

   __global__ void extractGhostKernel(ExtractGhostArg<Order,nDim> arg) {

     typedef typename mapper<Float>::type RegType;


     for (int parity=0; parity<2; parity++) {

       for (int dim=0; dim<nDim; dim++) {


         // linear index used for writing into ghost buffer

         int X = blockIdx.x * blockDim.x + threadIdx.x;

         //if (X >= 2*arg.nFace*arg.surfaceCB[dim]) continue;

         if (X >= 2*arg.order.faceVolumeCB[dim]) continue;

         // X = ((d * A + a)*B + b)*C + c

         int dab = X/arg.C[dim];

         int c = X - dab*arg.C[dim];

         int da = dab/arg.B[dim];

         int b = dab - da*arg.B[dim];

         int d = da / arg.A[dim];

         int a = da - d * arg.A[dim];

         d += arg.X[dim]-arg.nFace;


         // index is a checkboarded spacetime coordinate

         int indexCB = (a*arg.f[dim][0] + b*arg.f[dim][1] + c*arg.f[dim][2] + d*arg.f[dim][3]) >> 1;

         // we only do the extraction for parity we are currently working on

         int oddness = (a+b+c+d)&1;

         if (oddness == parity) {

           RegType u[length];

           arg.order.load(u, indexCB, dim, parity); // load the ghost element from the bulk

           arg.order.saveGhost(u, X>>1, dim, (parity+arg.localParity[dim])&1);

         } // oddness == parity


       } // dim


     } // parity


   }


   template <typename Float, int length, int nDim, typename Order>

   class ExtractGhost : Tunable {

     ExtractGhostArg<Order,nDim> arg;

     int size;

     const GaugeField &meta;


   private:

     unsigned int sharedBytesPerThread() const { return 0; }

     unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0 ;}


     bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.

     unsigned int minThreads() const { return size; }


   public:

     ExtractGhost(ExtractGhostArg<Order,nDim> &arg, const GaugeField &meta) : arg(arg), meta(meta) {

       int faceMax = 0;

       for (int d=0; d<nDim; d++)

         faceMax = (arg.order.faceVolumeCB[d] > faceMax )

           ? arg.order.faceVolumeCB[d] : faceMax;

       size = 2 * faceMax; // factor of comes from parity


       writeAuxString("stride=%d", arg.order.stride);

     }


     virtual ~ExtractGhost() { ; }


     void apply(const cudaStream_t &stream) {

 #if (__COMPUTE_CAPABILITY__ >= 200)

       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());

       extractGhostKernel<Float, length, nDim, Order>

         <<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);

 #else

       errorQuda("extractGhost not supported on pre-Fermi architecture");

 #endif

     }


     TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }


     std::string paramString(const TuneParam &param) const { // Don't bother printing the grid dim.

       std::stringstream ps;

       ps << "block=(" << param.block.x << "," << param.block.y << "," << param.block.z << "), ";

       ps << "shared=" << param.shared_bytes;

       return ps.str();

     }


     long long flops() const { return 0; }

     long long bytes() const {

       int sites = 0;

       for (int d=0; d<nDim; d++) sites += arg.order.faceVolumeCB[d];

       return 2 * sites * 2 * arg.order.Bytes(); // parity * sites * i/o * vec size

     }

   };


   template <typename Float, int length, typename Order>

   void extractGhost(Order order, const GaugeField &u, QudaFieldLocation location) {

     const int *X = u.X();

     const int nFace = u.Nface();

     const int nDim = 4;

     //loop variables: a, b, c with a the most signifcant and c the least significant

     //A, B, C the maximum value

     //we need to loop in d as well, d's vlaue dims[dir]-3, dims[dir]-2, dims[dir]-1

     int A[nDim], B[nDim], C[nDim];

     A[0] = X[3]; B[0] = X[2]; C[0] = X[1]; // X dimension face

     A[1] = X[3]; B[1] = X[2]; C[1] = X[0]; // Y dimension face

     A[2] = X[3]; B[2] = X[1]; C[2] = X[0]; // Z dimension face

     A[3] = X[2]; B[3] = X[1]; C[3] = X[0]; // T dimension face


     //multiplication factor to compute index in original cpu memory

     int f[nDim][nDim]={

       {X[0]*X[1]*X[2],  X[0]*X[1], X[0],               1},

       {X[0]*X[1]*X[2],  X[0]*X[1],    1,            X[0]},

       {X[0]*X[1]*X[2],       X[0],    1,       X[0]*X[1]},

       {     X[0]*X[1],       X[0],    1,  X[0]*X[1]*X[2]}

     };


     //set the local processor parity

     //switching odd and even ghost gauge when that dimension size is odd

     //only switch if X[dir] is odd and the gridsize in that dimension is greater than 1

     // FIXME - I don't understand this, shouldn't it be commDim(dim) == 0 ?

     int localParity[nDim];

     for (int dim=0; dim<nDim; dim++)

       //localParity[dim] = (X[dim]%2==0 || commDim(dim)) ? 0 : 1;

       localParity[dim] = ((X[dim] % 2 ==1) && (commDim(dim) > 1)) ? 1 : 0;


     ExtractGhostArg<Order, nDim> arg(order, nFace, X, A, B, C, f, localParity);

     if (location==QUDA_CPU_FIELD_LOCATION) {

       extractGhost<Float,length,nDim,Order>(arg);

     } else {

       ExtractGhost<Float,length,nDim,Order> extract(arg, u);

       extract.apply(0);

     }


   }


   template <typename Float>

     void extractGhost(const GaugeField &u, Float **Ghost) {


     const int length = 18;


     QudaFieldLocation location =

       (typeid(u)==typeid(cudaGaugeField)) ? QUDA_CUDA_FIELD_LOCATION : QUDA_CPU_FIELD_LOCATION;


     if (u.Order() == QUDA_FLOAT2_GAUGE_ORDER) {

       if (u.Reconstruct() == QUDA_RECONSTRUCT_NO) {

         if (typeid(Float)==typeid(short) && u.LinkType() == QUDA_ASQTAD_FAT_LINKS) {

           extractGhost<Float,length>(FloatNOrder<Float,length,2,19>(u, 0, Ghost), u, location);

         } else {

           extractGhost<Float,length>(FloatNOrder<Float,length,2,18>(u, 0, Ghost), u, location);

         }

       } else if (u.Reconstruct() == QUDA_RECONSTRUCT_12) {

         extractGhost<Float,length>(FloatNOrder<Float,length,2,12>(u, 0, Ghost), u, location);

       } else if (u.Reconstruct() == QUDA_RECONSTRUCT_8) {

         extractGhost<Float,length>(FloatNOrder<Float,length,2,8>(u, 0, Ghost), u, location);

       } else if (u.Reconstruct() == QUDA_RECONSTRUCT_13) {

         extractGhost<Float,length>(FloatNOrder<Float,length,2,13>(u, 0, Ghost), u, location);

       } else if (u.Reconstruct() == QUDA_RECONSTRUCT_9) {

         extractGhost<Float,length>(FloatNOrder<Float,length,2,9>(u, 0, Ghost), u, location);

       }

     } else if (u.Order() == QUDA_FLOAT4_GAUGE_ORDER) {

       if (u.Reconstruct() == QUDA_RECONSTRUCT_NO) {

         if (typeid(Float)==typeid(short) && u.LinkType() == QUDA_ASQTAD_FAT_LINKS) {

           extractGhost<Float,length>(FloatNOrder<Float,length,1,19>(u, 0, Ghost), u, location);

         } else {

           extractGhost<Float,length>(FloatNOrder<Float,length,1,18>(u, 0, Ghost), u, location);

         }

       } else if (u.Reconstruct() == QUDA_RECONSTRUCT_12) {

         extractGhost<Float,length>(FloatNOrder<Float,length,4,12>(u, 0, Ghost), u, location);

       } else if (u.Reconstruct() == QUDA_RECONSTRUCT_8) {

         extractGhost<Float,length>(FloatNOrder<Float,length,4,8>(u, 0, Ghost), u, location);

       } else if(u.Reconstruct() == QUDA_RECONSTRUCT_13){

         extractGhost<Float,length>(FloatNOrder<Float,length,4,13>(u, 0, Ghost), u, location);

       } else if(u.Reconstruct() == QUDA_RECONSTRUCT_9){

         extractGhost<Float,length>(FloatNOrder<Float,length,4,9>(u, 0, Ghost), u, location);

       }

     } else if (u.Order() == QUDA_QDP_GAUGE_ORDER) {


 #ifdef BUILD_QDP_INTERFACE

       extractGhost<Float,length>(QDPOrder<Float,length>(u, 0, Ghost), u, location);

 #else

       errorQuda("QDP interface has not been built\n");

 #endif


     } else if (u.Order() == QUDA_QDPJIT_GAUGE_ORDER) {


 #ifdef BUILD_QDPJIT_INTERFACE

       extractGhost<Float,length>(QDPJITOrder<Float,length>(u, 0, Ghost), u, location);

 #else

       errorQuda("QDPJIT interface has not been built\n");

 #endif


     } else if (u.Order() == QUDA_CPS_WILSON_GAUGE_ORDER) {


 #ifdef BUILD_CPS_INTERFACE

       extractGhost<Float,length>(CPSOrder<Float,length>(u, 0, Ghost), u, location);

 #else

       errorQuda("CPS interface has not been built\n");

 #endif


     } else if (u.Order() == QUDA_MILC_GAUGE_ORDER) {


 #ifdef BUILD_MILC_INTERFACE

       extractGhost<Float,length>(MILCOrder<Float,length>(u, 0, Ghost), u, location);

 #else

       errorQuda("MILC interface has not been built\n");

 #endif


     } else if (u.Order() == QUDA_BQCD_GAUGE_ORDER) {


 #ifdef BUILD_BQCD_INTERFACE

       extractGhost<Float,length>(BQCDOrder<Float,length>(u, 0, Ghost), u, location);

 #else

       errorQuda("BQCD interface has not been built\n");

 #endif


     } else if (u.Order() == QUDA_TIFR_GAUGE_ORDER) {


 #ifdef BUILD_TIFR_INTERFACE

       extractGhost<Float,length>(TIFROrder<Float,length>(u, 0, Ghost), u, location);

 #else

       errorQuda("TIFR interface has not been built\n");

 #endif


     } else {

       errorQuda("Gauge field %d order not supported", u.Order());

     }


   }


   void extractGaugeGhost(const GaugeField &u, void **ghost) {


 #if __COMPUTE_CAPABILITY__ < 200

     if (u.Reconstruct() == QUDA_RECONSTRUCT_13 || u.Reconstruct() == QUDA_RECONSTRUCT_9)

       errorQuda("Reconstruct 9/13 not supported on pre-Fermi architecture");

 #endif


     if (u.Precision() == QUDA_DOUBLE_PRECISION) {

       extractGhost(u, (double**)ghost);

     } else if (u.Precision() == QUDA_SINGLE_PRECISION) {

       extractGhost(u, (float**)ghost);

     } else if (u.Precision() == QUDA_HALF_PRECISION) {

       extractGhost(u, (short**)ghost);

     } else {

       errorQuda("Unknown precision type %d", u.Precision());

     }

   }


 } // namespace quda

commDim
int commDim(int)
Definition: face_buffer.cpp:535

QUDA_RECONSTRUCT_NO
Definition: enum_quda.h:55

quda::QDPOrder
Definition: clover_field_order.h:134

quda::ExtractGhost
Definition: extract_gauge_ghost.cu:114

quda::TuneParam
Definition: tune_quda.h:16

getVerbosity
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20

quda::MILCOrder
Definition: gauge_field_order.h:752

quda::ExtractGhost::bytes
long long bytes() const
Definition: extract_gauge_ghost.cu:159

quda::ExtractGhostArg::order
Order order
Definition: extract_gauge_ghost.cu:6

errorQuda
#define errorQuda(...)
Definition: util_quda.h:73

QUDA_BQCD_GAUGE_ORDER
Definition: enum_quda.h:36

quda::LatticeField::X
const int * X() const
Definition: lattice_field.h:162

QUDA_CUDA_FIELD_LOCATION
Definition: enum_quda.h:271

quda::extractGaugeGhost
void extractGaugeGhost(const GaugeField &u, void **ghost)
Definition: extract_gauge_ghost.cu:307

QUDA_HALF_PRECISION
Definition: enum_quda.h:48

QUDA_QDP_GAUGE_ORDER
Definition: enum_quda.h:32

quda::GaugeField::Order
QudaGaugeFieldOrder Order() const
Definition: gauge_field.h:169

QUDA_FLOAT2_GAUGE_ORDER
Definition: enum_quda.h:30

quda::stream
cudaStream_t * stream
Definition: cuda_color_spinor_field.cu:816

testing::internal::string
::std::string string
Definition: gtest.h:1979

quda::TuneParam::shared_bytes
int shared_bytes
Definition: tune_quda.h:21

quda::ExtractGhostArg::f
int f[nDim][nDim]
Definition: extract_gauge_ghost.cu:12

quda::ExtractGhost::paramString
std::string paramString(const TuneParam &param) const
Definition: extract_gauge_ghost.cu:151

quda::GaugeField::Nface
int Nface() const
Definition: gauge_field.h:193

quda::extractGhost
void extractGhost(ExtractGhostArg< Order, nDim > arg)
Definition: extract_gauge_ghost.cu:33

length
int length[]
Definition: gauge_force_test.cpp:41

quda::TuneParam::grid
dim3 grid
Definition: tune_quda.h:20

quda::BQCDOrder
Definition: clover_field_order.h:242

param
QudaGaugeParam param
Definition: pack_test.cpp:17

QUDA_RECONSTRUCT_9
Definition: enum_quda.h:58

quda::LatticeField::Precision
QudaPrecision Precision() const
Definition: lattice_field.h:176

quda::ExtractGhost::apply
void apply(const cudaStream_t &stream)
Definition: extract_gauge_ghost.cu:139

quda::Tunable
Definition: tune_quda.h:40

quda::Tunable::writeAuxString
void writeAuxString(const char *format,...)
Definition: tune_quda.h:138

QUDA_RECONSTRUCT_12
Definition: enum_quda.h:56

location
const QudaFieldLocation location
Definition: pack_test.cpp:46

quda::QDPJITOrder
Definition: clover_field_order.h:171

testing::internal::Float
FloatingPoint< float > Float
Definition: gtest.h:7350

quda::TuneParam::block
dim3 block
Definition: tune_quda.h:19

quda::ExtractGhostArg::B
unsigned short B[nDim]
Definition: extract_gauge_ghost.cu:10

QUDA_MILC_GAUGE_ORDER
Definition: enum_quda.h:35

quda::ExtractGhostArg::nFace
const unsigned char nFace
Definition: extract_gauge_ghost.cu:7

quda::ExtractGhost::~ExtractGhost
virtual ~ExtractGhost()
Definition: extract_gauge_ghost.cu:137

quda::ExtractGhostArg::A
unsigned short A[nDim]
Definition: extract_gauge_ghost.cu:9

dim
int dim
Definition: tm_ndeg_fused_exterior_dslash_core.h:195

quda::tuneLaunch
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:271

quda::ExtractGhostArg::localParity
bool localParity[nDim]
Definition: extract_gauge_ghost.cu:13

quda::GaugeField::Reconstruct
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:168

quda::FloatNOrder
Definition: clover_field_order.h:56

quda::cudaGaugeField
Definition: gauge_field.h:216

QUDA_CPS_WILSON_GAUGE_ORDER
Definition: enum_quda.h:34

quda::ExtractGhostArg
Definition: extract_gauge_ghost.cu:5

gauge_field_order.h

quda::LatticeField::VolString
const char * VolString() const
Definition: lattice_field.h:199

quda::ExtractGhostArg::ExtractGhostArg
ExtractGhostArg(const Order &order, int nFace, const int *X_, const int *A_, const int *B_, const int *C_, const int f_[nDim][nDim], const int *localParity_)
Definition: extract_gauge_ghost.cu:14

quda::TIFROrder
Definition: gauge_field_order.h:879

quda::extractGhostKernel
__global__ void extractGhostKernel(ExtractGhostArg< Order, nDim > arg)
Definition: extract_gauge_ghost.cu:78

QUDA_TIFR_GAUGE_ORDER
Definition: enum_quda.h:37

X
int X
Definition: asym_wilson_clover_dslash_dagger_fermi_core.h:394

quda::ExtractGhost::tuneKey
TuneKey tuneKey() const
Definition: extract_gauge_ghost.cu:149

QUDA_RECONSTRUCT_8
Definition: enum_quda.h:57

QUDA_DOUBLE_PRECISION
Definition: enum_quda.h:50

QudaFieldLocation
enum QudaFieldLocation_s QudaFieldLocation

QUDA_SINGLE_PRECISION
Definition: enum_quda.h:49

quda::mapper
Definition: register_traits.h:16

quda::GaugeField::LinkType
QudaLinkType LinkType() const
Definition: gauge_field.h:174

QUDA_FLOAT4_GAUGE_ORDER
Definition: enum_quda.h:31

QUDA_QDPJIT_GAUGE_ORDER
Definition: enum_quda.h:33

quda::ExtractGhost::flops
long long flops() const
Definition: extract_gauge_ghost.cu:158

quda::arg
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:843

QUDA_RECONSTRUCT_13
Definition: enum_quda.h:59

quda::ExtractGhostArg::X
unsigned short X[nDim]
Definition: extract_gauge_ghost.cu:8

quda::ExtractGhostArg::C
unsigned short C[nDim]
Definition: extract_gauge_ghost.cu:11

getTuning
QudaTune getTuning()
Definition: util_quda.cpp:32

quda::CPSOrder
Definition: gauge_field_order.h:784

QUDA_CPU_FIELD_LOCATION
Definition: enum_quda.h:270

QUDA_ASQTAD_FAT_LINKS
Definition: enum_quda.h:21

parity
const QudaParity parity
Definition: dslash_test.cpp:29

quda::Tunable::aux
char aux[TuneKey::aux_n]
Definition: tune_quda.h:136

quda::TuneKey
Definition: tune_key.h:8

quda::ExtractGhost::ExtractGhost
ExtractGhost(ExtractGhostArg< Order, nDim > &arg, const GaugeField &meta)
Definition: extract_gauge_ghost.cu:127

quda::GaugeField
Definition: gauge_field.h:118