quda-ref/v1.0.0/dslash__pack_8cuh_source.html

 #include <color_spinor_field_order.h>
 #include <color_spinor.h>
 #include <index_helper.cuh>
 #include <dslash_helper.cuh>

 namespace quda
 {

   static int commDim[QUDA_MAX_DIM];

   template <typename Float_, int nColor_, int nSpin_, bool spin_project_ = true> struct PackArg {

     typedef Float_ Float;
     typedef typename mapper<Float>::type real;

     static constexpr int nColor = nColor_;
     static constexpr int nSpin = nSpin_;

     static constexpr bool spin_project = (nSpin == 4 && spin_project_ ? true : false);
     static constexpr bool spinor_direct_load = false; // false means texture load
     typedef typename colorspinor_mapper<Float, nSpin, nColor, spin_project, spinor_direct_load>::type F;

     const F in; // field we are packing

     const int nFace;
     const bool dagger;
     const int parity;         // only use this for single parity fields
     const int nParity;        // number of parities we are working on
     const QudaPCType pc_type; // preconditioning type (4-d or 5-d)

     const DslashConstant dc; // pre-computed dslash constants for optimized indexing

     real a;    // preconditioned twisted-mass scaling parameter
     real b;    // preconditioned twisted-mass chiral twist factor
     real c;    // preconditioned twisted-mass flavor twist factor
     int twist; // whether we are doing preconditioned twisted-mass or not (1 - singlet, 2 - doublet)

     int_fastdiv threads;
     int threadDimMapLower[4];
     int threadDimMapUpper[4];

     int_fastdiv blocks_per_dir;
     int dim_map[4];
     int active_dims;

     int_fastdiv swizzle;
     int sites_per_block;

     PackArg(void **ghost, const ColorSpinorField &in, int nFace, bool dagger, int parity, int threads, double a,
         double b, double c) :
         in(in, nFace, nullptr, nullptr, reinterpret_cast<Float **>(ghost)),
         nFace(nFace),
         dagger(dagger),
         parity(parity),
         nParity(in.SiteSubset()),
         threads(threads),
         pc_type(in.PCType()),
         dc(in.getDslashConstant()),
         a(a),
         b(b),
         c(c),
         twist((a != 0.0 && b != 0.0) ? (c != 0.0 ? 2 : 1) : 0)
     {
       if (!in.isNative()) errorQuda("Unsupported field order colorspinor=%d\n", in.FieldOrder());

       int d = 0;
       int prev = -1; // previous dimension that was partitioned
       for (int i = 0; i < 4; i++) {
         threadDimMapLower[i] = 0;
         threadDimMapUpper[i] = 0;
         if (!commDim[i]) continue;
         threadDimMapLower[i] = (prev >= 0 ? threadDimMapUpper[prev] : 0);
         threadDimMapUpper[i] = threadDimMapLower[i] + 2 * nFace * dc.ghostFaceCB[i];
         prev = i;

         dim_map[d++] = i;
       }
       active_dims = d;
     }
   };

   template <bool dagger, int twist, int dim, QudaPCType pc, typename Arg>
   __device__ __host__ inline void pack(Arg &arg, int ghost_idx, int s, int parity)
   {

     typedef typename mapper<typename Arg::Float>::type real;
     typedef ColorSpinor<real, Arg::nColor, Arg::nSpin> Vector;
     constexpr int nFace = 1;

     // this means we treat 4-d preconditioned fields as 4-d fields,
     // and don't fold in any fifth dimension until after we have
     // computed the 4-d indices (saves division)
     constexpr int nDim = pc;

     // for 5-d preconditioning the face_size includes the Ls dimension
     const int face_size = nFace * arg.dc.ghostFaceCB[dim] * (pc == QUDA_5D_PC ? arg.dc.Ls : 1);

     int spinor_parity = (arg.nParity == 2) ? parity : 0;

     // compute where the output is located
     // compute an index into the local volume from the index into the face
     // read spinor, spin-project, and write half spinor to face

     // face_num determines which end of the lattice we are packing: 0 = start, 1 = end
     const int face_num = (ghost_idx >= face_size) ? 1 : 0;
     ghost_idx -= face_num * face_size;

     // remove const to ensure we have non-const Ghost member
     typedef typename std::remove_const<decltype(arg.in)>::type T;
     T &in = const_cast<T &>(arg.in);

     if (face_num == 0) { // backwards

       int idx = indexFromFaceIndex<nDim, pc, dim, nFace, 0>(ghost_idx, parity, arg);
       constexpr int proj_dir = dagger ? +1 : -1;
       Vector f = arg.in(idx + s * arg.dc.volume_4d_cb, spinor_parity);
       if (twist == 1) {
         f = arg.a * (f + arg.b * f.igamma(4));
       } else if (twist == 2) {
         Vector f1 = arg.in(idx + (1 - s) * arg.dc.volume_4d_cb, spinor_parity); // load other flavor
         if (s == 0)
           f = arg.a * (f + arg.b * f.igamma(4) + arg.c * f1);
         else
           f = arg.a * (f - arg.b * f.igamma(4) + arg.c * f1);
       }
       if (arg.spin_project) {
         in.Ghost(dim, 0, ghost_idx + s * arg.dc.ghostFaceCB[dim], spinor_parity) = f.project(dim, proj_dir);
       } else {
         in.Ghost(dim, 0, ghost_idx + s * arg.dc.ghostFaceCB[dim], spinor_parity) = f;
       }
     } else { // forwards

       int idx = indexFromFaceIndex<nDim, pc, dim, nFace, 1>(ghost_idx, parity, arg);
       constexpr int proj_dir = dagger ? -1 : +1;
       Vector f = arg.in(idx + s * arg.dc.volume_4d_cb, spinor_parity);
       if (twist == 1) {
         f = arg.a * (f + arg.b * f.igamma(4));
       } else if (twist == 2) {
         Vector f1 = arg.in(idx + (1 - s) * arg.dc.volume_4d_cb, spinor_parity); // load other flavor
         if (s == 0)
           f = arg.a * (f + arg.b * f.igamma(4) + arg.c * f1);
         else
           f = arg.a * (f - arg.b * f.igamma(4) + arg.c * f1);
       }
       if (arg.spin_project) {
         in.Ghost(dim, 1, ghost_idx + s * arg.dc.ghostFaceCB[dim], spinor_parity) = f.project(dim, proj_dir);
       } else {
         in.Ghost(dim, 1, ghost_idx + s * arg.dc.ghostFaceCB[dim], spinor_parity) = f;
       }
     }
   }

   template <int dim, int nFace = 1, typename Arg>
   __device__ __host__ inline void packStaggered(Arg &arg, int ghost_idx, int s, int parity)
   {
     typedef typename mapper<typename Arg::Float>::type real;
     typedef ColorSpinor<real, Arg::nColor, Arg::nSpin> Vector;

     int spinor_parity = (arg.nParity == 2) ? parity : 0;

     // compute where the output is located
     // compute an index into the local volume from the index into the face
     // read spinor and write spinor to face buffer

     // face_num determines which end of the lattice we are packing: 0 = start, 1 = end
     const int face_num = (ghost_idx >= nFace * arg.dc.ghostFaceCB[dim]) ? 1 : 0;
     ghost_idx -= face_num * nFace * arg.dc.ghostFaceCB[dim];

     // remove const to ensure we have non-const Ghost member
     typedef typename std::remove_const<decltype(arg.in)>::type T;
     T &in = const_cast<T &>(arg.in);

     if (face_num == 0) { // backwards
       int idx = indexFromFaceIndexStaggered<4, QUDA_4D_PC, dim, nFace, 0>(ghost_idx, parity, arg);
       Vector f = arg.in(idx + s * arg.dc.volume_4d_cb, spinor_parity);
       in.Ghost(dim, 0, ghost_idx + s * arg.dc.ghostFaceCB[dim], spinor_parity) = f;
     } else { // forwards
       int idx = indexFromFaceIndexStaggered<4, QUDA_4D_PC, dim, nFace, 1>(ghost_idx, parity, arg);
       Vector f = arg.in(idx + s * arg.dc.volume_4d_cb, spinor_parity);
       in.Ghost(dim, 1, ghost_idx + s * arg.dc.ghostFaceCB[dim], spinor_parity) = f;
     }
   }

   template <bool dagger, int twist, QudaPCType pc, typename Arg> __global__ void packKernel(Arg arg)
   {
     const int sites_per_block = arg.sites_per_block;
     int local_tid = threadIdx.x;
     int tid = sites_per_block * blockIdx.x + local_tid;
     int s = blockDim.y * blockIdx.y + threadIdx.y;
     if (s >= arg.dc.Ls) return;

     // this is the parity used for load/store, but we use arg.parity for index mapping
     int parity = (arg.nParity == 2) ? blockDim.z * blockIdx.z + threadIdx.z : arg.parity;

     while (local_tid < sites_per_block && tid < arg.threads) {

       // determine which dimension we are packing
       int ghost_idx;
       const int dim = dimFromFaceIndex(ghost_idx, tid, arg);

       if (pc == QUDA_5D_PC) { // 5-d checkerboarded, include s (not ghostFaceCB since both faces)
         switch (dim) {
         case 0: pack<dagger, twist, 0, pc>(arg, ghost_idx + s * arg.dc.ghostFace[0], 0, parity); break;
         case 1: pack<dagger, twist, 1, pc>(arg, ghost_idx + s * arg.dc.ghostFace[1], 0, parity); break;
         case 2: pack<dagger, twist, 2, pc>(arg, ghost_idx + s * arg.dc.ghostFace[2], 0, parity); break;
         case 3: pack<dagger, twist, 3, pc>(arg, ghost_idx + s * arg.dc.ghostFace[3], 0, parity); break;
         }
       } else { // 4-d checkerboarding, keeping s separate (if it exists)
         switch (dim) {
         case 0: pack<dagger, twist, 0, pc>(arg, ghost_idx, s, parity); break;
         case 1: pack<dagger, twist, 1, pc>(arg, ghost_idx, s, parity); break;
         case 2: pack<dagger, twist, 2, pc>(arg, ghost_idx, s, parity); break;
         case 3: pack<dagger, twist, 3, pc>(arg, ghost_idx, s, parity); break;
         }
       }

       local_tid += blockDim.x;
       tid += blockDim.x;
     } // while tid
   }

   template <bool dagger, int twist, QudaPCType pc, typename Arg> __global__ void packShmemKernel(Arg arg)
   {
     // (active_dims * 2 + dir) * blocks_per_dir + local_block_idx
     int local_block_idx = blockIdx.x % arg.blocks_per_dir;
     int dim_dir = blockIdx.x / arg.blocks_per_dir;
     int dir = dim_dir % 2;
     int dim;
     switch (dim_dir / 2) {
     case 0: dim = arg.dim_map[0]; break;
     case 1: dim = arg.dim_map[1]; break;
     case 2: dim = arg.dim_map[2]; break;
     case 3: dim = arg.dim_map[3]; break;
     }

     int local_tid = local_block_idx * blockDim.x + threadIdx.x;

     int s = blockDim.y * blockIdx.y + threadIdx.y;
     if (s >= arg.dc.Ls) return;

     // this is the parity used for load/store, but we use arg.parity for index mapping
     int parity = (arg.nParity == 2) ? blockDim.z * blockIdx.z + threadIdx.z : arg.parity;

     switch (dim) {
     case 0:
       while (local_tid < arg.dc.ghostFaceCB[0]) {
         int ghost_idx = dir * arg.dc.ghostFaceCB[0] + local_tid;
         if (pc == QUDA_5D_PC)
           pack<dagger, twist, 0, pc>(arg, ghost_idx + s * arg.dc.ghostFace[0], 0, parity);
         else
           pack<dagger, twist, 0, pc>(arg, ghost_idx, s, parity);
         local_tid += arg.blocks_per_dir * blockDim.x;
       }
       break;
     case 1:
       while (local_tid < arg.dc.ghostFaceCB[1]) {
         int ghost_idx = dir * arg.dc.ghostFaceCB[1] + local_tid;
         if (pc == QUDA_5D_PC)
           pack<dagger, twist, 1, pc>(arg, ghost_idx + s * arg.dc.ghostFace[1], 0, parity);
         else
           pack<dagger, twist, 1, pc>(arg, ghost_idx, s, parity);
         local_tid += arg.blocks_per_dir * blockDim.x;
       }
       break;
     case 2:
       while (local_tid < arg.dc.ghostFaceCB[2]) {
         int ghost_idx = dir * arg.dc.ghostFaceCB[2] + local_tid;
         if (pc == QUDA_5D_PC)
           pack<dagger, twist, 2, pc>(arg, ghost_idx + s * arg.dc.ghostFace[2], 0, parity);
         else
           pack<dagger, twist, 2, pc>(arg, ghost_idx, s, parity);
         local_tid += arg.blocks_per_dir * blockDim.x;
       }
       break;
     case 3:
       while (local_tid < arg.dc.ghostFaceCB[3]) {
         int ghost_idx = dir * arg.dc.ghostFaceCB[3] + local_tid;
         if (pc == QUDA_5D_PC)
           pack<dagger, twist, 3, pc>(arg, ghost_idx + s * arg.dc.ghostFace[3], 0, parity);
         else
           pack<dagger, twist, 3, pc>(arg, ghost_idx, s, parity);
         local_tid += arg.blocks_per_dir * blockDim.x;
       }
       break;
     }
   }

   template <typename Arg> __global__ void packStaggeredKernel(Arg arg)
   {
     const int sites_per_block = arg.sites_per_block;
     int local_tid = threadIdx.x;
     int tid = sites_per_block * blockIdx.x + local_tid;
     int s = blockDim.y * blockIdx.y + threadIdx.y;
     if (s >= arg.dc.Ls) return;

     // this is the parity used for load/store, but we use arg.parity for index mapping
     int parity = (arg.nParity == 2) ? blockDim.z * blockIdx.z + threadIdx.z : arg.parity;

     while (local_tid < sites_per_block && tid < arg.threads) {
       // determine which dimension we are packing
       int ghost_idx;
       const int dim = dimFromFaceIndex(ghost_idx, tid, arg);

       if (arg.nFace == 1) {
         switch (dim) {
         case 0: packStaggered<0, 1>(arg, ghost_idx, s, parity); break;
         case 1: packStaggered<1, 1>(arg, ghost_idx, s, parity); break;
         case 2: packStaggered<2, 1>(arg, ghost_idx, s, parity); break;
         case 3: packStaggered<3, 1>(arg, ghost_idx, s, parity); break;
         }
       } else if (arg.nFace == 3) {
         switch (dim) {
         case 0: packStaggered<0, 3>(arg, ghost_idx, s, parity); break;
         case 1: packStaggered<1, 3>(arg, ghost_idx, s, parity); break;
         case 2: packStaggered<2, 3>(arg, ghost_idx, s, parity); break;
         case 3: packStaggered<3, 3>(arg, ghost_idx, s, parity); break;
         }
       }

       local_tid += blockDim.x;
       tid += blockDim.x;
     } // while tid
   }

   template <typename Arg> __global__ void packStaggeredShmemKernel(Arg arg)
   {
     // (active_dims * 2 + dir) * blocks_per_dir + local_block_idx
     int local_block_idx = blockIdx.x % arg.blocks_per_dir;
     int dim_dir = blockIdx.x / arg.blocks_per_dir;
     int dir = dim_dir % 2;
     int dim;
     switch (dim_dir / 2) {
     case 0: dim = arg.dim_map[0]; break;
     case 1: dim = arg.dim_map[1]; break;
     case 2: dim = arg.dim_map[2]; break;
     case 3: dim = arg.dim_map[3]; break;
     }

     int local_tid = local_block_idx * blockDim.x + threadIdx.x;

     int s = blockDim.y * blockIdx.y + threadIdx.y;
     if (s >= arg.dc.Ls) return;

     // this is the parity used for load/store, but we use arg.parity for index mapping
     int parity = (arg.nParity == 2) ? blockDim.z * blockIdx.z + threadIdx.z : arg.parity;

     switch (dim) {
     case 0:
       while (local_tid < arg.nFace * arg.dc.ghostFaceCB[0]) {
         int ghost_idx = dir * arg.nFace * arg.dc.ghostFaceCB[0] + local_tid;
         if (arg.nFace == 1)
           packStaggered<0, 1>(arg, ghost_idx, s, parity);
         else
           packStaggered<0, 3>(arg, ghost_idx, s, parity);
         local_tid += arg.blocks_per_dir * blockDim.x;
       }
       break;
     case 1:
       while (local_tid < arg.nFace * arg.dc.ghostFaceCB[1]) {
         int ghost_idx = dir * arg.nFace * arg.dc.ghostFaceCB[1] + local_tid;
         if (arg.nFace == 1)
           packStaggered<1, 1>(arg, ghost_idx, s, parity);
         else
           packStaggered<1, 3>(arg, ghost_idx, s, parity);
         local_tid += arg.blocks_per_dir * blockDim.x;
       }
       break;
     case 2:
       while (local_tid < arg.nFace * arg.dc.ghostFaceCB[2]) {
         int ghost_idx = dir * arg.nFace * arg.dc.ghostFaceCB[2] + local_tid;
         if (arg.nFace == 1)
           packStaggered<2, 1>(arg, ghost_idx, s, parity);
         else
           packStaggered<2, 3>(arg, ghost_idx, s, parity);
         local_tid += arg.blocks_per_dir * blockDim.x;
       }
       break;
     case 3:
       while (local_tid < arg.nFace * arg.dc.ghostFaceCB[3]) {
         int ghost_idx = dir * arg.nFace * arg.dc.ghostFaceCB[3] + local_tid;
         if (arg.nFace == 1)
           packStaggered<3, 1>(arg, ghost_idx, s, parity);
         else
           packStaggered<3, 3>(arg, ghost_idx, s, parity);
         local_tid += arg.blocks_per_dir * blockDim.x;
       }
       break;
     }
   }

 } // namespace quda
quda::PackArg::dim_map
int dim_map[4]
Definition: dslash_pack.cuh:43

quda::PackArg::nParity
const int nParity
Definition: dslash_pack.cuh:28

quda::PackArg::active_dims
int active_dims
Definition: dslash_pack.cuh:44

quda::ColorSpinorField
Definition: color_spinor_field.h:311

quda::PackArg::PackArg
PackArg(void **ghost, const ColorSpinorField &in, int nFace, bool dagger, int parity, int threads, double a, double b, double c)
Definition: dslash_pack.cuh:49

quda::packStaggered
__device__ __host__ void packStaggered(Arg &arg, int ghost_idx, int s, int parity)
Definition: dslash_pack.cuh:154

quda::DslashConstant
Constants used by dslash and packing kernels.
Definition: color_spinor_field.h:278

quda::dimFromFaceIndex
__host__ __device__ int dimFromFaceIndex(int &face_idx, int tid, const Arg &arg)
Determines which face a given thread is computing. Also rescale face_idx so that is relative to a giv...
Definition: index_helper.cuh:783

errorQuda
#define errorQuda(...)
Definition: util_quda.h:121

quda::PackArg::nSpin
static constexpr int nSpin
Definition: dslash_pack.cuh:17

QudaPCType
enum QudaPCType_s QudaPCType

quda::ColorSpinor
Definition: color_spinor.h:24

quda::PackArg::nColor
static constexpr int nColor
Definition: dslash_pack.cuh:16

int_fastdiv
Definition: fast_intdiv.h:20

quda::PackArg::a
real a
Definition: dslash_pack.cuh:33

quda::PackArg::nFace
const int nFace
Definition: dslash_pack.cuh:25

quda
Definition: blas_cublas.h:5

quda::PackArg::dc
const DslashConstant dc
Definition: dslash_pack.cuh:31

quda::PackArg::b
real b
Definition: dslash_pack.cuh:34

quda::PackArg::threadDimMapUpper
int threadDimMapUpper[4]
Definition: dslash_pack.cuh:40

quda::PackArg
Definition: dslash_pack.cuh:11

quda::PackArg::dagger
const bool dagger
Definition: dslash_pack.cuh:26

quda::PackArg::swizzle
int_fastdiv swizzle
Definition: dslash_pack.cuh:46

quda::PackArg::sites_per_block
int sites_per_block
Definition: dslash_pack.cuh:47

quda::PackArg::F
colorspinor_mapper< Float, nSpin, nColor, spin_project, spinor_direct_load >::type F
Definition: dslash_pack.cuh:21

quda::PackArg::in
const F in
Definition: dslash_pack.cuh:23

quda::PackArg::threadDimMapLower
int threadDimMapLower[4]
Definition: dslash_pack.cuh:39

QUDA_5D_PC
Definition: enum_quda.h:396

quda::PackArg::twist
int twist
Definition: dslash_pack.cuh:36

dslash_helper.cuh

quda::PackArg::c
real c
Definition: dslash_pack.cuh:35

quda::packShmemKernel
__global__ void packShmemKernel(Arg arg)
Definition: dslash_pack.cuh:222

quda::DslashConstant::ghostFaceCB
int ghostFaceCB[QUDA_MAX_DIM+1]
Definition: color_spinor_field.h:296

quda::Arg
Definition: spinor_noise.cu:22

quda::ColorSpinorField::isNative
bool isNative() const
Definition: color_spinor_field.cpp:568

quda::PackArg::blocks_per_dir
int_fastdiv blocks_per_dir
Definition: dslash_pack.cuh:42

quda::PackArg::pc_type
const QudaPCType pc_type
Definition: dslash_pack.cuh:29

quda::PackArg::real
mapper< Float >::type real
Definition: dslash_pack.cuh:14

index_helper.cuh

quda::commDim
static int commDim[QUDA_MAX_DIM]
Definition: dslash_pack.cuh:9

quda::Arg::nParity
const int nParity
Definition: spinor_noise.cu:25

quda::pack
__device__ __host__ void pack(Arg &arg, int ghost_idx, int s, int parity)
Definition: dslash_pack.cuh:83

quda::mapper
Definition: register_traits.h:43

color_spinor_field_order.h

quda::s
__shared__ float s[]

quda::PackArg::spinor_direct_load
static constexpr bool spinor_direct_load
Definition: dslash_pack.cuh:20

quda::PackArg::parity
const int parity
Definition: dslash_pack.cuh:27

quda::colorspinor_mapper
Definition: color_spinor_field_order.h:1602

quda::packStaggeredShmemKernel
__global__ void packStaggeredShmemKernel(Arg arg)
Definition: dslash_pack.cuh:325

quda::arg
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:1076

quda::PackArg::Float
Float_ Float
Definition: dslash_pack.cuh:13

quda::Vector
VectorXcd Vector
Definition: inv_eigcg_quda.cpp:38

quda::packStaggeredKernel
__global__ void packStaggeredKernel(Arg arg)
Definition: dslash_pack.cuh:288

QUDA_MAX_DIM
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
Definition: quda_constants.h:17

quda::PackArg::threads
int_fastdiv threads
Definition: dslash_pack.cuh:38

quda::packKernel
__global__ void packKernel(Arg arg)
Definition: dslash_pack.cuh:184

quda::ColorSpinorField::FieldOrder
QudaFieldOrder FieldOrder() const
Definition: color_spinor_field.h:483

quda::PackArg::spin_project
static constexpr bool spin_project
Definition: dslash_pack.cuh:19

color_spinor.h