7 template <
typename Field>
24 volumeCB(a.VolumeCB()),
28 nParity(a.SiteSubset()),
32 X[0] = ((nParity == 1) ? 2 : 1) * a.
X(0);
33 for (
int d=1; d<
nDim; d++) X[d] = a.
X(d);
34 X[4] = (nDim == 5) ? a.
X(4) : 1;
35 for (
int i=0; i<4; i++) {
42 #define MAX_BLOCK_FLOAT_NC 32 47 template <
typename Float,
int Ns,
int Ms,
int Nc,
int Mc,
typename Arg>
48 __device__ __host__ __forceinline__ Float
compute_site_max(
Arg &
arg,
int x_cb,
int parity,
int spinor_parity,
int spin_block,
int color_block,
bool active) {
50 Float thread_max = 0.0;
51 Float site_max = active ? 0.0 : 1.0;
56 constexpr
int max_block_size = 1024;
57 constexpr
int bank_width = 32;
60 constexpr
int thread_width_x = ( (max_block_size / color_spin_threads + bank_width-1) / bank_width) * bank_width;
61 __shared__ Float v[ (Ns/Ms) * (Nc/Mc) * thread_width_x];
62 const auto &rhs = arg.field;
65 for (
int spin_local=0; spin_local<Ms; spin_local++) {
66 int s = spin_block + spin_local;
68 for (
int color_local=0; color_local<Mc; color_local++) {
69 int c = color_block + color_local;
70 complex<Float> z = rhs(spinor_parity, x_cb, s, c);
71 thread_max = thread_max > fabs(z.real()) ? thread_max : fabs(z.real());
72 thread_max = thread_max > fabs(z.imag()) ? thread_max : fabs(z.imag());
75 v[ ( (spin_block/Ms) * (Nc/Mc) + (color_block/Mc)) * blockDim.x + threadIdx.x ] = thread_max;
82 for (
int sc=0; sc<(Ns/Ms) * (Nc/Mc); sc++) {
83 site_max = site_max > v[sc*blockDim.x + threadIdx.x] ? site_max : v[sc*blockDim.x + threadIdx.x];
94 template <
typename Float,
bool block_
float,
int Ns,
int Ms,
int Nc,
int Mc,
int nDim,
int dim,
int dir,
typename Arg>
95 __device__ __host__ __forceinline__
void packGhost(
Arg &
arg,
int x_cb,
int parity,
int spinor_parity,
int spin_block,
int color_block) {
101 const auto &rhs = arg.field;
106 bool active = ( arg.commDim[dim] && ( (dir == 0 && x[dim] < arg.nFace) || (dir == 1 && x[dim] >= arg.X[dim] - arg.nFace) ) );
107 max = compute_site_max<Float,Ns,Ms,Nc,Mc>(
arg, x_cb,
parity, spinor_parity, spin_block, color_block, active);
110 if (dir == 0 && arg.commDim[dim] && x[dim] < arg.nFace) {
111 for (
int spin_local=0; spin_local<Ms; spin_local++) {
112 int s = spin_block + spin_local;
113 for (
int color_local=0; color_local<Mc; color_local++) {
114 int c = color_block + color_local;
115 arg.field.Ghost(dim, 0, spinor_parity, ghostFaceIndex<0, nDim>(x, arg.X, dim, arg.nFace), s, c, 0, max)
116 = rhs(spinor_parity, x_cb, s, c);
121 if (dir == 1 && arg.commDim[dim] && x[dim] >= arg.X[dim] - arg.nFace) {
122 for (
int spin_local=0; spin_local<Ms; spin_local++) {
123 int s = spin_block + spin_local;
124 for (
int color_local=0; color_local<Mc; color_local++) {
125 int c = color_block + color_local;
126 arg.field.Ghost(dim, 1, spinor_parity, ghostFaceIndex<1, nDim>(x, arg.X, dim, arg.nFace), s, c, 0, max)
127 = rhs(spinor_parity, x_cb, s, c);
134 template <
typename Float,
bool block_
float,
int Ns,
int Ms,
int Nc,
int Mc,
int nDim,
typename Arg>
139 for (
int dim=0; dim<4; dim++)
140 for (
int dir=0; dir<2; dir++)
141 for (
int x_cb=0; x_cb<arg.
volumeCB; x_cb++)
142 for (
int spin_block=0; spin_block<Ns; spin_block+=Ms)
143 for (
int color_block=0; color_block<Nc; color_block+=Mc)
147 case 0: packGhost<Float,block_float,Ns,Ms,Nc,Mc,nDim,0,0>(
arg, x_cb,
parity, spinor_parity, spin_block, color_block);
break;
148 case 1: packGhost<Float,block_float,Ns,Ms,Nc,Mc,nDim,1,0>(
arg, x_cb,
parity, spinor_parity, spin_block, color_block);
break;
149 case 2: packGhost<Float,block_float,Ns,Ms,Nc,Mc,nDim,2,0>(
arg, x_cb,
parity, spinor_parity, spin_block, color_block);
break;
150 case 3: packGhost<Float,block_float,Ns,Ms,Nc,Mc,nDim,3,0>(
arg, x_cb,
parity, spinor_parity, spin_block, color_block);
break;
155 case 0: packGhost<Float,block_float,Ns,Ms,Nc,Mc,nDim,0,1>(
arg, x_cb,
parity, spinor_parity, spin_block, color_block);
break;
156 case 1: packGhost<Float,block_float,Ns,Ms,Nc,Mc,nDim,1,1>(
arg, x_cb,
parity, spinor_parity, spin_block, color_block);
break;
157 case 2: packGhost<Float,block_float,Ns,Ms,Nc,Mc,nDim,2,1>(
arg, x_cb,
parity, spinor_parity, spin_block, color_block);
break;
158 case 3: packGhost<Float,block_float,Ns,Ms,Nc,Mc,nDim,3,1>(
arg, x_cb,
parity, spinor_parity, spin_block, color_block);
break;
164 template <
typename Float,
bool block_
float,
int Ns,
int Ms,
int Nc,
int Mc,
int nDim,
int dim_threads,
typename Arg>
166 int x_cb = blockIdx.x*blockDim.x + threadIdx.x;
167 int spin_color_block = blockDim.y*blockIdx.y + threadIdx.y;
168 int parity_dim_dir = blockDim.z*blockIdx.z + threadIdx.z;
172 spin_color_block %= (Ns/Ms)*(Nc/Mc);
173 parity_dim_dir %= arg.nParity2dim_threads;
175 const int dim_dir = parity_dim_dir % (2*dim_threads);
176 const int dim0 = dim_dir / 2;
177 const int dir = dim_dir % 2;
178 const int parity = (arg.
nParity == 2) ? (parity_dim_dir / (2*dim_threads) ) : arg.parity;
180 const int spin_block = (spin_color_block / (Nc / Mc)) * Ms;
181 const int color_block = (spin_color_block % (Nc / Mc)) * Mc;
184 for (
int dim=0; dim<4; dim+=dim_threads) {
188 case 0: packGhost<Float,block_float,Ns,Ms,Nc,Mc,nDim,0,0>(
arg, x_cb,
parity, spinor_parity, spin_block, color_block);
break;
189 case 1: packGhost<Float,block_float,Ns,Ms,Nc,Mc,nDim,1,0>(
arg, x_cb,
parity, spinor_parity, spin_block, color_block);
break;
190 case 2: packGhost<Float,block_float,Ns,Ms,Nc,Mc,nDim,2,0>(
arg, x_cb,
parity, spinor_parity, spin_block, color_block);
break;
191 case 3: packGhost<Float,block_float,Ns,Ms,Nc,Mc,nDim,3,0>(
arg, x_cb,
parity, spinor_parity, spin_block, color_block);
break;
196 case 0: packGhost<Float,block_float,Ns,Ms,Nc,Mc,nDim,0,1>(
arg, x_cb,
parity, spinor_parity, spin_block, color_block);
break;
197 case 1: packGhost<Float,block_float,Ns,Ms,Nc,Mc,nDim,1,1>(
arg, x_cb,
parity, spinor_parity, spin_block, color_block);
break;
198 case 2: packGhost<Float,block_float,Ns,Ms,Nc,Mc,nDim,2,1>(
arg, x_cb,
parity, spinor_parity, spin_block, color_block);
break;
199 case 3: packGhost<Float,block_float,Ns,Ms,Nc,Mc,nDim,3,1>(
arg, x_cb,
parity, spinor_parity, spin_block, color_block);
break;
const int_fastdiv volumeCB
enum QudaPCType_s QudaPCType
__device__ __host__ __forceinline__ Float compute_site_max(Arg &arg, int x_cb, int parity, int spinor_parity, int spin_block, int color_block, bool active)
PackGhostArg(Field field, const ColorSpinorField &a, int parity, int nFace, int dagger)
__global__ void GenericPackGhostKernel(Arg arg)
__device__ __host__ __forceinline__ void packGhost(Arg &arg, int x_cb, int parity, int spinor_parity, int spin_block, int color_block)
int_fastdiv nParity2dim_threads
static __device__ __host__ void getCoords5(int x[5], int cb_index, const I X[5], int parity, QudaPCType pc_type)
#define MAX_BLOCK_FLOAT_NC
int_fastdiv X[QUDA_MAX_DIM]
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
void GenericPackGhost(Arg &arg)
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
int comm_dim_partitioned(int dim)
__host__ __device__ int getCoords(int coord[], const Arg &arg, int &idx, int parity, int &dim)
Compute the space-time coordinates we are at.