13 #ifdef GPU_STAGGERED_DIRAC 19 enum OprodKernelType { OPROD_INTERIOR_KERNEL, OPROD_EXTERIOR_KERNEL };
21 template <
typename Float,
typename Output,
typename InputA,
typename InputB>
struct StaggeredOprodArg {
26 OprodKernelType kernelType;
34 unsigned int ghostOffset[4];
37 StaggeredOprodArg(
int parity,
int dir,
const unsigned int *ghostOffset,
int displacement,
38 const OprodKernelType &kernelType,
int nFace,
const double coeff[2], InputA &inA, InputB &inB,
39 Output &outA, Output &outB, GaugeField &meta) :
40 length(meta.VolumeCB()),
43 displacement(displacement),
44 kernelType(kernelType),
51 this->coeff[0] = coeff[0];
52 this->coeff[1] = coeff[1];
53 for (
int i = 0; i < 4; ++i) this->X[i] = meta.X()[i];
54 for (
int i = 0; i < 4; ++i) this->ghostOffset[i] = ghostOffset[i];
55 for (
int i = 0; i < 4; ++i) this->partitioned[i] =
commDimPartitioned(i) ? true :
false;
66 template <IndexType
idxType>
67 __device__
inline void coordsFromIndex(
int &idx,
int c[4],
unsigned int cb_idx,
int parity,
const int X[4])
72 const int XYZ = X[2]*X[1]*X[0];
73 const int XY = X[1]*X[0];
92 aux1 = (parity + t + z + y) & 1;
95 }
else if (idxType ==
EVEN_Y ) {
98 idx += (parity + t + z) & 1;
101 }
else if (idxType ==
EVEN_Z ) {
103 idx += (parity + t) & 1;
123 __device__
inline void coordsFromIndex(
int x[4],
unsigned int cb_idx,
const int X[4],
int dir,
int displacement,
126 int Xh[2] = {X[0] / 2, X[1] / 2};
129 x[2] = cb_idx / Xh[1] % X[2];
130 x[3] = cb_idx / (Xh[1] * X[2]) % X[3];
131 x[0] = cb_idx / (Xh[1] * X[2] * X[3]);
132 x[0] += (X[0] - displacement);
133 x[1] = 2 * (cb_idx % Xh[1]) + ((x[0] + x[2] + x[3] + parity) & 1);
137 x[2] = cb_idx / Xh[0] % X[2];
138 x[3] = cb_idx / (Xh[0] * X[2]) % X[3];
139 x[1] = cb_idx / (Xh[0] * X[2] * X[3]);
140 x[1] += (X[1] - displacement);
141 x[0] = 2 * (cb_idx % Xh[0]) + ((x[1] + x[2] + x[3] + parity) & 1);
145 x[1] = cb_idx / Xh[0] % X[1];
146 x[3] = cb_idx / (Xh[0] * X[1]) % X[3];
147 x[2] = cb_idx / (Xh[0] * X[1] * X[3]);
148 x[2] += (X[2] - displacement);
149 x[0] = 2 * (cb_idx % Xh[0]) + ((x[1] + x[2] + x[3] + parity) & 1);
153 x[1] = cb_idx / Xh[0] % X[1];
154 x[2] = cb_idx / (Xh[0] * X[1]) % X[2];
155 x[3] = cb_idx / (Xh[0] * X[1] * X[2]);
156 x[3] += (X[3] - displacement);
157 x[0] = 2 * (cb_idx % Xh[0]) + ((x[1] + x[2] + x[3] + parity) & 1);
163 __device__
inline int neighborIndex(
unsigned int cb_idx,
const int shift[4],
const bool partitioned[4],
int parity,
169 coordsFromIndex<EVEN_X>(full_idx, x, cb_idx,
parity,
X);
171 for(
int dim = 0; dim<4; ++dim){
172 if( partitioned[dim] )
173 if( (x[dim]+shift[dim])<0 || (x[dim]+shift[dim])>=X[dim])
return -1;
176 for(
int dim=0; dim<4; ++dim){
177 x[dim] = shift[dim] ? (x[dim]+shift[dim] + X[dim]) % X[dim] : x[dim];
179 return (((x[3]*X[2] + x[2])*X[1] + x[1])*X[0] + x[0]) >> 1;
182 template<
typename real,
typename Output,
typename InputA,
typename InputB>
183 __global__
void interiorOprodKernel(StaggeredOprodArg<real, Output, InputA, InputB>
arg)
185 using complex = complex<real>;
188 unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;
189 const unsigned int gridSize = gridDim.x*blockDim.x;
191 complex x[3], y[3], z[3];
194 while(idx<arg.length){
195 arg.inA.load(x, idx);
198 for(
int dim=0; dim<4; ++dim){
199 int shift[4] = {0,0,0,0};
201 const int first_nbr_idx =
neighborIndex(idx, shift, arg.partitioned, arg.parity, arg.X);
202 if(first_nbr_idx >= 0){
203 arg.inB.load(y, first_nbr_idx);
205 matrix tempA = arg.outA(dim, idx, arg.parity);
206 result = tempA + result*arg.coeff[0];
208 arg.outA(dim, idx, arg.parity) = result;
210 if (arg.nFace == 3) {
212 const int third_nbr_idx =
neighborIndex(idx, shift, arg.partitioned, arg.parity, arg.X);
213 if (third_nbr_idx >= 0) {
214 arg.inB.load(z, third_nbr_idx);
216 matrix tempB = arg.outB(dim, idx, arg.parity);
217 result = tempB + result*arg.coeff[1];
218 arg.outB(dim, idx, arg.parity) = result;
230 template<
int dim,
typename real,
typename Output,
typename InputA,
typename InputB>
231 __global__
void exteriorOprodKernel(StaggeredOprodArg<real, Output, InputA, InputB> arg)
233 using complex = complex<real>;
236 unsigned int cb_idx = blockIdx.x*blockDim.x + threadIdx.x;
237 const unsigned int gridSize = gridDim.x*blockDim.x;
242 Output&
out = (arg.displacement == 1) ? arg.outA : arg.outB;
243 real coeff = (arg.displacement == 1) ? arg.coeff[0] : arg.coeff[1];
246 while(cb_idx<arg.length){
247 coordsFromIndex(x, cb_idx, arg.X, arg.dir, arg.displacement, arg.parity);
248 const unsigned int bulk_cb_idx = ((((x[3]*arg.X[2] + x[2])*arg.X[1] + x[1])*arg.X[0] + x[0]) >> 1);
250 matrix inmatrix =
out(arg.dir, bulk_cb_idx, arg.parity);
251 arg.inA.load(a, bulk_cb_idx);
253 const unsigned int ghost_idx = arg.ghostOffset[dim] + cb_idx;
254 arg.inB.loadGhost(b, ghost_idx, arg.dir);
257 result = inmatrix + result*coeff;
258 out(arg.dir, bulk_cb_idx, arg.parity) = result;
266 template<
typename Float,
typename Output,
typename InputA,
typename InputB>
267 class StaggeredOprodField :
public Tunable {
270 StaggeredOprodArg<Float,Output,InputA,InputB> &
arg;
271 const GaugeField &meta;
273 unsigned int sharedBytesPerThread()
const {
return 0; }
274 unsigned int sharedBytesPerBlock(
const TuneParam &)
const {
return 0; }
276 unsigned int minThreads()
const {
return arg.outA.volumeCB; }
277 bool tunedGridDim()
const {
return false; }
280 StaggeredOprodField(StaggeredOprodArg<Float,Output,InputA,InputB> &arg,
const GaugeField &meta)
281 : arg(arg), meta(meta) {
282 writeAuxString(
"threads=%d,prec=%lu,stride=%d",arg.length,
sizeof(
Complex)/2,arg.inA.Stride());
288 virtual ~StaggeredOprodField() {}
290 void apply(
const cudaStream_t &
stream){
294 if (arg.kernelType == OPROD_INTERIOR_KERNEL) {
295 interiorOprodKernel<<<tp.grid,tp.block,tp.shared_bytes, stream>>>(
arg);
296 }
else if (arg.kernelType == OPROD_EXTERIOR_KERNEL) {
297 if (arg.dir == 0) exteriorOprodKernel<0><<<tp.grid,tp.block,tp.shared_bytes, stream>>>(
arg);
298 else if (arg.dir == 1) exteriorOprodKernel<1><<<tp.grid,tp.block,tp.shared_bytes, stream>>>(
arg);
299 else if (arg.dir == 2) exteriorOprodKernel<2><<<tp.grid,tp.block,tp.shared_bytes, stream>>>(
arg);
300 else if (arg.dir == 3) exteriorOprodKernel<3><<<tp.grid,tp.block,tp.shared_bytes, stream>>>(
arg);
302 errorQuda(
"Kernel type not supported\n");
305 errorQuda(
"No CPU support for staggered outer-product calculation\n");
309 void preTune(){ this->arg.outA.save(); this->arg.outB.save(); }
310 void postTune(){ this->arg.outA.load(); this->arg.outB.load(); }
312 long long flops()
const {
return 0; }
313 long long bytes()
const {
return 0; }
314 TuneKey tuneKey()
const {
return TuneKey(meta.VolString(),
typeid(*this).name(), aux);}
318 void exchangeGhost(
int nFace, cudaColorSpinorField &a,
int parity,
int dag) {
326 a.pack(nFace, 1-parity, dag,
Nstream-1, location, Device);
330 for(
int i=3; i>=0; i--){
333 a.gather(nFace, dag, 2*i);
339 for (
int i=3; i>=0; i--) {
341 a.commsStart(nFace, 2*i, dag);
345 for (
int i=3; i>=0; i--) {
347 a.commsWait(nFace, 2*i, dag);
348 a.scatter(nFace, dag, 2*i);
355 a.bufferIndex = (1 - a.bufferIndex);
359 template <
typename Float,
typename Output,
typename InputA,
typename InputB>
360 void computeStaggeredOprodCuda(Output outA, Output outB, GaugeField &outFieldA, GaugeField &outFieldB, InputA &inA,
361 InputB &inB, cudaColorSpinorField &src,
int parity,
const int faceVolumeCB[4],
362 const double coeff[2],
int nFace)
364 unsigned int ghostOffset[4] = {0, 0, 0, 0};
365 for (
int dir = 0; dir < 4; ++dir)
366 ghostOffset[dir] = src.GhostOffset(dir, 1) / src.FieldOrder();
369 StaggeredOprodArg<Float, Output, InputA, InputB>
arg(parity, 0, ghostOffset, 1, OPROD_INTERIOR_KERNEL, nFace, coeff,
370 inA, inB, outA, outB, outFieldA);
371 StaggeredOprodField<Float, Output, InputA, InputB> oprod(arg, outFieldA);
373 arg.kernelType = OPROD_INTERIOR_KERNEL;
374 arg.length = src.VolumeCB();
377 for (
int i = 3; i >= 0; i--) {
380 arg.kernelType = OPROD_EXTERIOR_KERNEL;
385 arg.displacement = 1;
386 arg.length = faceVolumeCB[i];
392 arg.displacement = 3;
393 arg.length = arg.displacement * faceVolumeCB[i];
402 #endif // GPU_STAGGERED_DIRAC 405 int parity,
const double coeff[2],
int nFace)
407 #ifdef GPU_STAGGERED_DIRAC 425 exchangeGhost(nFace,static_cast<cudaColorSpinorField&>(inB), parity, 0);
428 outA, outB, spinorA, spinorB, inB,
parity, inB.GhostFace(), coeff, nFace);
432 exchangeGhost(nFace,static_cast<cudaColorSpinorField&>(inB), parity, 0);
435 outA, outB, spinorA, spinorB, inB,
parity, inB.GhostFace(), coeff, nFace);
440 #else // GPU_STAGGERED_DIRAC not defined 441 errorQuda(
"Staggered Outer Product has not been built!");
451 double coeff_[2] = {-coeff[0],0.0};
453 }
else if (nFace == 3) {
__device__ __forceinline__ int neighborIndex(const unsigned int &cb_idx, const int(&shift)[4], const bool(&partitioned)[4], const unsigned int &parity)
int commDimPartitioned(int dir)
void allocateGhostBuffer(int nFace, bool spin_project=true) const
Allocate the ghost buffers.
QudaVerbosity getVerbosity()
const ColorSpinorField & Even() const
const ColorSpinorField & Odd() const
__device__ __host__ void outerProd(const Array< T, N > &a, const Array< T, N > &b, Matrix< T, N > *m)
#define qudaDeviceSynchronize()
static __device__ __forceinline__ void coordsFromIndex(int &idx, T *x, int &cb_idx, const Param ¶m)
Compute coordinates from index into the checkerboard (used by the interior Dslash kernels)...
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Main header file for host and device accessors to GaugeFields.
std::complex< double > Complex
cpuColorSpinorField * out
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
QudaGaugeFieldOrder Order() const
void pushKernelPackT(bool pack)
void computeStaggeredOprod(GaugeField *out[], ColorSpinorField &in, const double coeff[], int nFace)
Compute the outer-product field between the staggered quark field's one and (for HISQ and ASQTAD) thr...
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
QudaPrecision Precision() const
void setPackComms(const int *dim_pack)
Helper function that sets which dimensions the packing kernel should be packing for.