12 #ifdef GPU_STAGGERED_DIRAC 18 enum KernelType {OPROD_INTERIOR_KERNEL, OPROD_EXTERIOR_KERNEL};
20 template<
typename Float,
typename Output,
typename InputA,
typename InputB>
21 struct StaggeredOprodArg {
26 unsigned int ghostOffset[4];
27 unsigned int displacement;
37 StaggeredOprodArg(
const unsigned int parity,
38 const unsigned int dir,
39 const unsigned int *ghostOffset,
40 const unsigned int displacement,
43 const double coeff[2],
50 displacement(displacement), kernelType(kernelType), nFace(nFace),
51 inA(inA), inB(inB), outA(outA), outB(outB)
53 this->coeff[0] =
coeff[0];
54 this->coeff[1] =
coeff[1];
55 for(
int i=0;
i<4; ++
i) this->X[
i] = meta.X()[
i];
56 for(
int i=0;
i<4; ++
i) this->ghostOffset[
i] = ghostOffset[
i];
68 template <IndexType
idxType>
70 const unsigned int cb_idx,
const unsigned int parity,
const int X[4])
75 const int XYZ =
X[2]*
X[1]*
X[0];
76 const int XY =
X[1]*
X[0];
98 }
else if (idxType ==
EVEN_Y ) {
104 }
else if (idxType ==
EVEN_Z ) {
126 __device__
static void coordsFromIndex(
int x[4],
const unsigned int cb_idx,
const int X[4],
const unsigned int dir,
const int displacement,
const unsigned int parity)
128 int Xh[2] = {
X[0]/2,
X[1]/2};
131 x[2] = cb_idx/
Xh[1] %
X[2];
132 x[3] = cb_idx/(
Xh[1]*
X[2]) %
X[3];
133 x[0] = cb_idx/(
Xh[1]*
X[2]*
X[3]);
134 x[0] += (
X[0] - displacement);
135 x[1] = 2*(cb_idx %
Xh[1]) + ((
x[0]+
x[2]+
x[3]+
parity)&1);
139 x[2] = cb_idx/
Xh[0] %
X[2];
140 x[3] = cb_idx/(
Xh[0]*
X[2]) %
X[3];
141 x[1] = cb_idx/(
Xh[0]*
X[2]*
X[3]);
142 x[1] += (
X[1] - displacement);
143 x[0] = 2*(cb_idx %
Xh[0]) + ((
x[1]+
x[2]+
x[3]+
parity)&1);
147 x[1] = cb_idx/
Xh[0] %
X[1];
148 x[3] = cb_idx/(
Xh[0]*
X[1]) %
X[3];
149 x[2] = cb_idx/(
Xh[0]*
X[1]*
X[3]);
150 x[2] += (
X[2] - displacement);
151 x[0] = 2*(cb_idx %
Xh[0]) + ((
x[1]+
x[2]+
x[3]+
parity)&1);
155 x[1] = cb_idx/
Xh[0] %
X[1];
156 x[2] = cb_idx/(
Xh[0]*
X[1]) %
X[2];
157 x[3] = cb_idx/(
Xh[0]*
X[1]*
X[2]);
158 x[3] += (
X[3] - displacement);
159 x[0] = 2*(cb_idx %
Xh[0]) + ((
x[1]+
x[2]+
x[3]+
parity)&1);
166 __device__ __forceinline__
167 int neighborIndex(
const unsigned int cb_idx,
const int shift[4],
const bool partitioned[4],
const unsigned int parity,
const int X[4]){
175 if( partitioned[
dim] )
182 return (((
x[3]*
X[2] +
x[2])*
X[1] +
x[1])*
X[0] +
x[0]) >> 1;
186 template<
typename real,
typename Output,
typename InputA,
typename InputB>
187 __global__
void interiorOprodKernel(StaggeredOprodArg<real, Output, InputA, InputB>
arg)
189 unsigned int idx = blockIdx.x*
blockDim.x + threadIdx.x;
204 int shift[4] = {0,0,0,0};
207 if(first_nbr_idx >= 0){
208 arg.inB.load(
y, first_nbr_idx);
211 result = tempA + result*
arg.coeff[0];
215 if (
arg.nFace == 3) {
218 if(third_nbr_idx >= 0){
219 arg.inB.load(
z, third_nbr_idx);
222 result = tempB + result*
arg.coeff[1];
235 template<
int dim,
typename real,
typename Output,
typename InputA,
typename InputB>
236 __global__
void exteriorOprodKernel(StaggeredOprodArg<real, Output, InputA, InputB>
arg)
240 unsigned int cb_idx = blockIdx.x*
blockDim.x + threadIdx.x;
248 Output&
out = (
arg.displacement == 1) ?
arg.outA :
arg.outB;
249 real
coeff = (
arg.displacement == 1) ?
arg.coeff[0] :
arg.coeff[1];
252 while(cb_idx<
arg.length){
254 const unsigned int bulk_cb_idx = ((((
x[3]*
arg.X[2] +
x[2])*
arg.X[1] +
x[1])*
arg.X[0] +
x[0]) >> 1);
256 out.load(reinterpret_cast<real*>(inmatrix.
data), bulk_cb_idx,
arg.dir,
arg.parity);
257 arg.inA.load(
a, bulk_cb_idx);
259 const unsigned int ghost_idx =
arg.ghostOffset[
dim] + cb_idx;
260 arg.inB.loadGhost(
b, ghost_idx,
arg.dir);
263 result = inmatrix + result*
coeff;
264 out.save(reinterpret_cast<real*>(result.
data), bulk_cb_idx,
arg.dir,
arg.parity);
272 template<
typename Float,
typename Output,
typename InputA,
typename InputB>
273 class StaggeredOprodField :
public Tunable {
276 StaggeredOprodArg<Float,Output,InputA,InputB> &
arg;
277 const GaugeField &meta;
279 unsigned int sharedBytesPerThread()
const {
return 0; }
280 unsigned int sharedBytesPerBlock(
const TuneParam &)
const {
return 0; }
282 unsigned int minThreads()
const {
return arg.outA.volumeCB; }
283 bool tunedGridDim()
const {
return false; }
286 StaggeredOprodField(StaggeredOprodArg<Float,Output,InputA,InputB> &
arg,
const GaugeField &meta)
288 writeAuxString(
"threads=%d,prec=%lu,stride=%d",
arg.length,
sizeof(
Complex)/2,
arg.inA.Stride());
294 virtual ~StaggeredOprodField() {}
296 void apply(
const cudaStream_t &
stream){
300 if (
arg.kernelType == OPROD_INTERIOR_KERNEL) {
301 interiorOprodKernel<<<tp.grid,tp.block,tp.shared_bytes,
stream>>>(
arg);
302 }
else if (
arg.kernelType == OPROD_EXTERIOR_KERNEL) {
303 if (
arg.dir == 0) exteriorOprodKernel<0><<<tp.grid,tp.block,tp.shared_bytes,
stream>>>(
arg);
304 else if (
arg.dir == 1) exteriorOprodKernel<1><<<tp.grid,tp.block,tp.shared_bytes,
stream>>>(
arg);
305 else if (
arg.dir == 2) exteriorOprodKernel<2><<<tp.grid,tp.block,tp.shared_bytes,
stream>>>(
arg);
306 else if (
arg.dir == 3) exteriorOprodKernel<3><<<tp.grid,tp.block,tp.shared_bytes,
stream>>>(
arg);
308 errorQuda(
"Kernel type not supported\n");
311 errorQuda(
"No CPU support for staggered outer-product calculation\n");
315 void preTune(){ this->arg.outA.save(); this->arg.outB.save(); }
316 void postTune(){ this->arg.outA.load(); this->arg.outB.load(); }
318 long long flops()
const {
return 0; }
319 long long bytes()
const {
return 0; }
320 TuneKey tuneKey()
const {
return TuneKey(meta.VolString(),
typeid(*this).name(), aux);}
324 void exchangeGhost(
int nFace, cudaColorSpinorField &
a,
int parity,
int dag) {
337 for(
int i=3;
i>=0;
i--){
340 a.gather(nFace, dag, 2*
i);
346 for (
int i=3;
i>=0;
i--) {
348 a.commsStart(nFace, 2*
i, dag);
352 for (
int i=3;
i>=0;
i--) {
354 a.commsWait(nFace, 2*
i, dag);
355 a.scatter(nFace, dag, 2*
i);
362 a.bufferIndex = (1 -
a.bufferIndex);
366 template<
typename Float,
typename Output,
typename InputA,
typename InputB>
367 void computeStaggeredOprodCuda(Output outA, Output outB, GaugeField& outFieldA, GaugeField& outFieldB, InputA& inA, InputB& inB, cudaColorSpinorField&
src,
368 const unsigned int parity,
const int faceVolumeCB[4],
const double coeff[2],
int nFace)
370 unsigned int ghostOffset[4] = {0,0,0,0};
371 for(
int dir=0; dir<4; ++dir) ghostOffset[dir] =
src.GhostOffset(dir,1)/
src.FieldOrder();
374 StaggeredOprodArg<Float,Output,InputA,InputB>
arg(
parity, 0, ghostOffset, 1, OPROD_INTERIOR_KERNEL, nFace,
coeff, inA, inB, outA, outB, outFieldA);
375 StaggeredOprodField<Float,Output,InputA,InputB> oprod(
arg, outFieldA);
377 arg.kernelType = OPROD_INTERIOR_KERNEL;
378 arg.length =
src.VolumeCB();
381 for(
int i=3;
i>=0;
i--){
384 arg.kernelType = OPROD_EXTERIOR_KERNEL;
389 arg.displacement = 1;
390 arg.length = faceVolumeCB[
i];
396 arg.displacement = 3;
397 arg.length =
arg.displacement*faceVolumeCB[
i];
406 #endif // GPU_STAGGERED_DIRAC 409 const unsigned int parity,
const double coeff[2],
int nFace)
411 #ifdef GPU_STAGGERED_DIRAC 429 exchangeGhost(nFace,static_cast<cudaColorSpinorField&>(inB),
parity, 0);
432 outA, outB, spinorA, spinorB, inB,
parity, inB.GhostFace(),
coeff, nFace);
436 exchangeGhost(nFace,static_cast<cudaColorSpinorField&>(inB),
parity, 0);
439 outA, outB, spinorA, spinorB, inB,
parity, inB.GhostFace(),
coeff, nFace);
444 #else // GPU_STAGGERED_DIRAC not defined 445 errorQuda(
"Staggered Outer Product has not been built!");
455 double coeff_[2] = {-
coeff[0],0.0};
457 }
else if (nFace == 3) {
__device__ __forceinline__ int neighborIndex(const unsigned int &cb_idx, const int(&shift)[4], const bool(&partitioned)[4], const unsigned int &parity)
int commDimPartitioned(int dir)
void allocateGhostBuffer(int nFace, bool spin_project=true) const
Allocate the ghost buffers.
QudaVerbosity getVerbosity()
void setPackComms(const int *commDim)
std::complex< double > Complex
static __inline__ dim3 dim3 void size_t cudaStream_t int dim
const ColorSpinorField & Even() const
const ColorSpinorField & Odd() const
__device__ __host__ void outerProd(const Array< T, N > &a, const Array< T, N > &b, Matrix< T, N > *m)
static __device__ __forceinline__ void coordsFromIndex(int &idx, T *x, int &cb_idx, const Param ¶m)
Compute coordinates from index into the checkerboard (used by the interior Dslash kernels)...
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
static unsigned int unsigned int shift
Main header file for host and device accessors to GaugeFields.
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
cpuColorSpinorField * out
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
void setKernelPackT(bool pack)
QudaGaugeFieldOrder Order() const
void computeStaggeredOprod(GaugeField *out[], ColorSpinorField &in, const double coeff[], int nFace)
Compute the outer-product field between the staggered quark field's one and (for HISQ and ASQTAD) thr...
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
QudaPrecision Precision() const