13 #ifdef GPU_CLOVER_DIRAC 20 void createEventArray(cudaEvent_t (&event)[N],
unsigned int flags=cudaEventDefault)
22 for(
int i=0; i<N; ++i)
23 cudaEventCreate(&event[i],
flags);
28 void destroyEventArray(cudaEvent_t (&event)[N])
30 for(
int i=0; i<N; ++i)
31 cudaEventDestroy(event[i]);
38 static cudaEvent_t oprodStart;
39 static cudaEvent_t oprodEnd;
42 void createCloverForceEvents(){
43 cudaEventCreate(&packEnd, cudaEventDisableTiming);
44 createEventArray(gatherEnd, cudaEventDisableTiming);
45 createEventArray(scatterEnd, cudaEventDisableTiming);
46 cudaEventCreate(&oprodStart, cudaEventDisableTiming);
47 cudaEventCreate(&oprodEnd, cudaEventDisableTiming);
51 void destroyCloverForceEvents(){
52 destroyEventArray(gatherEnd);
53 destroyEventArray(scatterEnd);
54 cudaEventDestroy(packEnd);
55 cudaEventDestroy(oprodStart);
56 cudaEventDestroy(oprodEnd);
60 enum OprodKernelType { OPROD_INTERIOR_KERNEL, OPROD_EXTERIOR_KERNEL };
62 template<
typename Float,
typename Output,
typename Gauge,
typename InputA,
typename InputB,
typename InputC,
typename InputD>
63 struct CloverForceArg {
68 unsigned int ghostOffset[4];
69 unsigned int displacement;
70 OprodKernelType kernelType;
80 CloverForceArg(
const unsigned int parity,
const unsigned int dir,
const unsigned int *ghostOffset,
81 const unsigned int displacement,
const OprodKernelType kernelType,
const double coeff, InputA &inA,
82 InputB &inB_shift, InputC &inC, InputD &inD_shift, Gauge &gauge, Output &force, GaugeField &meta) :
83 length(meta.VolumeCB()),
86 displacement(displacement),
87 kernelType(kernelType),
96 for(
int i=0; i<4; ++i) this->X[i] = meta.X()[i];
97 for(
int i=0; i<4; ++i) this->ghostOffset[i] = ghostOffset[i];
109 template <IndexType
idxType>
111 int &idx,
int c[4],
const unsigned int cb_idx,
const unsigned int parity,
const int X[4])
113 const int &LX = X[0];
114 const int &LY = X[1];
115 const int &LZ = X[2];
116 const int XYZ = X[2] * X[1] * X[0];
117 const int XY = X[1] * X[0];
132 int aux2 = aux1 / LY;
133 y = aux1 - aux2 * LY;
136 aux1 = (parity + t + z + y) & 1;
139 }
else if (idxType ==
EVEN_Y ) {
142 idx += (parity + t + z) & 1;
145 }
else if (idxType ==
EVEN_Z ) {
147 idx += (parity + t) & 1;
166 __device__
inline void coordsFromIndexExterior(
int x[4],
const unsigned int cb_idx,
const int X[4],
167 const unsigned int dir,
const int displacement,
const unsigned int parity)
169 int Xh[2] = {X[0] / 2, X[1] / 2};
172 x[2] = cb_idx / Xh[1] % X[2];
173 x[3] = cb_idx / (Xh[1] * X[2]) % X[3];
174 x[0] = cb_idx / (Xh[1] * X[2] * X[3]);
175 x[0] += (X[0] - displacement);
176 x[1] = 2 * (cb_idx % Xh[1]) + ((x[0] + x[2] + x[3] + parity) & 1);
180 x[2] = cb_idx / Xh[0] % X[2];
181 x[3] = cb_idx / (Xh[0] * X[2]) % X[3];
182 x[1] = cb_idx / (Xh[0] * X[2] * X[3]);
183 x[1] += (X[1] - displacement);
184 x[0] = 2 * (cb_idx % Xh[0]) + ((x[1] + x[2] + x[3] + parity) & 1);
188 x[1] = cb_idx / Xh[0] % X[1];
189 x[3] = cb_idx / (Xh[0] * X[1]) % X[3];
190 x[2] = cb_idx / (Xh[0] * X[1] * X[3]);
191 x[2] += (X[2] - displacement);
192 x[0] = 2 * (cb_idx % Xh[0]) + ((x[1] + x[2] + x[3] + parity) & 1);
196 x[1] = cb_idx / Xh[0] % X[1];
197 x[2] = cb_idx / (Xh[0] * X[1]) % X[2];
198 x[3] = cb_idx / (Xh[0] * X[1] * X[2]);
199 x[3] += (X[3] - displacement);
200 x[0] = 2 * (cb_idx % Xh[0]) + ((x[1] + x[2] + x[3] + parity) & 1);
207 __device__ __forceinline__
208 int neighborIndex(
const unsigned int cb_idx,
const int shift[4],
const bool partitioned[4],
const unsigned int parity,
const int X[4]){
211 coordsFromIndex<EVEN_X>(full_idx, x, cb_idx,
parity,
X);
213 for(
int dim = 0; dim<4; ++dim){
215 if( (x[dim]+shift[dim])<0 || (x[dim]+shift[dim])>=X[dim])
return -1;
218 for(
int dim=0; dim<4; ++dim){
219 x[dim] = shift[dim] ? (x[dim]+shift[dim] + X[dim]) % X[dim] : x[dim];
221 return (((x[3]*X[2] + x[2])*X[1] + x[1])*X[0] + x[0]) >> 1;
226 template<
typename real,
typename Output,
typename Gauge,
typename InputA,
typename InputB,
typename InputC,
typename InputD>
227 __global__
void interiorOprodKernel(CloverForceArg<real, Output, Gauge, InputA, InputB, InputC, InputD>
arg) {
229 int idx = blockIdx.x*blockDim.x + threadIdx.x;
231 ColorSpinor<real,3,4> A, B_shift, C, D_shift;
234 while (idx<arg.length){
235 arg.inA.load(static_cast<Complex*>(A.data), idx);
236 arg.inC.load(static_cast<Complex*>(C.data), idx);
239 for(
int dim=0; dim<4; ++dim){
240 int shift[4] = {0,0,0,0};
242 const int nbr_idx =
neighborIndex(idx, shift, arg.partitioned, arg.parity, arg.X);
245 arg.inB_shift.load(static_cast<Complex*>(B_shift.data), nbr_idx);
246 arg.inD_shift.load(static_cast<Complex*>(D_shift.data), nbr_idx);
248 B_shift = (B_shift.project(dim,1)).reconstruct(dim,1);
251 D_shift = (D_shift.project(dim,-1)).reconstruct(dim,-1);
254 temp = arg.force(dim, idx, arg.parity);
255 U = arg.gauge(dim, idx, arg.parity);
256 result = temp + U*result*arg.coeff;
257 arg.force(dim, idx, arg.parity) = result;
261 idx += gridDim.x*blockDim.x;
266 template<
int dim,
typename real,
typename Output,
typename Gauge,
typename InputA,
typename InputB,
typename InputC,
typename InputD>
267 __global__
void exteriorOprodKernel(CloverForceArg<real, Output, Gauge, InputA, InputB, InputC, InputD> arg) {
269 int cb_idx = blockIdx.x*blockDim.x + threadIdx.x;
271 ColorSpinor<real,3,4> A, B_shift, C, D_shift;
272 ColorSpinor<real,3,2> projected_tmp;
276 while (cb_idx<arg.length){
277 coordsFromIndexExterior(x, cb_idx, arg.X, dim, arg.displacement, arg.parity);
278 const unsigned int bulk_cb_idx = ((((x[3]*arg.X[2] + x[2])*arg.X[1] + x[1])*arg.X[0] + x[0]) >> 1);
279 arg.inA.load(static_cast<Complex*>(A.data), bulk_cb_idx);
280 arg.inC.load(static_cast<Complex*>(C.data), bulk_cb_idx);
282 const unsigned int ghost_idx = arg.ghostOffset[dim] + cb_idx;
284 arg.inB_shift.loadGhost(static_cast<Complex*>(projected_tmp.data), ghost_idx, dim);
285 B_shift = projected_tmp.reconstruct(dim, 1);
288 arg.inD_shift.loadGhost(static_cast<Complex*>(projected_tmp.data), ghost_idx, dim);
289 D_shift = projected_tmp.reconstruct(dim,-1);
292 temp = arg.force(dim, bulk_cb_idx, arg.parity);
293 U = arg.gauge(dim, bulk_cb_idx, arg.parity);
294 result = temp + U*result*arg.coeff;
295 arg.force(dim, bulk_cb_idx, arg.parity) = result;
297 cb_idx += gridDim.x*blockDim.x;
302 template<
typename Float,
typename Output,
typename Gauge,
typename InputA,
typename InputB,
typename InputC,
typename InputD>
303 class CloverForce :
public Tunable {
306 CloverForceArg<Float,Output,Gauge,InputA,InputB,InputC,InputD> &
arg;
307 const GaugeField &meta;
310 unsigned int sharedBytesPerThread()
const {
return 0; }
311 unsigned int sharedBytesPerBlock(
const TuneParam &)
const {
return 0; }
313 unsigned int minThreads()
const {
return arg.length; }
314 bool tuneGridDim()
const {
return false; }
317 CloverForce(CloverForceArg<Float,Output,Gauge,InputA,InputB,InputC,InputD> &arg,
319 : arg(arg), meta(meta), location(location) {
320 writeAuxString(
"prec=%lu,stride=%d",
sizeof(Float), arg.inA.Stride());
326 virtual ~CloverForce() {}
328 void apply(
const cudaStream_t &
stream){
333 if(arg.kernelType == OPROD_INTERIOR_KERNEL){
334 interiorOprodKernel<<<tp.grid,tp.block,tp.shared_bytes,stream>>>(
arg);
335 }
else if (arg.kernelType == OPROD_EXTERIOR_KERNEL) {
337 exteriorOprodKernel<0><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(
arg);
338 else if (arg.dir == 1) exteriorOprodKernel<1><<<tp.grid,tp.block,tp.shared_bytes, stream>>>(
arg);
339 else if (arg.dir == 2) exteriorOprodKernel<2><<<tp.grid,tp.block,tp.shared_bytes, stream>>>(
arg);
340 else if (arg.dir == 3) exteriorOprodKernel<3><<<tp.grid,tp.block,tp.shared_bytes, stream>>>(
arg);
342 errorQuda(
"Kernel type not supported\n");
345 errorQuda(
"No CPU support for staggered outer-product calculation\n");
350 this->arg.force.save();
353 this->arg.force.load();
356 long long flops()
const {
357 if (arg.kernelType == OPROD_INTERIOR_KERNEL) {
358 return ((
long long)arg.length)*4*(24 + 144 + 234);
360 return ((
long long)arg.length)*(144 + 234);
363 long long bytes()
const {
364 if (arg.kernelType == OPROD_INTERIOR_KERNEL) {
365 return ((
long long)arg.length)*(arg.inA.Bytes() + arg.inC.Bytes() + 4*(arg.inB_shift.Bytes() + arg.inD_shift.Bytes() + 2*arg.force.Bytes() + arg.gauge.Bytes()));
367 return ((
long long)arg.length)*(arg.inA.Bytes() + arg.inB_shift.Bytes()/2 + arg.inC.Bytes() + arg.inD_shift.Bytes()/2 + 2*arg.force.Bytes() + arg.gauge.Bytes());
371 TuneKey tuneKey()
const {
373 strcpy(new_aux, aux);
374 if (arg.kernelType == OPROD_INTERIOR_KERNEL) {
375 strcat(new_aux,
",interior");
377 strcat(new_aux,
",exterior");
378 if (arg.dir==0) strcat(new_aux,
",dir=0");
379 else if (arg.dir==1) strcat(new_aux,
",dir=1");
380 else if (arg.dir==2) strcat(new_aux,
",dir=2");
381 else if (arg.dir==3) strcat(new_aux,
",dir=3");
383 return TuneKey(meta.VolString(),
"CloverForce", new_aux);
387 void exchangeGhost(cudaColorSpinorField &a,
int parity,
int dag) {
395 a.pack(1, 1-parity, dag,
Nstream-1, location, Device);
399 for(
int i=3; i>=0; i--){
402 a.gather(1, dag, 2*i);
408 for (
int i=3; i>=0; i--) {
410 a.commsStart(1, 2*i, dag);
414 for (
int i=3; i>=0; i--) {
416 a.commsWait(1, 2*i, dag);
417 a.scatter(1, dag, 2*i);
424 a.bufferIndex = (1 - a.bufferIndex);
428 template<
typename Float,
typename Output,
typename Gauge,
typename InputA,
typename InputB,
typename InputC,
typename InputD>
429 void computeCloverForceCuda(Output force, Gauge gauge, GaugeField&
out,
430 InputA& inA, InputB& inB, InputC &inC, InputD &inD,
431 cudaColorSpinorField& src1, cudaColorSpinorField& src2,
432 const unsigned int parity,
const int faceVolumeCB[4],
const double coeff)
436 unsigned int ghostOffset[4] = {0,0,0,0};
437 for (
int dir=0; dir<4; ++dir) {
438 if (src1.GhostOffset(dir) != src2.GhostOffset(dir))
439 errorQuda(
"Mismatched ghost offset[%d] %d != %d", dir, src1.GhostOffset(dir), src2.GhostOffset(dir));
440 ghostOffset[dir] = (src1.GhostOffset(dir,1))/src1.FieldOrder();
444 CloverForceArg<Float,Output,Gauge,InputA,InputB,InputC,InputD>
arg(parity, 0, ghostOffset, 1, OPROD_INTERIOR_KERNEL, coeff, inA, inB, inC, inD, gauge, force, out);
447 arg.kernelType = OPROD_INTERIOR_KERNEL;
448 arg.length = src1.VolumeCB();
451 for (
int i=3; i>=0; i--) {
454 arg.kernelType = OPROD_EXTERIOR_KERNEL;
456 arg.length = faceVolumeCB[i];
457 arg.displacement = 1;
463 #endif // GPU_CLOVER_DIRAC 466 std::vector<ColorSpinorField *> &p, std::vector<double> &coeff)
469 #ifdef GPU_CLOVER_DIRAC 473 if(x[0]->Precision() != force.
Precision())
474 errorQuda(
"Mixed precision not supported: %d %d\n", x[0]->Precision(), force.
Precision());
476 createCloverForceEvents();
480 for (
unsigned int i=0; i<x.size(); i++) {
486 for (
int parity=0; parity<2; parity++) {
497 exchangeGhost(static_cast<cudaColorSpinorField&>(inB), parity, dag);
502 exchangeGhost(static_cast<cudaColorSpinorField&>(inD), parity, 1-dag);
507 force, spinorA, spinorB, spinorC, spinorD,
509 static_cast<cudaColorSpinorField&>(inD),
514 force, spinorA, spinorB, spinorC, spinorD,
516 static_cast<cudaColorSpinorField&>(inD),
519 errorQuda(
"Unsupported recontruction type");
521 #if 0 // FIXME - Spinor class expect float4 pointer not Complex pointer 525 errorQuda(
"Unsupported precision: %d\n", x[0]->Precision());
530 destroyCloverForceEvents();
532 #else // GPU_CLOVER_DIRAC not defined 533 errorQuda(
"Clover Dirac operator has not been built!");
__device__ __forceinline__ int neighborIndex(const unsigned int &cb_idx, const int(&shift)[4], const bool(&partitioned)[4], const unsigned int &parity)
int commDimPartitioned(int dir)
QudaVerbosity getVerbosity()
void computeCloverForce(GaugeField &force, const GaugeField &U, std::vector< ColorSpinorField *> &x, std::vector< ColorSpinorField *> &p, std::vector< double > &coeff)
Compute the force contribution from the solver solution fields.
__device__ __host__ Matrix< complex< Float >, Nc > outerProdSpinTrace(const ColorSpinor< Float, Nc, Ns > &a, const ColorSpinor< Float, Nc, Ns > &b)
#define qudaDeviceSynchronize()
static __device__ __forceinline__ void coordsFromIndex(int &idx, T *x, int &cb_idx, const Param ¶m)
Compute coordinates from index into the checkerboard (used by the interior Dslash kernels)...
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Main header file for host and device accessors to GaugeFields.
std::complex< double > Complex
enum QudaFieldLocation_s QudaFieldLocation
cpuColorSpinorField * out
cudaEvent_t scatterEnd[Nstream]
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
const int * GhostFace() const
cudaError_t qudaEventRecord(cudaEvent_t &event, cudaStream_t stream=0)
Wrapper around cudaEventRecord or cuEventRecord.
QudaReconstructType Reconstruct() const
QudaGaugeFieldOrder Order() const
void pushKernelPackT(bool pack)
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
QudaPrecision Precision() const
cudaEvent_t gatherEnd[Nstream]
void setPackComms(const int *dim_pack)
Helper function that sets which dimensions the packing kernel should be packing for.