13 #ifdef GPU_CLOVER_DIRAC 20 void createEventArray(cudaEvent_t (&
event)[N],
unsigned int flags=cudaEventDefault)
22 for(
int i=0;
i<N; ++
i)
28 void destroyEventArray(cudaEvent_t (&
event)[N])
30 for(
int i=0;
i<N; ++
i)
31 cudaEventDestroy(
event[
i]);
38 static cudaEvent_t oprodStart;
39 static cudaEvent_t oprodEnd;
42 void createCloverForceEvents(){
43 cudaEventCreate(&
packEnd, cudaEventDisableTiming);
44 createEventArray(
gatherEnd, cudaEventDisableTiming);
45 createEventArray(
scatterEnd, cudaEventDisableTiming);
46 cudaEventCreate(&oprodStart, cudaEventDisableTiming);
47 cudaEventCreate(&oprodEnd, cudaEventDisableTiming);
51 void destroyCloverForceEvents(){
55 cudaEventDestroy(oprodStart);
56 cudaEventDestroy(oprodEnd);
60 enum KernelType {OPROD_INTERIOR_KERNEL, OPROD_EXTERIOR_KERNEL};
62 template<
typename Float,
typename Output,
typename Gauge,
typename InputA,
typename InputB,
typename InputC,
typename InputD>
63 struct CloverForceArg {
68 unsigned int ghostOffset[4];
69 unsigned int displacement;
80 CloverForceArg(
const unsigned int parity,
81 const unsigned int dir,
82 const unsigned int *ghostOffset,
83 const unsigned int displacement,
93 displacement(displacement), kernelType(kernelType),
95 inC(inC), inD_shift(inD_shift),
96 gauge(gauge), force(force)
98 for(
int i=0;
i<4; ++
i) this->X[
i] = meta.X()[
i];
99 for(
int i=0;
i<4; ++
i) this->ghostOffset[
i] = ghostOffset[
i];
111 template <IndexType
idxType>
113 const unsigned int cb_idx,
const unsigned int parity,
const int X[4])
115 const int &LX =
X[0];
116 const int &LY =
X[1];
117 const int &LZ =
X[2];
118 const int XYZ =
X[2]*
X[1]*
X[0];
119 const int XY =
X[1]*
X[0];
134 int aux2 = aux1 / LY;
135 y = aux1 - aux2 * LY;
141 }
else if (idxType ==
EVEN_Y ) {
147 }
else if (idxType ==
EVEN_Z ) {
170 __device__
static void coordsFromIndex(
int x[4],
const unsigned int cb_idx,
const int X[4],
const unsigned int dir,
const int displacement,
const unsigned int parity)
172 int Xh[2] = {
X[0]/2,
X[1]/2};
175 x[2] = cb_idx/
Xh[1] %
X[2];
176 x[3] = cb_idx/(
Xh[1]*
X[2]) %
X[3];
177 x[0] = cb_idx/(
Xh[1]*
X[2]*
X[3]);
178 x[0] += (
X[0] - displacement);
179 x[1] = 2*(cb_idx %
Xh[1]) + ((
x[0]+
x[2]+
x[3]+
parity)&1);
183 x[2] = cb_idx/
Xh[0] %
X[2];
184 x[3] = cb_idx/(
Xh[0]*
X[2]) %
X[3];
185 x[1] = cb_idx/(
Xh[0]*
X[2]*
X[3]);
186 x[1] += (
X[1] - displacement);
187 x[0] = 2*(cb_idx %
Xh[0]) + ((
x[1]+
x[2]+
x[3]+
parity)&1);
191 x[1] = cb_idx/
Xh[0] %
X[1];
192 x[3] = cb_idx/(
Xh[0]*
X[1]) %
X[3];
193 x[2] = cb_idx/(
Xh[0]*
X[1]*
X[3]);
194 x[2] += (
X[2] - displacement);
195 x[0] = 2*(cb_idx %
Xh[0]) + ((
x[1]+
x[2]+
x[3]+
parity)&1);
199 x[1] = cb_idx/
Xh[0] %
X[1];
200 x[2] = cb_idx/(
Xh[0]*
X[1]) %
X[2];
201 x[3] = cb_idx/(
Xh[0]*
X[1]*
X[2]);
202 x[3] += (
X[3] - displacement);
203 x[0] = 2*(cb_idx %
Xh[0]) + ((
x[1]+
x[2]+
x[3]+
parity)&1);
210 __device__ __forceinline__
211 int neighborIndex(
const unsigned int cb_idx,
const int shift[4],
const bool partitioned[4],
const unsigned int parity,
const int X[4]){
224 return (((
x[3]*
X[2] +
x[2])*
X[1] +
x[1])*
X[0] +
x[0]) >> 1;
229 template<
typename real,
typename Output,
typename Gauge,
typename InputA,
typename InputB,
typename InputC,
typename InputD>
230 __global__
void interiorOprodKernel(CloverForceArg<real, Output, Gauge, InputA, InputB, InputC, InputD>
arg) {
234 ColorSpinor<real,3,4> A, B_shift, C, D_shift;
238 arg.inA.load(static_cast<Complex*>(A.data),
idx);
239 arg.inC.load(static_cast<Complex*>(C.data),
idx);
243 int shift[4] = {0,0,0,0};
248 arg.inB_shift.load(static_cast<Complex*>(B_shift.data), nbr_idx);
249 arg.inD_shift.load(static_cast<Complex*>(D_shift.data), nbr_idx);
251 B_shift = (B_shift.project(
dim,1)).reconstruct(
dim,1);
254 D_shift = (D_shift.project(
dim,-1)).reconstruct(
dim,-1);
259 result = temp + U*result*
arg.coeff;
269 template<
int dim,
typename real,
typename Output,
typename Gauge,
typename InputA,
typename InputB,
typename InputC,
typename InputD>
270 __global__
void exteriorOprodKernel(CloverForceArg<real, Output, Gauge, InputA, InputB, InputC, InputD>
arg) {
272 int cb_idx = blockIdx.x*
blockDim.x + threadIdx.x;
274 ColorSpinor<real,3,4> A, B_shift, C, D_shift;
275 ColorSpinor<real,3,2> projected_tmp;
279 while (cb_idx<
arg.length){
281 const unsigned int bulk_cb_idx = ((((
x[3]*
arg.X[2] +
x[2])*
arg.X[1] +
x[1])*
arg.X[0] +
x[0]) >> 1);
282 arg.inA.load(static_cast<Complex*>(A.data), bulk_cb_idx);
283 arg.inC.load(static_cast<Complex*>(C.data), bulk_cb_idx);
285 const unsigned int ghost_idx =
arg.ghostOffset[
dim] + cb_idx;
287 arg.inB_shift.loadGhost(static_cast<Complex*>(projected_tmp.data), ghost_idx,
dim);
288 B_shift = projected_tmp.reconstruct(
dim, 1);
291 arg.inD_shift.loadGhost(static_cast<Complex*>(projected_tmp.data), ghost_idx,
dim);
292 D_shift = projected_tmp.reconstruct(
dim,-1);
295 arg.force.load(reinterpret_cast<real*>(temp.
data), bulk_cb_idx,
dim,
arg.parity);
296 arg.gauge.load(reinterpret_cast<real*>(U.
data), bulk_cb_idx,
dim,
arg.parity);
297 result = temp + U*result*
arg.coeff;
298 arg.force.save(reinterpret_cast<real*>(result.
data), bulk_cb_idx,
dim,
arg.parity);
305 template<
typename Float,
typename Output,
typename Gauge,
typename InputA,
typename InputB,
typename InputC,
typename InputD>
306 class CloverForce :
public Tunable {
309 CloverForceArg<Float,Output,Gauge,InputA,InputB,InputC,InputD> &
arg;
310 const GaugeField &meta;
313 unsigned int sharedBytesPerThread()
const {
return 0; }
314 unsigned int sharedBytesPerBlock(
const TuneParam &)
const {
return 0; }
316 unsigned int minThreads()
const {
return arg.length; }
317 bool tuneGridDim()
const {
return false; }
320 CloverForce(CloverForceArg<Float,Output,Gauge,InputA,InputB,InputC,InputD> &
arg,
322 :
arg(
arg), meta(meta), location(location) {
323 writeAuxString(
"prec=%lu,stride=%d",
sizeof(Float),
arg.inA.Stride());
329 virtual ~CloverForce() {}
331 void apply(
const cudaStream_t &
stream){
336 if(
arg.kernelType == OPROD_INTERIOR_KERNEL){
337 interiorOprodKernel<<<tp.grid,tp.block,tp.shared_bytes,
stream>>>(
arg);
338 }
else if(
arg.kernelType == OPROD_EXTERIOR_KERNEL) {
339 if (
arg.dir == 0) exteriorOprodKernel<0><<<tp.grid,tp.block,tp.shared_bytes,
stream>>>(
arg);
340 else if (
arg.dir == 1) exteriorOprodKernel<1><<<tp.grid,tp.block,tp.shared_bytes,
stream>>>(
arg);
341 else if (
arg.dir == 2) exteriorOprodKernel<2><<<tp.grid,tp.block,tp.shared_bytes,
stream>>>(
arg);
342 else if (
arg.dir == 3) exteriorOprodKernel<3><<<tp.grid,tp.block,tp.shared_bytes,
stream>>>(
arg);
344 errorQuda(
"Kernel type not supported\n");
347 errorQuda(
"No CPU support for staggered outer-product calculation\n");
352 this->arg.force.save();
355 this->arg.force.load();
358 long long flops()
const {
359 if (
arg.kernelType == OPROD_INTERIOR_KERNEL) {
360 return ((
long long)
arg.length)*4*(24 + 144 + 234);
362 return ((
long long)
arg.length)*(144 + 234);
365 long long bytes()
const {
366 if (
arg.kernelType == OPROD_INTERIOR_KERNEL) {
367 return ((
long long)
arg.length)*(
arg.inA.Bytes() +
arg.inC.Bytes() + 4*(
arg.inB_shift.Bytes() +
arg.inD_shift.Bytes() + 2*
arg.force.Bytes() +
arg.gauge.Bytes()));
369 return ((
long long)
arg.length)*(
arg.inA.Bytes() +
arg.inB_shift.Bytes()/2 +
arg.inC.Bytes() +
arg.inD_shift.Bytes()/2 + 2*
arg.force.Bytes() +
arg.gauge.Bytes());
373 TuneKey tuneKey()
const {
376 if (
arg.kernelType == OPROD_INTERIOR_KERNEL) {
377 strcat(new_aux,
",interior");
379 strcat(new_aux,
",exterior");
380 if (
arg.dir==0)
strcat(new_aux,
",dir=0");
381 else if (
arg.dir==1)
strcat(new_aux,
",dir=1");
382 else if (
arg.dir==2)
strcat(new_aux,
",dir=2");
383 else if (
arg.dir==3)
strcat(new_aux,
",dir=3");
385 return TuneKey(meta.VolString(),
"CloverForce", new_aux);
389 void exchangeGhost(cudaColorSpinorField &
a,
int parity,
int dag) {
402 for(
int i=3;
i>=0;
i--){
405 a.gather(1, dag, 2*
i);
411 for (
int i=3;
i>=0;
i--) {
413 a.commsStart(1, 2*
i, dag);
417 for (
int i=3;
i>=0;
i--) {
419 a.commsWait(1, 2*
i, dag);
420 a.scatter(1, dag, 2*
i);
427 a.bufferIndex = (1 -
a.bufferIndex);
431 template<
typename Float,
typename Output,
typename Gauge,
typename InputA,
typename InputB,
typename InputC,
typename InputD>
432 void computeCloverForceCuda(Output force, Gauge gauge, GaugeField&
out,
433 InputA& inA, InputB& inB, InputC &inC, InputD &inD,
434 cudaColorSpinorField& src1, cudaColorSpinorField& src2,
435 const unsigned int parity,
const int faceVolumeCB[4],
const double coeff)
439 unsigned int ghostOffset[4] = {0,0,0,0};
440 for (
int dir=0; dir<4; ++dir) {
441 if (src1.GhostOffset(dir) != src2.GhostOffset(dir))
442 errorQuda(
"Mismatched ghost offset[%d] %d != %d", dir, src1.GhostOffset(dir), src2.GhostOffset(dir));
443 ghostOffset[dir] = (src1.GhostOffset(dir,1))/src1.FieldOrder();
447 CloverForceArg<Float,Output,Gauge,InputA,InputB,InputC,InputD>
arg(
parity, 0, ghostOffset, 1, OPROD_INTERIOR_KERNEL,
coeff, inA, inB, inC, inD, gauge, force,
out);
450 arg.kernelType = OPROD_INTERIOR_KERNEL;
451 arg.length = src1.VolumeCB();
454 for (
int i=3;
i>=0;
i--) {
457 arg.kernelType = OPROD_EXTERIOR_KERNEL;
459 arg.length = faceVolumeCB[
i];
460 arg.displacement = 1;
466 #endif // GPU_CLOVER_FORCE 470 std::vector<ColorSpinorField*> &
x,
471 std::vector<ColorSpinorField*> &
p,
472 std::vector<double> &
coeff)
475 #ifdef GPU_CLOVER_DIRAC 482 createCloverForceEvents();
486 for (
unsigned int i=0;
i<
x.size();
i++) {
503 exchangeGhost(static_cast<cudaColorSpinorField&>(inB),
parity, dag);
508 exchangeGhost(static_cast<cudaColorSpinorField&>(inD),
parity, 1-dag);
513 force, spinorA, spinorB, spinorC, spinorD,
515 static_cast<cudaColorSpinorField&>(inD),
520 force, spinorA, spinorB, spinorC, spinorD,
522 static_cast<cudaColorSpinorField&>(inD),
525 errorQuda(
"Unsupported recontruction type");
527 #if 0 // FIXME - Spinor class expect float4 pointer not Complex pointer 531 errorQuda(
"Unsupported precision: %d\n",
x[0]->Precision());
536 destroyCloverForceEvents();
538 #else // GPU_CLOVER_DIRAC not defined 539 errorQuda(
"Clover Dirac operator has not been built!");
__device__ __forceinline__ int neighborIndex(const unsigned int &cb_idx, const int(&shift)[4], const bool(&partitioned)[4], const unsigned int &parity)
int commDimPartitioned(int dir)
QudaVerbosity getVerbosity()
cudaEvent_t scatterEnd[Nstream]
void computeCloverForce(GaugeField &force, const GaugeField &U, std::vector< ColorSpinorField *> &x, std::vector< ColorSpinorField *> &p, std::vector< double > &coeff)
Compute the force contribution from the solver solution fields.
void setPackComms(const int *commDim)
std::complex< double > Complex
static __inline__ dim3 dim3 void size_t cudaStream_t int dim
char * strcpy(char *__dst, const char *__src)
char * strcat(char *__s1, const char *__s2)
__device__ __host__ Matrix< complex< Float >, Nc > outerProdSpinTrace(const ColorSpinor< Float, Nc, Ns > &a, const ColorSpinor< Float, Nc, Ns > &b)
static __device__ __forceinline__ void coordsFromIndex(int &idx, T *x, int &cb_idx, const Param ¶m)
Compute coordinates from index into the checkerboard (used by the interior Dslash kernels)...
static __inline__ size_t p
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
cudaEvent_t gatherEnd[Nstream]
static unsigned int unsigned int shift
Main header file for host and device accessors to GaugeFields.
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
enum QudaFieldLocation_s QudaFieldLocation
cpuColorSpinorField * out
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
void setKernelPackT(bool pack)
const int * GhostFace() const
cudaError_t qudaEventRecord(cudaEvent_t &event, cudaStream_t stream=0)
Wrapper around cudaEventRecord or cuEventRecord.
QudaReconstructType Reconstruct() const
QudaGaugeFieldOrder Order() const
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
const void int size_t unsigned int flags
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
QudaPrecision Precision() const