12 #ifdef GPU_STAGGERED_OPROD
19 void createEventArray(cudaEvent_t (&event)[N],
unsigned int flags=cudaEventDefault)
21 for(
int i=0; i<N; ++i)
22 cudaEventCreate(&event[i],flags);
27 void destroyEventArray(cudaEvent_t (&event)[N])
29 for(
int i=0; i<N; ++i)
30 cudaEventDestroy(event[i]);
37 static cudaEvent_t oprodStart;
38 static cudaEvent_t oprodEnd;
43 cudaEventCreate(&
packEnd, cudaEventDisableTiming);
44 createEventArray(
gatherEnd, cudaEventDisableTiming);
45 createEventArray(
scatterEnd, cudaEventDisableTiming);
47 cudaEventCreate(&oprodStart, cudaEventDisableTiming);
48 cudaEventCreate(&oprodEnd, cudaEventDisableTiming);
58 cudaEventDestroy(oprodStart);
59 cudaEventDestroy(oprodEnd);
64 enum KernelType {OPROD_INTERIOR_KERNEL, OPROD_EXTERIOR_KERNEL};
66 template<
typename Complex,
typename Output,
typename InputA,
typename InputB>
67 struct StaggeredOprodArg {
72 unsigned int ghostOffset;
73 unsigned int displacement;
80 cudaGaugeField& outFieldA;
81 cudaGaugeField& outFieldB;
82 typename RealTypeId<Complex>::Type
coeff[2];
84 StaggeredOprodArg(
const unsigned int length,
87 const unsigned int dir,
88 const unsigned int ghostOffset,
89 const unsigned int displacement,
91 const double coeff[2],
96 cudaGaugeField& outFieldA,
97 cudaGaugeField& outFieldB) : length(length), parity(parity), ghostOffset(ghostOffset),
98 displacement(displacement), kernelType(kernelType), inA(inA), inB(inB), outA(outA), outB(outB),
99 outFieldA(outFieldA), outFieldB(outFieldB)
101 this->coeff[0] = coeff[0];
102 this->coeff[1] = coeff[1];
103 for(
int i=0; i<4; ++i) this->X[i] = X[i];
115 template <IndexType
idxType>
116 static __device__ __forceinline__
void coordsFromIndex(
int&
idx,
int c[4],
117 const unsigned int cb_idx,
const unsigned int parity,
const int X[4])
119 const int &LX = X[0];
120 const int &LY = X[1];
121 const int &LZ = X[2];
122 const int XYZ = X[2]*X[1]*X[0];
123 const int XY = X[1]*X[0];
129 if (idxType == EVEN_X ) {
138 int aux2 = aux1 / LY;
139 y = aux1 - aux2 * LY;
142 aux1 = (parity + t + z +
y) & 1;
145 }
else if (idxType == EVEN_Y ) {
148 idx += (parity + t + z) & 1;
151 }
else if (idxType == EVEN_Z ) {
153 idx += (parity + t) & 1;
176 __device__
void coordsFromIndex(
int x[4],
const unsigned int cb_idx,
const int X[4],
const unsigned int dir,
const int displacement,
const unsigned int parity)
180 unsigned int Xh[2] = {X[0]/2, X[1]/2};
183 x[2] = cb_idx/Xh[1] % X[2];
184 x[3] = cb_idx/(Xh[1]*X[2]) % X[3];
185 x[0] = cb_idx/(Xh[1]*X[2]*X[3]);
186 x[0] += (X[0] - displacement);
187 x[1] = 2*(cb_idx % Xh[1]) + ((x[0]+x[2]+x[3]+parity)&1);
191 x[2] = cb_idx/Xh[0] % X[2];
192 x[3] = cb_idx/(Xh[0]*X[2]) % X[3];
193 x[1] = cb_idx/(Xh[0]*X[2]*X[3]);
194 x[1] += (X[1] - displacement);
195 x[0] = 2*(cb_idx % Xh[0]) + ((x[1]+x[2]+x[3]+parity)&1);
200 x[1] = cb_idx/Xh[0] % X[1];
201 x[3] = cb_idx/(Xh[0]*X[1]) % X[3];
202 x[2] = cb_idx/(Xh[0]*X[1]*X[3]);
203 x[2] += (X[2] - displacement);
204 x[0] = 2*(cb_idx % Xh[0]) + ((x[1]+x[2]+x[3]+parity)&1);
209 x[1] = cb_idx/Xh[0] % X[1];
210 x[2] = cb_idx/(Xh[0]*X[1]) % X[2];
211 x[3] = cb_idx/(Xh[0]*X[1]*X[2]);
212 x[3] += (X[3] - displacement);
213 x[0] = 2*(cb_idx % Xh[0]) + ((x[1]+x[2]+x[3]+parity)&1);
217 }
else if(
Nspin == 3){
224 template<
int Nspin,
int Nface>
225 __device__
int ghostIndexFromCoords(
const int x[4],
const int X[4],
const unsigned int dir,
const int shift){
232 __device__
int ghostIndexFromCoords<1,3>(
240 if((x[dir] + shift) >= X[dir]){
243 ghost_idx = (3*3 + (x[0]-X[0]+shift))*(X[3]*X[2]*X[1])/2 + ((x[3]*X[2] + x[2])*X[1] + x[1])/2;
246 ghost_idx = (3*3 + (x[1]-X[1]+shift))*(X[3]*X[2]*X[0])/2 + (x[3]*X[2]*X[0] + x[2]*X[0] + x[0])/2;
249 ghost_idx = (3*3 + (x[2]-X[2]+shift))*(X[3]*X[1]*X[0])/2 + (x[3]*X[1]*X[0] + x[1]*X[0] + x[0])/2;
252 ghost_idx = (3*3 + (x[3]-X[3]+shift))*(X[2]*X[1]*X[0])/2 + (x[2]*X[1]*X[0] + x[1]*X[0] + x[0])/2;
259 if(static_cast<int>(x[dir]) + shift < 0){
262 ghost_idx = (3 + shift)*(X[3]*X[2]*X[1])/2 + ((x[3]*X[2] + x[2])*X[1] + x[1])/2;
265 ghost_idx = (3 + shift)*(X[3]*X[2]*X[0])/2 + ((x[3]*X[2] + x[2])*X[0] + x[0])/2;
268 ghost_idx = (3 + shift)*(X[3]*X[1]*X[0])/2 + ((x[3]*X[1] + x[1])*X[0] + x[0])/2;
271 ghost_idx = (3 + shift)*(X[2]*X[1]*X[0])/2 + ((x[2]*X[1] + x[1])*X[0] + x[0])/2;
283 __device__ __forceinline__
284 int neighborIndex(
const unsigned int& cb_idx,
const int shift[4],
const bool partitioned[4],
const unsigned int& parity,
296 if( (x[dim]+shift[dim])<0 || (x[
dim]+shift[
dim])>=X[dim])
return -1;
300 for(
int dim=0; dim<4; ++
dim){
303 return (((x[3]*X[2] + x[2])*X[1] + x[1])*X[0] + x[0]) >> 1;
308 template<
typename Complex,
typename Output,
typename InputA,
typename InputB>
309 __global__
void interiorOprodKernel(StaggeredOprodArg<Complex, Output, InputA, InputB>
arg)
311 unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;
312 const unsigned int gridSize = gridDim.x*blockDim.x;
314 typedef typename RealTypeId<Complex>::Type real;
322 while(idx<arg.length){
323 arg.inA.load(x, idx);
324 for(
int dir=0; dir<4; ++dir){
325 int shift[4] = {0,0,0,0};
327 const int first_nbr_idx =
neighborIndex(idx, shift, arg.partitioned, arg.parity, arg.X);
328 if(first_nbr_idx >= 0){
329 arg.inB.load(y, first_nbr_idx);
331 arg.outA.load(reinterpret_cast<real*>(tempA.data), idx, dir, arg.parity);
332 result = tempA + result*arg.coeff[0];
333 arg.outA.save(reinterpret_cast<real*>(result.data), idx, dir, arg.parity);
336 const int third_nbr_idx =
neighborIndex(idx, shift, arg.partitioned, arg.parity, arg.X);
337 if(third_nbr_idx >= 0){
338 arg.inB.load(z, third_nbr_idx);
340 arg.outB.load(reinterpret_cast<real*>(tempB.data), idx, dir, arg.parity);
341 result = tempB + result*arg.coeff[1];
342 arg.outB.save(reinterpret_cast<real*>(result.data), idx, dir, arg.parity);
353 template<
typename Complex,
typename Output,
typename InputA,
typename InputB>
354 __global__
void exteriorOprodKernel(StaggeredOprodArg<Complex, Output, InputA, InputB> arg)
356 unsigned int cb_idx = blockIdx.x*blockDim.x + threadIdx.x;
357 const unsigned int gridSize = gridDim.x*blockDim.x;
363 typedef typename RealTypeId<Complex>::Type real;
366 Output&
out = (arg.displacement == 1) ? arg.outA : arg.outB;
367 real coeff = (arg.displacement == 1) ? arg.coeff[0] : arg.coeff[1];
370 while(cb_idx<arg.length){
371 coordsFromIndex<1>(
x, cb_idx, arg.X, arg.dir, arg.displacement, arg.parity);
372 const unsigned int bulk_cb_idx = ((((x[3]*arg.X[2] + x[2])*arg.X[1] + x[1])*arg.X[0] + x[0]) >> 1);
374 out.load(reinterpret_cast<real*>(inmatrix.data), bulk_cb_idx, arg.dir, arg.parity);
375 arg.inA.load(a, bulk_cb_idx);
377 const unsigned int ghost_idx = arg.ghostOffset + ghostIndexFromCoords<1,3>(
x, arg.X, arg.dir, arg.displacement);
378 arg.inB.load(b, ghost_idx);
381 result = inmatrix + result*
coeff;
382 out.save(reinterpret_cast<real*>(result.data), bulk_cb_idx, arg.dir, arg.parity);
391 template<
typename Complex,
typename Output,
typename InputA,
typename InputB>
392 class StaggeredOprodField :
public Tunable {
395 StaggeredOprodArg<Complex,Output,InputA,InputB>
arg;
396 const GaugeField &meta;
399 unsigned int sharedBytesPerThread()
const {
return 0; }
400 unsigned int sharedBytesPerBlock(
const TuneParam &)
const {
return 0; }
402 unsigned int minThreads()
const {
return arg.outA.volumeCB; }
403 bool tunedGridDim()
const {
return false; }
406 StaggeredOprodField(
const StaggeredOprodArg<Complex,Output,InputA,InputB> &arg,
408 : arg(arg), meta(meta), location(location) {
409 writeAuxString(
"threads=%d,prec=%lu,stride=%d",arg.length,
sizeof(
Complex)/2,arg.inA.Stride());
415 virtual ~StaggeredOprodField() {}
417 void set(
const StaggeredOprodArg<Complex,Output,InputA,InputB> &arg,
QudaFieldLocation location){
419 this->arg.dir = arg.dir;
420 this->arg.length = arg.length;
421 this->arg.ghostOffset = arg.ghostOffset;
422 this->arg.kernelType = arg.kernelType;
426 void apply(
const cudaStream_t &
stream){
431 interiorOprodKernel<<<tp.grid,tp.block,tp.shared_bytes, stream>>>(
arg);
445 errorQuda(
"No CPU support for staggered outer-product calculation\n");
450 this->arg.outFieldA.backup();
451 this->arg.outFieldB.backup();
454 this->arg.outFieldA.restore();
455 this->arg.outFieldB.restore();
458 long long flops()
const {
462 long long bytes()
const {
466 TuneKey tuneKey()
const {
return TuneKey(meta.VolString(),
typeid(*this).name(), aux);}
469 template<
typename Complex,
typename Output,
typename InputA,
typename InputB>
470 void computeStaggeredOprodCuda(Output outA, Output outB, cudaGaugeField& outFieldA, cudaGaugeField& outFieldB, InputA& inA, InputB& inB, cudaColorSpinorField& src,
471 FaceBuffer& faceBuffer,
const unsigned int parity,
const int faceVolumeCB[4],
472 const unsigned int ghostOffset[4],
const double coeff[2])
478 const int dim[4] = {src.X(0)*2, src.X(1), src.X(2), src.X(3)};
480 StaggeredOprodArg<Complex,Output,InputA,InputB>
arg(outA.volumeCB, dim, parity, 0, 0, 1, OPROD_INTERIOR_KERNEL, coeff, inA, inB, outA, outB, outFieldA,
488 for(
int i=3; i>=0; i--){
499 faceBuffer.pack(src, 1-parity, 0,
streams);
504 for(
int i=3; i>=0; i--){
508 cudaStreamWaitEvent(
streams[2*i], event, 0);
512 faceBuffer.gather(src,
false, 2*i);
532 int gatherCompleted[5];
533 int commsCompleted[5];
534 int oprodCompleted[4];
536 for(
int i=0; i<4; ++i){
537 gatherCompleted[i] = commsCompleted[i] = oprodCompleted[i] = 0;
539 gatherCompleted[4] = commsCompleted[4] = 1;
542 int commDimTotal = 0;
543 for(
int i=0; i<4; ++i){
550 for(
int i=3; i>=0; i--){
553 for(
int j=3; j>i; j--){
558 previousDir[i] = prev;
564 arg.kernelType = OPROD_EXTERIOR_KERNEL;
565 unsigned int completeSum=0;
566 while(completeSum < commDimTotal){
568 for(
int i=3; i>=0; i--){
571 if(!gatherCompleted[i] && gatherCompleted[previousDir[i]]){
572 cudaError_t event_test = cudaEventQuery(
gatherEnd[i]);
574 if(event_test == cudaSuccess){
575 gatherCompleted[i] = 1;
577 faceBuffer.commsStart(2*i);
582 if(!commsCompleted[i] && commsCompleted[previousDir[i]] && gatherCompleted[i]){
583 int comms_test = faceBuffer.commsQuery(2*i);
585 commsCompleted[i] = 1;
587 faceBuffer.scatter(src,
false, 2*i);
592 if(!oprodCompleted[i] && commsCompleted[i]){
597 arg.ghostOffset = ghostOffset[i];
598 const unsigned int volume = arg.X[0]*arg.X[1]*arg.X[2]*arg.X[3];
599 arg.inB.setStride(3*volume/(2*arg.X[arg.dir]));
603 arg.length = faceVolumeCB[i];
604 arg.displacement = 1;
605 dim3 blockDim(128, 1, 1);
606 const int gridSize = (arg.length + (blockDim.x-1))/blockDim.x;
607 dim3 gridDim(gridSize, 1, 1);
613 arg.displacement = 3;
614 arg.length = arg.displacement*faceVolumeCB[i];
615 dim3 blockDim(128, 1, 1);
616 const int gridSize = (arg.length + (blockDim.x-1))/blockDim.x;
617 dim3 gridDim(gridSize, 1, 1);
620 arg.inB.setStride(arg.inA.Stride());
622 oprodCompleted[i] = 1;
631 #endif // GPU_STAGGERED_OPROD
639 const unsigned int parity,
const double coeff[2])
642 #ifdef GPU_STAGGERED_OPROD
650 unsigned int ghostOffset[4] = {0,0,0,0};
653 for(
int dir=0; dir<4; ++dir){
663 #if (__COMPUTE_CAPABILITY__ >= 200)
670 spinorA, spinorB, inB, faceBuffer,
parity, inB.GhostFace(), ghostOffset,
coeff);
677 spinorA, spinorB, inB, faceBuffer,
parity, inB.GhostFace(), ghostOffset,
coeff);
682 errorQuda(
"Staggered Outer Product not supported on pre-Fermi architecture");
686 #else // GPU_STAGGERED_OPROD not defined
687 errorQuda(
"Staggered Outer Product has not been built!");
__device__ __forceinline__ int neighborIndex(const unsigned int &cb_idx, const int(&shift)[4], const bool(&partitioned)[4], const unsigned int &parity)
void destroyStaggeredOprodEvents()
int commDimPartitioned(int dir)
QudaVerbosity getVerbosity()
void setPackComms(const int *commDim)
std::complex< double > Complex
QudaGaugeFieldOrder Order() const
__device__ __host__ void outerProd(const Array< T, N > &a, const Array< T, N > &b, Matrix< T, N > *m)
QudaPrecision Precision() const
const QudaFieldLocation location
cudaEvent_t packEnd[Nstream]
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
__constant__ double coeff
cudaEvent_t gatherEnd[Nstream]
void computeStaggeredOprod(cudaGaugeField &out, cudaColorSpinorField &in, FaceBuffer &facebuffer, const unsigned int parity, const double coeff, const unsigned int displacement)
QudaFieldOrder FieldOrder() const
enum QudaFieldLocation_s QudaFieldLocation
cpuColorSpinorField * out
QudaPrecision Precision() const
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
coordsFromIndex< EVEN_X >(X, x1, x2, x3, x4, sid, param.parity, dims)
cudaEvent_t scatterEnd[Nstream]
int GhostOffset(const int i) const
void createStaggeredOprodEvents()