9 #define PRESERVE_SPINOR_NORM 11 #ifdef PRESERVE_SPINOR_NORM // Preserve the norm regardless of basis 12 #define kP (1.0/sqrt(2.0)) 13 #define kU (1.0/sqrt(2.0)) 14 #else // More numerically accurate not to preserve the norm between basis 23 using namespace colorspinor;
29 for(
int i=0; i<4; i++){
30 if(R[i] > nFace) nFace = R[i];
35 int gatherCompleted[2] = {0,0};
36 int commsCompleted[2] = {0,0};
39 for(
int dir=0; dir<2; dir++) cudaEventCreate(&gatherEnd[dir], cudaEventDisableTiming);
41 for(
int dim=3; dim<=0; dim--){
44 spinor->
packExtended(nFace, R, parity, dagger, dim, stream_p);
46 for(
int dir=1; dir<=0; dir--){
47 spinor->
gather(nFace, dagger, 2*dim + dir);
53 while(completeSum < 2){
54 if(!gatherCompleted[dir]){
55 if(cudaSuccess == cudaEventQuery(gatherEnd[dir])){
58 gatherCompleted[dir--] = 1;
62 gatherCompleted[0] = gatherCompleted[1] = 0;
66 while(completeSum < 4){
67 if(!commsCompleted[dir]){
68 if(spinor->
commsQuery(nFace, 2*dim+dir, dagger)){
71 commsCompleted[dir--] = 1;
75 commsCompleted[0] = commsCompleted[1] = 0;
79 for(
int dir=0; dir<2; dir++) cudaEventDestroy(gatherEnd[dir]);
86 template <
typename FloatOut,
typename FloatIn,
int Ns,
int Nc>
92 for (
int s=0;
s<Ns;
s++) {
93 for (
int c=0; c<Nc; c++) {
101 template <
typename FloatOut,
typename FloatIn,
int Ns,
int Nc>
106 int s1[4] = {1, 2, 3, 0};
107 int s2[4] = {3, 0, 1, 2};
108 RegTypeOut K1[4] = {
static_cast<RegTypeOut
>(
kP), static_cast<RegTypeOut>(-
kP),
109 static_cast<RegTypeOut
>(-
kP), static_cast<RegTypeOut>(-
kP)};
110 RegTypeOut K2[4] = {
static_cast<RegTypeOut
>(
kP), static_cast<RegTypeOut>(-
kP),
111 static_cast<RegTypeOut
>(
kP), static_cast<RegTypeOut>(
kP)};
112 for (
int s=0;
s<Ns;
s++) {
113 for (
int c=0; c<Nc; c++) {
114 out(
s,c).real(K1[
s]*
in(s1[
s],c).real() + K2[s]*
in(s2[s],c).real());
115 out(s,c).imag(K1[s]*
in(s1[s],c).imag() + K2[s]*
in(s2[s],c).imag());
122 template <
typename FloatOut,
typename FloatIn,
int Ns,
int Nc>
127 int s1[4] = {1, 2, 3, 0};
128 int s2[4] = {3, 0, 1, 2};
129 RegTypeOut K1[4] = {
static_cast<RegTypeOut
>(-
kU), static_cast<RegTypeOut>(
kU),
130 static_cast<RegTypeOut
>(
kU), static_cast<RegTypeOut>(
kU)};
131 RegTypeOut K2[4] = {
static_cast<RegTypeOut
>(-
kU), static_cast<RegTypeOut>(
kU),
132 static_cast<RegTypeOut
>(-
kU), static_cast<RegTypeOut>(-
kU)};
133 for (
int s=0;
s<Ns;
s++) {
134 for (
int c=0; c<Nc; c++) {
135 out(
s,c).real(K1[
s]*
in(s1[
s],c).real() + K2[s]*
in(s2[s],c).real());
136 out(s,c).imag(K1[s]*
in(s1[s],c).imag() + K2[s]*
in(s2[s],c).imag());
142 template<
typename OutOrder,
typename InOrder,
typename Basis>
152 CopySpinorExArg(
const OutOrder &out,
const InOrder &in,
const Basis& basis,
const int *E,
const int *X,
const int parity)
153 : out(out), in(in), basis(basis), parity(parity)
156 for(
int d=0; d<4; d++){
159 this->length *= X[d];
165 template<
typename FloatOut,
typename FloatIn,
int Ns,
int Nc,
typename OutOrder,
typename InOrder,
typename Basis,
bool extend>
170 for(
int d=0; d<4; d++) R[d] = (arg.
E[d] - arg.
X[d]) >> 1;
172 int za = X/(arg.
X[0]/2);
173 int x0h = X - za*(arg.
X[0]/2);
174 int zb = za/arg.
X[1];
175 x[1] = za - zb*arg.
X[1];
176 x[3] = zb / arg.
X[2];
177 x[2] = zb - x[3]*arg.
X[2];
178 x[0] = 2*x0h + ((x[1] + x[2] + x[3] + arg.
parity) & 1);
181 int Y = ((((x[3]+R[3])*arg.
E[2] + (x[2]+R[2]))*arg.
E[1] + (x[1]+R[1]))*arg.
E[0]+(x[0]+R[0])) >> 1;
191 in = arg.
in(X, parity);
195 in = arg.
in(Y, parity);
202 template<
typename FloatOut,
typename FloatIn,
int Ns,
int Nc,
typename OutOrder,
typename InOrder,
typename Basis,
bool extend>
205 int cb_idx = blockIdx.x*blockDim.x + threadIdx.x;
207 while(cb_idx < arg.
length){
208 copyInterior<FloatOut,FloatIn,Ns,Nc,OutOrder,InOrder,Basis,extend>(
arg,cb_idx);
209 cb_idx += gridDim.x*blockDim.x;
216 template<
typename FloatOut,
typename FloatIn,
int Ns,
int Nc,
typename OutOrder,
typename InOrder,
typename Basis,
bool extend>
219 for(
int cb_idx=0; cb_idx<arg.
length; cb_idx++){
220 copyInterior<FloatOut,FloatIn,Ns,Nc,OutOrder,InOrder,Basis,extend>(
arg, cb_idx);
227 template<
typename FloatOut,
typename FloatIn,
int Ns,
int Nc,
typename OutOrder,
typename InOrder,
typename Basis,
bool extend>
243 : arg(arg), meta(meta), location(location) {
244 writeAuxString(
"out_stride=%d,in_stride=%d",arg.
out.stride,arg.
in.stride);
252 copyInterior<FloatOut,FloatIn,Ns,Nc,OutOrder,InOrder,Basis,extend>(
arg);
254 copyInteriorKernel<FloatOut,FloatIn,Ns,Nc,OutOrder,InOrder,Basis,extend>
261 long long flops()
const {
return 0; }
263 return arg.
length*2*Nc*Ns*(
sizeof(FloatIn) +
sizeof(FloatOut));
270 template<
typename FloatOut,
typename FloatIn,
int Ns,
int Nc,
typename OutOrder,
typename InOrder,
typename Basis>
271 void copySpinorEx(OutOrder outOrder,
const InOrder inOrder,
const Basis basis,
const int *
E,
285 template<
typename FloatOut,
typename FloatIn,
int Ns,
int Nc,
typename OutOrder,
typename InOrder>
287 const int*
E,
const int*
X,
const int parity,
const bool extend,
290 if(inBasis == outBasis){
292 copySpinorEx<FloatOut, FloatIn, Ns, Nc, OutOrder, InOrder, PreserveBasis<FloatOut,FloatIn,Ns,Nc> >
293 (outOrder, inOrder, basis,
E,
X,
parity, extend, meta, location);
295 if(Ns != 4)
errorQuda(
"Can only change basis with Nspin = 4, not Nspin = %d", Ns);
297 copySpinorEx<FloatOut, FloatIn, 4, Nc, OutOrder, InOrder, NonRelBasis<FloatOut,FloatIn,4,Nc> >
298 (outOrder, inOrder, basis,
E,
X,
parity, extend, meta, location);
300 if(Ns != 4)
errorQuda(
"Can only change basis with Nspin = 4, not Nspin = %d", Ns);
302 copySpinorEx<FloatOut, FloatIn, 4, Nc, OutOrder, InOrder, RelBasis<FloatOut,FloatIn,4,Nc> >
303 (outOrder, inOrder, basis,
E,
X,
parity, extend, meta, location);
312 template<
typename FloatOut,
typename FloatIn,
int Ns,
int Nc,
typename InOrder>
319 ColorSpinor outOrder(out, 1, Out, outNorm);
320 copySpinorEx<FloatOut,FloatIn,Ns,Nc>
328 template<
typename FloatOut,
typename FloatIn,
int Ns,
int Nc>
331 float* outNorm,
float *inNorm){
337 for (
int d=0; d<4; d++) {
342 for (
int d=0; d<4; d++) {
347 X[0] *= 2; E[0] *= 2;
351 ColorSpinor inOrder(in, 1, In, inNorm);
352 extendedCopyColorSpinor<FloatOut,FloatIn,Ns,Nc>(inOrder,
out, in.
GammaBasis(),
E,
X,
parity, extend, location, Out, outNorm);
359 template<
int Ns,
typename dstFloat,
typename srcFloat>
362 float *dstNorm,
float *srcNorm) {
386 errorQuda(
"Copying to full fields with lexicographical ordering is not currently supported");
392 errorQuda(
"QDPJIT field ordering not supported for full site fields");
396 srcFloat *srcEven = Src ? Src : (srcFloat*)src.
V();
397 srcFloat* srcOdd = (srcFloat*)((
char*)srcEven + src.
Bytes()/2);
398 float *srcNormEven = srcNorm ? srcNorm : (
float*)src.
Norm();
399 float *srcNormOdd = (
float*)((
char*)srcNormEven + src.
NormBytes()/2);
401 std::swap<srcFloat*>(srcEven, srcOdd);
402 std::swap<float*>(srcNormEven, srcNormOdd);
406 dstFloat *dstEven = Dst ? Dst : (dstFloat*)dst.
V();
407 dstFloat *dstOdd = (dstFloat*)((
char*)dstEven + dst.
Bytes()/2);
408 float *dstNormEven = dstNorm ? dstNorm : (
float*)dst.
Norm();
409 float *dstNormOdd = (
float*)((
char*)dstNormEven + dst.
NormBytes()/2);
411 std::swap<dstFloat*>(dstEven, dstOdd);
412 std::swap<float*>(dstNormEven, dstNormOdd);
416 extendedCopyColorSpinor<dstFloat, srcFloat, Ns, Nc>
417 (dst, src, 0, location, dstEven, srcEven, dstNormEven, srcNormEven);
418 extendedCopyColorSpinor<dstFloat, srcFloat, Ns, Nc>
419 (dst, src, 1, location, dstOdd, srcOdd, dstNormOdd, srcNormOdd);
421 extendedCopyColorSpinor<dstFloat, srcFloat, Ns, Nc>
422 (dst, src,
parity, location, Dst, Src, dstNorm, srcNorm);
427 template<
typename dstFloat,
typename srcFloat>
430 float *dstNorm=0,
float *srcNorm=0)
433 errorQuda(
"source and destination spins must match");
435 if(dst.
Nspin() == 4){
436 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC) 437 copyExtendedColorSpinor<4>(dst, src,
parity, location, Dst, Src, dstNorm, srcNorm);
439 errorQuda(
"Extended copy has not been built for Nspin=%d fields",dst.Nspin());
441 }
else if(dst.
Nspin() == 1){
442 #ifdef GPU_STAGGERED_DIRAC 443 copyExtendedColorSpinor<1>(dst, src,
parity, location, Dst, Src, dstNorm, srcNorm);
445 errorQuda(
"Extended copy has not been built for Nspin=%d fields", dst.Nspin());
456 void *dstNorm,
void *srcNorm){
464 CopyExtendedColorSpinor(dst, src, parity, location, static_cast<double*>(Dst), static_cast<short*>(Src), 0, static_cast<float*>(srcNorm));
474 CopyExtendedColorSpinor(dst, src, parity, location, static_cast<float*>(Dst), static_cast<short*>(Src), 0, static_cast<float*>(srcNorm));
480 CopyExtendedColorSpinor(dst, src, parity, location, static_cast<short*>(Dst), static_cast<double*>(Src), static_cast<float*>(dstNorm), 0);
482 CopyExtendedColorSpinor(dst, src, parity, location, static_cast<short*>(Dst), static_cast<float*>(Src), static_cast<float*>(dstNorm), 0);
484 CopyExtendedColorSpinor(dst, src, parity, location, static_cast<short*>(Dst), static_cast<short*>(Src), static_cast<float*>(dstNorm), static_cast<float*>(srcNorm));
CopySpinorEx(CopySpinorExArg< OutOrder, InOrder, Basis > &arg, const ColorSpinorField &meta, QudaFieldLocation location)
void commsStart(int nFace, int d, int dagger=0, cudaStream_t *stream_p=nullptr, bool gdr_send=false, bool gdr_recv=false)
Initiate halo communication.
mapper< FloatOut >::type RegTypeOut
__device__ __host__ void operator()(ColorSpinor< RegTypeOut, Nc, Ns > &out, const ColorSpinor< RegTypeIn, Nc, Ns > &in)
QudaVerbosity getVerbosity()
void gather(int nFace, int dagger, int dir, cudaStream_t *stream_p=NULL)
void CopyExtendedColorSpinor(ColorSpinorField &dst, const ColorSpinorField &src, const int parity, const QudaFieldLocation location, dstFloat *Dst, srcFloat *Src, float *dstNorm=0, float *srcNorm=0)
QudaGammaBasis GammaBasis() const
void copySpinorEx(OutOrder outOrder, const InOrder inOrder, const Basis basis, const int *E, const int *X, const int parity, const bool extend, const ColorSpinorField &meta, QudaFieldLocation location)
const char * VolString() const
void scatterExtended(int nFace, int parity, int dagger, int dir)
mapper< FloatIn >::type RegTypeIn
int commsQuery(int nFace, int d, int dagger=0, cudaStream_t *stream_p=nullptr, bool gdr_send=false, bool gdr_recv=false)
Non-blocking query if the halo communication has completed.
bool advanceSharedBytes(TuneParam ¶m) const
#define qudaDeviceSynchronize()
void apply(const cudaStream_t &stream)
mapper< FloatOut >::type RegTypeOut
unsigned int minThreads() const
QudaSiteSubset SiteSubset() const
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
void packExtended(const int nFace, const int R[], const int parity, const int dagger, const int dim, cudaStream_t *stream_p, const bool zeroCopyPack=false)
QudaFieldLocation location
void exchangeExtendedGhost(cudaColorSpinorField *spinor, int R[], int parity, cudaStream_t *stream_p)
void extendedCopyColorSpinor(InOrder &inOrder, ColorSpinorField &out, QudaGammaBasis inBasis, const int *E, const int *X, const int parity, const bool extend, QudaFieldLocation location, FloatOut *Out, float *outNorm)
mapper< FloatIn >::type RegTypeIn
CopySpinorExArg< OutOrder, InOrder, Basis > arg
const ColorSpinorField & meta
enum QudaFieldLocation_s QudaFieldLocation
static int commDim[QUDA_MAX_DIM]
cpuColorSpinorField * out
void copyExtendedColorSpinor(ColorSpinorField &dst, const ColorSpinorField &src, QudaFieldLocation location, const int parity, void *Dst, void *Src, void *dstNorm, void *srcNorm)
unsigned int sharedBytesPerThread() const
enum QudaGammaBasis_s QudaGammaBasis
__device__ __host__ void operator()(ColorSpinor< RegTypeOut, Nc, Ns > &out, const ColorSpinor< RegTypeIn, Nc, Ns > &in)
CopySpinorExArg(const OutOrder &out, const InOrder &in, const Basis &basis, const int *E, const int *X, const int parity)
QudaSiteOrder SiteOrder() const
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
mapper< FloatOut >::type RegTypeOut
cudaError_t qudaEventRecord(cudaEvent_t &event, cudaStream_t stream=0)
Wrapper around cudaEventRecord or cuEventRecord.
__global__ void copyInteriorKernel(CopySpinorExArg< OutOrder, InOrder, Basis > arg)
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
__device__ __host__ void copyInterior(CopySpinorExArg< OutOrder, InOrder, Basis > &arg, int X)
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
QudaPrecision Precision() const
unsigned int sharedBytesPerBlock(const TuneParam ¶m) const
__device__ __host__ void operator()(ColorSpinor< RegTypeOut, Nc, Ns > &out, const ColorSpinor< RegTypeIn, Nc, Ns > &in)
mapper< FloatIn >::type RegTypeIn
QudaFieldOrder FieldOrder() const
cpuColorSpinorField * spinor
cudaEvent_t gatherEnd[Nstream]