16 #define PRESERVE_SPINOR_NORM 18 #ifdef PRESERVE_SPINOR_NORM // Preserve the norm regardless of basis 19 #define kP (1.0/sqrt(2.0)) 20 #define kU (1.0/sqrt(2.0)) 21 #else // More numerically accurate not to preserve the norm between basis 28 using namespace colorspinor;
30 template <
typename Out,
typename In>
39 :
out(
out),
in(
in), volumeCB(in_.VolumeCB()), nParity(in_.SiteSubset()),
45 template <
int Ns,
int Nc>
47 template <
typename FloatOut,
typename FloatIn>
48 __device__ __host__
inline void operator()(complex<FloatOut>
out[Ns*Nc],
const complex<FloatIn>
in[Ns*Nc])
const {
49 for (
int s=0;
s<Ns;
s++)
for (
int c=0;
c<Nc;
c++)
out[
s*Nc+
c] =
in[
s*Nc+
c];
54 template <
int Ns,
int Nc>
56 template <
typename FloatOut,
typename FloatIn>
57 __device__ __host__
inline void operator()(complex<FloatOut>
out[Ns*Nc],
const complex<FloatIn>
in[Ns*Nc])
const {
58 int s1[4] = {1, 2, 3, 0};
59 int s2[4] = {3, 0, 1, 2};
60 FloatOut K1[4] = {
static_cast<FloatOut
>(
kP), static_cast<FloatOut>(-
kP),
static_cast<FloatOut
>(-
kP), static_cast<FloatOut>(-
kP)};
61 FloatOut K2[4] = {
static_cast<FloatOut
>(
kP), static_cast<FloatOut>(-
kP),
static_cast<FloatOut
>(
kP), static_cast<FloatOut>(
kP)};
62 for (
int s=0;
s<Ns;
s++) {
63 for (
int c=0;
c<Nc;
c++) {
64 out[
s*Nc+
c] = K1[
s]*
static_cast<complex<FloatOut>
>(
in[s1[
s]*Nc+
c]) + K2[
s]*
static_cast<complex<FloatOut>
>(
in[s2[
s]*Nc+
c]);
71 template <
int Ns,
int Nc>
73 template <
typename FloatOut,
typename FloatIn>
74 __device__ __host__
inline void operator()(complex<FloatOut>
out[Ns*Nc],
const complex<FloatIn>
in[Ns*Nc])
const {
75 int s1[4] = {1, 2, 3, 0};
76 int s2[4] = {3, 0, 1, 2};
77 FloatOut K1[4] = {
static_cast<FloatOut
>(-
kU), static_cast<FloatOut>(
kU),
static_cast<FloatOut
>(
kU), static_cast<FloatOut>(
kU)};
78 FloatOut K2[4] = {
static_cast<FloatOut
>(-
kU), static_cast<FloatOut>(
kU),
static_cast<FloatOut
>(-
kU), static_cast<FloatOut>(-
kU)};
79 for (
int s=0;
s<Ns;
s++) {
80 for (
int c=0;
c<Nc;
c++) {
81 out[
s*Nc+
c] = K1[
s]*
static_cast<complex<FloatOut>
>(
in[s1[
s]*Nc+
c]) + K2[
s]*
static_cast<complex<FloatOut>
>(
in[s2[
s]*Nc+
c]);
88 template <
int Ns,
int Nc>
90 template <
typename FloatOut,
typename FloatIn>
91 __device__ __host__
inline void operator()(complex<FloatOut>
out[Ns*Nc],
const complex<FloatIn>
in[Ns*Nc])
const {
92 int s1[4] = {0, 1, 0, 1};
93 int s2[4] = {2, 3, 2, 3};
94 FloatOut K1[4] = {
static_cast<FloatOut
>(-
kP), static_cast<FloatOut>(-
kP),
static_cast<FloatOut
>(
kP), static_cast<FloatOut>(
kP)};
95 FloatOut K2[4] = {
static_cast<FloatOut
>(
kP), static_cast<FloatOut>(
kP),
static_cast<FloatOut
>(
kP), static_cast<FloatOut>(
kP)};
96 for (
int s=0;
s<Ns;
s++) {
97 for (
int c=0;
c<Nc;
c++) {
98 out[
s*Nc+
c] = K1[
s]*
static_cast<complex<FloatOut>
>(
in[s1[
s]*Nc+
c]) + K2[
s]*
static_cast<complex<FloatOut>
>(
in[s2[
s]*Nc+
c]);
105 template <
int Ns,
int Nc>
107 template <
typename FloatOut,
typename FloatIn>
108 __device__ __host__
inline void operator()(complex<FloatOut>
out[Ns*Nc],
const complex<FloatIn>
in[Ns*Nc])
const {
109 int s1[4] = {0, 1, 0, 1};
110 int s2[4] = {2, 3, 2, 3};
111 FloatOut K1[4] = {
static_cast<FloatOut
>(-
kU), static_cast<FloatOut>(-
kU),
static_cast<FloatOut
>(
kU), static_cast<FloatOut>(
kU)};
112 FloatOut K2[4] = {
static_cast<FloatOut
>(
kU),static_cast<FloatOut>(
kU),
static_cast<FloatOut
>(
kU), static_cast<FloatOut>(
kU)};
113 for (
int s=0;
s<Ns;
s++) {
114 for (
int c=0;
c<Nc;
c++) {
115 out[
s*Nc+
c] = K1[
s]*
static_cast<complex<FloatOut>
>(
in[s1[
s]*Nc+
c]) + K2[
s]*
static_cast<complex<FloatOut>
>(
in[s2[
s]*Nc+
c]);
122 template <
typename FloatOut,
typename FloatIn,
int Ns,
int Nc,
typename Arg,
typename Basis>
128 for (
int x=0;
x<
arg.volumeCB;
x++) {
131 basis(
out.data,
in.data);
138 template <
typename FloatOut,
typename FloatIn,
int Ns,
int Nc,
typename Arg,
typename Basis>
143 int x = blockIdx.x *
blockDim.x + threadIdx.x;
144 if (
x >=
arg.volumeCB)
return;
149 basis(
out.data,
in.data);
153 template <
typename FloatOut,
typename FloatIn,
int Ns,
int Nc,
typename Arg>
170 if (
out.GammaBasis()!=
in.GammaBasis())
errorQuda(
"Cannot change gamma basis for nSpin=%d\n", Ns);
171 writeAuxString(
"out_stride=%d,in_stride=%d",
arg.out.stride,
arg.in.stride);
180 copyColorSpinorKernel<FloatOut, FloatIn, Ns, Nc>
186 long long flops()
const {
return 0; }
187 long long bytes()
const {
return arg.in.Bytes() +
arg.out.Bytes(); }
190 template <
typename FloatOut,
typename FloatIn,
int Nc,
typename Arg>
192 static constexpr
int Ns = 4;
210 if (
out.GammaBasis()==
in.GammaBasis()) {
211 writeAuxString(
"out_stride=%d,in_stride=%d,PreserveBasis",
arg.out.stride,
arg.in.stride);
213 writeAuxString(
"out_stride=%d,in_stride=%d,NonRelBasis",
arg.out.stride,
arg.in.stride);
215 writeAuxString(
"out_stride=%d,in_stride=%d,RelBasis",
arg.out.stride,
arg.in.stride);
217 writeAuxString(
"out_stride=%d,in_stride=%d,ChiralToNonRelBasis",
arg.out.stride,
arg.in.stride);
219 writeAuxString(
"out_stride=%d,in_stride=%d,NonRelToChiralBasis",
arg.out.stride,
arg.in.stride);
221 errorQuda(
"Basis change from %d to %d not supported",
in.GammaBasis(),
out.GammaBasis());
228 if (
out.GammaBasis()==
in.GammaBasis()) {
241 if (
out.GammaBasis()==
in.GammaBasis()) {
242 copyColorSpinorKernel<FloatOut, FloatIn, Ns, Nc>
245 copyColorSpinorKernel<FloatOut, FloatIn, Ns, Nc>
248 copyColorSpinorKernel<FloatOut, FloatIn, Ns, Nc>
251 copyColorSpinorKernel<FloatOut, FloatIn, Ns, Nc>
254 copyColorSpinorKernel<FloatOut, FloatIn, Ns, Nc>
261 long long flops()
const {
return 0; }
262 long long bytes()
const {
return arg.in.Bytes() +
arg.out.Bytes(); }
267 template <
typename FloatOut,
typename FloatIn,
int Ns,
int Nc,
typename Out,
typename In>
272 Arg
arg(outOrder, inOrder,
out,
in);
279 template <
typename FloatOut,
typename FloatIn,
int Ns,
int Nc,
typename InOrder>
282 FloatOut *Out,
float *outNorm) {
284 const bool override =
true;
285 if (
out.isNative()) {
288 genericCopyColorSpinor<FloatOut,FloatIn,Ns,Nc>
289 (outOrder, inOrder,
out,
in, location);
293 ColorSpinor outOrder(
out, 1, (
float*)Out, outNorm,
nullptr,
override);
294 genericCopyColorSpinor<float,FloatIn,4,Nc>
295 (outOrder, inOrder,
out,
in, location);
298 genericCopyColorSpinor<FloatOut,FloatIn,Ns,Nc>
299 (outOrder, inOrder,
out,
in, location);
302 genericCopyColorSpinor<FloatOut,FloatIn,Ns,Nc>
303 (outOrder, inOrder,
out,
in, location);
306 #ifdef BUILD_TIFR_INTERFACE 308 genericCopyColorSpinor<FloatOut,FloatIn,Ns,Nc>
309 (outOrder, inOrder,
out,
in, location);
311 errorQuda(
"TIFR interface has not been built\n");
316 #ifdef BUILD_QDPJIT_INTERFACE 318 genericCopyColorSpinor<FloatOut,FloatIn,Ns,Nc>
319 (outOrder, inOrder,
out,
in, location);
321 errorQuda(
"QDPJIT interface has not been built\n");
324 errorQuda(
"Order %d not defined (Ns=%d, Nc=%d)",
out.FieldOrder(), Ns, Nc);
330 template <
typename FloatOut,
typename FloatIn,
int Ns,
int Nc>
333 float *outNorm,
float *inNorm) {
335 const bool override =
true;
339 genericCopyColorSpinor<FloatOut,FloatIn,Ns,Nc>(inOrder,
out,
in, location, Out, outNorm);
343 ColorSpinor inOrder(
in, 1, (
float*)In, inNorm,
nullptr,
override);
344 genericCopyColorSpinor<FloatOut,float,4,Nc>(inOrder,
out,
in, location, Out, outNorm);
347 genericCopyColorSpinor<FloatOut,FloatIn,Ns,Nc>(inOrder,
out,
in, location, Out, outNorm);
350 genericCopyColorSpinor<FloatOut,FloatIn,Ns,Nc>(inOrder,
out,
in, location, Out, outNorm);
353 #ifdef BUILD_TIFR_INTERFACE 355 genericCopyColorSpinor<FloatOut,FloatIn,Ns,Nc>(inOrder,
out,
in, location, Out, outNorm);
357 errorQuda(
"TIFR interface has not been built\n");
362 #ifdef BUILD_QDPJIT_INTERFACE 364 genericCopyColorSpinor<FloatOut,FloatIn,Ns,Nc>(inOrder,
out,
in, location, Out, outNorm);
366 errorQuda(
"QDPJIT interface has not been built\n");
369 errorQuda(
"Order %d not defined (Ns=%d, Nc=%d)",
in.FieldOrder(), Ns, Nc);
375 template <
int Ns,
int Nc,
typename dstFloat,
typename srcFloat>
378 float *dstNorm,
float *srcNorm) {
381 errorQuda(
"Number of dimensions %d %d don't match", dst.
Ndim(),
src.Ndim());
399 errorQuda(
"Copying to full fields with lexicographical ordering is not currently supported");
403 errorQuda(
"QDPJIT field ordering not supported for full site fields");
406 genericCopyColorSpinor<dstFloat, srcFloat, Ns, Nc>(dst,
src, location, Dst, Src, dstNorm, srcNorm);
410 template <
int Nc,
typename dstFloat,
typename srcFloat>
413 float *dstNorm=0,
float *srcNorm=0) {
416 errorQuda(
"source and destination spins must match");
418 if (dst.
Nspin() == 4) {
419 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC) 420 copyGenericColorSpinor<4,Nc>(dst,
src, location, Dst, Src, dstNorm, srcNorm);
422 errorQuda(
"%s has not been built for Nspin=%d fields", __func__,
src.Nspin());
424 }
else if (dst.
Nspin() == 2) {
425 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC) || defined(GPU_STAGGERED_DIRAC) 426 copyGenericColorSpinor<2,Nc>(dst,
src, location, Dst, Src, dstNorm, srcNorm);
428 errorQuda(
"%s has not been built for Nspin=%d fields", __func__,
src.Nspin());
430 }
else if (dst.
Nspin() == 1) {
431 #ifdef GPU_STAGGERED_DIRAC 432 copyGenericColorSpinor<1,Nc>(dst,
src, location, Dst, Src, dstNorm, srcNorm);
434 errorQuda(
"%s has not been built for Nspin=%d fields", __func__,
src.Nspin());
void CopyGenericColorSpinor(ColorSpinorField &dst, const ColorSpinorField &src, QudaFieldLocation location, dstFloat *Dst, srcFloat *Src, float *dstNorm=0, float *srcNorm=0)
void apply(const cudaStream_t &stream)
__device__ __host__ void operator()(complex< FloatOut > out[Ns *Nc], const complex< FloatIn > in[Ns *Nc]) const
QudaVerbosity getVerbosity()
void genericCopyColorSpinor(Out &outOrder, const In &inOrder, const ColorSpinorField &out, const ColorSpinorField &in, QudaFieldLocation location)
const QudaFieldLocation location
__global__ void copyColorSpinorKernel(Arg arg, Basis basis)
const char * VolString() const
__host__ __device__ void copy(T1 &a, const T2 &b)
void copyGenericColorSpinor(ColorSpinorField &dst, const ColorSpinorField &src, QudaFieldLocation location, void *Dst=0, void *Src=0, void *dstNorm=0, void *srcNorm=0)
bool advanceSharedBytes(TuneParam ¶m) const
const ColorSpinorField & in
__device__ __host__ void operator()(complex< FloatOut > out[Ns *Nc], const complex< FloatIn > in[Ns *Nc]) const
const ColorSpinorField & out
unsigned int minThreads() const
QudaSiteSubset SiteSubset() const
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
unsigned int sharedBytesPerBlock(const TuneParam ¶m) const
__device__ __host__ void operator()(complex< FloatOut > out[Ns *Nc], const complex< FloatIn > in[Ns *Nc]) const
const QudaFieldLocation location
CopyColorSpinorArg(const Out &out, const In &in, const ColorSpinorField &out_, const ColorSpinorField &in_)
__device__ __host__ void operator()(complex< FloatOut > out[Ns *Nc], const complex< FloatIn > in[Ns *Nc]) const
CopyColorSpinor(Arg &arg, const ColorSpinorField &out, const ColorSpinorField &in, QudaFieldLocation location)
enum QudaFieldLocation_s QudaFieldLocation
cpuColorSpinorField * out
unsigned int minThreads() const
virtual ~CopyColorSpinor()
void copyColorSpinor(Arg &arg, const Basis &basis)
unsigned int sharedBytesPerThread() const
unsigned int sharedBytesPerThread() const
unsigned int sharedBytesPerBlock(const TuneParam ¶m) const
QudaSiteOrder SiteOrder() const
const ColorSpinorField & meta
bool advanceSharedBytes(TuneParam ¶m) const
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
__device__ __host__ void operator()(complex< FloatOut > out[Ns *Nc], const complex< FloatIn > in[Ns *Nc]) const
Accessor routine for ColorSpinorFields in native field order.
void apply(const cudaStream_t &stream)
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
QudaFieldOrder FieldOrder() const
virtual ~CopyColorSpinor()
CopyColorSpinor(Arg &arg, const ColorSpinorField &out, const ColorSpinorField &in, QudaFieldLocation location)