16 #define PRESERVE_SPINOR_NORM 18 #ifdef PRESERVE_SPINOR_NORM // Preserve the norm regardless of basis 19 #define kP (1.0/sqrt(2.0)) 20 #define kU (1.0/sqrt(2.0)) 21 #else // More numerically accurate not to preserve the norm between basis 28 using namespace colorspinor;
30 template <
typename FloatOut,
typename FloatIn,
int nSpin_,
int nColor_,
typename Out,
typename In>
34 static constexpr
int nSpin = nSpin_;
35 static constexpr
int nColor = nColor_;
43 : out(out), in(in), volumeCB(in_.VolumeCB()), nParity(in_.SiteSubset()),
49 template <
typename Arg>
51 static constexpr
int Ns = Arg::nSpin;
53 template <
typename FloatOut,
typename FloatIn>
54 __device__ __host__
inline void operator()(complex<FloatOut>
out[Ns*Nc],
const complex<FloatIn>
in[Ns*Nc])
const {
55 for (
int s=0;
s<Ns;
s++)
for (
int c=0; c<Nc; c++)
out[
s*Nc+c] =
in[
s*Nc+c];
60 template <
typename Arg>
62 static constexpr
int Ns = Arg::nSpin;
64 template <
typename FloatOut,
typename FloatIn>
65 __device__ __host__
inline void operator()(complex<FloatOut>
out[Ns*Nc],
const complex<FloatIn>
in[Ns*Nc])
const {
66 int s1[4] = {1, 2, 3, 0};
67 int s2[4] = {3, 0, 1, 2};
68 FloatOut K1[4] = {
static_cast<FloatOut
>(
kP), static_cast<FloatOut>(-
kP),
static_cast<FloatOut
>(-
kP), static_cast<FloatOut>(-
kP)};
69 FloatOut K2[4] = {
static_cast<FloatOut
>(
kP), static_cast<FloatOut>(-
kP),
static_cast<FloatOut
>(
kP), static_cast<FloatOut>(
kP)};
70 for (
int s=0;
s<Ns;
s++) {
71 for (
int c=0; c<Nc; c++) {
72 out[
s*Nc+c] = K1[
s]*
static_cast<complex<FloatOut>
>(
in[s1[
s]*Nc+c]) + K2[
s]*
static_cast<complex<FloatOut>
>(
in[s2[
s]*Nc+c]);
79 template <
typename Arg>
81 static constexpr
int Ns = Arg::nSpin;
83 template <
typename FloatOut,
typename FloatIn>
84 __device__ __host__
inline void operator()(complex<FloatOut>
out[Ns*Nc],
const complex<FloatIn>
in[Ns*Nc])
const {
85 int s1[4] = {1, 2, 3, 0};
86 int s2[4] = {3, 0, 1, 2};
87 FloatOut K1[4] = {
static_cast<FloatOut
>(-
kU), static_cast<FloatOut>(
kU),
static_cast<FloatOut
>(
kU), static_cast<FloatOut>(
kU)};
88 FloatOut K2[4] = {
static_cast<FloatOut
>(-
kU), static_cast<FloatOut>(
kU),
static_cast<FloatOut
>(-
kU), static_cast<FloatOut>(-
kU)};
89 for (
int s=0;
s<Ns;
s++) {
90 for (
int c=0; c<Nc; c++) {
91 out[
s*Nc+c] = K1[
s]*
static_cast<complex<FloatOut>
>(
in[s1[
s]*Nc+c]) + K2[
s]*
static_cast<complex<FloatOut>
>(
in[s2[
s]*Nc+c]);
98 template <
typename Arg>
100 static constexpr
int Ns = Arg::nSpin;
102 template <
typename FloatOut,
typename FloatIn>
103 __device__ __host__
inline void operator()(complex<FloatOut>
out[Ns*Nc],
const complex<FloatIn>
in[Ns*Nc])
const {
104 int s1[4] = {0, 1, 0, 1};
105 int s2[4] = {2, 3, 2, 3};
106 FloatOut K1[4] = {
static_cast<FloatOut
>(-
kP), static_cast<FloatOut>(-
kP),
static_cast<FloatOut
>(
kP), static_cast<FloatOut>(
kP)};
107 FloatOut K2[4] = {
static_cast<FloatOut
>(
kP), static_cast<FloatOut>(
kP),
static_cast<FloatOut
>(
kP), static_cast<FloatOut>(
kP)};
108 for (
int s=0;
s<Ns;
s++) {
109 for (
int c=0; c<Nc; c++) {
110 out[
s*Nc+c] = K1[
s]*
static_cast<complex<FloatOut>
>(
in[s1[
s]*Nc+c]) + K2[
s]*
static_cast<complex<FloatOut>
>(
in[s2[
s]*Nc+c]);
117 template <
typename Arg>
119 static constexpr
int Ns = Arg::nSpin;
121 template <
typename FloatOut,
typename FloatIn>
122 __device__ __host__
inline void operator()(complex<FloatOut>
out[Ns*Nc],
const complex<FloatIn>
in[Ns*Nc])
const {
123 int s1[4] = {0, 1, 0, 1};
124 int s2[4] = {2, 3, 2, 3};
125 FloatOut K1[4] = {
static_cast<FloatOut
>(-
kU), static_cast<FloatOut>(-
kU),
static_cast<FloatOut
>(
kU), static_cast<FloatOut>(
kU)};
126 FloatOut K2[4] = {
static_cast<FloatOut
>(
kU),static_cast<FloatOut>(
kU),
static_cast<FloatOut
>(
kU), static_cast<FloatOut>(
kU)};
127 for (
int s=0;
s<Ns;
s++) {
128 for (
int c=0; c<Nc; c++) {
129 out[
s*Nc+c] = K1[
s]*
static_cast<complex<FloatOut>
>(
in[s1[
s]*Nc+c]) + K2[
s]*
static_cast<complex<FloatOut>
>(
in[s2[
s]*Nc+c]);
139 for (
int x=0; x<arg.
volumeCB; x++) {
143 arg.out(x, (
parity+arg.outParity)&1) =
out;
151 int x = blockIdx.x * blockDim.x + threadIdx.x;
153 int parity = blockIdx.y * blockDim.y + threadIdx.y;
158 arg.out(x, (parity+arg.outParity)&1) =
out;
161 template <
int Ns,
typename Arg>
177 :
TunableVectorY(arg.nParity), arg(arg), meta(in), location(location) {
179 writeAuxString(
"out_stride=%d,in_stride=%d", arg.out.stride, arg.in.stride);
193 long long flops()
const {
return 0; }
194 long long bytes()
const {
return arg.in.Bytes() + arg.out.Bytes(); }
197 template <
typename Arg>
199 static constexpr
int Ns = 4;
215 :
TunableVectorY(arg.nParity), arg(arg), out(out), in(in), location(location) {
218 writeAuxString(
"out_stride=%d,in_stride=%d,PreserveBasis", arg.out.stride, arg.in.stride);
220 writeAuxString(
"out_stride=%d,in_stride=%d,NonRelBasis", arg.out.stride, arg.in.stride);
222 writeAuxString(
"out_stride=%d,in_stride=%d,RelBasis", arg.out.stride, arg.in.stride);
224 writeAuxString(
"out_stride=%d,in_stride=%d,ChiralToNonRelBasis", arg.out.stride, arg.in.stride);
226 writeAuxString(
"out_stride=%d,in_stride=%d,NonRelToChiralBasis", arg.out.stride, arg.in.stride);
263 long long flops()
const {
return 0; }
264 long long bytes()
const {
return arg.in.Bytes() + arg.out.Bytes(); }
269 template <
typename FloatOut,
typename FloatIn,
int Ns,
int Nc,
typename Out,
typename In>
279 template <
typename FloatOut,
typename FloatIn,
int Ns,
int Nc,
typename InOrder>
282 FloatOut *Out,
float *outNorm) {
283 const bool override =
true;
286 ColorSpinor outOrder(out, 1, Out, outNorm,
nullptr,
override);
287 genericCopyColorSpinor<FloatOut,FloatIn,Ns,Nc>
288 (outOrder, inOrder,
out,
in, location);
292 ColorSpinor outOrder(out, 1, Out, outNorm,
nullptr,
override);
293 genericCopyColorSpinor<FloatOut,FloatIn,4,Nc>
294 (outOrder, inOrder,
out,
in, location);
297 genericCopyColorSpinor<FloatOut,FloatIn,Ns,Nc>
298 (outOrder, inOrder,
out,
in, location);
301 genericCopyColorSpinor<FloatOut,FloatIn,Ns,Nc>
302 (outOrder, inOrder,
out,
in, location);
305 #ifdef BUILD_TIFR_INTERFACE 307 genericCopyColorSpinor<FloatOut,FloatIn,Ns,Nc>
308 (outOrder, inOrder,
out,
in, location);
310 errorQuda(
"TIFR interface has not been built\n");
315 #ifdef BUILD_QDPJIT_INTERFACE 317 genericCopyColorSpinor<FloatOut,FloatIn,Ns,Nc>
318 (outOrder, inOrder,
out,
in, location);
320 errorQuda(
"QDPJIT interface has not been built\n");
329 template <
typename FloatOut,
typename FloatIn,
int Ns,
int Nc>
332 float *outNorm,
float *inNorm) {
333 const bool override =
true;
336 ColorSpinor inOrder(in, 1, In, inNorm,
nullptr,
override);
337 genericCopyColorSpinor<FloatOut,FloatIn,Ns,Nc>(inOrder,
out,
in, location, Out, outNorm);
341 ColorSpinor inOrder(in, 1, In, inNorm,
nullptr,
override);
342 genericCopyColorSpinor<FloatOut,FloatIn,4,Nc>(inOrder,
out,
in, location, Out, outNorm);
345 genericCopyColorSpinor<FloatOut,FloatIn,Ns,Nc>(inOrder,
out,
in, location, Out, outNorm);
348 genericCopyColorSpinor<FloatOut,FloatIn,Ns,Nc>(inOrder,
out,
in, location, Out, outNorm);
351 #ifdef BUILD_TIFR_INTERFACE 353 genericCopyColorSpinor<FloatOut,FloatIn,Ns,Nc>(inOrder,
out,
in, location, Out, outNorm);
355 errorQuda(
"TIFR interface has not been built\n");
360 #ifdef BUILD_QDPJIT_INTERFACE 362 genericCopyColorSpinor<FloatOut,FloatIn,Ns,Nc>(inOrder,
out,
in, location, Out, outNorm);
364 errorQuda(
"QDPJIT interface has not been built\n");
373 template <
int Ns,
int Nc,
typename dstFloat,
typename srcFloat>
376 float *dstNorm,
float *srcNorm) {
397 errorQuda(
"Copying to full fields with lexicographical ordering is not currently supported");
401 errorQuda(
"QDPJIT field ordering not supported for full site fields");
404 genericCopyColorSpinor<dstFloat, srcFloat, Ns, Nc>(dst, src, location, Dst, Src, dstNorm, srcNorm);
408 template <
int Nc,
typename dstFloat,
typename srcFloat>
411 float *dstNorm=0,
float *srcNorm=0) {
414 errorQuda(
"source and destination spins must match");
416 if (dst.
Nspin() == 4) {
417 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC) || defined(GPU_COVDEV) || defined(GPU_CONTRACT) 418 copyGenericColorSpinor<4,Nc>(dst, src, location, Dst, Src, dstNorm, srcNorm);
420 errorQuda(
"%s has not been built for Nspin=%d fields", __func__, src.Nspin());
422 }
else if (dst.
Nspin() == 2) {
423 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC) || defined(GPU_STAGGERED_DIRAC) 424 copyGenericColorSpinor<2,Nc>(dst, src, location, Dst, Src, dstNorm, srcNorm);
426 errorQuda(
"%s has not been built for Nspin=%d fields", __func__, src.Nspin());
428 }
else if (dst.
Nspin() == 1) {
429 #ifdef GPU_STAGGERED_DIRAC 430 copyGenericColorSpinor<1,Nc>(dst, src, location, Dst, Src, dstNorm, srcNorm);
432 errorQuda(
"%s has not been built for Nspin=%d fields", __func__, src.Nspin());
unsigned int minThreads() const
const ColorSpinorField & meta
unsigned int sharedBytesPerThread() const
void CopyGenericColorSpinor(ColorSpinorField &dst, const ColorSpinorField &src, QudaFieldLocation location, dstFloat *Dst, srcFloat *Src, float *dstNorm=0, float *srcNorm=0)
typename mapper< FloatIn >::type realIn
CopyColorSpinorArg(const Out &out, const In &in, const ColorSpinorField &out_, const ColorSpinorField &in_)
QudaVerbosity getVerbosity()
complex< Float > data[size]
const ColorSpinorField & out
typename mapper< FloatOut >::type realOut
__device__ __host__ void operator()(complex< FloatOut > out[Ns *Nc], const complex< FloatIn > in[Ns *Nc]) const
void genericCopyColorSpinor(Out &outOrder, const In &inOrder, const ColorSpinorField &out, const ColorSpinorField &in, QudaFieldLocation location)
__device__ __host__ void operator()(complex< FloatOut > out[Ns *Nc], const complex< FloatIn > in[Ns *Nc]) const
QudaGammaBasis GammaBasis() const
const char * VolString() const
__host__ __device__ void copy(T1 &a, const T2 &b)
void copyGenericColorSpinor(ColorSpinorField &dst, const ColorSpinorField &src, QudaFieldLocation location, void *Dst=0, void *Src=0, void *dstNorm=0, void *srcNorm=0)
const QudaFieldLocation location
unsigned int minThreads() const
bool advanceSharedBytes(TuneParam ¶m) const
virtual ~CopyColorSpinor()
__global__ void copyColorSpinorKernel(Arg arg, Basis basis)
unsigned int sharedBytesPerThread() const
CopyColorSpinor(Arg &arg, const ColorSpinorField &out, const ColorSpinorField &in, QudaFieldLocation location)
__device__ __host__ void operator()(complex< FloatOut > out[Ns *Nc], const complex< FloatIn > in[Ns *Nc]) const
const QudaFieldLocation location
QudaSiteSubset SiteSubset() const
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
void apply(const cudaStream_t &stream)
__device__ __host__ void operator()(complex< FloatOut > out[Ns *Nc], const complex< FloatIn > in[Ns *Nc]) const
void copyColorSpinor(Arg &arg, const Basis &basis)
enum QudaFieldLocation_s QudaFieldLocation
cpuColorSpinorField * out
QudaSiteOrder SiteOrder() const
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
bool advanceSharedBytes(TuneParam ¶m) const
__device__ __host__ void operator()(complex< FloatOut > out[Ns *Nc], const complex< FloatIn > in[Ns *Nc]) const
virtual ~CopyColorSpinor()
Accessor routine for ColorSpinorFields in native field order.
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
void apply(const cudaStream_t &stream)
QudaFieldOrder FieldOrder() const
unsigned int sharedBytesPerBlock(const TuneParam ¶m) const
CopyColorSpinor(Arg &arg, const ColorSpinorField &out, const ColorSpinorField &in, QudaFieldLocation location)
const ColorSpinorField & in
unsigned int sharedBytesPerBlock(const TuneParam ¶m) const