18 using namespace colorspinor;
21 template <
typename FloatOut,
typename FloatIn,
int Ns,
int Nc,
typename OutOrder,
typename InOrder>
22 void packSpinor(OutOrder &outOrder,
const InOrder &inOrder,
int volume) {
23 for (
int x=0; x<volume; x++) {
24 for (
int s=0;
s<Ns;
s++) {
25 for (
int c=0; c<Nc; c++) {
26 outOrder(0, x,
s, c) = inOrder(0, x,
s, c);
33 template <
typename FloatOut,
typename FloatIn,
int Ns,
int Nc,
typename OutOrder,
typename InOrder>
34 __global__
void packSpinorKernel(OutOrder outOrder,
const InOrder inOrder,
int volume) {
35 int x = blockIdx.x * blockDim.x + threadIdx.x;
36 if (x >= volume)
return;
38 for (
int s=0;
s<Ns;
s++) {
39 for (
int c=0; c<Nc; c++) {
40 outOrder(0, x,
s, c) = inOrder(0, x,
s, c);
45 template <
typename FloatOut,
typename FloatIn,
int Ns,
int Nc,
typename OutOrder,
typename InOrder>
62 : out(out), in(in), meta(meta), location(location) { }
67 packSpinor<FloatOut, FloatIn, Ns, Nc>(
out,
in, meta.
VolumeCB());
70 packSpinorKernel<FloatOut, FloatIn, Ns, Nc, OutOrder, InOrder>
78 long long flops()
const {
return 0; }
79 long long bytes()
const {
return in.Bytes() + out.Bytes(); }
83 template <
typename FloatOut,
typename FloatIn,
int Ns,
int Nc,
typename OutOrder,
typename InOrder>
91 template <
typename FloatOut,
typename FloatIn,
int Ns,
int Nc,
typename InOrder>
97 ColorSpinor outOrder(out, 1, Out);
98 genericCopyColorSpinor<FloatOut,FloatIn,Ns,Nc>(outOrder, inOrder,
out, location);
101 ColorSpinor outOrder(out, 1, Out);
102 genericCopyColorSpinor<FloatOut,FloatIn,Ns,Nc>(outOrder, inOrder,
out, location);
110 template <
typename FloatOut,
typename FloatIn,
int Ns,
int Nc>
116 ColorSpinor inOrder(in, 1, In);
117 genericCopyColorSpinor<FloatOut,FloatIn,Ns,Nc>(inOrder,
out, location, Out);
120 ColorSpinor inOrder(in, 1, In);
121 genericCopyColorSpinor<FloatOut,FloatIn,Ns,Nc>(inOrder,
out, location, Out);
129 template <
int Ns,
int Nc,
typename dstFloat,
typename srcFloat>
152 errorQuda(
"Copying to full fields with lexicographical ordering is not currently supported");
158 errorQuda(
"QDPJIT field ordering not supported for full site fields");
162 srcFloat *srcEven = Src ? Src : (srcFloat*)src.
V();
163 srcFloat *srcOdd = (srcFloat*)((
char*)srcEven + src.
Bytes()/2);
165 std::swap<srcFloat*>(srcEven, srcOdd);
169 dstFloat *dstEven = Dst ? Dst : (dstFloat*)dst.
V();
170 dstFloat *dstOdd = (dstFloat*)((
char*)dstEven + dst.
Bytes()/2);
172 std::swap<dstFloat*>(dstEven, dstOdd);
175 genericCopyColorSpinor<dstFloat, srcFloat, Ns, Nc>(dst, src, location, dstEven, srcEven);
176 genericCopyColorSpinor<dstFloat, srcFloat, Ns, Nc>(dst, src, location, dstOdd, srcOdd);
178 genericCopyColorSpinor<dstFloat, srcFloat, Ns, Nc>(dst, src, location, Dst, Src);
183 template <
int Nc,
typename dstFloat,
typename srcFloat>
189 errorQuda(
"source and destination spins must match");
191 if (dst.
Nspin() == 4) {
192 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC) 193 copyGenericColorSpinor<4,Nc>(dst, src, location, Dst, Src);
195 errorQuda(
"%s has not been built for Nspin=%d fields", __func__, src.Nspin());
197 }
else if (dst.
Nspin() == 2) {
198 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC) || defined(GPU_STAGGERED_DIRAC) 199 copyGenericColorSpinor<2,Nc>(dst, src, location, Dst, Src);
201 errorQuda(
"%s has not been built for Nspin=%d fields", __func__, src.Nspin());
203 }
else if (dst.
Nspin() == 1) {
204 #ifdef GPU_STAGGERED_DIRAC 205 copyGenericColorSpinor<1,Nc>(dst, src, location, Dst, Src);
207 errorQuda(
"%s has not been built for Nspin=%d fields", __func__, src.Nspin());
216 #define INSTANTIATE_COLOR \ 217 switch (src.Ncolor()) { \ 218 case 1: CopyGenericColorSpinor<1>(dst, src, location, dst_ptr, src_ptr); break; \ 219 case 2: CopyGenericColorSpinor<2>(dst, src, location, dst_ptr, src_ptr); break; \ 220 case 4: CopyGenericColorSpinor<4>(dst, src, location, dst_ptr, src_ptr); break; \ 221 case 6: CopyGenericColorSpinor<6>(dst, src, location, dst_ptr, src_ptr); break; \ 222 case 9: CopyGenericColorSpinor<9>(dst, src, location, dst_ptr, src_ptr); break; \ 223 case 12: CopyGenericColorSpinor<12>(dst, src, location, dst_ptr, src_ptr); break; \ 224 case 16: CopyGenericColorSpinor<16>(dst, src, location, dst_ptr, src_ptr); break; \ 225 case 18: CopyGenericColorSpinor<18>(dst, src, location, dst_ptr, src_ptr); break; \ 226 case 24: CopyGenericColorSpinor<24>(dst, src, location, dst_ptr, src_ptr); break; \ 227 case 32: CopyGenericColorSpinor<32>(dst, src, location, dst_ptr, src_ptr); break; \ 228 case 36: CopyGenericColorSpinor<36>(dst, src, location, dst_ptr, src_ptr); break; \ 229 case 48: CopyGenericColorSpinor<48>(dst, src, location, dst_ptr, src_ptr); break; \ 230 case 72: CopyGenericColorSpinor<72>(dst, src, location, dst_ptr, src_ptr); break; \ 231 case 96: CopyGenericColorSpinor<96>(dst, src, location, dst_ptr, src_ptr); break; \ 232 case 256: CopyGenericColorSpinor<256>(dst, src, location, dst_ptr, src_ptr); break; \ 233 case 576: CopyGenericColorSpinor<576>(dst, src, location, dst_ptr, src_ptr); break; \ 234 case 768: CopyGenericColorSpinor<768>(dst, src, location, dst_ptr, src_ptr); break; \ 235 case 1024: CopyGenericColorSpinor<1024>(dst, src, location, dst_ptr, src_ptr); break; \ 236 default: errorQuda("Ncolors=%d not supported", src.Ncolor()); \ 239 #define INSTANTIATE_COLOR QudaFieldLocation location
const ColorSpinorField & meta
void CopyGenericColorSpinor(ColorSpinorField &dst, const ColorSpinorField &src, QudaFieldLocation location, dstFloat *Dst, srcFloat *Src, float *dstNorm=0, float *srcNorm=0)
unsigned int sharedBytesPerThread() const
const char * AuxString() const
QudaVerbosity getVerbosity()
void genericCopyColorSpinor(Out &outOrder, const In &inOrder, const ColorSpinorField &out, const ColorSpinorField &in, QudaFieldLocation location)
const char * VolString() const
__host__ __device__ void copy(T1 &a, const T2 &b)
void copyGenericColorSpinor(ColorSpinorField &dst, const ColorSpinorField &src, QudaFieldLocation location, void *Dst=0, void *Src=0, void *dstNorm=0, void *srcNorm=0)
void apply(const cudaStream_t &stream)
void packSpinor(OutOrder &outOrder, const InOrder &inOrder, int volume)
unsigned int minThreads() const
QudaSiteSubset SiteSubset() const
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
__global__ void packSpinorKernel(OutOrder outOrder, const InOrder inOrder, int volume)
enum QudaFieldLocation_s QudaFieldLocation
cpuColorSpinorField * out
QudaSiteOrder SiteOrder() const
CopySpinor(OutOrder &out, const InOrder &in, const ColorSpinorField &meta, QudaFieldLocation location)
bool advanceSharedBytes(TuneParam ¶m) const
unsigned int sharedBytesPerBlock(const TuneParam ¶m) const
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
QudaFieldOrder FieldOrder() const