7 #define checkSpinorLength(a, b) \ 9 if (a.Length() != b.Length()) \ 10 errorQuda("lengths do not match: %lu %lu", a.Length(), b.Length()); \ 11 if (a.Stride() != b.Stride()) \ 12 errorQuda("strides do not match: %d %d", a.Stride(), b.Stride()); \ 13 if (a.GammaBasis() != b.GammaBasis()) \ 14 errorQuda("gamma basis does not match: %d %d", a.GammaBasis(), b.GammaBasis()); \ 31 template <
typename FloatN,
int N,
typename Output,
typename Input>
33 unsigned int i = blockIdx.x*(
blockDim.x) + threadIdx.x;
34 unsigned int parity = blockIdx.y;
45 template <
typename FloatN,
int N,
typename Output,
typename Input>
94 long long flops()
const {
return 0; }
96 const int Ninternal = (
sizeof(FloatN)/
sizeof(((FloatN*)0)->x))*N;
97 size_t bytes = (
X.Precision() +
Y.Precision())*Ninternal;
106 if (&
src == &dst)
return;
109 errorQuda(
"Spinor fields do not have matching subsets dst=%d src=%d\n",
src.SiteSubset(), dst.
SiteSubset());
129 int partitions = (
src.IsComposite() ?
src.CompositeDim() : 1) * (
src.SiteSubset());
139 if (
src.Nspin() == 4){
142 CopyCuda<float4, 6, Spinor<float4, double2, 6, 1>,
Spinor<float4, float4, 6, 0, 0> >
143 copy(dst_spinor, src_tex,
src.Volume(), partitions);
145 }
else if (
src.Nspin() == 2) {
147 errorQuda(
"Non-zero stride not supported");
150 CopyCuda<float2, 1, Spinor<float2, double2, 1, 1>,
Spinor<float2, float2, 1, 0, 0> >
151 copy(dst_spinor, src_tex,
src.Length()/2, partitions);
153 }
else if (
src.Nspin() == 1) {
156 CopyCuda<float2, 3, Spinor<float2, double2, 3, 1>,
Spinor<float2, float2, 3, 0, 0> >
157 copy(dst_spinor, src_tex,
src.Volume(), partitions);
163 if (
src.Nspin() == 4){
166 CopyCuda<float4, 6, Spinor<float4, float4, 6, 1>,
Spinor<float4, double2, 6, 0, 0> >
167 copy(dst_spinor, src_tex,
src.Volume(), partitions);
169 }
else if (
src.Nspin() == 2) {
171 errorQuda(
"Non-zero stride not supported");
174 CopyCuda<float2, 1, Spinor<float2, float2, 1, 1>,
Spinor<float2, double2, 1, 0, 0> >
175 copy(dst_spinor, src_tex,
src.Length()/2, partitions);
177 }
else if (
src.Nspin() == 1) {
180 CopyCuda<float2, 3, Spinor<float2, float2, 3, 1>,
Spinor<float2, double2, 3, 0, 0> >
181 copy(dst_spinor, src_tex,
src.Volume(), partitions);
188 if (
src.Nspin() == 4){
191 CopyCuda<float4, 6, Spinor<float4, float4, 6, 1>,
Spinor<float4, short4, 6, 0, 0> >
192 copy(dst_spinor, src_tex,
src.Volume(), partitions);
194 }
else if (
src.Nspin() == 1) {
197 CopyCuda<float2, 3, Spinor<float2, float2, 3, 1>,
Spinor<float2, short2, 3, 0, 0> >
198 copy(dst_spinor, src_tex,
src.Volume(), partitions);
205 if (
src.Nspin() == 4){
208 CopyCuda<float4, 6, Spinor<float4, short4, 6, 1>,
Spinor<float4, float4, 6, 0, 0> >
209 copy(dst_spinor, src_tex,
src.Volume(), partitions);
211 }
else if (
src.Nspin() == 1) {
214 CopyCuda<float2, 3, Spinor<float2, short2, 3, 1>,
Spinor<float2, float2, 3, 0, 0> >
215 copy(dst_spinor, src_tex,
src.Volume(), partitions);
222 if (
src.Nspin() == 4){
225 CopyCuda<double2, 12, Spinor<double2, double2, 12, 1>,
Spinor<double2, short4, 12, 0, 0> >
226 copy(dst_spinor, src_tex,
src.Volume(), partitions);
228 }
else if (
src.Nspin() == 1) {
231 CopyCuda<double2, 3, Spinor<double2, double2, 3, 1>,
Spinor<double2, short2, 3, 0, 0> >
232 copy(dst_spinor, src_tex,
src.Volume(), partitions);
239 if (
src.Nspin() == 4){
242 CopyCuda<double2, 12, Spinor<double2, short4, 12, 1>,
Spinor<double2, double2, 12, 0, 0> >
243 copy(dst_spinor, src_tex,
src.Volume(), partitions);
245 }
else if (
src.Nspin() == 1) {
248 CopyCuda<double2, 3, Spinor<double2, short2, 3, 1>,
Spinor<double2, double2, 3, 0, 0> >
249 copy(dst_spinor, src_tex,
src.Volume(), partitions);
267 static_cast<const cudaColorSpinorField&>(
src));
#define qudaMemcpy(dst, src, count, kind)
void defaultTuneParam(TuneParam ¶m) const
const char * AuxString() const
QudaVerbosity getVerbosity()
void initTuneParam(TuneParam ¶m) const
static struct quda::blas::copy_ns::@5 blasStrings
cudaColorSpinorField * tmp
virtual bool advanceSharedBytes(TuneParam ¶m) const
char * strcpy(char *__dst, const char *__src)
unsigned int sharedBytesPerBlock(const TuneParam ¶m) const
void copy(ColorSpinorField &dst, const ColorSpinorField &src)
char * strcat(char *__s1, const char *__s2)
size_t RealLength() const
cudaStream_t * getStream()
QudaSiteSubset SiteSubset() const
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Provides precision abstractions and defines the register precision given the storage precision using ...
#define checkSpinorLength(a, b)
QudaFieldLocation Location() const
void apply(const cudaStream_t &stream)
__global__ void copyKernel(Output Y, Input X, int length)
virtual void initTuneParam(TuneParam ¶m) const
unsigned int sharedBytesPerThread() const
virtual bool advanceBlockDim(TuneParam ¶m) const
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
QudaPrecision Precision() const
void copy(cudaColorSpinorField &dst, const cudaColorSpinorField &src)
CopyCuda(Output &Y, Input &X, int length, int nParity)
virtual void defaultTuneParam(TuneParam ¶m) const