QUDA  0.9.0
copy_quda.cu
Go to the documentation of this file.
1 #include <blas_quda.h>
2 #include <tune_quda.h>
3 #include <float_vector.h>
4 #include <register_traits.h>
5 
6 // For kernels with precision conversion built in
7 #define checkSpinorLength(a, b) \
8  { \
9  if (a.Length() != b.Length()) \
10  errorQuda("lengths do not match: %lu %lu", a.Length(), b.Length()); \
11  if (a.Stride() != b.Stride()) \
12  errorQuda("strides do not match: %d %d", a.Stride(), b.Stride()); \
13  if (a.GammaBasis() != b.GammaBasis()) \
14  errorQuda("gamma basis does not match: %d %d", a.GammaBasis(), b.GammaBasis()); \
15  }
16 
17 namespace quda {
18 
19  namespace blas {
20  cudaStream_t* getStream();
21 
22  namespace copy_ns {
23 
24 #include <texture.h>
25 
26  static struct {
27  const char *vol_str;
28  const char *aux_str;
29  } blasStrings;
30 
31  template <typename FloatN, int N, typename Output, typename Input>
32  __global__ void copyKernel(Output Y, Input X, int length) {
33  unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
34  unsigned int parity = blockIdx.y;
35  unsigned int gridSize = gridDim.x*blockDim.x;
36 
37  while (i < length) {
38  FloatN x[N];
39  X.load(x, i, parity);
40  Y.save(x, i, parity);
41  i += gridSize;
42  }
43  }
44 
45  template <typename FloatN, int N, typename Output, typename Input>
46  class CopyCuda : public Tunable {
47 
48  private:
49  Input &X;
50  Output &Y;
51  const int length;
52  const int nParity;
53 
54  unsigned int sharedBytesPerThread() const { return 0; }
55  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
56 
57  virtual bool advanceSharedBytes(TuneParam &param) const
58  {
59  TuneParam next(param);
60  advanceBlockDim(next); // to get next blockDim
61  int nthreads = next.block.x * next.block.y * next.block.z;
62  param.shared_bytes = sharedBytesPerThread()*nthreads > sharedBytesPerBlock(param) ?
64  return false;
65  }
66 
67  public:
68  CopyCuda(Output &Y, Input &X, int length, int nParity)
69  : X(X), Y(Y), length(length/nParity), nParity(nParity) { }
70  virtual ~CopyCuda() { ; }
71 
72  inline TuneKey tuneKey() const {
73  return TuneKey(blasStrings.vol_str, "copyKernel", blasStrings.aux_str);
74  }
75 
76  inline void apply(const cudaStream_t &stream) {
77  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
78  copyKernel<FloatN, N><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(Y, X, length);
79  }
80 
81  void preTune() { ; } // no need to save state for copy kernels
82  void postTune() { ; } // no need to restore state for copy kernels
83 
84  void initTuneParam(TuneParam &param) const {
86  param.grid.y = nParity;
87  }
88 
91  param.grid.y = nParity;
92  }
93 
94  long long flops() const { return 0; }
95  long long bytes() const {
96  const int Ninternal = (sizeof(FloatN)/sizeof(((FloatN*)0)->x))*N;
97  size_t bytes = (X.Precision() + Y.Precision())*Ninternal;
98  if (X.Precision() == QUDA_HALF_PRECISION) bytes += sizeof(float);
99  if (Y.Precision() == QUDA_HALF_PRECISION) bytes += sizeof(float);
100  return bytes*length*nParity;
101  }
102  int tuningIter() const { return 3; }
103  };
104 
106  if (&src == &dst) return; // aliasing fields
107 
108  if (src.SiteSubset() != dst.SiteSubset())
109  errorQuda("Spinor fields do not have matching subsets dst=%d src=%d\n", src.SiteSubset(), dst.SiteSubset());
110 
111  checkSpinorLength(dst, src);
112 
113  blasStrings.vol_str = src.VolString();
114  char tmp[256];
115  strcpy(tmp, "dst=");
116  strcat(tmp, dst.AuxString());
117  strcat(tmp, ",src=");
118  strcat(tmp, src.AuxString());
119  blasStrings.aux_str = tmp;
120 
121  if (dst.Nspin() != src.Nspin())
122  errorQuda("Spins (%d,%d) do not match", dst.Nspin(), src.Nspin());
123 
124  // For a given dst precision, there are two non-trivial possibilities for the
125  // src precision.
126 
127  blas::bytes += (unsigned long long)src.RealLength()*(src.Precision() + dst.Precision());
128 
129  int partitions = (src.IsComposite() ? src.CompositeDim() : 1) * (src.SiteSubset());
130 
131  if (dst.Precision() == src.Precision()) {
132  if (src.Bytes() != dst.Bytes()) errorQuda("Precisions match, but bytes do not");
133  qudaMemcpy(dst.V(), src.V(), dst.Bytes(), cudaMemcpyDeviceToDevice);
134  if (dst.Precision() == QUDA_HALF_PRECISION) {
135  qudaMemcpy(dst.Norm(), src.Norm(), dst.NormBytes(), cudaMemcpyDeviceToDevice);
136  blas::bytes += 2*(unsigned long long)dst.RealLength()*sizeof(float);
137  }
138  } else if (dst.Precision() == QUDA_DOUBLE_PRECISION && src.Precision() == QUDA_SINGLE_PRECISION) {
139  if (src.Nspin() == 4){
141  Spinor<float4, double2, 6, 1> dst_spinor(dst);
143  copy(dst_spinor, src_tex, src.Volume(), partitions);
144  copy.apply(*blas::getStream());
145  } else if (src.Nspin() == 2) {
146  if (src.Length() != src.RealLength() || dst.Length() != dst.RealLength())
147  errorQuda("Non-zero stride not supported"); // we need to know how many colors to set "M" (requires JIT)
149  Spinor<float2, double2, 1, 1> dst_spinor(dst);
151  copy(dst_spinor, src_tex, src.Length()/2, partitions);
152  copy.apply(*blas::getStream());
153  } else if (src.Nspin() == 1) {
155  Spinor<float2, double2, 3, 1> dst_spinor(dst);
157  copy(dst_spinor, src_tex, src.Volume(), partitions);
158  copy.apply(*blas::getStream());
159  } else {
160  errorQuda("Nspin(%d) is not supported", src.Nspin());
161  }
162  } else if (dst.Precision() == QUDA_SINGLE_PRECISION && src.Precision() == QUDA_DOUBLE_PRECISION) {
163  if (src.Nspin() == 4){
165  Spinor<float4, float4, 6, 1> dst_spinor(dst);
167  copy(dst_spinor, src_tex, src.Volume(), partitions);
168  copy.apply(*blas::getStream());
169  } else if (src.Nspin() == 2) {
170  if (src.Length() != src.RealLength() || dst.Length() != dst.RealLength())
171  errorQuda("Non-zero stride not supported"); // we need to know how many colors to set "M" (requires JIT)
173  Spinor<float2, float2, 1, 1> dst_spinor(dst);
175  copy(dst_spinor, src_tex, src.Length()/2, partitions);
176  copy.apply(*blas::getStream());
177  } else if (src.Nspin() == 1) {
179  Spinor<float2, float2, 3, 1> dst_spinor(dst);
181  copy(dst_spinor, src_tex, src.Volume(), partitions);
182  copy.apply(*blas::getStream());
183  } else {
184  errorQuda("Nspin(%d) is not supported", src.Nspin());
185  }
186  } else if (dst.Precision() == QUDA_SINGLE_PRECISION && src.Precision() == QUDA_HALF_PRECISION) {
187  blas::bytes += (unsigned long long)src.Volume()*sizeof(float);
188  if (src.Nspin() == 4){
190  Spinor<float4, float4, 6, 1> dst_spinor(dst);
192  copy(dst_spinor, src_tex, src.Volume(), partitions);
193  copy.apply(*blas::getStream());
194  } else if (src.Nspin() == 1) {
196  Spinor<float2, float2, 3, 1> dst_spinor(dst);
198  copy(dst_spinor, src_tex, src.Volume(), partitions);
199  copy.apply(*blas::getStream());
200  } else {
201  errorQuda("Nspin(%d) is not supported", src.Nspin());
202  }
203  } else if (dst.Precision() == QUDA_HALF_PRECISION && src.Precision() == QUDA_SINGLE_PRECISION) {
204  blas::bytes += (unsigned long long)dst.Volume()*sizeof(float);
205  if (src.Nspin() == 4){
207  Spinor<float4, short4, 6, 1> dst_spinor(dst);
209  copy(dst_spinor, src_tex, src.Volume(), partitions);
210  copy.apply(*blas::getStream());
211  } else if (src.Nspin() == 1) {
213  Spinor<float2, short2, 3, 1> dst_spinor(dst);
215  copy(dst_spinor, src_tex, src.Volume(), partitions);
216  copy.apply(*blas::getStream());
217  } else {
218  errorQuda("Nspin(%d) is not supported", src.Nspin());
219  }
220  } else if (dst.Precision() == QUDA_DOUBLE_PRECISION && src.Precision() == QUDA_HALF_PRECISION) {
221  blas::bytes += (unsigned long long)src.Volume()*sizeof(float);
222  if (src.Nspin() == 4){
224  Spinor<double2, double2, 12, 1> dst_spinor(dst);
226  copy(dst_spinor, src_tex, src.Volume(), partitions);
227  copy.apply(*blas::getStream());
228  } else if (src.Nspin() == 1) {
230  Spinor<double2, double2, 3, 1> dst_spinor(dst);
232  copy(dst_spinor, src_tex, src.Volume(), partitions);
233  copy.apply(*blas::getStream());
234  } else {
235  errorQuda("Nspin(%d) is not supported", src.Nspin());
236  }
237  } else if (dst.Precision() == QUDA_HALF_PRECISION && src.Precision() == QUDA_DOUBLE_PRECISION) {
238  blas::bytes += (unsigned long long)dst.Volume()*sizeof(float);
239  if (src.Nspin() == 4){
241  Spinor<double2, short4, 12, 1> dst_spinor(dst);
243  copy(dst_spinor, src_tex, src.Volume(), partitions);
244  copy.apply(*blas::getStream());
245  } else if (src.Nspin() == 1) {
247  Spinor<double2, short2, 3, 1> dst_spinor(dst);
249  copy(dst_spinor, src_tex, src.Volume(), partitions);
250  copy.apply(*blas::getStream());
251  } else {
252  errorQuda("Nspin(%d) is not supported", src.Nspin());
253  }
254  } else {
255  errorQuda("Invalid precision combination dst=%d and src=%d", dst.Precision(), src.Precision());
256  }
257 
258  checkCudaError();
259  }
260 
261  } // namespace copy_nw
262 
264  if (dst.Location() == QUDA_CUDA_FIELD_LOCATION &&
265  src.Location() == QUDA_CUDA_FIELD_LOCATION) {
266  copy_ns::copy(static_cast<cudaColorSpinorField&>(dst),
267  static_cast<const cudaColorSpinorField&>(src));
268  } else {
269  dst = src;
270  }
271  }
272 
273  } // namespace blas
274 } // namespace quda
#define qudaMemcpy(dst, src, count, kind)
Definition: quda_cuda_api.h:32
long long bytes() const
Definition: copy_quda.cu:95
dim3 dim3 blockDim
void defaultTuneParam(TuneParam &param) const
Definition: copy_quda.cu:89
const char * AuxString() const
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
const void * src
void initTuneParam(TuneParam &param) const
Definition: copy_quda.cu:84
#define errorQuda(...)
Definition: util_quda.h:90
static struct quda::blas::copy_ns::@5 blasStrings
const char * aux_str
Definition: copy_quda.cu:28
cudaStream_t * stream
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:44
virtual bool advanceSharedBytes(TuneParam &param) const
Definition: copy_quda.cu:57
char * strcpy(char *__dst, const char *__src)
unsigned int sharedBytesPerBlock(const TuneParam &param) const
Definition: copy_quda.cu:55
void copy(ColorSpinorField &dst, const ColorSpinorField &src)
Definition: copy_quda.cu:263
char * strcat(char *__s1, const char *__s2)
QudaGaugeParam param
Definition: pack_test.cpp:17
cudaStream_t * getStream()
Definition: blas_quda.cu:75
QudaSiteSubset SiteSubset() const
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603
Provides precision abstractions and defines the register precision given the storage precision using ...
const char * vol_str
Definition: copy_quda.cu:27
#define checkSpinorLength(a, b)
Definition: copy_quda.cu:7
QudaFieldLocation Location() const
void apply(const cudaStream_t &stream)
Definition: copy_quda.cu:76
__global__ void copyKernel(Output Y, Input X, int length)
Definition: copy_quda.cu:32
void size_t length
virtual void initTuneParam(TuneParam &param) const
Definition: tune_quda.h:230
unsigned int sharedBytesPerThread() const
Definition: copy_quda.cu:54
#define checkCudaError()
Definition: util_quda.h:129
virtual bool advanceBlockDim(TuneParam &param) const
Definition: tune_quda.h:102
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51
QudaPrecision Precision() const
void copy(cudaColorSpinorField &dst, const cudaColorSpinorField &src)
Definition: copy_quda.cu:105
QudaParity parity
Definition: covdev_test.cpp:53
CopyCuda(Output &Y, Input &X, int length, int nParity)
Definition: copy_quda.cu:68
unsigned long long bytes
Definition: blas_quda.cu:43
long long flops() const
Definition: copy_quda.cu:94
virtual void defaultTuneParam(TuneParam &param) const
Definition: tune_quda.h:254