QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
copy_quda.cu
Go to the documentation of this file.
1 #include <blas_quda.h>
2 #include <tune_quda.h>
3 #include <float_vector.h>
4 #include <register_traits.h>
5 
6 // For kernels with precision conversion built in
7 #define checkSpinorLength(a, b) \
8  { \
9  if (a.Length() != b.Length()) \
10  errorQuda("lengths do not match: %lu %lu", a.Length(), b.Length()); \
11  if (a.Stride() != b.Stride()) \
12  errorQuda("strides do not match: %d %d", a.Stride(), b.Stride()); \
13  if (a.GammaBasis() != b.GammaBasis()) \
14  errorQuda("gamma basis does not match: %d %d", a.GammaBasis(), b.GammaBasis()); \
15  }
16 
17 namespace quda {
18 
19  namespace blas {
20  cudaStream_t* getStream();
21 
22  namespace copy_ns {
23 
24 #include <texture.h>
25 
26  static struct {
27  const char *vol_str;
28  const char *aux_str;
29  } blasStrings;
30 
31  template <typename FloatN, int N, typename Output, typename Input>
32  __global__ void copyKernel(Output Y, Input X, int length) {
33  unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
34  unsigned int parity = blockIdx.y;
35  unsigned int gridSize = gridDim.x*blockDim.x;
36 
37  while (i < length) {
38  FloatN x[N];
39  X.load(x, i, parity);
40  Y.save(x, i, parity);
41  i += gridSize;
42  }
43  }
44 
45  template <typename FloatN, int N, typename Output, typename Input>
46  class CopyCuda : public Tunable {
47 
48  private:
49  Input &X;
50  Output &Y;
51  const int length;
52  const int nParity;
53 
54  unsigned int sharedBytesPerThread() const { return 0; }
55  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
56 
57  virtual bool advanceSharedBytes(TuneParam &param) const
58  {
59  TuneParam next(param);
60  advanceBlockDim(next); // to get next blockDim
61  int nthreads = next.block.x * next.block.y * next.block.z;
62  param.shared_bytes = sharedBytesPerThread()*nthreads > sharedBytesPerBlock(param) ?
63  sharedBytesPerThread()*nthreads : sharedBytesPerBlock(param);
64  return false;
65  }
66 
67  public:
68  CopyCuda(Output &Y, Input &X, int length, int nParity)
69  : X(X), Y(Y), length(length/nParity), nParity(nParity) { }
70  virtual ~CopyCuda() { ; }
71 
72  inline TuneKey tuneKey() const {
73  return TuneKey(blasStrings.vol_str, "copyKernel", blasStrings.aux_str);
74  }
75 
76  inline void apply(const cudaStream_t &stream) {
77  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
78  copyKernel<FloatN, N><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(Y, X, length);
79  }
80 
81  void preTune() { ; } // no need to save state for copy kernels
82  void postTune() { ; } // no need to restore state for copy kernels
83 
84  void initTuneParam(TuneParam &param) const {
86  param.grid.y = nParity;
87  }
88 
91  param.grid.y = nParity;
92  }
93 
94  long long flops() const { return 0; }
95  long long bytes() const {
96  const int Ninternal = (sizeof(FloatN)/sizeof(((FloatN*)0)->x))*N;
97  size_t bytes = (X.Precision() + Y.Precision())*Ninternal;
98  if (X.Precision() == QUDA_HALF_PRECISION || X.Precision() == QUDA_QUARTER_PRECISION) bytes += sizeof(float);
99  if (Y.Precision() == QUDA_HALF_PRECISION || Y.Precision() == QUDA_QUARTER_PRECISION) bytes += sizeof(float);
100  return bytes*length*nParity;
101  }
102  int tuningIter() const { return 3; }
103  };
104 
106  if (&src == &dst) return; // aliasing fields
107 
108  if (src.SiteSubset() != dst.SiteSubset())
109  errorQuda("Spinor fields do not have matching subsets dst=%d src=%d\n", src.SiteSubset(), dst.SiteSubset());
110 
111  checkSpinorLength(dst, src);
112 
113  blasStrings.vol_str = src.VolString();
114  char tmp[256];
115  strcpy(tmp, "dst=");
116  strcat(tmp, dst.AuxString());
117  strcat(tmp, ",src=");
118  strcat(tmp, src.AuxString());
119  blasStrings.aux_str = tmp;
120 
121  if (dst.Nspin() != src.Nspin())
122  errorQuda("Spins (%d,%d) do not match", dst.Nspin(), src.Nspin());
123 
124  // For a given dst precision, there are two non-trivial possibilities for the
125  // src precision.
126 
127  blas::bytes += (unsigned long long)src.RealLength()*(src.Precision() + dst.Precision());
128 
129  int partitions = (src.IsComposite() ? src.CompositeDim() : 1) * (src.SiteSubset());
130 
131  if (dst.Precision() == src.Precision()) {
132  if (src.Bytes() != dst.Bytes()) errorQuda("Precisions match, but bytes do not");
133  qudaMemcpyAsync(dst.V(), src.V(), dst.Bytes(), cudaMemcpyDeviceToDevice, *blas::getStream());
135  qudaMemcpyAsync(dst.Norm(), src.Norm(), dst.NormBytes(), cudaMemcpyDeviceToDevice, *blas::getStream());
136  blas::bytes += 2*(unsigned long long)dst.RealLength()*sizeof(float);
137  }
138  } else if (dst.Precision() == QUDA_DOUBLE_PRECISION && src.Precision() == QUDA_SINGLE_PRECISION) {
139  if (src.Nspin() == 4){
140  Spinor<float4, float4, 6, 0> src_tex(src);
141  Spinor<float4, double2, 6, 1> dst_spinor(dst);
143  partitions);
144  copy.apply(*blas::getStream());
145  } else if (src.Nspin() == 2) {
146  if (src.Length() != src.RealLength() || dst.Length() != dst.RealLength())
147  errorQuda("Non-zero stride not supported"); // we need to know how many colors to set "M" (requires JIT)
148  Spinor<float2, float2, 1, 0> src_tex(src);
149  Spinor<float2, double2, 1, 1> dst_spinor(dst);
151  partitions);
152  copy.apply(*blas::getStream());
153  } else if (src.Nspin() == 1) {
154  Spinor<float2, float2, 3, 0> src_tex(src);
155  Spinor<float2, double2, 3, 1> dst_spinor(dst);
157  partitions);
158  copy.apply(*blas::getStream());
159  } else {
160  errorQuda("Nspin(%d) is not supported", src.Nspin());
161  }
162  } else if (dst.Precision() == QUDA_SINGLE_PRECISION && src.Precision() == QUDA_DOUBLE_PRECISION) {
163  if (src.Nspin() == 4){
164  Spinor<float4, double2, 6, 0> src_tex(src);
165  Spinor<float4, float4, 6, 1> dst_spinor(dst);
167  partitions);
168  copy.apply(*blas::getStream());
169  } else if (src.Nspin() == 2) {
170  if (src.Length() != src.RealLength() || dst.Length() != dst.RealLength())
171  errorQuda("Non-zero stride not supported"); // we need to know how many colors to set "M" (requires JIT)
172  Spinor<float2, double2, 1, 0> src_tex(src);
173  Spinor<float2, float2, 1, 1> dst_spinor(dst);
175  partitions);
176  copy.apply(*blas::getStream());
177  } else if (src.Nspin() == 1) {
178  Spinor<float2, double2, 3, 0> src_tex(src);
179  Spinor<float2, float2, 3, 1> dst_spinor(dst);
181  partitions);
182  copy.apply(*blas::getStream());
183  } else {
184  errorQuda("Nspin(%d) is not supported", src.Nspin());
185  }
186  } else if (dst.Precision() == QUDA_SINGLE_PRECISION && src.Precision() == QUDA_HALF_PRECISION) {
187  blas::bytes += (unsigned long long)src.Volume()*sizeof(float);
188  if (src.Nspin() == 4){
189  Spinor<float4, short4, 6, 0> src_tex(src);
190  Spinor<float4, float4, 6, 1> dst_spinor(dst);
192  partitions);
193  copy.apply(*blas::getStream());
194  } else if (src.Nspin() == 1) {
195  Spinor<float2, short2, 3, 0> src_tex(src);
196  Spinor<float2, float2, 3, 1> dst_spinor(dst);
198  partitions);
199  copy.apply(*blas::getStream());
200  } else {
201  errorQuda("Nspin(%d) is not supported", src.Nspin());
202  }
203  } else if (dst.Precision() == QUDA_HALF_PRECISION && src.Precision() == QUDA_SINGLE_PRECISION) {
204  blas::bytes += (unsigned long long)dst.Volume()*sizeof(float);
205  if (src.Nspin() == 4){
206  Spinor<float4, float4, 6, 0> src_tex(src);
207  Spinor<float4, short4, 6, 1> dst_spinor(dst);
209  partitions);
210  copy.apply(*blas::getStream());
211  } else if (src.Nspin() == 1) {
212  Spinor<float2, float2, 3, 0> src_tex(src);
213  Spinor<float2, short2, 3, 1> dst_spinor(dst);
215  partitions);
216  copy.apply(*blas::getStream());
217  } else {
218  errorQuda("Nspin(%d) is not supported", src.Nspin());
219  }
220  } else if (dst.Precision() == QUDA_DOUBLE_PRECISION && src.Precision() == QUDA_HALF_PRECISION) {
221  blas::bytes += (unsigned long long)src.Volume()*sizeof(float);
222  if (src.Nspin() == 4){
223  Spinor<double2, short4, 12, 0> src_tex(src);
224  Spinor<double2, double2, 12, 1> dst_spinor(dst);
226  partitions);
227  copy.apply(*blas::getStream());
228  } else if (src.Nspin() == 1) {
229  Spinor<double2, short2, 3, 0> src_tex(src);
230  Spinor<double2, double2, 3, 1> dst_spinor(dst);
232  partitions);
233  copy.apply(*blas::getStream());
234  } else {
235  errorQuda("Nspin(%d) is not supported", src.Nspin());
236  }
237  } else if (dst.Precision() == QUDA_HALF_PRECISION && src.Precision() == QUDA_DOUBLE_PRECISION) {
238  blas::bytes += (unsigned long long)dst.Volume()*sizeof(float);
239  if (src.Nspin() == 4){
240  Spinor<double2, double2, 12, 0> src_tex(src);
241  Spinor<double2, short4, 12, 1> dst_spinor(dst);
243  partitions);
244  copy.apply(*blas::getStream());
245  } else if (src.Nspin() == 1) {
246  Spinor<double2, double2, 3, 0> src_tex(src);
247  Spinor<double2, short2, 3, 1> dst_spinor(dst);
249  partitions);
250  copy.apply(*blas::getStream());
251  } else {
252  errorQuda("Nspin(%d) is not supported", src.Nspin());
253  }
254 
255 
256  } else if (dst.Precision() == QUDA_HALF_PRECISION && src.Precision() == QUDA_QUARTER_PRECISION) {
257  blas::bytes += (unsigned long long)src.Volume()*sizeof(float)*2;
258  if (src.Nspin() == 4){
259  Spinor<float4, char4, 6, 0> src_tex(src);
260  Spinor<float4, short4, 6, 1> dst_spinor(dst);
261  CopyCuda<float4, 6, decltype(dst_spinor), decltype(src_tex)> copy(dst_spinor, src_tex, src.Volume(), partitions);
262  copy.apply(*blas::getStream());
263  } else if (src.Nspin() == 1) {
264  Spinor<float2, char2, 3, 0> src_tex(src);
265  Spinor<float2, short2, 3, 1> dst_spinor(dst);
266  CopyCuda<float2, 3, decltype(dst_spinor), decltype(src_tex)> copy(dst_spinor, src_tex, src.Volume(), partitions);
267  copy.apply(*blas::getStream());
268  } else {
269  errorQuda("Nspin(%d) is not supported", src.Nspin());
270  }
271  } else if (dst.Precision() == QUDA_QUARTER_PRECISION && src.Precision() == QUDA_HALF_PRECISION) {
272  blas::bytes += (unsigned long long)dst.Volume()*sizeof(float)*2;
273  if (src.Nspin() == 4){
274  Spinor<float4, short4, 6, 0> src_tex(src);
275  Spinor<float4, char4, 6, 1> dst_spinor(dst);
276  CopyCuda<float4, 6, decltype(dst_spinor), decltype(src_tex)> copy(dst_spinor, src_tex, src.Volume(), partitions);
277  copy.apply(*blas::getStream());
278  } else if (src.Nspin() == 1) {
279  Spinor<float2, short2, 3, 0> src_tex(src);
280  Spinor<float2, char2, 3, 1> dst_spinor(dst);
281  CopyCuda<float2, 3, decltype(dst_spinor), decltype(src_tex)> copy(dst_spinor, src_tex, src.Volume(), partitions);
282  copy.apply(*blas::getStream());
283  } else {
284  errorQuda("Nspin(%d) is not supported", src.Nspin());
285  }
286  } else if (dst.Precision() == QUDA_SINGLE_PRECISION && src.Precision() == QUDA_QUARTER_PRECISION) {
287  blas::bytes += (unsigned long long)src.Volume()*sizeof(float);
288  if (src.Nspin() == 4){
289  Spinor<float4, char4, 6, 0> src_tex(src);
290  Spinor<float4, float4, 6, 1> dst_spinor(dst);
291  CopyCuda<float4, 6, decltype(dst_spinor), decltype(src_tex)> copy(dst_spinor, src_tex, src.Volume(), partitions);
292  copy.apply(*blas::getStream());
293  } else if (src.Nspin() == 1) {
294  Spinor<float2, char2, 3, 0> src_tex(src);
295  Spinor<float2, float2, 3, 1> dst_spinor(dst);
296  CopyCuda<float2, 3, decltype(dst_spinor), decltype(src_tex)> copy(dst_spinor, src_tex, src.Volume(), partitions);
297  copy.apply(*blas::getStream());
298  } else {
299  errorQuda("Nspin(%d) is not supported", src.Nspin());
300  }
301  } else if (dst.Precision() == QUDA_QUARTER_PRECISION && src.Precision() == QUDA_SINGLE_PRECISION) {
302  blas::bytes += (unsigned long long)dst.Volume()*sizeof(float);
303  if (src.Nspin() == 4){
304  Spinor<float4, float4, 6, 0> src_tex(src);
305  Spinor<float4, char4, 6, 1> dst_spinor(dst);
306  CopyCuda<float4, 6, decltype(dst_spinor), decltype(src_tex)> copy(dst_spinor, src_tex, src.Volume(), partitions);
307  copy.apply(*blas::getStream());
308  } else if (src.Nspin() == 1) {
309  Spinor<float2, float2, 3, 0> src_tex(src);
310  Spinor<float2, char2, 3, 1> dst_spinor(dst);
311  CopyCuda<float2, 3, decltype(dst_spinor), decltype(src_tex)> copy(dst_spinor, src_tex, src.Volume(), partitions);
312  copy.apply(*blas::getStream());
313  } else {
314  errorQuda("Nspin(%d) is not supported", src.Nspin());
315  }
316  } else if (dst.Precision() == QUDA_DOUBLE_PRECISION && src.Precision() == QUDA_QUARTER_PRECISION) {
317  blas::bytes += (unsigned long long)src.Volume()*sizeof(float);
318  if (src.Nspin() == 4){
319  Spinor<double2, char4, 12, 0> src_tex(src);
320  Spinor<double2, double2, 12, 1> dst_spinor(dst);
321  CopyCuda<double2, 12, decltype(dst_spinor), decltype(src_tex)> copy(dst_spinor, src_tex, src.Volume(), partitions);
322  copy.apply(*blas::getStream());
323  } else if (src.Nspin() == 1) {
324  Spinor<double2, char2, 3, 0> src_tex(src);
325  Spinor<double2, double2, 3, 1> dst_spinor(dst);
326  CopyCuda<double2, 3, decltype(dst_spinor), decltype(src_tex)> copy(dst_spinor, src_tex, src.Volume(), partitions);
327  copy.apply(*blas::getStream());
328  } else {
329  errorQuda("Nspin(%d) is not supported", src.Nspin());
330  }
331  } else if (dst.Precision() == QUDA_QUARTER_PRECISION && src.Precision() == QUDA_DOUBLE_PRECISION) {
332  blas::bytes += (unsigned long long)dst.Volume()*sizeof(float);
333  if (src.Nspin() == 4){
334  Spinor<double2, double2, 12, 0> src_tex(src);
335  Spinor<double2, char4, 12, 1> dst_spinor(dst);
336  CopyCuda<double2, 12, decltype(dst_spinor), decltype(src_tex)> copy(dst_spinor, src_tex, src.Volume(), partitions);
337  copy.apply(*blas::getStream());
338  } else if (src.Nspin() == 1) {
339  Spinor<double2, double2, 3, 0> src_tex(src);
340  Spinor<double2, char2, 3, 1> dst_spinor(dst);
341  CopyCuda<double2, 3, decltype(dst_spinor), decltype(src_tex)> copy(dst_spinor, src_tex, src.Volume(), partitions);
342  copy.apply(*blas::getStream());
343  } else {
344  errorQuda("Nspin(%d) is not supported", src.Nspin());
345  }
346  } else {
347  errorQuda("Invalid precision combination dst=%d and src=%d", dst.Precision(), src.Precision());
348  }
349 
350  checkCudaError();
351  }
352 
353  } // namespace copy_nw
354 
355  void copy(ColorSpinorField &dst, const ColorSpinorField &src) {
356  if (dst.Location() == QUDA_CUDA_FIELD_LOCATION &&
358  copy_ns::copy(static_cast<cudaColorSpinorField&>(dst),
359  static_cast<const cudaColorSpinorField&>(src));
360  } else {
361  dst = src;
362  }
363  }
364 
365  } // namespace blas
366 } // namespace quda
long long bytes() const
Definition: copy_quda.cu:95
void defaultTuneParam(TuneParam &param) const
Definition: copy_quda.cu:89
const char * AuxString() const
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21
void initTuneParam(TuneParam &param) const
Definition: copy_quda.cu:84
#define errorQuda(...)
Definition: util_quda.h:121
const char * aux_str
Definition: copy_quda.cu:28
cudaStream_t * stream
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:44
virtual bool advanceSharedBytes(TuneParam &param) const
Definition: copy_quda.cu:57
unsigned int sharedBytesPerBlock(const TuneParam &param) const
Definition: copy_quda.cu:55
const char * VolString() const
int length[]
QudaGaugeParam param
Definition: pack_test.cpp:17
cudaStream_t * getStream()
Definition: blas_quda.cu:494
QudaSiteSubset SiteSubset() const
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:643
Provides precision abstractions and defines the register precision given the storage precision using ...
int X[4]
Definition: covdev_test.cpp:70
const char * vol_str
Definition: copy_quda.cu:27
#define checkSpinorLength(a, b)
Definition: copy_quda.cu:7
QudaFieldLocation Location() const
void apply(const cudaStream_t &stream)
Definition: copy_quda.cu:76
__global__ void copyKernel(Output Y, Input X, int length)
Definition: copy_quda.cu:32
#define qudaMemcpyAsync(dst, src, count, kind, stream)
Definition: quda_cuda_api.h:38
static struct quda::blas::copy_ns::@4 blasStrings
virtual void initTuneParam(TuneParam &param) const
Definition: tune_quda.h:304
unsigned int sharedBytesPerThread() const
Definition: copy_quda.cu:54
#define checkCudaError()
Definition: util_quda.h:161
virtual bool advanceBlockDim(TuneParam &param) const
Definition: tune_quda.h:124
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:52
QudaPrecision Precision() const
void copy(cudaColorSpinorField &dst, const cudaColorSpinorField &src)
Definition: copy_quda.cu:105
QudaParity parity
Definition: covdev_test.cpp:54
CopyCuda(Output &Y, Input &X, int length, int nParity)
Definition: copy_quda.cu:68
unsigned long long bytes
Definition: blas_quda.cu:23
long long flops() const
Definition: copy_quda.cu:94
virtual void defaultTuneParam(TuneParam &param) const
Definition: tune_quda.h:329