QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
copy_quda.cu
Go to the documentation of this file.
1 #include <blas_quda.h>
2 #include <tune_quda.h>
3 #include <float_vector.h>
4 
5 // For kernels with precision conversion built in
6 #define checkSpinorLength(a, b) \
7  { \
8  if (a.Length() != b.Length()) \
9  errorQuda("lengths do not match: %d %d", a.Length(), b.Length()); \
10  if (a.Stride() != b.Stride()) \
11  errorQuda("strides do not match: %d %d", a.Stride(), b.Stride()); \
12  }
13 
14 namespace quda {
15 
16  cudaStream_t* getBlasStream();
17 
18  namespace copy {
19 
20 #include <texture.h>
21 
22  static struct {
23  const char *vol_str;
24  const char *aux_str;
25  } blasStrings;
26 
27  template <typename FloatN, int N, typename Output, typename Input>
28  __global__ void copyKernel(Output Y, Input X, int length) {
29  unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
30  unsigned int gridSize = gridDim.x*blockDim.x;
31 
32  while (i < length) {
33  FloatN x[N];
34  X.load(x, i);
35  Y.save(x, i);
36  i += gridSize;
37  }
38  }
39 
40  template <typename FloatN, int N, typename Output, typename Input>
41  class CopyCuda : public Tunable {
42 
43  private:
44  Input &X;
45  Output &Y;
46  const int length;
47 
48  unsigned int sharedBytesPerThread() const { return 0; }
49  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
50 
51  virtual bool advanceSharedBytes(TuneParam &param) const
52  {
53  TuneParam next(param);
54  advanceBlockDim(next); // to get next blockDim
55  int nthreads = next.block.x * next.block.y * next.block.z;
56  param.shared_bytes = sharedBytesPerThread()*nthreads > sharedBytesPerBlock(param) ?
57  sharedBytesPerThread()*nthreads : sharedBytesPerBlock(param);
58  return false;
59  }
60 
61  public:
62  CopyCuda(Output &Y, Input &X, int length) : X(X), Y(Y), length(length) { }
63  virtual ~CopyCuda() { ; }
64 
65  inline TuneKey tuneKey() const {
66  return TuneKey(blasStrings.vol_str, "copyKernel", blasStrings.aux_str);
67  }
68 
69  inline void apply(const cudaStream_t &stream) {
70  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
71  copyKernel<FloatN, N><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(Y, X, length);
72  }
73 
74  void preTune() { ; } // no need to save state for copy kernels
75  void postTune() { ; } // no need to restore state for copy kernels
76 
77  long long flops() const { return 0; }
78  long long bytes() const {
79  const int Ninternal = (sizeof(FloatN)/sizeof(((FloatN*)0)->x))*N;
80  size_t bytes = (X.Precision() + Y.Precision())*Ninternal;
81  if (X.Precision() == QUDA_HALF_PRECISION) bytes += sizeof(float);
82  if (Y.Precision() == QUDA_HALF_PRECISION) bytes += sizeof(float);
83  return bytes*length;
84  }
85  int tuningIter() const { return 3; }
86  };
87 
89  if (&src == &dst) return; // aliasing fields
90  if (src.Nspin() != 1 && src.Nspin() != 4) errorQuda("nSpin(%d) not supported\n", src.Nspin());
91 
93  if (src.SiteSubset() != dst.SiteSubset())
94  errorQuda("Spinor fields do not have matching subsets dst=%d src=%d\n",
95  dst.SiteSubset(), src.SiteSubset());
96  copy::copyCuda(dst.Even(), src.Even());
97  copy::copyCuda(dst.Odd(), src.Odd());
98  return;
99  }
100 
101  checkSpinorLength(dst, src);
102 
103  blasStrings.vol_str = src.VolString();
104  char tmp[256];
105  strcpy(tmp, "dst=");
106  strcat(tmp, dst.AuxString());
107  strcat(tmp, ",src=");
108  strcat(tmp, src.AuxString());
109  blasStrings.aux_str = tmp;
110 
111  // For a given dst precision, there are two non-trivial possibilities for the
112  // src precision.
113 
114  // FIXME: use traits to encapsulate register type for shorts -
115  // will reduce template type parameters from 3 to 2
116 
117  blas_bytes += (unsigned long long)src.RealLength()*(src.Precision() + dst.Precision());
118 
119  if (dst.Precision() == src.Precision()) {
120  if (src.Bytes() != dst.Bytes()) errorQuda("Precisions match, but bytes do not");
121  cudaMemcpy(dst.V(), src.V(), dst.Bytes(), cudaMemcpyDeviceToDevice);
122  if (dst.Precision() == QUDA_HALF_PRECISION) {
123  cudaMemcpy(dst.Norm(), src.Norm(), dst.NormBytes(), cudaMemcpyDeviceToDevice);
124  blas_bytes += 2*(unsigned long long)dst.RealLength()*sizeof(float);
125  }
126  } else if (dst.Precision() == QUDA_DOUBLE_PRECISION && src.Precision() == QUDA_SINGLE_PRECISION) {
127  if (src.Nspin() == 4){
132  copy(dst_spinor, src_tex, src.Volume());
133  copy.apply(*getBlasStream());
134  } else { //src.Nspin() == 1
139  copy(dst_spinor, src_tex, src.Volume());
140  copy.apply(*getBlasStream());
141  }
142  } else if (dst.Precision() == QUDA_SINGLE_PRECISION && src.Precision() == QUDA_DOUBLE_PRECISION) {
143  if (src.Nspin() == 4){
145  Spinor<float4, float4, float4, 6, 1> dst_spinor(dst);
148  copy(dst_spinor, src_tex, src.Volume());
149  copy.apply(*getBlasStream());
150  } else { //src.Nspin() ==1
152  Spinor<float2, float2, float2, 3, 1> dst_spinor(dst);
155  copy(dst_spinor, src_tex, src.Volume());
156  copy.apply(*getBlasStream());
157 }
158  } else if (dst.Precision() == QUDA_SINGLE_PRECISION && src.Precision() == QUDA_HALF_PRECISION) {
159  blas_bytes += (unsigned long long)src.Volume()*sizeof(float);
160  if (src.Nspin() == 4){
162  Spinor<float4, float4, float4, 6, 1> dst_spinor(dst);
165  copy(dst_spinor, src_tex, src.Volume());
166  copy.apply(*getBlasStream());
167  } else { //nSpin== 1;
169  Spinor<float2, float2, float2, 3, 1> dst_spinor(dst);
172  copy(dst_spinor, src_tex, src.Volume());
173  copy.apply(*getBlasStream());
174  }
175  } else if (dst.Precision() == QUDA_HALF_PRECISION && src.Precision() == QUDA_SINGLE_PRECISION) {
176  blas_bytes += (unsigned long long)dst.Volume()*sizeof(float);
177  if (src.Nspin() == 4){
179  Spinor<float4, float4, short4, 6, 1> dst_spinor(dst);
182  copy(dst_spinor, src_tex, src.Volume());
183  copy.apply(*getBlasStream());
184  } else { //nSpin == 1
186  Spinor<float2, float2, short2, 3, 1> dst_spinor(dst);
189  copy(dst_spinor, src_tex, src.Volume());
190  copy.apply(*getBlasStream());
191 }
192  } else if (dst.Precision() == QUDA_DOUBLE_PRECISION && src.Precision() == QUDA_HALF_PRECISION) {
193  blas_bytes += (unsigned long long)src.Volume()*sizeof(float);
194  if (src.Nspin() == 4){
199  copy(dst_spinor, src_tex, src.Volume());
200  copy.apply(*getBlasStream());
201  } else { //nSpin == 1
206  copy(dst_spinor, src_tex, src.Volume());
207  copy.apply(*getBlasStream());
208  }
209  } else if (dst.Precision() == QUDA_HALF_PRECISION && src.Precision() == QUDA_DOUBLE_PRECISION) {
210  blas_bytes += (unsigned long long)dst.Volume()*sizeof(float);
211  if (src.Nspin() == 4){
216  copy(dst_spinor, src_tex, src.Volume());
217  copy.apply(*getBlasStream());
218  } else { //nSpin == 1
223  copy(dst_spinor, src_tex, src.Volume());
224  copy.apply(*getBlasStream());
225 }
226  } else {
227  errorQuda("Invalid precision combination dst=%d and src=%d", dst.Precision(), src.Precision());
228  }
229 
230  checkCudaError();
231  }
232 
233  } // namespace copy
234 
236  copy::copyCuda(dst, src);
237  }
238 
239 } // namespace quda
CopyCuda(Output &Y, Input &X, int length)
Definition: copy_quda.cu:62
const char * aux_str
Definition: copy_quda.cu:24
const char * vol_str
Definition: copy_quda.cu:23
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
#define errorQuda(...)
Definition: util_quda.h:73
unsigned long long blas_bytes
Definition: blas_quda.cu:38
cudaStream_t * stream
virtual ~CopyCuda()
Definition: copy_quda.cu:63
__host__ __device__ void copy(T1 &a, const T2 &b)
int length[]
QudaGaugeParam param
Definition: pack_test.cpp:17
cudaColorSpinorField & Odd() const
cudaColorSpinorField * tmp
const char * AuxString() const
virtual bool advanceBlockDim(TuneParam &param) const
Definition: tune_quda.h:74
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:271
void copyCuda(cudaColorSpinorField &dst, const cudaColorSpinorField &src)
Definition: copy_quda.cu:235
const char * VolString() const
int x[4]
#define checkSpinorLength(a, b)
Definition: copy_quda.cu:6
cudaStream_t * getBlasStream()
Definition: blas_quda.cu:64
long long flops() const
Definition: copy_quda.cu:77
void copyCuda(cudaColorSpinorField &dst, const cudaColorSpinorField &src)
Definition: copy_quda.cu:88
long long bytes() const
Definition: copy_quda.cu:78
QudaPrecision Precision() const
void apply(const cudaStream_t &stream)
Definition: copy_quda.cu:69
TuneKey tuneKey() const
Definition: copy_quda.cu:65
#define checkCudaError()
Definition: util_quda.h:110
QudaTune getTuning()
Definition: util_quda.cpp:32
int tuningIter() const
Definition: copy_quda.cu:85
QudaSiteSubset SiteSubset() const
__global__ void copyKernel(Output Y, Input X, int length)
Definition: copy_quda.cu:28
cudaColorSpinorField & Even() const