QUDA  v0.5.0
A library for QCD on GPUs
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
copy_quda.cu
Go to the documentation of this file.
1 #include <blas_quda.h>
2 #include <tune_quda.h>
3 #include <float_vector.h>
4 
5 // For kernels with precision conversion built in
6 #define checkSpinorLength(a, b) \
7  { \
8  if (a.Length() != b.Length()) \
9  errorQuda("lengths do not match: %d %d", a.Length(), b.Length()); \
10  if (a.Stride() != b.Stride()) \
11  errorQuda("strides do not match: %d %d", a.Stride(), b.Stride()); \
12  }
13 
14 namespace quda {
15 
18  cudaStream_t* getBlasStream();
19 
20  namespace copy {
21 
22 #include <texture.h>
23 
24  static struct {
26  int stride;
27  } blasConstants;
28 
29  template <typename FloatN, int N, typename Output, typename Input>
30  __global__ void copyKernel(Output Y, Input X, int length) {
31  unsigned int i = blockIdx.x*(blockDim.x) + threadIdx.x;
32  unsigned int gridSize = gridDim.x*blockDim.x;
33 
34  while (i < length) {
35  FloatN x[N];
36  X.load(x, i);
37  Y.save(x, i);
38  i += gridSize;
39  }
40  }
41 
42  template <typename FloatN, int N, typename Output, typename Input>
43  class CopyCuda : public Tunable {
44 
45  private:
46  Input &X;
47  Output &Y;
48  const int length;
49 
50  int sharedBytesPerThread() const { return 0; }
51  int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
52 
53  virtual bool advanceSharedBytes(TuneParam &param) const
54  {
55  TuneParam next(param);
56  advanceBlockDim(next); // to get next blockDim
57  int nthreads = next.block.x * next.block.y * next.block.z;
58  param.shared_bytes = sharedBytesPerThread()*nthreads > sharedBytesPerBlock(param) ?
59  sharedBytesPerThread()*nthreads : sharedBytesPerBlock(param);
60  return false;
61  }
62 
63  public:
64  CopyCuda(Output &Y, Input &X, int length) : X(X), Y(Y), length(length) { ; }
65  virtual ~CopyCuda() { ; }
66 
67  TuneKey tuneKey() const {
68  std::stringstream vol, aux;
69  vol << blasConstants.x[0] << "x";
70  vol << blasConstants.x[1] << "x";
71  vol << blasConstants.x[2] << "x";
72  vol << blasConstants.x[3];
73  aux << "stride=" << blasConstants.stride << ",out_prec=" << Y.Precision() << ",in_prec=" << X.Precision();
74  return TuneKey(vol.str(), "copyKernel", aux.str());
75  }
76 
77  void apply(const cudaStream_t &stream) {
79  copyKernel<FloatN, N><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(Y, X, length);
80  }
81 
82  void preTune() { ; } // no need to save state for copy kernels
83  void postTune() { ; } // no need to restore state for copy kernels
84 
85  long long flops() const { return 0; }
86  long long bytes() const {
87  const int Ninternal = (sizeof(FloatN)/sizeof(((FloatN*)0)->x))*N;
88  size_t bytes = (X.Precision() + Y.Precision())*Ninternal;
89  if (X.Precision() == QUDA_HALF_PRECISION) bytes += sizeof(float);
90  if (Y.Precision() == QUDA_HALF_PRECISION) bytes += sizeof(float);
91  return bytes*length;
92  }
93  };
94 
96  if (&src == &dst) return; // aliasing fields
97  if (src.Nspin() != 1 && src.Nspin() != 4) errorQuda("nSpin(%d) not supported\n", src.Nspin());
98 
100  if (src.SiteSubset() != dst.SiteSubset())
101  errorQuda("Spinor fields do not have matching subsets dst=%d src=%d\n",
102  dst.SiteSubset(), src.SiteSubset());
103  copy::copyCuda(dst.Even(), src.Even());
104  copy::copyCuda(dst.Odd(), src.Odd());
105  return;
106  }
107 
108  checkSpinorLength(dst, src);
109 
110  for (int d=0; d<QUDA_MAX_DIM; d++) blasConstants.x[d] = src.X()[d];
111  blasConstants.stride = src.Stride();
112 
113  // For a given dst precision, there are two non-trivial possibilities for the
114  // src precision.
115 
116  blas_bytes += (unsigned long long)src.RealLength()*(src.Precision() + dst.Precision());
117 
118  if (dst.Precision() == src.Precision()) {
119  if (src.Bytes() != dst.Bytes()) errorQuda("Precisions match, but bytes do not");
120  cudaMemcpy(dst.V(), src.V(), dst.Bytes(), cudaMemcpyDeviceToDevice);
121  if (dst.Precision() == QUDA_HALF_PRECISION) {
122  cudaMemcpy(dst.Norm(), src.Norm(), dst.NormBytes(), cudaMemcpyDeviceToDevice);
123  blas_bytes += 2*(unsigned long long)dst.RealLength()*sizeof(float);
124  }
125  } else if (dst.Precision() == QUDA_DOUBLE_PRECISION && src.Precision() == QUDA_SINGLE_PRECISION) {
126  if (src.Nspin() == 4){
131  copy(dst_spinor, src_tex, src.Volume());
132  copy.apply(*getBlasStream());
133  } else { //src.Nspin() == 1
138  copy(dst_spinor, src_tex, src.Volume());
139  copy.apply(*getBlasStream());
140  }
141  } else if (dst.Precision() == QUDA_SINGLE_PRECISION && src.Precision() == QUDA_DOUBLE_PRECISION) {
142  if (src.Nspin() == 4){
144  Spinor<float4, float4, float4, 6, 1> dst_spinor(dst);
147  copy(dst_spinor, src_tex, src.Volume());
148  copy.apply(*getBlasStream());
149  } else { //src.Nspin() ==1
151  Spinor<float2, float2, float2, 3, 1> dst_spinor(dst);
154  copy(dst_spinor, src_tex, src.Volume());
155  copy.apply(*getBlasStream());
156  }
157  } else if (dst.Precision() == QUDA_SINGLE_PRECISION && src.Precision() == QUDA_HALF_PRECISION) {
158  blas_bytes += (unsigned long long)src.Volume()*sizeof(float);
159  if (src.Nspin() == 4){
161  Spinor<float4, float4, float4, 6, 1> dst_spinor(dst);
164  copy(dst_spinor, src_tex, src.Volume());
165  copy.apply(*getBlasStream());
166  } else { //nSpin== 1;
168  Spinor<float2, float2, float2, 3, 1> dst_spinor(dst);
171  copy(dst_spinor, src_tex, src.Volume());
172  copy.apply(*getBlasStream());
173  }
174  } else if (dst.Precision() == QUDA_HALF_PRECISION && src.Precision() == QUDA_SINGLE_PRECISION) {
175  blas_bytes += (unsigned long long)dst.Volume()*sizeof(float);
176  if (src.Nspin() == 4){
178  Spinor<float4, float4, short4, 6, 1> dst_spinor(dst);
181  copy(dst_spinor, src_tex, src.Volume());
182  copy.apply(*getBlasStream());
183  } else { //nSpin == 1
185  Spinor<float2, float2, short2, 3, 1> dst_spinor(dst);
188  copy(dst_spinor, src_tex, src.Volume());
189  copy.apply(*getBlasStream());
190 }
191  } else if (dst.Precision() == QUDA_DOUBLE_PRECISION && src.Precision() == QUDA_HALF_PRECISION) {
192  blas_bytes += (unsigned long long)src.Volume()*sizeof(float);
193  if (src.Nspin() == 4){
198  copy(dst_spinor, src_tex, src.Volume());
199  copy.apply(*getBlasStream());
200  } else { //nSpin == 1
205  copy(dst_spinor, src_tex, src.Volume());
206  copy.apply(*getBlasStream());
207  }
208  } else if (dst.Precision() == QUDA_HALF_PRECISION && src.Precision() == QUDA_DOUBLE_PRECISION) {
209  blas_bytes += (unsigned long long)dst.Volume()*sizeof(float);
210  if (src.Nspin() == 4){
215  copy(dst_spinor, src_tex, src.Volume());
216  copy.apply(*getBlasStream());
217  } else { //nSpin == 1
222  copy(dst_spinor, src_tex, src.Volume());
223  copy.apply(*getBlasStream());
224 }
225  } else {
226  errorQuda("Invalid precision combination dst=%d and src=%d", dst.Precision(), src.Precision());
227  }
228 
229  checkCudaError();
230  }
231 
232  } // namespace copy
233 
235  copy::copyCuda(dst, src);
236  }
237 
238 } // namespace quda