QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
contract.cu
Go to the documentation of this file.
1 //
2 // double2 contractCuda(float2 *x, float2 *y, float2 *result) {}
3 //
4 
5 namespace quda
6 {
7 #include <gamma5.h> // g5 kernel
8 
13  template <typename sFloat>
14  class Gamma5Cuda : public Tunable {
15 
16  private:
17  cudaColorSpinorField *out; //Output spinor
18  const cudaColorSpinorField *in; //Input spinor
19 
20  unsigned int sharedBytesPerThread() const { return 0; }
21  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
22  bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
23  unsigned int minThreads() const { return in->X(0) * in->X(1) * in->X(2) * in->X(3); }
24 
25  char *saveOut, *saveOutNorm;
26 
27  public:
29  out(out), in(in) { bindSpinorTex<sFloat>(in, out); strcpy(aux,"gamma5");}
30 
31  virtual ~Gamma5Cuda() { unbindSpinorTex<sFloat>(in, out); }
32 
33  TuneKey tuneKey() const
34  {
35  return TuneKey(in->VolString(), typeid(*this).name());
36  }
37 
38  void apply(const cudaStream_t &stream)
39  {
40  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
41  gamma5Kernel<<<tp.grid, tp.block, tp.shared_bytes>>> ((sFloat*)out->V(), (float*)out->Norm(), (sFloat*)in->V(), (float*)in->Norm(), dslashParam, in->Stride());
42  }
43 
44  void preTune()
45  {
46  saveOut = new char[out->Bytes()];
47  cudaMemcpy(saveOut, out->V(), out->Bytes(), cudaMemcpyDeviceToHost);
48 
49  if (typeid(sFloat) == typeid(short4))
50  {
51  saveOutNorm = new char[out->NormBytes()];
52  cudaMemcpy(saveOutNorm, out->Norm(), out->NormBytes(), cudaMemcpyDeviceToHost);
53  }
54  }
55 
56  void postTune()
57  {
58  cudaMemcpy(out->V(), saveOut, out->Bytes(), cudaMemcpyHostToDevice);
59  delete[] saveOut;
60 
61  if (typeid(sFloat) == typeid(short4))
62  {
63  cudaMemcpy(out->Norm(), saveOutNorm, out->NormBytes(), cudaMemcpyHostToDevice);
64  delete[] saveOutNorm;
65  }
66  }
67 
68  std::string paramString(const TuneParam &param) const
69  {
70  std::stringstream ps;
71  ps << "block=(" << param.block.x << "," << param.block.y << "," << param.block.z << "), ";
72  ps << "shared=" << param.shared_bytes;
73  return ps.str();
74  }
75 
76  long long flops() const { return 12ll * in->VolumeCB(); }
77  long long bytes() const { return in->Bytes() + in->NormBytes() + out->Bytes() + out->NormBytes(); }
78  };
79 
86  {
87  dslashParam.threads = in->Volume();
88 
89  Tunable *gamma5 = 0;
90 
91  if (in->Precision() == QUDA_DOUBLE_PRECISION)
92  {
93 #if (__COMPUTE_CAPABILITY__ >= 130)
94  gamma5 = new Gamma5Cuda<double2>(out, in);
95 #else
96  errorQuda("Double precision not supported on this GPU");
97 #endif
98  } else if (in->Precision() == QUDA_SINGLE_PRECISION) {
99  gamma5 = new Gamma5Cuda<float4>(out, in);
100  } else if (in->Precision() == QUDA_HALF_PRECISION) {
101  errorQuda("Half precision not supported for gamma5 kernel yet"); // Support for half precision is very straightforward,
102  } // but I doubt is useful
103 
104  gamma5->apply(streams[Nstream-1]);
105  checkCudaError();
106 
107  delete gamma5;
108  }
109 
110 #include "contract_core.h"
111 #include "contract_core_plus.h"
112 #include "contract_core_minus.h"
113 
114 #ifndef _TWIST_QUDA_CONTRACT
115 #error "Contraction core undefined"
116 #endif
117 
118 #ifndef _TWIST_QUDA_CONTRACT_PLUS
119 #error "Contraction core (plus) undefined"
120 #endif
121 
122 #ifndef _TWIST_QUDA_CONTRACT_MINUS
123 #error "Contraction core (minus) undefined"
124 #endif
125 
126 #define checkSpinor(a, b) \
127  { \
128  if (a.Precision() != b.Precision()) \
129  errorQuda("precisions do not match: %d %d", a.Precision(), b.Precision()); \
130  if (a.Length() != b.Length()) \
131  errorQuda("lengths do not match: %d %d", a.Length(), b.Length()); \
132  if (a.Stride() != b.Stride()) \
133  errorQuda("strides do not match: %d %d", a.Stride(), b.Stride()); \
134  }
135 
141  template <typename Float2, typename rFloat>
142  class ContractCuda : public Tunable {
143 
144  private:
145  const cudaColorSpinorField x; // Spinor to be contracted
146  const cudaColorSpinorField y; // Spinor to be contracted
147  const QudaParity parity; // Parity of the field, actual kernels act on parity spinors
148  const QudaContractType contract_type; // Type of contraction, to be detailed later
149 
169  void *result; // The output array with the result of the contraction
170 
171  const int nTSlice; // Time-slice in case of time-dilution
172 
173  char aux[16][TuneKey::aux_n]; // For tuning purposes
174 
175  unsigned int sharedBytesPerThread() const { return 16*sizeof(rFloat); }
176  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
177  bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
178  unsigned int minThreads() const { return x.X(0) * x.X(1) * x.X(2) * x.X(3); }
179 
180  char *saveOut, *saveOutNorm;
181 
182  void fillAux(QudaContractType contract_type, const char *contract_str) { strcpy(aux[contract_type], contract_str); }
183 
184  public:
185  ContractCuda(const cudaColorSpinorField &x, const cudaColorSpinorField &y, void *result, const QudaParity parity, const QudaContractType contract_type) :
186  x(x), y(y), result(result), parity(parity), contract_type(contract_type), nTSlice(-1) {
187  fillAux(QUDA_CONTRACT, "type=plain");
188  fillAux(QUDA_CONTRACT_PLUS, "type=plain-plus");
189  fillAux(QUDA_CONTRACT_MINUS, "type=plain-minus");
190  fillAux(QUDA_CONTRACT_GAMMA5, "type=gamma5");
191  fillAux(QUDA_CONTRACT_GAMMA5_PLUS, "type=gamma5-plus");
192  fillAux(QUDA_CONTRACT_GAMMA5_MINUS, "type=gamma5-minus");
193  fillAux(QUDA_CONTRACT_TSLICE, "type=tslice");
194  fillAux(QUDA_CONTRACT_TSLICE_PLUS, "type=tslice-plus");
195  fillAux(QUDA_CONTRACT_TSLICE_MINUS, "type=tslice-minus");
196 
197  bindSpinorTex<Float2>(&x, &y);
198  }
199 
200  ContractCuda(const cudaColorSpinorField &x, const cudaColorSpinorField &y, void *result, const QudaParity parity, const QudaContractType contract_type, const int tSlice) :
201  x(x), y(y), result(result), parity(parity), contract_type(contract_type), nTSlice(tSlice) {
202  fillAux(QUDA_CONTRACT, "type=plain");
203  fillAux(QUDA_CONTRACT_PLUS, "type=plain-plus");
204  fillAux(QUDA_CONTRACT_MINUS, "type=plain-minus");
205  fillAux(QUDA_CONTRACT_GAMMA5, "type=gamma5");
206  fillAux(QUDA_CONTRACT_GAMMA5_PLUS, "type=gamma5-plus");
207  fillAux(QUDA_CONTRACT_GAMMA5_MINUS, "type=gamma5-minus");
208  fillAux(QUDA_CONTRACT_TSLICE, "type=tslice");
209  fillAux(QUDA_CONTRACT_TSLICE_PLUS, "type=tslice-plus");
210  fillAux(QUDA_CONTRACT_TSLICE_MINUS, "type=tslice-minus");
211 
212  bindSpinorTex<Float2>(&x, &y);
213  }
214 
215  virtual ~ContractCuda() { unbindSpinorTex<Float2>(&x, &y); } // if (tSlice != NULL) { cudaFreeHost(tSlice); } }
216 
217  QudaContractType ContractType() const { return contract_type; }
218 
219  TuneKey tuneKey() const
220  {
221  return TuneKey(x.VolString(), typeid(*this).name(), aux[contract_type]);
222  }
223 
224  void apply(const cudaStream_t &stream)
225  {
226  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
227  switch (contract_type)
228  {
229  default:
230  case QUDA_CONTRACT_GAMMA5: // Calculates the volume contraction (x^+ g5)_\mu y_\nu and stores it in result
231  contractGamma5Kernel <<<tp.grid, tp.block, tp.shared_bytes>>>((rFloat*)result, (Float2*)x.V(), (Float2*)y.V(), x.Stride(), parity, dslashParam);
232  break;
233 
234  case QUDA_CONTRACT_GAMMA5_PLUS: // Calculates the volume contraction (x^+ g5)_\mu y_\nu and adds it to result
235  contractGamma5PlusKernel <<<tp.grid, tp.block, tp.shared_bytes>>>((rFloat*)result, (Float2*)x.V(), (Float2*)y.V(), x.Stride(), parity, dslashParam);
236  break;
237 
238  case QUDA_CONTRACT_GAMMA5_MINUS: // Calculates the volume contraction (x^+ g5)_\mu y_\nu and substracts it from result
239  contractGamma5MinusKernel<<<tp.grid, tp.block, tp.shared_bytes>>>((rFloat*)result, (Float2*)x.V(), (Float2*)y.V(), x.Stride(), parity, dslashParam);
240  break;
241 
242  case QUDA_CONTRACT: // Calculates the volume contraction x^+_\mu y_\nu and stores it in result
243  contractKernel <<<tp.grid, tp.block, tp.shared_bytes>>>((rFloat*)result, (Float2*)x.V(), (Float2*)y.V(), x.Stride(), parity, dslashParam);
244  break;
245 
246  case QUDA_CONTRACT_PLUS: // Calculates the volume contraction x^+_\mu y_\nu and adds it to result
247  contractPlusKernel <<<tp.grid, tp.block, tp.shared_bytes>>>((rFloat*)result, (Float2*)x.V(), (Float2*)y.V(), x.Stride(), parity, dslashParam);
248  break;
249 
250  case QUDA_CONTRACT_MINUS: // Calculates the volume contraction x^+_\mu y_\nu and substracts it from result
251  contractMinusKernel <<<tp.grid, tp.block, tp.shared_bytes>>>((rFloat*)result, (Float2*)x.V(), (Float2*)y.V(), x.Stride(), parity, dslashParam);
252  break;
253 
254  case QUDA_CONTRACT_TSLICE: // Calculates the time-slice contraction x^+_\mu y_\nu and stores it in result
255  contractTsliceKernel <<<tp.grid, tp.block, tp.shared_bytes>>>((rFloat*)result, (Float2*)x.V(), (Float2*)y.V(), x.Stride(), nTSlice, parity, dslashParam);
256  break;
257 
258  case QUDA_CONTRACT_TSLICE_PLUS: // Calculates the time-slice contraction x^+_\mu y_\nu and adds it to result
259  contractTslicePlusKernel <<<tp.grid, tp.block, tp.shared_bytes>>>((rFloat*)result, (Float2*)x.V(), (Float2*)y.V(), x.Stride(), nTSlice, parity, dslashParam);
260  break;
261 
262  case QUDA_CONTRACT_TSLICE_MINUS: // Calculates the time-slice contraction x^+_\mu y_\nu and substracts it from result
263  contractTsliceMinusKernel<<<tp.grid, tp.block, tp.shared_bytes>>>((rFloat*)result, (Float2*)x.V(), (Float2*)y.V(), x.Stride(), nTSlice, parity, dslashParam);
264  break;
265  }
266  }
267 
268  void preTune() {}
269 
270  void postTune() {}
271 
272  std::string paramString(const TuneParam &param) const
273  {
274  std::stringstream ps;
275  ps << "block=(" << param.block.x << "," << param.block.y << "," << param.block.z << "), ";
276  ps << "shared=" << param.shared_bytes;
277  return ps.str();
278  }
279 
280  long long flops() const { return 120ll * x.VolumeCB(); }
281  long long bytes() const { return x.Bytes() + x.NormBytes() + y.Bytes() + y.NormBytes(); }
282  };
283 
290  void contractCuda (const cudaColorSpinorField &x, const cudaColorSpinorField &y, void *result, const QudaContractType contract_type, const QudaParity parity)
291  {
292  if ((contract_type == QUDA_CONTRACT_TSLICE) || (contract_type == QUDA_CONTRACT_TSLICE_PLUS) || (contract_type == QUDA_CONTRACT_TSLICE_MINUS)) {
293  errorQuda("No time-slice specified for contraction\n");
294  return;
295  }
296 
297  dslashParam.threads = x.Volume();
298 
299  Tunable *contract = 0;
300 
301  if (x.Precision() == QUDA_DOUBLE_PRECISION)
302  {
303 #if (__COMPUTE_CAPABILITY__ >= 130)
304  contract = new ContractCuda<double2,double2>(x, y, result, parity, contract_type);
305 #else
306  errorQuda("Double precision not supported on this GPU");
307 #endif
308  } else if (x.Precision() == QUDA_SINGLE_PRECISION) {
309  contract = new ContractCuda<float4,float2>(x, y, result, parity, contract_type);
310  } else if (x.Precision() == QUDA_HALF_PRECISION) {
311  errorQuda("Half precision not supported for gamma5 kernel yet");
312  }
313 
314  contract->apply(streams[Nstream-1]);
315  checkCudaError();
316 
317  delete contract;
318  }
319 
325  void contractCuda (const cudaColorSpinorField &x, const cudaColorSpinorField &y, void *result, const QudaContractType contract_type, const int nTSlice, const QudaParity parity)
326  {
327  if ((contract_type != QUDA_CONTRACT_TSLICE) || (contract_type != QUDA_CONTRACT_TSLICE_PLUS) || (contract_type != QUDA_CONTRACT_TSLICE_MINUS)) {
328  errorQuda("No time-slice input allowed for volume contractions\n");
329  return;
330  }
331 
332  dslashParam.threads = x.X(0)*x.X(1)*x.X(2);
333 
334  Tunable *contract = 0;
335 
336  if (x.Precision() == QUDA_DOUBLE_PRECISION)
337  {
338 #if (__COMPUTE_CAPABILITY__ >= 130)
339  contract = new ContractCuda<double2,double2>(x, y, result, parity, contract_type, nTSlice);
340 #else
341  errorQuda("Double precision not supported on this GPU");
342 #endif
343  } else if (x.Precision() == QUDA_SINGLE_PRECISION) {
344  contract = new ContractCuda<float4,float2>(x, y, result, parity, contract_type, nTSlice);
345  } else if (x.Precision() == QUDA_HALF_PRECISION) {
346  errorQuda("Half precision not supported for gamma5 kernel yet");
347  }
348 
349  contract->apply(streams[Nstream-1]);
350  checkCudaError();
351 
352  delete contract;
353  }
354 }
355 
int y[4]
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
__global__ void contractTsliceKernel(float2 *out, float4 *in1, float4 *in2, int myStride, const int Tslice, const int Parity, const DslashParam param)
const int * X() const
#define errorQuda(...)
Definition: util_quda.h:73
long long bytes() const
Definition: contract.cu:77
long long flops() const
Definition: contract.cu:76
cudaStream_t * streams
__global__ void contractGamma5MinusKernel(float2 *out, float4 *in1, float4 *in2, int myStride, const int Parity, const DslashParam param)
cudaStream_t * stream
::std::string string
Definition: gtest.h:1979
const int Nstream
TuneKey tuneKey() const
Definition: contract.cu:33
__global__ void contractTslicePlusKernel(float2 *out, float4 *in1, float4 *in2, int myStride, const int Tslice, const int Parity, const DslashParam param)
__global__ void contractMinusKernel(float2 *out, float4 *in1, float4 *in2, int myStride, const int Parity, const DslashParam param)
__global__ void contractGamma5PlusKernel(float2 *out, float4 *in1, float4 *in2, int myStride, const int Parity, const DslashParam param)
TuneKey tuneKey() const
Definition: contract.cu:219
std::string paramString(const TuneParam &param) const
Definition: contract.cu:68
QudaGaugeParam param
Definition: pack_test.cpp:17
ContractCuda(const cudaColorSpinorField &x, const cudaColorSpinorField &y, void *result, const QudaParity parity, const QudaContractType contract_type, const int tSlice)
Definition: contract.cu:200
void postTune()
Definition: contract.cu:56
cpuColorSpinorField * in
__global__ void contractKernel(float2 *out, float4 *in1, float4 *in2, int myStride, const int Parity, const DslashParam param)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:271
void apply(const cudaStream_t &stream)
Definition: contract.cu:38
const char * VolString() const
void preTune()
Definition: contract.cu:44
enum QudaParity_s QudaParity
QudaContractType ContractType() const
Definition: contract.cu:217
int x[4]
long long flops() const
Definition: contract.cu:280
__global__ void gamma5Kernel(float4 *out, float *outNorm, float4 *in, float *inNorm, DslashParam param, int myStride)
Definition: gamma5.h:297
virtual ~Gamma5Cuda()
Definition: contract.cu:31
void apply(const cudaStream_t &stream)
Definition: contract.cu:224
Gamma5Cuda(cudaColorSpinorField *out, const cudaColorSpinorField *in)
Definition: contract.cu:28
cpuColorSpinorField * out
void contractCuda(const cudaColorSpinorField &x, const cudaColorSpinorField &y, void *result, const QudaContractType contract_type, const QudaParity parity)
Definition: contract.cu:290
QudaPrecision Precision() const
std::string paramString(const TuneParam &param) const
Definition: contract.cu:272
static const int aux_n
Definition: tune_key.h:12
enum QudaContractType_s QudaContractType
__global__ void contractPlusKernel(float2 *out, float4 *in1, float4 *in2, int myStride, const int Parity, const DslashParam param)
virtual ~ContractCuda()
Definition: contract.cu:215
__global__ void contractGamma5Kernel(float2 *out, float4 *in1, float4 *in2, int myStride, const int Parity, const DslashParam param)
#define checkCudaError()
Definition: util_quda.h:110
QudaTune getTuning()
Definition: util_quda.cpp:32
const QudaParity parity
Definition: dslash_test.cpp:29
char aux[TuneKey::aux_n]
Definition: tune_quda.h:136
void gamma5Cuda(cudaColorSpinorField *out, const cudaColorSpinorField *in)
Definition: contract.cu:85
ContractCuda(const cudaColorSpinorField &x, const cudaColorSpinorField &y, void *result, const QudaParity parity, const QudaContractType contract_type)
Definition: contract.cu:185
long long bytes() const
Definition: contract.cu:281
virtual void apply(const cudaStream_t &stream)=0
__global__ void contractTsliceMinusKernel(float2 *out, float4 *in1, float4 *in2, int myStride, const int Tslice, const int Parity, const DslashParam param)