QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
extended_color_spinor_utilities.cu
Go to the documentation of this file.
1 #include <cstdlib>
2 #include <cstdio>
3 #include <string>
4 
5 #include <color_spinor_field.h>
7 #include <tune_quda.h>
8 
9 #define PRESERVE_SPINOR_NORM
10 
11 #ifdef PRESERVE_SPINOR_NORM // Preserve the norm regardless of basis
12 #define kP (1.0/sqrt(2.0))
13 #define kU (1.0/sqrt(2.0))
14 #else // More numerically accurate not to preserve the norm between basis
15 #define kP (0.5)
16 #define kU (1.0)
17 #endif
18 
19 
20 
21 namespace quda {
22 
23  using namespace colorspinor;
24 
25  void exchangeExtendedGhost(cudaColorSpinorField* spinor, int R[], int parity, cudaStream_t *stream_p)
26  {
27 #ifdef MULTI_GPU
28  int nFace = 0;
29  for(int i=0; i<4; i++){
30  if(R[i] > nFace) nFace = R[i];
31  }
32 
33  int dagger = 0;
34 
35  int gatherCompleted[2] = {0,0};
36  int commsCompleted[2] = {0,0};
37 
38  cudaEvent_t gatherEnd[2];
39  for(int dir=0; dir<2; dir++) cudaEventCreate(&gatherEnd[dir], cudaEventDisableTiming);
40 
41  for(int dim=3; dim<=0; dim--){
42  if(!commDim(dim)) continue;
43 
44  spinor->packExtended(nFace, R, parity, dagger, dim, stream_p); // packing in the dim dimension complete
45  qudaDeviceSynchronize(); // Need this since packing is performed in stream[Nstream-1]
46  for(int dir=1; dir<=0; dir--){
47  spinor->gather(nFace, dagger, 2*dim + dir);
48  qudaEventRecord(gatherEnd[dir], streams[2*dim+dir]); // gatherEnd[1], gatherEnd[0]
49  }
50 
51  int completeSum = 0;
52  int dir = 1;
53  while(completeSum < 2){
54  if(!gatherCompleted[dir]){
55  if(cudaSuccess == cudaEventQuery(gatherEnd[dir])){
56  spinor->commsStart(nFace, 2*dim+dir, dagger);
57  completeSum++;
58  gatherCompleted[dir--] = 1;
59  }
60  }
61  }
62  gatherCompleted[0] = gatherCompleted[1] = 0;
63 
64  // Query if comms has completed
65  dir = 1;
66  while(completeSum < 4){
67  if(!commsCompleted[dir]){
68  if(spinor->commsQuery(nFace, 2*dim+dir, dagger)){
69  spinor->scatterExtended(nFace, parity, dagger, 2*dim+dir);
70  completeSum++;
71  commsCompleted[dir--] = 1;
72  }
73  }
74  }
75  commsCompleted[0] = commsCompleted[1] = 0;
76  qudaDeviceSynchronize(); // Wait for scatters to complete before next iteration
77  } // loop over dim
78 
79  for(int dir=0; dir<2; dir++) cudaEventDestroy(gatherEnd[dir]);
80 #endif
81  return;
82  }
83 
84 
86  template <typename FloatOut, typename FloatIn, int Ns, int Nc>
87  class PreserveBasis {
88  typedef typename mapper<FloatIn>::type RegTypeIn;
90  public:
91  __device__ __host__ inline void operator()(ColorSpinor<RegTypeOut,Nc,Ns> &out, const ColorSpinor<RegTypeIn,Nc,Ns> &in) {
92  for (int s=0; s<Ns; s++) {
93  for (int c=0; c<Nc; c++) {
94  out(s,c) = in(s,c);
95  }
96  }
97  }
98  };
99 
101  template <typename FloatOut, typename FloatIn, int Ns, int Nc>
102  struct NonRelBasis {
105  __device__ __host__ inline void operator()(ColorSpinor<RegTypeOut,Nc,Ns> &out, const ColorSpinor<RegTypeIn,Nc,Ns> &in) {
106  int s1[4] = {1, 2, 3, 0};
107  int s2[4] = {3, 0, 1, 2};
108  RegTypeOut K1[4] = {static_cast<RegTypeOut>(kP), static_cast<RegTypeOut>(-kP),
109  static_cast<RegTypeOut>(-kP), static_cast<RegTypeOut>(-kP)};
110  RegTypeOut K2[4] = {static_cast<RegTypeOut>(kP), static_cast<RegTypeOut>(-kP),
111  static_cast<RegTypeOut>(kP), static_cast<RegTypeOut>(kP)};
112  for (int s=0; s<Ns; s++) {
113  for (int c=0; c<Nc; c++) {
114  out(s,c).real(K1[s]*in(s1[s],c).real() + K2[s]*in(s2[s],c).real());
115  out(s,c).imag(K1[s]*in(s1[s],c).imag() + K2[s]*in(s2[s],c).imag());
116  }
117  }
118  }
119  };
120 
122  template <typename FloatOut, typename FloatIn, int Ns, int Nc>
123  struct RelBasis {
126  __device__ __host__ inline void operator()(ColorSpinor<RegTypeOut,Nc,Ns> &out, const ColorSpinor<RegTypeIn,Nc,Ns> &in) {
127  int s1[4] = {1, 2, 3, 0};
128  int s2[4] = {3, 0, 1, 2};
129  RegTypeOut K1[4] = {static_cast<RegTypeOut>(-kU), static_cast<RegTypeOut>(kU),
130  static_cast<RegTypeOut>(kU), static_cast<RegTypeOut>(kU)};
131  RegTypeOut K2[4] = {static_cast<RegTypeOut>(-kU), static_cast<RegTypeOut>(kU),
132  static_cast<RegTypeOut>(-kU), static_cast<RegTypeOut>(-kU)};
133  for (int s=0; s<Ns; s++) {
134  for (int c=0; c<Nc; c++) {
135  out(s,c).real(K1[s]*in(s1[s],c).real() + K2[s]*in(s2[s],c).real());
136  out(s,c).imag(K1[s]*in(s1[s],c).imag() + K2[s]*in(s2[s],c).imag());
137  }
138  }
139  }
140  };
141 
142  template<typename OutOrder, typename InOrder, typename Basis>
144  OutOrder out;
145  const InOrder in;
146  Basis basis;
149  int length;
150  int parity;
151 
152  CopySpinorExArg(const OutOrder &out, const InOrder &in, const Basis& basis, const int *E, const int *X, const int parity)
153  : out(out), in(in), basis(basis), parity(parity)
154  {
155  this->length = 1;
156  for(int d=0; d<4; d++){
157  this->E[d] = E[d];
158  this->X[d] = X[d];
159  this->length *= X[d]; // smaller volume
160  }
161  }
162  };
163 
164 
165  template<typename FloatOut, typename FloatIn, int Ns, int Nc, typename OutOrder, typename InOrder, typename Basis, bool extend>
167  {
168  int x[4];
169  int R[4];
170  for(int d=0; d<4; d++) R[d] = (arg.E[d] - arg.X[d]) >> 1;
171 
172  int za = X/(arg.X[0]/2);
173  int x0h = X - za*(arg.X[0]/2);
174  int zb = za/arg.X[1];
175  x[1] = za - zb*arg.X[1];
176  x[3] = zb / arg.X[2];
177  x[2] = zb - x[3]*arg.X[2];
178  x[0] = 2*x0h + ((x[1] + x[2] + x[3] + arg.parity) & 1);
179 
180  // Y is the cb spatial index into the extended gauge field
181  int Y = ((((x[3]+R[3])*arg.E[2] + (x[2]+R[2]))*arg.E[1] + (x[1]+R[1]))*arg.E[0]+(x[0]+R[0])) >> 1;
182 
183  typedef typename mapper<FloatIn>::type RegTypeIn;
184  typedef typename mapper<FloatOut>::type RegTypeOut;
185 
188  int parity = 0;
189 
190  if(extend){
191  in = arg.in(X, parity);
192  arg.basis(out, in);
193  arg.out(Y, parity) = out;
194  }else{
195  in = arg.in(Y, parity);
196  arg.basis(out, in);
197  arg.out(Y, parity) = out;
198  }
199  }
200 
201 
202  template<typename FloatOut, typename FloatIn, int Ns, int Nc, typename OutOrder, typename InOrder, typename Basis, bool extend>
204  {
205  int cb_idx = blockIdx.x*blockDim.x + threadIdx.x;
206 
207  while(cb_idx < arg.length){
208  copyInterior<FloatOut,FloatIn,Ns,Nc,OutOrder,InOrder,Basis,extend>(arg,cb_idx);
209  cb_idx += gridDim.x*blockDim.x;
210  }
211  }
212 
213  /*
214  Host function
215  */
216  template<typename FloatOut, typename FloatIn, int Ns, int Nc, typename OutOrder, typename InOrder, typename Basis, bool extend>
218  {
219  for(int cb_idx=0; cb_idx<arg.length; cb_idx++){
220  copyInterior<FloatOut,FloatIn,Ns,Nc,OutOrder,InOrder,Basis,extend>(arg, cb_idx);
221  }
222  }
223 
224 
225 
226 
227  template<typename FloatOut, typename FloatIn, int Ns, int Nc, typename OutOrder, typename InOrder, typename Basis, bool extend>
229 
233 
234  private:
235  unsigned int sharedBytesPerThread() const { return 0; }
236  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
237  bool advanceSharedBytes(TuneParam &param) const { return false; } // Don't tune shared mem
238  bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
239  unsigned int minThreads() const { return arg.length; }
240 
241  public:
243  : arg(arg), meta(meta), location(location) {
244  writeAuxString("out_stride=%d,in_stride=%d",arg.out.stride,arg.in.stride);
245  }
246  virtual ~CopySpinorEx() {}
247 
248  void apply(const cudaStream_t &stream){
249  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
250 
251  if(location == QUDA_CPU_FIELD_LOCATION){
252  copyInterior<FloatOut,FloatIn,Ns,Nc,OutOrder,InOrder,Basis,extend>(arg);
253  }else if(location == QUDA_CUDA_FIELD_LOCATION){
254  copyInteriorKernel<FloatOut,FloatIn,Ns,Nc,OutOrder,InOrder,Basis,extend>
255  <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
256  }
257  }
258 
259  TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
260 
261  long long flops() const { return 0; }
262  long long bytes() const {
263  return arg.length*2*Nc*Ns*(sizeof(FloatIn) + sizeof(FloatOut));
264  }
265 
266  }; // CopySpinorEx
267 
268 
269 
270  template<typename FloatOut, typename FloatIn, int Ns, int Nc, typename OutOrder, typename InOrder, typename Basis>
271  void copySpinorEx(OutOrder outOrder, const InOrder inOrder, const Basis basis, const int *E,
272  const int *X, const int parity, const bool extend, const ColorSpinorField &meta, QudaFieldLocation location)
273  {
274  CopySpinorExArg<OutOrder,InOrder,Basis> arg(outOrder, inOrder, basis, E, X, parity);
275  if(extend){
277  copier.apply(0);
278  }else{
280  copier.apply(0);
281  }
282  if(location == QUDA_CUDA_FIELD_LOCATION) checkCudaError();
283  }
284 
285  template<typename FloatOut, typename FloatIn, int Ns, int Nc, typename OutOrder, typename InOrder>
286  void copySpinorEx(OutOrder outOrder, InOrder inOrder, const QudaGammaBasis outBasis, const QudaGammaBasis inBasis,
287  const int* E, const int* X, const int parity, const bool extend,
288  const ColorSpinorField &meta, QudaFieldLocation location)
289  {
290  if(inBasis == outBasis){
292  copySpinorEx<FloatOut, FloatIn, Ns, Nc, OutOrder, InOrder, PreserveBasis<FloatOut,FloatIn,Ns,Nc> >
293  (outOrder, inOrder, basis, E, X, parity, extend, meta, location);
294  }else if(outBasis == QUDA_UKQCD_GAMMA_BASIS && inBasis == QUDA_DEGRAND_ROSSI_GAMMA_BASIS){
295  if(Ns != 4) errorQuda("Can only change basis with Nspin = 4, not Nspin = %d", Ns);
297  copySpinorEx<FloatOut, FloatIn, 4, Nc, OutOrder, InOrder, NonRelBasis<FloatOut,FloatIn,4,Nc> >
298  (outOrder, inOrder, basis, E, X, parity, extend, meta, location);
299  }else if(inBasis == QUDA_UKQCD_GAMMA_BASIS && outBasis == QUDA_DEGRAND_ROSSI_GAMMA_BASIS){
300  if(Ns != 4) errorQuda("Can only change basis with Nspin = 4, not Nspin = %d", Ns);
302  copySpinorEx<FloatOut, FloatIn, 4, Nc, OutOrder, InOrder, RelBasis<FloatOut,FloatIn,4,Nc> >
303  (outOrder, inOrder, basis, E, X, parity, extend, meta, location);
304  }else{
305  errorQuda("Basis change not supported");
306  }
307  }
308 
309 
310  // Need to rewrite the following two functions...
311  // Decide on the output order
312  template<typename FloatOut, typename FloatIn, int Ns, int Nc, typename InOrder>
314  QudaGammaBasis inBasis, const int *E, const int *X, const int parity, const bool extend,
315  QudaFieldLocation location, FloatOut *Out, float *outNorm){
316 
317  if (out.isNative()) {
319  ColorSpinor outOrder(out, 1, Out, outNorm);
320  copySpinorEx<FloatOut,FloatIn,Ns,Nc>
321  (outOrder, inOrder, out.GammaBasis(), inBasis, E, X, parity, extend, out, location);
322  } else {
323  errorQuda("Order not defined");
324  }
325 
326  }
327 
328  template<typename FloatOut, typename FloatIn, int Ns, int Nc>
330  const int parity, const QudaFieldLocation location, FloatOut *Out, FloatIn *In,
331  float* outNorm, float *inNorm){
332 
333  int E[4];
334  int X[4];
335  const bool extend = (out.Volume() >= in.Volume());
336  if (extend) {
337  for (int d=0; d<4; d++) {
338  E[d] = out.X()[d];
339  X[d] = in.X()[d];
340  }
341  } else {
342  for (int d=0; d<4; d++) {
343  E[d] = in.X()[d];
344  X[d] = out.X()[d];
345  }
346  }
347  X[0] *= 2; E[0] *= 2; // Since we consider only a single parity at a time
348 
349  if (in.isNative()) {
351  ColorSpinor inOrder(in, 1, In, inNorm);
352  extendedCopyColorSpinor<FloatOut,FloatIn,Ns,Nc>(inOrder, out, in.GammaBasis(), E, X, parity, extend, location, Out, outNorm);
353  } else {
354  errorQuda("Order not defined");
355  }
356 
357  }
358 
359  template<int Ns, typename dstFloat, typename srcFloat>
361  const int parity, const QudaFieldLocation location, dstFloat *Dst, srcFloat *Src,
362  float *dstNorm, float *srcNorm) {
363 
364 
365  if(dst.Ndim() != src.Ndim())
366  errorQuda("Number of dimensions %d %d don't match", dst.Ndim(), src.Ndim());
367 
368  if(!(dst.SiteOrder() == src.SiteOrder() ||
372  src.SiteOrder() == QUDA_EVEN_ODD_SITE_ORDER) ) ){
373 
374  errorQuda("Subset orders %d %d don't match", dst.SiteOrder(), src.SiteOrder());
375  }
376 
377  if(dst.SiteSubset() != src.SiteSubset())
378  errorQuda("Subset types do not match %d %d", dst.SiteSubset(), src.SiteSubset());
379 
380  if(dst.Ncolor() != 3 || src.Ncolor() != 3) errorQuda("Nc != 3 not yet supported");
381 
382  const int Nc = 3;
383 
384  // We currently only support parity-ordered fields; even-odd or odd-even
386  errorQuda("Copying to full fields with lexicographical ordering is not currently supported");
387  }
388 
389  if(dst.SiteSubset() == QUDA_FULL_SITE_SUBSET){
390  if(src.FieldOrder() == QUDA_QDPJIT_FIELD_ORDER ||
392  errorQuda("QDPJIT field ordering not supported for full site fields");
393  }
394 
395  // set for the source subset ordering
396  srcFloat *srcEven = Src ? Src : (srcFloat*)src.V();
397  srcFloat* srcOdd = (srcFloat*)((char*)srcEven + src.Bytes()/2);
398  float *srcNormEven = srcNorm ? srcNorm : (float*)src.Norm();
399  float *srcNormOdd = (float*)((char*)srcNormEven + src.NormBytes()/2);
400  if(src.SiteOrder() == QUDA_ODD_EVEN_SITE_ORDER){
401  std::swap<srcFloat*>(srcEven, srcOdd);
402  std::swap<float*>(srcNormEven, srcNormOdd);
403  }
404 
405  // set for the destination subset ordering
406  dstFloat *dstEven = Dst ? Dst : (dstFloat*)dst.V();
407  dstFloat *dstOdd = (dstFloat*)((char*)dstEven + dst.Bytes()/2);
408  float *dstNormEven = dstNorm ? dstNorm : (float*)dst.Norm();
409  float *dstNormOdd = (float*)((char*)dstNormEven + dst.NormBytes()/2);
410  if(dst.SiteOrder() == QUDA_ODD_EVEN_SITE_ORDER){
411  std::swap<dstFloat*>(dstEven, dstOdd);
412  std::swap<float*>(dstNormEven, dstNormOdd);
413  }
414 
415  // should be able to apply to select either even or odd parity at this point as well.
416  extendedCopyColorSpinor<dstFloat, srcFloat, Ns, Nc>
417  (dst, src, 0, location, dstEven, srcEven, dstNormEven, srcNormEven);
418  extendedCopyColorSpinor<dstFloat, srcFloat, Ns, Nc>
419  (dst, src, 1, location, dstOdd, srcOdd, dstNormOdd, srcNormOdd);
420  }else{
421  extendedCopyColorSpinor<dstFloat, srcFloat, Ns, Nc>
422  (dst, src, parity, location, Dst, Src, dstNorm, srcNorm);
423  } // N.B. Need to update this to account for differences in parity
424  }
425 
426 
427  template<typename dstFloat, typename srcFloat>
429  const int parity, const QudaFieldLocation location, dstFloat *Dst, srcFloat *Src,
430  float *dstNorm=0, float *srcNorm=0)
431  {
432  if(dst.Nspin() != src.Nspin())
433  errorQuda("source and destination spins must match");
434 
435  if(dst.Nspin() == 4){
436 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC)
437  copyExtendedColorSpinor<4>(dst, src, parity, location, Dst, Src, dstNorm, srcNorm);
438 #else
439  errorQuda("Extended copy has not been built for Nspin=%d fields",dst.Nspin());
440 #endif
441  }else if(dst.Nspin() == 1){
442 #ifdef GPU_STAGGERED_DIRAC
443  copyExtendedColorSpinor<1>(dst, src, parity, location, Dst, Src, dstNorm, srcNorm);
444 #else
445  errorQuda("Extended copy has not been built for Nspin=%d fields", dst.Nspin());
446 #endif
447  }else{
448  errorQuda("Nspin=%d unsupported", dst.Nspin());
449  }
450  }
451 
452 
453  // There's probably no need to have the additional Dst and Src arguments here!
455  QudaFieldLocation location, const int parity, void *Dst, void *Src,
456  void *dstNorm, void *srcNorm){
457 
458  if(dst.Precision() == QUDA_DOUBLE_PRECISION){
459  if(src.Precision() == QUDA_DOUBLE_PRECISION){
460  CopyExtendedColorSpinor(dst, src, parity, location, static_cast<double*>(Dst), static_cast<double*>(Src));
461  }else if(src.Precision() == QUDA_SINGLE_PRECISION){
462  CopyExtendedColorSpinor(dst, src, parity, location, static_cast<double*>(Dst), static_cast<float*>(Src));
463  }else if(src.Precision() == QUDA_HALF_PRECISION){
464  CopyExtendedColorSpinor(dst, src, parity, location, static_cast<double*>(Dst), static_cast<short*>(Src), 0, static_cast<float*>(srcNorm));
465  } else {
466  errorQuda("Unsupported Precision %d", src.Precision());
467  }
468  } else if (dst.Precision() == QUDA_SINGLE_PRECISION){
469  if(src.Precision() == QUDA_DOUBLE_PRECISION){
470  CopyExtendedColorSpinor(dst, src, parity, location, static_cast<float*>(Dst), static_cast<double*>(Src));
471  }else if(src.Precision() == QUDA_SINGLE_PRECISION){
472  CopyExtendedColorSpinor(dst, src, parity, location, static_cast<float*>(Dst), static_cast<float*>(Src));
473  }else if(src.Precision() == QUDA_HALF_PRECISION){
474  CopyExtendedColorSpinor(dst, src, parity, location, static_cast<float*>(Dst), static_cast<short*>(Src), 0, static_cast<float*>(srcNorm));
475  }else{
476  errorQuda("Unsupported Precision %d", src.Precision());
477  }
478  } else if (dst.Precision() == QUDA_HALF_PRECISION){
479  if(src.Precision() == QUDA_DOUBLE_PRECISION){
480  CopyExtendedColorSpinor(dst, src, parity, location, static_cast<short*>(Dst), static_cast<double*>(Src), static_cast<float*>(dstNorm), 0);
481  }else if(src.Precision() == QUDA_SINGLE_PRECISION){
482  CopyExtendedColorSpinor(dst, src, parity, location, static_cast<short*>(Dst), static_cast<float*>(Src), static_cast<float*>(dstNorm), 0);
483  }else if(src.Precision() == QUDA_HALF_PRECISION){
484  CopyExtendedColorSpinor(dst, src, parity, location, static_cast<short*>(Dst), static_cast<short*>(Src), static_cast<float*>(dstNorm), static_cast<float*>(srcNorm));
485  }else{
486  errorQuda("Unsupported Precision %d", src.Precision());
487  }
488  }else{
489  errorQuda("Unsupported Precision %d", dst.Precision());
490  }
491  }
492 
493 } // quda
CopySpinorEx(CopySpinorExArg< OutOrder, InOrder, Basis > &arg, const ColorSpinorField &meta, QudaFieldLocation location)
void commsStart(int nFace, int d, int dagger=0, cudaStream_t *stream_p=nullptr, bool gdr_send=false, bool gdr_recv=false)
Initiate halo communication.
mapper< FloatOut >::type RegTypeOut
__device__ __host__ void operator()(ColorSpinor< RegTypeOut, Nc, Ns > &out, const ColorSpinor< RegTypeIn, Nc, Ns > &in)
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21
void gather(int nFace, int dagger, int dir, cudaStream_t *stream_p=NULL)
#define errorQuda(...)
Definition: util_quda.h:121
cudaStream_t * streams
void CopyExtendedColorSpinor(ColorSpinorField &dst, const ColorSpinorField &src, const int parity, const QudaFieldLocation location, dstFloat *Dst, srcFloat *Src, float *dstNorm=0, float *srcNorm=0)
cudaStream_t * stream
QudaGammaBasis GammaBasis() const
void copySpinorEx(OutOrder outOrder, const InOrder inOrder, const Basis basis, const int *E, const int *X, const int parity, const bool extend, const ColorSpinorField &meta, QudaFieldLocation location)
const char * VolString() const
static int R[4]
void scatterExtended(int nFace, int parity, int dagger, int dir)
mapper< FloatIn >::type RegTypeIn
int E[4]
Definition: test_util.cpp:35
int commsQuery(int nFace, int d, int dagger=0, cudaStream_t *stream_p=nullptr, bool gdr_send=false, bool gdr_recv=false)
Non-blocking query if the halo communication has completed.
QudaGaugeParam param
Definition: pack_test.cpp:17
bool advanceSharedBytes(TuneParam &param) const
#define qudaDeviceSynchronize()
void apply(const cudaStream_t &stream)
mapper< FloatOut >::type RegTypeOut
cpuColorSpinorField * in
QudaSiteSubset SiteSubset() const
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:643
void packExtended(const int nFace, const int R[], const int parity, const int dagger, const int dim, cudaStream_t *stream_p, const bool zeroCopyPack=false)
void exchangeExtendedGhost(cudaColorSpinorField *spinor, int R[], int parity, cudaStream_t *stream_p)
void extendedCopyColorSpinor(InOrder &inOrder, ColorSpinorField &out, QudaGammaBasis inBasis, const int *E, const int *X, const int parity, const bool extend, QudaFieldLocation location, FloatOut *Out, float *outNorm)
int X[4]
Definition: covdev_test.cpp:70
mapper< FloatIn >::type RegTypeIn
CopySpinorExArg< OutOrder, InOrder, Basis > arg
enum QudaFieldLocation_s QudaFieldLocation
static int commDim[QUDA_MAX_DIM]
Definition: dslash_pack.cuh:9
cpuColorSpinorField * out
void copyExtendedColorSpinor(ColorSpinorField &dst, const ColorSpinorField &src, QudaFieldLocation location, const int parity, void *Dst, void *Src, void *dstNorm, void *srcNorm)
unsigned int sharedBytesPerThread() const
enum QudaGammaBasis_s QudaGammaBasis
__shared__ float s[]
__device__ __host__ void operator()(ColorSpinor< RegTypeOut, Nc, Ns > &out, const ColorSpinor< RegTypeIn, Nc, Ns > &in)
CopySpinorExArg(const OutOrder &out, const InOrder &in, const Basis &basis, const int *E, const int *X, const int parity)
QudaSiteOrder SiteOrder() const
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
mapper< FloatOut >::type RegTypeOut
cudaError_t qudaEventRecord(cudaEvent_t &event, cudaStream_t stream=0)
Wrapper around cudaEventRecord or cuEventRecord.
const int * X() const
__global__ void copyInteriorKernel(CopySpinorExArg< OutOrder, InOrder, Basis > arg)
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
#define checkCudaError()
Definition: util_quda.h:161
__device__ __host__ void copyInterior(CopySpinorExArg< OutOrder, InOrder, Basis > &arg, int X)
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:52
QudaPrecision Precision() const
unsigned int sharedBytesPerBlock(const TuneParam &param) const
QudaDagType dagger
Definition: test_util.cpp:1620
__device__ __host__ void operator()(ColorSpinor< RegTypeOut, Nc, Ns > &out, const ColorSpinor< RegTypeIn, Nc, Ns > &in)
QudaParity parity
Definition: covdev_test.cpp:54
mapper< FloatIn >::type RegTypeIn
QudaFieldOrder FieldOrder() const
cpuColorSpinorField * spinor
Definition: covdev_test.cpp:41
cudaEvent_t gatherEnd[Nstream]
Definition: dslash_quda.cu:57