QUDA  0.9.0
copy_color_spinor.cuh
Go to the documentation of this file.
1 /*
2  Spinor reordering and copying routines. These are implemented to
3  un on both CPU and GPU. Here we are templating on the following:
4  - input precision
5  - output precision
6  - number of colors
7  - number of spins
8  - field ordering
9 */
10 
11 #include <color_spinor_field.h>
13 #include <tune_quda.h>
14 #include <algorithm> // for std::swap
15 
16 #define PRESERVE_SPINOR_NORM
17 
18 #ifdef PRESERVE_SPINOR_NORM // Preserve the norm regardless of basis
19 #define kP (1.0/sqrt(2.0))
20 #define kU (1.0/sqrt(2.0))
21 #else // More numerically accurate not to preserve the norm between basis
22 #define kP (0.5)
23 #define kU (1.0)
24 #endif
25 
26 namespace quda {
27 
28  using namespace colorspinor;
29 
30  template <typename Out, typename In>
32  Out out;
33  const In in;
34  const int volumeCB;
35  const int nParity;
36  const int outParity;
37  const int inParity;
38  CopyColorSpinorArg(const Out &out, const In &in, const ColorSpinorField &out_, const ColorSpinorField &in_)
39  : out(out), in(in), volumeCB(in_.VolumeCB()), nParity(in_.SiteSubset()),
40  outParity(out_.SiteOrder()==QUDA_ODD_EVEN_SITE_ORDER ? 1 : 0),
41  inParity(in_.SiteOrder()==QUDA_ODD_EVEN_SITE_ORDER ? 1 : 0) { }
42  };
43 
45  template <int Ns, int Nc>
46  struct PreserveBasis {
47  template <typename FloatOut, typename FloatIn>
48  __device__ __host__ inline void operator()(complex<FloatOut> out[Ns*Nc], const complex<FloatIn> in[Ns*Nc]) const {
49  for (int s=0; s<Ns; s++) for (int c=0; c<Nc; c++) out[s*Nc+c] = in[s*Nc+c];
50  }
51  };
52 
54  template <int Ns, int Nc>
55  struct NonRelBasis {
56  template <typename FloatOut, typename FloatIn>
57  __device__ __host__ inline void operator()(complex<FloatOut> out[Ns*Nc], const complex<FloatIn> in[Ns*Nc]) const {
58  int s1[4] = {1, 2, 3, 0};
59  int s2[4] = {3, 0, 1, 2};
60  FloatOut K1[4] = {static_cast<FloatOut>(kP), static_cast<FloatOut>(-kP), static_cast<FloatOut>(-kP), static_cast<FloatOut>(-kP)};
61  FloatOut K2[4] = {static_cast<FloatOut>(kP), static_cast<FloatOut>(-kP), static_cast<FloatOut>(kP), static_cast<FloatOut>(kP)};
62  for (int s=0; s<Ns; s++) {
63  for (int c=0; c<Nc; c++) {
64  out[s*Nc+c] = K1[s]*static_cast<complex<FloatOut> >(in[s1[s]*Nc+c]) + K2[s]*static_cast<complex<FloatOut> >(in[s2[s]*Nc+c]);
65  }
66  }
67  }
68  };
69 
71  template <int Ns, int Nc>
72  struct RelBasis {
73  template <typename FloatOut, typename FloatIn>
74  __device__ __host__ inline void operator()(complex<FloatOut> out[Ns*Nc], const complex<FloatIn> in[Ns*Nc]) const {
75  int s1[4] = {1, 2, 3, 0};
76  int s2[4] = {3, 0, 1, 2};
77  FloatOut K1[4] = {static_cast<FloatOut>(-kU), static_cast<FloatOut>(kU), static_cast<FloatOut>(kU), static_cast<FloatOut>(kU)};
78  FloatOut K2[4] = {static_cast<FloatOut>(-kU), static_cast<FloatOut>(kU), static_cast<FloatOut>(-kU), static_cast<FloatOut>(-kU)};
79  for (int s=0; s<Ns; s++) {
80  for (int c=0; c<Nc; c++) {
81  out[s*Nc+c] = K1[s]*static_cast<complex<FloatOut> >(in[s1[s]*Nc+c]) + K2[s]*static_cast<complex<FloatOut> >(in[s2[s]*Nc+c]);
82  }
83  }
84  }
85  };
86 
88  template <int Ns, int Nc>
90  template <typename FloatOut, typename FloatIn>
91  __device__ __host__ inline void operator()(complex<FloatOut> out[Ns*Nc], const complex<FloatIn> in[Ns*Nc]) const {
92  int s1[4] = {0, 1, 0, 1};
93  int s2[4] = {2, 3, 2, 3};
94  FloatOut K1[4] = {static_cast<FloatOut>(-kP), static_cast<FloatOut>(-kP), static_cast<FloatOut>(kP), static_cast<FloatOut>(kP)};
95  FloatOut K2[4] = {static_cast<FloatOut>(kP), static_cast<FloatOut>(kP), static_cast<FloatOut>(kP), static_cast<FloatOut>(kP)};
96  for (int s=0; s<Ns; s++) {
97  for (int c=0; c<Nc; c++) {
98  out[s*Nc+c] = K1[s]*static_cast<complex<FloatOut> >(in[s1[s]*Nc+c]) + K2[s]*static_cast<complex<FloatOut> >(in[s2[s]*Nc+c]);
99  }
100  }
101  }
102  };
103 
105  template <int Ns, int Nc>
107  template <typename FloatOut, typename FloatIn>
108  __device__ __host__ inline void operator()(complex<FloatOut> out[Ns*Nc], const complex<FloatIn> in[Ns*Nc]) const {
109  int s1[4] = {0, 1, 0, 1};
110  int s2[4] = {2, 3, 2, 3};
111  FloatOut K1[4] = {static_cast<FloatOut>(-kU), static_cast<FloatOut>(-kU), static_cast<FloatOut>(kU), static_cast<FloatOut>(kU)};
112  FloatOut K2[4] = {static_cast<FloatOut>(kU),static_cast<FloatOut>(kU), static_cast<FloatOut>(kU), static_cast<FloatOut>(kU)};
113  for (int s=0; s<Ns; s++) {
114  for (int c=0; c<Nc; c++) {
115  out[s*Nc+c] = K1[s]*static_cast<complex<FloatOut> >(in[s1[s]*Nc+c]) + K2[s]*static_cast<complex<FloatOut> >(in[s2[s]*Nc+c]);
116  }
117  }
118  }
119  };
120 
122  template <typename FloatOut, typename FloatIn, int Ns, int Nc, typename Arg, typename Basis>
123  void copyColorSpinor(Arg &arg, const Basis &basis) {
124  typedef typename mapper<FloatIn>::type RegTypeIn;
125  typedef typename mapper<FloatOut>::type RegTypeOut;
126 
127  for (int parity = 0; parity<arg.nParity; parity++) {
128  for (int x=0; x<arg.volumeCB; x++) {
129  ColorSpinor<RegTypeIn, Nc, Ns> in = arg.in(x, (parity+arg.inParity)&1);
131  basis(out.data, in.data);
132  arg.out(x, (parity+arg.outParity)&1) = out;
133  }
134  }
135  }
136 
138  template <typename FloatOut, typename FloatIn, int Ns, int Nc, typename Arg, typename Basis>
139  __global__ void copyColorSpinorKernel(Arg arg, Basis basis) {
140  typedef typename mapper<FloatIn>::type RegTypeIn;
141  typedef typename mapper<FloatOut>::type RegTypeOut;
142 
143  int x = blockIdx.x * blockDim.x + threadIdx.x;
144  if (x >= arg.volumeCB) return;
145  int parity = blockIdx.y * blockDim.y + threadIdx.y;
146 
147  ColorSpinor<RegTypeIn, Nc, Ns> in = arg.in(x, (parity+arg.inParity)&1);
149  basis(out.data, in.data);
150  arg.out(x, (parity+arg.outParity)&1) = out;
151  }
152 
153  template <typename FloatOut, typename FloatIn, int Ns, int Nc, typename Arg>
155  Arg &arg;
158 
159  private:
160  unsigned int sharedBytesPerThread() const { return 0; }
161  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
162  bool advanceSharedBytes(TuneParam &param) const { return false; } // Don't tune shared mem
163  bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
164  unsigned int minThreads() const { return meta.VolumeCB(); }
165 
166  public:
168  QudaFieldLocation location)
169  : TunableVectorY(arg.nParity), arg(arg), meta(in), location(location) {
170  if (out.GammaBasis()!=in.GammaBasis()) errorQuda("Cannot change gamma basis for nSpin=%d\n", Ns);
171  writeAuxString("out_stride=%d,in_stride=%d", arg.out.stride, arg.in.stride);
172  }
173  virtual ~CopyColorSpinor() { ; }
174 
175  void apply(const cudaStream_t &stream) {
176  if (location == QUDA_CPU_FIELD_LOCATION) {
177  copyColorSpinor<FloatOut, FloatIn, Ns, Nc>(arg, PreserveBasis<Ns,Nc>());
178  } else {
179  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
180  copyColorSpinorKernel<FloatOut, FloatIn, Ns, Nc>
181  <<<tp.grid, tp.block, tp.shared_bytes, stream>>> (arg, PreserveBasis<Ns,Nc>());
182  }
183  }
184 
185  TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
186  long long flops() const { return 0; }
187  long long bytes() const { return arg.in.Bytes() + arg.out.Bytes(); }
188  };
189 
190  template <typename FloatOut, typename FloatIn, int Nc, typename Arg>
191  class CopyColorSpinor<FloatOut,FloatIn,4,Nc,Arg> : TunableVectorY {
192  static constexpr int Ns = 4;
193  Arg &arg;
197 
198  private:
199  unsigned int sharedBytesPerThread() const { return 0; }
200  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
201  bool advanceSharedBytes(TuneParam &param) const { return false; } // Don't tune shared mem
202  bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
203  unsigned int minThreads() const { return in.VolumeCB(); }
204 
205  public:
207  QudaFieldLocation location)
208  : TunableVectorY(arg.nParity), arg(arg), out(out), in(in), location(location) {
209 
210  if (out.GammaBasis()==in.GammaBasis()) {
211  writeAuxString("out_stride=%d,in_stride=%d,PreserveBasis", arg.out.stride, arg.in.stride);
212  } else if (out.GammaBasis() == QUDA_UKQCD_GAMMA_BASIS && in.GammaBasis() == QUDA_DEGRAND_ROSSI_GAMMA_BASIS) {
213  writeAuxString("out_stride=%d,in_stride=%d,NonRelBasis", arg.out.stride, arg.in.stride);
214  } else if (in.GammaBasis() == QUDA_UKQCD_GAMMA_BASIS && out.GammaBasis() == QUDA_DEGRAND_ROSSI_GAMMA_BASIS) {
215  writeAuxString("out_stride=%d,in_stride=%d,RelBasis", arg.out.stride, arg.in.stride);
216  } else if (out.GammaBasis() == QUDA_UKQCD_GAMMA_BASIS && in.GammaBasis() == QUDA_CHIRAL_GAMMA_BASIS) {
217  writeAuxString("out_stride=%d,in_stride=%d,ChiralToNonRelBasis", arg.out.stride, arg.in.stride);
218  } else if (in.GammaBasis() == QUDA_UKQCD_GAMMA_BASIS && out.GammaBasis() == QUDA_CHIRAL_GAMMA_BASIS) {
219  writeAuxString("out_stride=%d,in_stride=%d,NonRelToChiralBasis", arg.out.stride, arg.in.stride);
220  } else {
221  errorQuda("Basis change from %d to %d not supported", in.GammaBasis(), out.GammaBasis());
222  }
223  }
224  virtual ~CopyColorSpinor() { ; }
225 
226  void apply(const cudaStream_t &stream) {
227  if (location == QUDA_CPU_FIELD_LOCATION) {
228  if (out.GammaBasis()==in.GammaBasis()) {
229  copyColorSpinor<FloatOut, FloatIn, Ns, Nc>(arg, PreserveBasis<Ns,Nc>());
230  } else if (out.GammaBasis() == QUDA_UKQCD_GAMMA_BASIS && in.GammaBasis() == QUDA_DEGRAND_ROSSI_GAMMA_BASIS) {
231  copyColorSpinor<FloatOut, FloatIn, Ns, Nc>(arg, NonRelBasis<Ns,Nc>());
232  } else if (in.GammaBasis() == QUDA_UKQCD_GAMMA_BASIS && out.GammaBasis() == QUDA_DEGRAND_ROSSI_GAMMA_BASIS) {
233  copyColorSpinor<FloatOut, FloatIn, Ns, Nc>(arg, RelBasis<Ns,Nc>());
234  } else if (out.GammaBasis() == QUDA_UKQCD_GAMMA_BASIS && in.GammaBasis() == QUDA_CHIRAL_GAMMA_BASIS) {
235  copyColorSpinor<FloatOut, FloatIn, Ns, Nc>(arg, ChiralToNonRelBasis<Ns,Nc>());
236  } else if (in.GammaBasis() == QUDA_UKQCD_GAMMA_BASIS && out.GammaBasis() == QUDA_CHIRAL_GAMMA_BASIS) {
237  copyColorSpinor<FloatOut, FloatIn, Ns, Nc>(arg, NonRelToChiralBasis<Ns,Nc>());
238  }
239  } else {
240  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
241  if (out.GammaBasis()==in.GammaBasis()) {
242  copyColorSpinorKernel<FloatOut, FloatIn, Ns, Nc>
243  <<<tp.grid, tp.block, tp.shared_bytes, stream>>> (arg, PreserveBasis<Ns,Nc>());
244  } else if (out.GammaBasis() == QUDA_UKQCD_GAMMA_BASIS && in.GammaBasis() == QUDA_DEGRAND_ROSSI_GAMMA_BASIS) {
245  copyColorSpinorKernel<FloatOut, FloatIn, Ns, Nc>
246  <<<tp.grid, tp.block, tp.shared_bytes, stream>>> (arg, NonRelBasis<Ns,Nc>());
247  } else if (in.GammaBasis() == QUDA_UKQCD_GAMMA_BASIS && out.GammaBasis() == QUDA_DEGRAND_ROSSI_GAMMA_BASIS) {
248  copyColorSpinorKernel<FloatOut, FloatIn, Ns, Nc>
249  <<<tp.grid, tp.block, tp.shared_bytes, stream>>> (arg, RelBasis<Ns,Nc>());
250  } else if (out.GammaBasis() == QUDA_UKQCD_GAMMA_BASIS && in.GammaBasis() == QUDA_CHIRAL_GAMMA_BASIS) {
251  copyColorSpinorKernel<FloatOut, FloatIn, Ns, Nc>
253  } else if (in.GammaBasis() == QUDA_UKQCD_GAMMA_BASIS && out.GammaBasis() == QUDA_CHIRAL_GAMMA_BASIS) {
254  copyColorSpinorKernel<FloatOut, FloatIn, Ns, Nc>
256  }
257  }
258  }
259 
260  TuneKey tuneKey() const { return TuneKey(in.VolString(), typeid(*this).name(), aux); }
261  long long flops() const { return 0; }
262  long long bytes() const { return arg.in.Bytes() + arg.out.Bytes(); }
263  };
264 
265 
267  template <typename FloatOut, typename FloatIn, int Ns, int Nc, typename Out, typename In>
268  void genericCopyColorSpinor(Out &outOrder, const In &inOrder, const ColorSpinorField &out,
269  const ColorSpinorField &in, QudaFieldLocation location) {
270 
271  typedef CopyColorSpinorArg<Out,In> Arg;
272  Arg arg(outOrder, inOrder, out, in);
274  copy.apply(0);
275 
276  }
277 
279  template <typename FloatOut, typename FloatIn, int Ns, int Nc, typename InOrder>
280  void genericCopyColorSpinor(InOrder &inOrder, ColorSpinorField &out,
281  const ColorSpinorField &in, QudaFieldLocation location,
282  FloatOut *Out, float *outNorm) {
283 
284  const bool override = true;
285  if (out.isNative()) {
287  ColorSpinor outOrder(out, 1, Out, outNorm, nullptr, override);
288  genericCopyColorSpinor<FloatOut,FloatIn,Ns,Nc>
289  (outOrder, inOrder, out, in, location);
290  } else if (out.FieldOrder() == QUDA_FLOAT2_FIELD_ORDER && Ns == 4) {
291  // this is needed for single-precision mg for changing basis in the transfer
293  ColorSpinor outOrder(out, 1, (float*)Out, outNorm, nullptr, override);
294  genericCopyColorSpinor<float,FloatIn,4,Nc>
295  (outOrder, inOrder, out, in, location);
296  } else if (out.FieldOrder() == QUDA_SPACE_SPIN_COLOR_FIELD_ORDER) {
298  genericCopyColorSpinor<FloatOut,FloatIn,Ns,Nc>
299  (outOrder, inOrder, out, in, location);
300  } else if (out.FieldOrder() == QUDA_SPACE_COLOR_SPIN_FIELD_ORDER) {
302  genericCopyColorSpinor<FloatOut,FloatIn,Ns,Nc>
303  (outOrder, inOrder, out, in, location);
304  } else if (out.FieldOrder() == QUDA_PADDED_SPACE_SPIN_COLOR_FIELD_ORDER) {
305 
306 #ifdef BUILD_TIFR_INTERFACE
308  genericCopyColorSpinor<FloatOut,FloatIn,Ns,Nc>
309  (outOrder, inOrder, out, in, location);
310 #else
311  errorQuda("TIFR interface has not been built\n");
312 #endif
313 
314  } else if (out.FieldOrder() == QUDA_QDPJIT_FIELD_ORDER) {
315 
316 #ifdef BUILD_QDPJIT_INTERFACE
317  QDPJITDiracOrder<FloatOut, Ns, Nc> outOrder(out, 1, Out);
318  genericCopyColorSpinor<FloatOut,FloatIn,Ns,Nc>
319  (outOrder, inOrder, out, in, location);
320 #else
321  errorQuda("QDPJIT interface has not been built\n");
322 #endif
323  } else {
324  errorQuda("Order %d not defined (Ns=%d, Nc=%d)", out.FieldOrder(), Ns, Nc);
325  }
326 
327  }
328 
330  template <typename FloatOut, typename FloatIn, int Ns, int Nc>
332  QudaFieldLocation location, FloatOut *Out, FloatIn *In,
333  float *outNorm, float *inNorm) {
334 
335  const bool override = true;
336  if (in.isNative()) {
338  ColorSpinor inOrder(in, 1, In, inNorm, nullptr, override);
339  genericCopyColorSpinor<FloatOut,FloatIn,Ns,Nc>(inOrder, out, in, location, Out, outNorm);
340  } else if (in.FieldOrder() == QUDA_FLOAT2_FIELD_ORDER && Ns == 4) {
341  // this is needed for single-precision mg for changing basis in the transfer
343  ColorSpinor inOrder(in, 1, (float*)In, inNorm, nullptr, override);
344  genericCopyColorSpinor<FloatOut,float,4,Nc>(inOrder, out, in, location, Out, outNorm);
345  } else if (in.FieldOrder() == QUDA_SPACE_SPIN_COLOR_FIELD_ORDER) {
347  genericCopyColorSpinor<FloatOut,FloatIn,Ns,Nc>(inOrder, out, in, location, Out, outNorm);
348  } else if (in.FieldOrder() == QUDA_SPACE_COLOR_SPIN_FIELD_ORDER) {
350  genericCopyColorSpinor<FloatOut,FloatIn,Ns,Nc>(inOrder, out, in, location, Out, outNorm);
351  } else if (in.FieldOrder() == QUDA_PADDED_SPACE_SPIN_COLOR_FIELD_ORDER) {
352 
353 #ifdef BUILD_TIFR_INTERFACE
355  genericCopyColorSpinor<FloatOut,FloatIn,Ns,Nc>(inOrder, out, in, location, Out, outNorm);
356 #else
357  errorQuda("TIFR interface has not been built\n");
358 #endif
359 
360  } else if (in.FieldOrder() == QUDA_QDPJIT_FIELD_ORDER) {
361 
362 #ifdef BUILD_QDPJIT_INTERFACE
363  QDPJITDiracOrder<FloatIn, Ns, Nc> inOrder(in, 1, In);
364  genericCopyColorSpinor<FloatOut,FloatIn,Ns,Nc>(inOrder, out, in, location, Out, outNorm);
365 #else
366  errorQuda("QDPJIT interface has not been built\n");
367 #endif
368  } else {
369  errorQuda("Order %d not defined (Ns=%d, Nc=%d)", in.FieldOrder(), Ns, Nc);
370  }
371 
372  }
373 
374 
375  template <int Ns, int Nc, typename dstFloat, typename srcFloat>
377  QudaFieldLocation location, dstFloat *Dst, srcFloat *Src,
378  float *dstNorm, float *srcNorm) {
379 
380  if (dst.Ndim() != src.Ndim())
381  errorQuda("Number of dimensions %d %d don't match", dst.Ndim(), src.Ndim());
382 
383  if (dst.Volume() != src.Volume())
384  errorQuda("Volumes %d %d don't match", dst.Volume(), src.Volume());
385 
386  if (!( dst.SiteOrder() == src.SiteOrder() ||
387  (dst.SiteOrder() == QUDA_EVEN_ODD_SITE_ORDER &&
388  src.SiteOrder() == QUDA_ODD_EVEN_SITE_ORDER) ||
389  (dst.SiteOrder() == QUDA_ODD_EVEN_SITE_ORDER &&
390  src.SiteOrder() == QUDA_EVEN_ODD_SITE_ORDER) ) ) {
391  errorQuda("Subset orders %d %d don't match", dst.SiteOrder(), src.SiteOrder());
392  }
393 
394  if (dst.SiteSubset() != src.SiteSubset())
395  errorQuda("Subset types do not match %d %d", dst.SiteSubset(), src.SiteSubset());
396 
397  // We currently only support parity-ordered fields; even-odd or odd-even
399  errorQuda("Copying to full fields with lexicographical ordering is not currently supported");
400  }
401 
402  if (dst.SiteSubset() == QUDA_FULL_SITE_SUBSET && (src.FieldOrder() == QUDA_QDPJIT_FIELD_ORDER || dst.FieldOrder() == QUDA_QDPJIT_FIELD_ORDER)) {
403  errorQuda("QDPJIT field ordering not supported for full site fields");
404  }
405 
406  genericCopyColorSpinor<dstFloat, srcFloat, Ns, Nc>(dst, src, location, Dst, Src, dstNorm, srcNorm);
407 
408  }
409 
410  template <int Nc, typename dstFloat, typename srcFloat>
412  QudaFieldLocation location, dstFloat *Dst, srcFloat *Src,
413  float *dstNorm=0, float *srcNorm=0) {
414 
415  if (dst.Nspin() != src.Nspin())
416  errorQuda("source and destination spins must match");
417 
418  if (dst.Nspin() == 4) {
419 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC)
420  copyGenericColorSpinor<4,Nc>(dst, src, location, Dst, Src, dstNorm, srcNorm);
421 #else
422  errorQuda("%s has not been built for Nspin=%d fields", __func__, src.Nspin());
423 #endif
424  } else if (dst.Nspin() == 2) {
425 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC) || defined(GPU_STAGGERED_DIRAC)
426  copyGenericColorSpinor<2,Nc>(dst, src, location, Dst, Src, dstNorm, srcNorm);
427 #else
428  errorQuda("%s has not been built for Nspin=%d fields", __func__, src.Nspin());
429 #endif
430  } else if (dst.Nspin() == 1) {
431 #ifdef GPU_STAGGERED_DIRAC
432  copyGenericColorSpinor<1,Nc>(dst, src, location, Dst, Src, dstNorm, srcNorm);
433 #else
434  errorQuda("%s has not been built for Nspin=%d fields", __func__, src.Nspin());
435 #endif
436  } else {
437  errorQuda("Nspin=%d unsupported", dst.Nspin());
438  }
439 
440  }
441 
442 } // namespace quda
dim3 dim3 blockDim
void CopyGenericColorSpinor(ColorSpinorField &dst, const ColorSpinorField &src, QudaFieldLocation location, dstFloat *Dst, srcFloat *Src, float *dstNorm=0, float *srcNorm=0)
__device__ __host__ void operator()(complex< FloatOut > out[Ns *Nc], const complex< FloatIn > in[Ns *Nc]) const
#define kP
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
const void * src
#define errorQuda(...)
Definition: util_quda.h:90
void genericCopyColorSpinor(Out &outOrder, const In &inOrder, const ColorSpinorField &out, const ColorSpinorField &in, QudaFieldLocation location)
cudaStream_t * stream
__global__ void copyColorSpinorKernel(Arg arg, Basis basis)
const char * VolString() const
__host__ __device__ void copy(T1 &a, const T2 &b)
void copyGenericColorSpinor(ColorSpinorField &dst, const ColorSpinorField &src, QudaFieldLocation location, void *Dst=0, void *Src=0, void *dstNorm=0, void *srcNorm=0)
bool advanceSharedBytes(TuneParam &param) const
__device__ __host__ void operator()(complex< FloatOut > out[Ns *Nc], const complex< FloatIn > in[Ns *Nc]) const
QudaGaugeParam param
Definition: pack_test.cpp:17
unsigned int minThreads() const
cpuColorSpinorField * in
QudaSiteSubset SiteSubset() const
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603
unsigned int sharedBytesPerBlock(const TuneParam &param) const
__device__ __host__ void operator()(complex< FloatOut > out[Ns *Nc], const complex< FloatIn > in[Ns *Nc]) const
const QudaFieldLocation location
CopyColorSpinorArg(const Out &out, const In &in, const ColorSpinorField &out_, const ColorSpinorField &in_)
__device__ __host__ void operator()(complex< FloatOut > out[Ns *Nc], const complex< FloatIn > in[Ns *Nc]) const
CopyColorSpinor(Arg &arg, const ColorSpinorField &out, const ColorSpinorField &in, QudaFieldLocation location)
enum QudaFieldLocation_s QudaFieldLocation
cpuColorSpinorField * out
void copyColorSpinor(Arg &arg, const Basis &basis)
unsigned int sharedBytesPerThread() const
unsigned int sharedBytesPerBlock(const TuneParam &param) const
QudaSiteOrder SiteOrder() const
const ColorSpinorField & meta
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:880
const void * c
__device__ __host__ void operator()(complex< FloatOut > out[Ns *Nc], const complex< FloatIn > in[Ns *Nc]) const
Accessor routine for ColorSpinorFields in native field order.
void apply(const cudaStream_t &stream)
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51
#define kU
QudaParity parity
Definition: covdev_test.cpp:53
QudaFieldOrder FieldOrder() const
CopyColorSpinor(Arg &arg, const ColorSpinorField &out, const ColorSpinorField &in, QudaFieldLocation location)