QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
extended_color_spinor_utilities.cu
Go to the documentation of this file.
1 #include <cstdlib>
2 #include <cstdio>
3 #include <string>
4 
5 #include <color_spinor_field.h>
7 #include <face_quda.h>
8 #include <tune_quda.h>
9 
10 #define PRESERVE_SPINOR_NORM
11 
12 #ifdef PRESERVE_SPINOR_NORM // Preserve the norm regardless of basis
13 #define kP (1.0/sqrt(2.0))
14 #define kU (1.0/sqrt(2.0))
15 #else // More numerically accurate not to preserve the norm between basis
16 #define kP (0.5)
17 #define kU (1.0)
18 #endif
19 
20 
21 
22 namespace quda {
23 
24  void exchangeExtendedGhost(cudaColorSpinorField* spinor, int R[], int parity, cudaStream_t *stream_p)
25  {
26 #ifdef MULTI_GPU
27  int nFace = 0;
28  for(int i=0; i<4; i++){
29  if(R[i] > nFace) nFace = R[i];
30  }
31 
32  int dagger = 0;
33 
34  int gatherCompleted[2] = {0,0};
35  int commsCompleted[2] = {0,0};
36 
37  cudaEvent_t gatherEnd[2];
38  for(int dir=0; dir<2; dir++) cudaEventCreate(&gatherEnd[dir], cudaEventDisableTiming);
39 
40  for(int dim=3; dim<=0; dim--){
41  if(!commDim(dim)) continue;
42 
43  spinor->packExtended(nFace, R, parity, dagger, dim, stream_p); // packing in the dim dimension complete
44  cudaDeviceSynchronize(); // Need this since packing is performed in stream[Nstream-1]
45  for(int dir=1; dir<=0; dir--){
46  spinor->gather(nFace, dagger, 2*dim + dir);
47  cudaEventRecord(gatherEnd[dir], streams[2*dim+dir]); // gatherEnd[1], gatherEnd[0]
48  }
49 
50  int completeSum = 0;
51  int dir = 1;
52  while(completeSum < 2){
53  if(!gatherCompleted[dir]){
54  if(cudaSuccess == cudaEventQuery(gatherEnd[dir])){
55  spinor->commsStart(nFace, 2*dim+dir, dagger);
56  completeSum++;
57  gatherCompleted[dir--] = 1;
58  }
59  }
60  }
61  gatherCompleted[0] = gatherCompleted[1] = 0;
62 
63  // Query if comms has completed
64  dir = 1;
65  while(completeSum < 4){
66  if(!commsCompleted[dir]){
67  if(spinor->commsQuery(nFace, 2*dim+dir, dagger)){
68  spinor->scatterExtended(nFace, parity, dagger, 2*dim+dir);
69  completeSum++;
70  commsCompleted[dir--] = 1;
71  }
72  }
73  }
74  commsCompleted[0] = commsCompleted[1] = 0;
75  cudaDeviceSynchronize(); // Wait for scatters to complete before next iteration
76  } // loop over dim
77 
78  for(int dir=0; dir<2; dir++) cudaEventDestroy(gatherEnd[dir]);
79 #endif
80  return;
81  }
82 
83 
84 
86  template <typename FloatOut, typename FloatIn, int Ns, int Nc>
87  class PreserveBasis {
88  typedef typename mapper<FloatIn>::type RegTypeIn;
89  typedef typename mapper<FloatOut>::type RegTypeOut;
90  public:
91  __device__ __host__ inline void operator()(RegTypeOut out[Ns*Nc*2], const RegTypeIn in[Ns*Nc*2]) {
92  for (int s=0; s<Ns; s++) {
93  for (int c=0; c<Nc; c++) {
94  for (int z=0; z<2; z++) {
95  out[(s*Nc+c)*2+z] = in[(s*Nc+c)*2+z];
96  }
97  }
98  }
99  }
100  };
101 
103  template <typename FloatOut, typename FloatIn, int Ns, int Nc>
104  struct NonRelBasis {
107  __device__ __host__ inline void operator()(RegTypeOut out[Ns*Nc*2], const RegTypeIn in[Ns*Nc*2]) {
108  int s1[4] = {1, 2, 3, 0};
109  int s2[4] = {3, 0, 1, 2};
110  RegTypeOut K1[4] = {kP, -kP, -kP, -kP};
111  RegTypeOut K2[4] = {kP, -kP, kP, kP};
112  for (int s=0; s<Ns; s++) {
113  for (int c=0; c<Nc; c++) {
114  for (int z=0; z<2; z++) {
115  out[(s*Nc+c)*2+z] = K1[s]*in[(s1[s]*Nc+c)*2+z] + K2[s]*in[(s2[s]*Nc+c)*2+z];
116  }
117  }
118  }
119  }
120  };
121 
122 
124  template <typename FloatOut, typename FloatIn, int Ns, int Nc>
125  struct RelBasis {
128  __device__ __host__ inline void operator()(RegTypeOut out[Ns*Nc*2], const RegTypeIn in[Ns*Nc*2]) {
129  int s1[4] = {1, 2, 3, 0};
130  int s2[4] = {3, 0, 1, 2};
131  RegTypeOut K1[4] = {-kU, kU, kU, kU};
132  RegTypeOut K2[4] = {-kU, kU, -kU, -kU};
133  for (int s=0; s<Ns; s++) {
134  for (int c=0; c<Nc; c++) {
135  for (int z=0; z<2; z++) {
136  out[(s*Nc+c)*2+z] = K1[s]*in[(s1[s]*Nc+c)*2+z] + K2[s]*in[(s2[s]*Nc+c)*2+z];
137  }
138  }
139  }
140  }
141  };
142 
143 
144 
145 
146  template<typename OutOrder, typename InOrder, typename Basis>
148  OutOrder out;
149  const InOrder in;
150  Basis basis;
153  int length;
154  int parity;
155 
156  CopySpinorExArg(const OutOrder &out, const InOrder &in, const Basis& basis, const int *E, const int *X, const int parity)
157  : out(out), in(in), basis(basis), parity(parity)
158  {
159  this->length = 1;
160  for(int d=0; d<4; d++){
161  this->E[d] = E[d];
162  this->X[d] = X[d];
163  this->length *= X[d]; // smaller volume
164  }
165  }
166  };
167 
168 
169  template<typename FloatOut, typename FloatIn, int Ns, int Nc, typename OutOrder, typename InOrder, typename Basis, bool extend>
171  {
172  int x[4];
173  int R[4];
174  for(int d=0; d<4; d++) R[d] = (arg.E[d] - arg.X[d]) >> 1;
175 
176  int za = X/(arg.X[0]/2);
177  int x0h = X - za*(arg.X[0]/2);
178  int zb = za/arg.X[1];
179  x[1] = za - zb*arg.X[1];
180  x[3] = zb / arg.X[2];
181  x[2] = zb - x[3]*arg.X[2];
182  x[0] = 2*x0h + ((x[1] + x[2] + x[3] + arg.parity) & 1);
183 
184  // Y is the cb spatial index into the extended gauge field
185  int Y = ((((x[3]+R[3])*arg.E[2] + (x[2]+R[2]))*arg.E[1] + (x[1]+R[1]))*arg.E[0]+(x[0]+R[0])) >> 1;
186 
187  typedef typename mapper<FloatIn>::type RegTypeIn;
188  typedef typename mapper<FloatOut>::type RegTypeOut;
189 
190  RegTypeIn in[Ns*Nc*2];
191  RegTypeOut out[Ns*Nc*2];
192 
193  if(extend){
194  arg.in.load(in, X);
195  arg.basis(out, in);
196  arg.out.save(out, Y);
197  }else{
198  arg.in.load(in, Y);
199  arg.basis(out,in);
200  arg.out.save(out, Y);
201  }
202  }
203 
204 
205  template<typename FloatOut, typename FloatIn, int Ns, int Nc, typename OutOrder, typename InOrder, typename Basis, bool extend>
207  {
208  int cb_idx = blockIdx.x*blockDim.x + threadIdx.x;
209 
210  while(cb_idx < arg.length){
211  copyInterior<FloatOut,FloatIn,Ns,Nc,OutOrder,InOrder,Basis,extend>(arg,cb_idx);
212  cb_idx += gridDim.x*blockDim.x;
213  }
214  }
215 
216  /*
217  Host function
218  */
219  template<typename FloatOut, typename FloatIn, int Ns, int Nc, typename OutOrder, typename InOrder, typename Basis, bool extend>
221  {
222  for(int cb_idx=0; cb_idx<arg.length; cb_idx++){
223  copyInterior<FloatOut,FloatIn,Ns,Nc,OutOrder,InOrder,Basis,extend>(arg, cb_idx);
224  }
225  }
226 
227 
228 
229 
230  template<typename FloatOut, typename FloatIn, int Ns, int Nc, typename OutOrder, typename InOrder, typename Basis, bool extend>
232 
234  const ColorSpinorField &meta;
235  QudaFieldLocation location;
236 
237  private:
238  unsigned int sharedBytesPerThread() const { return 0; }
239  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
240  bool advanceSharedBytes(TuneParam &param) const { return false; } // Don't tune shared mem
241  bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
242  unsigned int minThreads() const { return arg.length; }
243 
244  public:
246  : arg(arg), meta(meta), location(location) {
247  writeAuxString("out_stride=%d,in_stride=%d",arg.out.stride,arg.in.stride);
248  }
249  virtual ~CopySpinorEx() {}
250 
251  void apply(const cudaStream_t &stream){
252  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
253 
254  if(location == QUDA_CPU_FIELD_LOCATION){
255  copyInterior<FloatOut,FloatIn,Ns,Nc,OutOrder,InOrder,Basis,extend>(arg);
256  }else if(location == QUDA_CUDA_FIELD_LOCATION){
257  copyInteriorKernel<FloatOut,FloatIn,Ns,Nc,OutOrder,InOrder,Basis,extend>
258  <<<tp.grid,tp.block,tp.shared_bytes,stream>>>(arg);
259  }
260  }
261 
262  TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
263 
264  std::string paramString(const TuneParam &param) const { // Don't bother printing the grid dim
265  std::stringstream ps;
266  ps << "block=(" << param.block.x << "," << param.block.y << "," << param.block.z << ")";
267  ps << "shared=" << param.shared_bytes;
268  return ps.str();
269  }
270 
271  long long flops() const { return 0; }
272  long long bytes() const {
273  return arg.length*2*Nc*Ns*(sizeof(FloatIn) + sizeof(FloatOut));
274  }
275 
276  }; // CopySpinorEx
277 
278 
279 
280  template<typename FloatOut, typename FloatIn, int Ns, int Nc, typename OutOrder, typename InOrder, typename Basis>
281  void copySpinorEx(OutOrder outOrder, const InOrder inOrder, const Basis basis, const int *E,
282  const int *X, const int parity, const bool extend, const ColorSpinorField &meta, QudaFieldLocation location)
283  {
284  CopySpinorExArg<OutOrder,InOrder,Basis> arg(outOrder, inOrder, basis, E, X, parity);
285  if(extend){
287  copier.apply(0);
288  }else{
290  copier.apply(0);
291  }
292  if(location == QUDA_CUDA_FIELD_LOCATION) checkCudaError();
293  }
294 
295  template<typename FloatOut, typename FloatIn, int Ns, int Nc, typename OutOrder, typename InOrder>
296  void copySpinorEx(OutOrder outOrder, InOrder inOrder, const QudaGammaBasis outBasis, const QudaGammaBasis inBasis,
297  const int* E, const int* X, const int parity, const bool extend,
299  {
300  if(inBasis == outBasis){
302  copySpinorEx<FloatOut, FloatIn, Ns, Nc, OutOrder, InOrder, PreserveBasis<FloatOut,FloatIn,Ns,Nc> >
303  (outOrder, inOrder, basis, E, X, parity, extend, meta, location);
304  }else if(outBasis == QUDA_UKQCD_GAMMA_BASIS && inBasis == QUDA_DEGRAND_ROSSI_GAMMA_BASIS){
305  if(Ns != 4) errorQuda("Can only change basis with Nspin = 4, not Nspin = %d", Ns);
307  copySpinorEx<FloatOut, FloatIn, Ns, Nc, OutOrder, InOrder, NonRelBasis<FloatOut,FloatIn,Ns,Nc> >
308  (outOrder, inOrder, basis, E, X, parity, extend, meta, location);
309  }else if(inBasis == QUDA_UKQCD_GAMMA_BASIS && outBasis == QUDA_DEGRAND_ROSSI_GAMMA_BASIS){
310  if(Ns != 4) errorQuda("Can only change basis with Nspin = 4, not Nspin = %d", Ns);
312  copySpinorEx<FloatOut, FloatIn, Ns, Nc, OutOrder, InOrder, RelBasis<FloatOut,FloatIn,Ns,Nc> >
313  (outOrder, inOrder, basis, E, X, parity, extend, meta, location);
314  }else{
315  errorQuda("Basis change not supported");
316  }
317  }
318 
319 
320  // Need to rewrite the following two functions...
321  // Decide on the output order
322  template<typename FloatOut, typename FloatIn, int Ns, int Nc, typename InOrder>
324  QudaGammaBasis inBasis, const int *E, const int *X, const int parity, const bool extend,
325  QudaFieldLocation location, FloatOut *Out, float *outNorm){
326 
327  if(out.FieldOrder() == QUDA_FLOAT4_FIELD_ORDER){
328  FloatNOrder<FloatOut, Ns, Nc, 4> outOrder(out, Out, outNorm);
329  copySpinorEx<FloatOut,FloatIn,Ns,Nc>
330  (outOrder, inOrder, out.GammaBasis(), inBasis, E, X, parity, extend, out, location);
331  }else if(out.FieldOrder() == QUDA_FLOAT2_FIELD_ORDER){
332  FloatNOrder<FloatOut, Ns, Nc, 2> outOrder(out, Out, outNorm);
333  copySpinorEx<FloatOut,FloatIn,Ns,Nc>
334  (outOrder, inOrder, out.GammaBasis(), inBasis, E, X, parity, extend, out, location);
335 #if 0
337  SpaceSpinorColorOrder<FloatOut, Ns, Nc> outOrder(out, Out);
338  copySpinorEx<FloatOut,FloatIn,Ns,Nc>
339  (outOrder, inOrder, out.GammaBasis(), inBasis, E, X, parity, extend, out, location);
341  SpaceColorSpinorOrder<FloatOut, Ns, Nc> outOrder(out, Out);
342  copySpinorEx<FloatOut,FloatIn,Ns,Nc>
343  (outOrder, inOrder, out.GammaBasis(), inBasis, E, X, parity, extend, out, location);
344  } else if (out.FieldOrder() == QUDA_QDPJIT_FIELD_ORDER){
345 #ifdef BUILD_QDPJIT_INTERFACE
346  QDPJITDiracOrder<FloatOut, Ns, Nc> outOrder(out, Out);
347  copySpinorEx<FloatOut,FloatIn,Ns,Nc>
348  (outOrder, inOrder, out.GammaBasis(), inBasis, E, X, parity, extend, out, location);
349 #else
350  errorQuda("QDPJIT interface has not been built\n");
351 #endif
352 #endif
353  }else{
354  errorQuda("Order not defined");
355  }
356  }
357 
358  template<typename FloatOut, typename FloatIn, int Ns, int Nc>
360  const int parity, const QudaFieldLocation location, FloatOut *Out, FloatIn *In,
361  float* outNorm, float *inNorm){
362 
363 
364  int E[4];
365  int X[4];
366  const bool extend = (out.Volume() >= in.Volume());
367  if(extend){
368  for(int d=0; d<4; d++){
369  E[d] = out.X()[d];
370  X[d] = in.X()[d];
371  }
372  }else{
373  for(int d=0; d<4; d++){
374  E[d] = in.X()[d];
375  X[d] = out.X()[d];
376  }
377  }
378  X[0] *= 2; E[0] *= 2; // Since we consider only a single parity at a time
379 
380 
382  FloatNOrder<FloatIn,Ns,Nc,4> inOrder(in, In, inNorm);
383  extendedCopyColorSpinor<FloatOut,FloatIn,Ns,Nc>(inOrder, out, in.GammaBasis(), E, X, parity, extend, location, Out, outNorm);
384  }else if(in.FieldOrder() == QUDA_FLOAT2_FIELD_ORDER){
385  FloatNOrder<FloatIn,Ns,Nc,2> inOrder(in, In, inNorm);
386  extendedCopyColorSpinor<FloatOut,FloatIn,Ns,Nc>(inOrder, out, in.GammaBasis(), E, X, parity, extend, location, Out, outNorm);
387 #if 0
389  SpaceSpinorColorOrder<FloatIn,Ns,Nc> inOrder(in, In);
390  extendedCopyColorSpinor<FloatOut,FloatIn,Ns,Nc>(inOrder, out, in.GammaBasis(), E, X, parity, extend, location, Out, outNorm);
392  SpaceColorSpinorOrder<FloatIn,Ns,Nc> inOrder(in, In);
393  extendedCopyColorSpinor<FloatOut,FloatIn,Ns,Nc>(inOrder, out, in.GammaBasis(), E, X, parity, extend, location, Out, outNorm);
394  }else if (in.FieldOrder() == QUDA_QDPJIT_FIELD_ORDER){
395 #ifdef BUILD_QDPJIT_INTERFACE
396  QDPJITDiracOrder<FloatIn,Ns,Nc> inOrder(in, In);
397  extendedCopyColorSpinor<FloatOut,FloatIn,Ns,Nc>(inOrder, out, in.GammaBasis(), E, X, parity, extend,location, Out, outNorm);
398 #else
399  errorQuda("QDPJIT interface has not been built\n");
400 #endif
401 #endif
402  }else{
403  errorQuda("Order not defined");
404  }
405  }
406 
407 
408 
409 
410 
411 
412  template<int Ns, typename dstFloat, typename srcFloat>
414  const int parity, const QudaFieldLocation location, dstFloat *Dst, srcFloat *Src,
415  float *dstNorm, float *srcNorm) {
416 
417 
418  if(dst.Ndim() != src.Ndim())
419  errorQuda("Number of dimensions %d %d don't match", dst.Ndim(), src.Ndim());
420 
421  if(!(dst.SiteOrder() == src.SiteOrder() ||
425  src.SiteOrder() == QUDA_EVEN_ODD_SITE_ORDER) ) ){
426 
427  errorQuda("Subset orders %d %d don't match", dst.SiteOrder(), src.SiteOrder());
428  }
429 
430  if(dst.SiteSubset() != src.SiteSubset())
431  errorQuda("Subset types do not match %d %d", dst.SiteSubset(), src.SiteSubset());
432 
433  if(dst.Ncolor() != 3 || src.Ncolor() != 3) errorQuda("Nc != 3 not yet supported");
434 
435  const int Nc = 3;
436 
437  // We currently only support parity-ordered fields; even-odd or odd-even
439  errorQuda("Copying to full fields with lexicographical ordering is not currently supported");
440  }
441 
442  if(dst.SiteSubset() == QUDA_FULL_SITE_SUBSET){
443  if(src.FieldOrder() == QUDA_QDPJIT_FIELD_ORDER ||
445  errorQuda("QDPJIT field ordering not supported for full site fields");
446  }
447 
448  // set for the source subset ordering
449  srcFloat *srcEven = Src ? Src : (srcFloat*)src.V();
450  srcFloat* srcOdd = (srcFloat*)((char*)srcEven + src.Bytes()/2);
451  float *srcNormEven = srcNorm ? srcNorm : (float*)src.Norm();
452  float *srcNormOdd = (float*)((char*)srcNormEven + src.NormBytes()/2);
453  if(src.SiteOrder() == QUDA_ODD_EVEN_SITE_ORDER){
454  std::swap<srcFloat*>(srcEven, srcOdd);
455  std::swap<float*>(srcNormEven, srcNormOdd);
456  }
457 
458  // set for the destination subset ordering
459  dstFloat *dstEven = Dst ? Dst : (dstFloat*)dst.V();
460  dstFloat *dstOdd = (dstFloat*)((char*)dstEven + dst.Bytes()/2);
461  float *dstNormEven = dstNorm ? dstNorm : (float*)dst.Norm();
462  float *dstNormOdd = (float*)((char*)dstNormEven + dst.NormBytes()/2);
463  if(dst.SiteOrder() == QUDA_ODD_EVEN_SITE_ORDER){
464  std::swap<dstFloat*>(dstEven, dstOdd);
465  std::swap<float*>(dstNormEven, dstNormOdd);
466  }
467 
468  // should be able to apply to select either even or odd parity at this point as well.
469  extendedCopyColorSpinor<dstFloat, srcFloat, Ns, Nc>
470  (dst, src, 0, location, dstEven, srcEven, dstNormEven, srcNormEven);
471  extendedCopyColorSpinor<dstFloat, srcFloat, Ns, Nc>
472  (dst, src, 1, location, dstOdd, srcOdd, dstNormOdd, srcNormOdd);
473  }else{
474  extendedCopyColorSpinor<dstFloat, srcFloat, Ns, Nc>
475  (dst, src, parity, location, Dst, Src, dstNorm, srcNorm);
476  } // N.B. Need to update this to account for differences in parity
477  }
478 
479 
480  template<typename dstFloat, typename srcFloat>
482  const int parity, const QudaFieldLocation location, dstFloat *Dst, srcFloat *Src,
483  float *dstNorm=0, float *srcNorm=0)
484  {
485  if(dst.Nspin() != src.Nspin())
486  errorQuda("source and destination spins must match");
487 
488  if(dst.Nspin() == 4){
489 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC)
490  copyExtendedColorSpinor<4>(dst, src, parity, location, Dst, Src, dstNorm, srcNorm);
491 #else
492  errorQuda("Extended copy has not been built for Nspin=%d fields",dst.Nspin());
493 #endif
494  }else if(dst.Nspin() == 1){
495 #ifdef GPU_STAGGERED_DIRAC
496  copyExtendedColorSpinor<1>(dst, src, parity, location, Dst, Src, dstNorm, srcNorm);
497 #else
498  errorQuda("Extended copy has not been built for Nspin=%d fields", dst.Nspin());
499 #endif
500  }else{
501  errorQuda("Nspin=%d unsupported", dst.Nspin());
502  }
503  }
504 
505 
506  // There's probably no need to have the additional Dst and Src arguments here!
508  QudaFieldLocation location, const int parity, void *Dst, void *Src,
509  void *dstNorm, void *srcNorm){
510 
511  if(dst.Precision() == QUDA_DOUBLE_PRECISION){
512  if(src.Precision() == QUDA_DOUBLE_PRECISION){
513  CopyExtendedColorSpinor(dst, src, parity, location, static_cast<double*>(Dst), static_cast<double*>(Src));
514  }else if(src.Precision() == QUDA_SINGLE_PRECISION){
515  CopyExtendedColorSpinor(dst, src, parity, location, static_cast<double*>(Dst), static_cast<float*>(Src));
516  }else if(src.Precision() == QUDA_HALF_PRECISION){
517  CopyExtendedColorSpinor(dst, src, parity, location, static_cast<double*>(Dst), static_cast<short*>(Src), 0, static_cast<float*>(srcNorm));
518  } else {
519  errorQuda("Unsupported Precision %d", src.Precision());
520  }
521  } else if (dst.Precision() == QUDA_SINGLE_PRECISION){
522  if(src.Precision() == QUDA_DOUBLE_PRECISION){
523  CopyExtendedColorSpinor(dst, src, parity, location, static_cast<float*>(Dst), static_cast<double*>(Src));
524  }else if(src.Precision() == QUDA_SINGLE_PRECISION){
525  CopyExtendedColorSpinor(dst, src, parity, location, static_cast<float*>(Dst), static_cast<float*>(Src));
526  }else if(src.Precision() == QUDA_HALF_PRECISION){
527  CopyExtendedColorSpinor(dst, src, parity, location, static_cast<float*>(Dst), static_cast<short*>(Src), 0, static_cast<float*>(srcNorm));
528  }else{
529  errorQuda("Unsupported Precision %d", src.Precision());
530  }
531  } else if (dst.Precision() == QUDA_HALF_PRECISION){
532  if(src.Precision() == QUDA_DOUBLE_PRECISION){
533  CopyExtendedColorSpinor(dst, src, parity, location, static_cast<short*>(Dst), static_cast<double*>(Src), static_cast<float*>(dstNorm), 0);
534  }else if(src.Precision() == QUDA_SINGLE_PRECISION){
535  CopyExtendedColorSpinor(dst, src, parity, location, static_cast<short*>(Dst), static_cast<float*>(Src), static_cast<float*>(dstNorm), 0);
536  }else if(src.Precision() == QUDA_HALF_PRECISION){
537  CopyExtendedColorSpinor(dst, src, parity, location, static_cast<short*>(Dst), static_cast<short*>(Src), static_cast<float*>(dstNorm), static_cast<float*>(srcNorm));
538  }else{
539  errorQuda("Unsupported Precision %d", src.Precision());
540  }
541  }else{
542  errorQuda("Unsupported Precision %d", dst.Precision());
543  }
544  }
545 
546 } // quda
int commDim(int)
CopySpinorEx(CopySpinorExArg< OutOrder, InOrder, Basis > &arg, const ColorSpinorField &meta, QudaFieldLocation location)
__device__ __host__ void operator()(RegTypeOut out[Ns *Nc *2], const RegTypeIn in[Ns *Nc *2])
mapper< FloatOut >::type RegTypeOut
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
mapper< FloatOut >::type RegTypeOut
const int * X() const
void gather(int nFace, int dagger, int dir, cudaStream_t *stream_p=NULL)
#define errorQuda(...)
Definition: util_quda.h:73
mapper< FloatIn >::type RegTypeIn
int commsQuery(int nFace, int dir, int dagger=0)
cudaStream_t * streams
void CopyExtendedColorSpinor(ColorSpinorField &dst, const ColorSpinorField &src, const int parity, const QudaFieldLocation location, dstFloat *Dst, srcFloat *Src, float *dstNorm=0, float *srcNorm=0)
cudaStream_t * stream
::std::string string
Definition: gtest.h:1979
void copySpinorEx(OutOrder outOrder, const InOrder inOrder, const Basis basis, const int *E, const int *X, const int parity, const bool extend, const ColorSpinorField &meta, QudaFieldLocation location)
std::string paramString(const TuneParam &param) const
void scatterExtended(int nFace, int parity, int dagger, int dir)
cpuColorSpinorField * spinor
Definition: dslash_test.cpp:40
QudaDagType dagger
Definition: test_util.cpp:1558
QudaGaugeParam param
Definition: pack_test.cpp:17
int E[4]
__device__ __host__ void operator()(RegTypeOut out[Ns *Nc *2], const RegTypeIn in[Ns *Nc *2])
void writeAuxString(const char *format,...)
Definition: tune_quda.h:138
const QudaFieldLocation location
Definition: pack_test.cpp:46
void apply(const cudaStream_t &stream)
cpuColorSpinorField * in
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:271
void packExtended(const int nFace, const int R[], const int parity, const int dagger, const int dim, cudaStream_t *stream_p, const bool zeroCopyPack=false)
void exchangeExtendedGhost(cudaColorSpinorField *spinor, int R[], int parity, cudaStream_t *stream_p)
void extendedCopyColorSpinor(InOrder &inOrder, ColorSpinorField &out, QudaGammaBasis inBasis, const int *E, const int *X, const int parity, const bool extend, QudaFieldLocation location, FloatOut *Out, float *outNorm)
const char * VolString() const
cudaEvent_t gatherEnd[Nstream]
Definition: dslash_quda.cu:101
QudaSiteOrder SiteOrder() const
int x[4]
QudaFieldOrder FieldOrder() const
enum QudaFieldLocation_s QudaFieldLocation
cpuColorSpinorField * out
void copyExtendedColorSpinor(ColorSpinorField &dst, const ColorSpinorField &src, QudaFieldLocation location, const int parity, void *Dst, void *Src, void *dstNorm, void *srcNorm)
mapper< FloatIn >::type RegTypeIn
enum QudaGammaBasis_s QudaGammaBasis
QudaPrecision Precision() const
QudaGammaBasis GammaBasis() const
CopySpinorExArg(const OutOrder &out, const InOrder &in, const Basis &basis, const int *E, const int *X, const int parity)
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:843
__global__ void copyInteriorKernel(CopySpinorExArg< OutOrder, InOrder, Basis > arg)
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
#define checkCudaError()
Definition: util_quda.h:110
void commsStart(int nFace, int dir, int dagger=0)
__device__ __host__ void copyInterior(CopySpinorExArg< OutOrder, InOrder, Basis > &arg, int X)
QudaTune getTuning()
Definition: util_quda.cpp:32
VOLATILE spinorFloat * s
QudaSiteSubset SiteSubset() const
const QudaParity parity
Definition: dslash_test.cpp:29
char aux[TuneKey::aux_n]
Definition: tune_quda.h:136
__device__ __host__ void operator()(RegTypeOut out[Ns *Nc *2], const RegTypeIn in[Ns *Nc *2])