QUDA  v0.5.0
A library for QCD on GPUs
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
pack_spinor.h
Go to the documentation of this file.
1 /*
2  MAC:
3 
4  On my mark, unleash template hell
5 
6  Here we are templating on the following
7  - input precision
8  - output precision
9  - number of colors
10  - number of spins
11  - short vector lengh (float, float2, float4 etc.)
12 
13  This is still quite a mess. Options to reduce to the amount of code
14  bloat here include:
15 
16  1. Using functors to define arbitrary ordering
17  2. Use class inheritance to the same effect
18  3. Abuse the C preprocessor to define arbitrary mappings
19  4. Something else
20 
21  Solution is to use class inheritance to defined different mappings,
22  and combine it with templating on the return type. Initial attempt
23  at this is in the cpuColorSpinorField, but this will eventually roll
24  out here too.
25 
26 */
27 
28 
29 /*
30 
31  Packing routines
32 
33 */
34 
35 #define PRESERVE_SPINOR_NORM
36 
37 #ifdef PRESERVE_SPINOR_NORM // Preserve the norm regardless of basis
38 #define kP (1.0/sqrt(2.0))
39 #define kU (1.0/sqrt(2.0))
40 #else // More numerically accurate not to preserve the norm between basis
41 #define kP (0.5)
42 #define kU (1.0)
43 #endif
44 
45 template <typename Float, int Ns, int Nc, int N>
46 struct FloatNOrder {
48  int volume;
49  int stride;
51  : field(field), volume(volume), stride(stride) { ; }
52  virtual ~FloatNOrder() { ; }
53 
54  __device__ __host__ inline void load(Float v[Ns*Nc*2], int x, int volume) const {
55  for (int s=0; s<Ns; s++) {
56  for (int c=0; c<Nc; c++) {
57  for (int z=0; z<2; z++) {
58  int internal_idx = (s*Nc + c)*2 + z;
59  int pad_idx = internal_idx / N;
60  v[(s*Nc+c)*2+z] = field[(pad_idx * stride + x)*N + internal_idx % N];
61  }
62  }
63  }
64  }
65 
66  __device__ __host__ inline void save(const Float v[Ns*Nc*2], int x, int volume) {
67  for (int s=0; s<Ns; s++) {
68  for (int c=0; c<Nc; c++) {
69  for (int z=0; z<2; z++) {
70  int internal_idx = (s*Nc + c)*2 + z;
71  int pad_idx = internal_idx / N;
72  field[(pad_idx * stride + x)*N + internal_idx % N] = v[(s*Nc+c)*2+z];
73  }
74  }
75  }
76  }
77 
78  size_t Bytes() const { return volume * Nc * Ns * 2 * sizeof(Float); }
79 };
80 
82 template<> __device__ inline void FloatNOrder<float, 4, 3, 4>::load(float v[24], int x, int volume) const {
83  //read4<4,3>(v, field, stride, x);
84 #pragma unroll
85  for (int i=0; i<4*3*2; i+=4) {
86  float4 tmp = ((float4*)field)[i/4 * stride + x];
87  v[i] = tmp.x; v[i+1] = tmp.y; v[i+2] = tmp.z; v[i+3] = tmp.w;
88  }
89 }
90 
92 template<> __device__ inline void FloatNOrder<float, 4, 3, 4>::save(const float v[24], int x, int volume) {
93 #pragma unroll
94  for (int i=0; i<4*3*2; i+=4) {
95  float4 tmp = make_float4(v[i], v[i+1], v[i+2], v[i+3]);
96  ((float4*)field)[i/4 * stride + x] = tmp;
97  }
98 }
99 
100 template <typename Float, int Ns, int Nc>
103  int volume;
104  int stride;
106  : field(field), volume(volume), stride(stride)
107  { if (volume != stride) errorQuda("Stride must equal volume for this field order"); }
108  virtual ~SpaceColorSpinorOrder() { ; }
109 
110  __device__ __host__ inline void load(Float v[Ns*Nc*2], int x, int volume) const {
111  for (int s=0; s<Ns; s++) {
112  for (int c=0; c<Nc; c++) {
113  for (int z=0; z<2; z++) {
114  v[(s*Nc+c)*2+z] = field[((x*Nc + c)*Ns + s)*2 + z];
115  }
116  }
117  }
118  }
119 
120  __device__ __host__ inline void save(const Float v[Ns*Nc*2], int x, int volume) {
121  for (int s=0; s<Ns; s++) {
122  for (int c=0; c<Nc; c++) {
123  for (int z=0; z<2; z++) {
124  field[((x*Nc + c)*Ns + s)*2 + z] = v[(s*Nc+c)*2+z];
125  }
126  }
127  }
128  }
129 
130  size_t Bytes() const { return volume * Nc * Ns * 2 * sizeof(Float); }
131 };
132 
133 template <typename Float, int Ns, int Nc>
134  __device__ inline void load_shared(Float v[Ns*Nc*2], Float *field, int x, int volume) {
135  const int tid = threadIdx.x;
136  const int vec_length = Ns*Nc*2;
137 
138  // the length of the block on the last grid site might not extend to all threads
139  const int block_dim = (blockIdx.x == gridDim.x-1) ?
140  volume - (gridDim.x-1)*blockDim.x : blockDim.x;
141 
142  extern __shared__ Float s_data[];
143 
144  int x0 = x-tid;
145  int i=tid;
146  while (i<vec_length*block_dim) {
147  int space_idx = i / vec_length;
148  int internal_idx = i - space_idx*vec_length;
149  int sh_idx = internal_idx*(blockDim.x+1) + space_idx;
150  s_data[sh_idx] = field[x0*vec_length + i];
151  i += block_dim;
152  }
153 
154  __syncthreads();
155 
156 #pragma unroll
157  for (int s=0; s<Ns; s++)
158 #pragma unroll
159  for (int c=0; c<Nc; c++)
160 #pragma unroll
161  for (int z=0; z<2; z++) { // block+1 to avoid bank conflicts
162  int sh_idx = ((c*Ns+s)*2+z)*(blockDim.x+1) + tid;
163  v[(s*Nc + c)*2 + z] = s_data[sh_idx];
164  }
165 
166 }
167 
168 template <typename Float, int Ns, int Nc>
169  __device__ inline void save_shared(Float *field, const Float v[Ns*Nc*2], int x, int volume) {
170  const int tid = threadIdx.x;
171  const int vec_length = Ns*Nc*2;
172 
173  // the length of the block on the last grid site might not extend to all threads
174  const int block_dim = (blockIdx.x == gridDim.x-1) ?
175  volume - (gridDim.x-1)*blockDim.x : blockDim.x;
176 
177  extern __shared__ Float s_data[];
178 
179 #pragma unroll
180  for (int s=0; s<Ns; s++)
181 #pragma unroll
182  for (int c=0; c<Nc; c++)
183 #pragma unroll
184  for (int z=0; z<2; z++) { // block+1 to avoid bank conflicts
185  int sh_idx = ((c*Ns+s)*2+z)*(blockDim.x+1) + tid;
186  s_data[sh_idx] = v[(s*Nc + c)*2 + z];
187  }
188 
189  __syncthreads();
190 
191  int x0 = x-tid;
192  int i=tid;
193  while (i<vec_length*block_dim) {
194  int space_idx = i / vec_length;
195  int internal_idx = i - space_idx*vec_length;
196  int sh_idx = internal_idx*(blockDim.x+1) + space_idx;
197  field[x0*vec_length + i] = s_data[sh_idx];
198  i += block_dim;
199  }
200 
201 }
202 
204 template<> __host__ __device__ inline void SpaceColorSpinorOrder<float, 4, 3>::load(float v[24], int x, int volume) const {
205 #ifdef __CUDA_ARCH__
206  load_shared<float, 4, 3>(v, field, x, volume);
207 #else
208  const int Ns=4;
209  const int Nc=3;
210  for (int s=0; s<Ns; s++) {
211  for (int c=0; c<Nc; c++) {
212  for (int z=0; z<2; z++) {
213  v[(s*Nc+c)*2+z] = field[((x*Nc + c)*Ns + s)*2 + z];
214  }
215  }
216  }
217 #endif
218 }
219 
221 template<> __host__ __device__ inline void SpaceColorSpinorOrder<float, 4, 3>::save(const float v[24], int x, int volume) {
222 #ifdef __CUDA_ARCH__
223  save_shared<float, 4, 3>(field, v, x, volume);
224 #else
225  const int Ns=4;
226  const int Nc=3;
227  for (int s=0; s<Ns; s++) {
228  for (int c=0; c<Nc; c++) {
229  for (int z=0; z<2; z++) {
230  field[((x*Nc + c)*Ns + s)*2 + z] = v[(s*Nc+c)*2+z];
231  }
232  }
233  }
234 #endif
235 }
236 
237 template <typename Float, int Ns, int Nc>
240  int volume;
241  int stride;
243  : field(field), volume(volume), stride(stride)
244  { if (volume != stride) errorQuda("Stride must equal volume for this field order"); }
245  virtual ~SpaceSpinorColorOrder() { ; }
246 
247  __device__ __host__ inline void load(Float v[Ns*Nc*2], int x, int volume) const {
248  for (int s=0; s<Ns; s++) {
249  for (int c=0; c<Nc; c++) {
250  for (int z=0; z<2; z++) {
251  v[(s*Nc+c)*2+z] = field[((x*Ns + s)*Nc + c)*2 + z];
252  }
253  }
254  }
255  }
256 
257  __device__ __host__ inline void save(const Float v[Ns*Nc*2], int x, int volume) {
258  for (int s=0; s<Ns; s++) {
259  for (int c=0; c<Nc; c++) {
260  for (int z=0; z<2; z++) {
261  field[((x*Ns + s)*Nc + c)*2 + z] = v[(s*Nc+c)*2+z];
262  }
263  }
264  }
265  }
266 
267  size_t Bytes() const { return volume * Nc * Ns * 2 * sizeof(Float); }
268 };
269 
271 template <typename FloatOut, typename FloatIn, int Ns, int Nc>
273  public:
274  __device__ __host__ inline void operator()(FloatOut out[Ns*Nc*2], const FloatIn in[Ns*Nc*2]) {
275  for (int s=0; s<Ns; s++) {
276  for (int c=0; c<Nc; c++) {
277  for (int z=0; z<2; z++) {
278  out[(s*Nc+c)*2+z] = in[(s*Nc+c)*2+z];
279  }
280  }
281  }
282  }
283 };
284 
286 template <typename FloatOut, typename FloatIn, int Ns, int Nc>
287 struct NonRelBasis {
288  __device__ __host__ inline void operator()(FloatOut out[Ns*Nc*2], const FloatIn in[Ns*Nc*2]) {
289  int s1[4] = {1, 2, 3, 0};
290  int s2[4] = {3, 0, 1, 2};
291  FloatOut K1[4] = {kP, -kP, -kP, -kP};
292  FloatOut K2[4] = {kP, -kP, kP, kP};
293  for (int s=0; s<Ns; s++) {
294  for (int c=0; c<Nc; c++) {
295  for (int z=0; z<2; z++) {
296  out[(s*Nc+c)*2+z] = K1[s]*in[(s1[s]*Nc+c)*2+z] + K2[s]*in[(s2[s]*Nc+c)*2+z];
297  }
298  }
299  }
300  }
301 };
302 
304 template <typename FloatOut, typename FloatIn, int Ns, int Nc>
305 struct RelBasis {
306  __device__ __host__ inline void operator()(FloatOut out[Ns*Nc*2], const FloatIn in[Ns*Nc*2]) {
307  int s1[4] = {1, 2, 3, 0};
308  int s2[4] = {3, 0, 1, 2};
309  FloatOut K1[4] = {-kU, kU, kU, kU};
310  FloatOut K2[4] = {-kU, kU, -kU, -kU};
311  for (int s=0; s<Ns; s++) {
312  for (int c=0; c<Nc; c++) {
313  for (int z=0; z<2; z++) {
314  out[(s*Nc+c)*2+z] = K1[s]*in[(s1[s]*Nc+c)*2+z] + K2[s]*in[(s2[s]*Nc+c)*2+z];
315  }
316  }
317  }
318  }
319 };
320 
322 template <typename FloatOut, typename FloatIn, int Ns, int Nc, typename OutOrder, typename InOrder, typename Basis>
323 void packSpinor(OutOrder &outOrder, const InOrder &inOrder, Basis basis, int volume) {
324  for (int x=0; x<volume; x++) {
325  FloatIn in[Ns*Nc*2];
326  FloatOut out[Ns*Nc*2];
327  inOrder.load(in, x, volume);
328  basis(out, in);
329  outOrder.save(out, x, volume);
330  }
331 }
332 
334 template <typename FloatOut, typename FloatIn, int Ns, int Nc, typename OutOrder, typename InOrder, typename Basis>
335 __global__ void packSpinorKernel(OutOrder outOrder, const InOrder inOrder, Basis basis, int volume) {
336  int x = blockIdx.x * blockDim.x + threadIdx.x;
337 
338  FloatIn in[Ns*Nc*2];
339  FloatOut out[Ns*Nc*2];
340  inOrder.load(in, x, volume);
341 
342  if (x >= volume) return;
343  basis(out, in);
344  outOrder.save(out, x, volume);
345 }
346 
347 template <typename FloatOut, typename FloatIn, int Ns, int Nc, typename OutOrder, typename InOrder, typename Basis>
348 class PackSpinor : Tunable {
349  const InOrder &in;
350  OutOrder &out;
351  Basis &basis;
352  int volume;
353 
354  private:
355  int sharedBytesPerThread() const {
356  size_t regSize = sizeof(FloatOut) > sizeof(FloatIn) ? sizeof(FloatOut) : sizeof(FloatIn);
357  return Ns*Nc*2*regSize;
358  }
359 
360  // the minimum shared memory per block is (block+1) because we pad to avoid bank conflicts
361  int sharedBytesPerBlock(const TuneParam &param) const { return (param.block.x+1)*sharedBytesPerThread(); }
362  bool advanceSharedBytes(TuneParam &param) const { return false; } // Don't tune shared mem
363  bool advanceGridDim(TuneParam &param) const { return false; } // Don't tune the grid dimensions.
364  bool advanceBlockDim(TuneParam &param) const {
365  bool advance = Tunable::advanceBlockDim(param);
366  if (advance) param.grid = dim3( (volume+param.block.x-1) / param.block.x, 1, 1);
367  param.shared_bytes = sharedBytesPerThread() * (param.block.x+1); // FIXME: use sharedBytesPerBlock
368  return advance;
369  }
370 
371  public:
372  PackSpinor(OutOrder &out, const InOrder &in, Basis &basis, int volume)
373  : out(out), in(in), basis(basis), volume(volume) { ; }
374  virtual ~PackSpinor() { ; }
375 
376  void apply(const cudaStream_t &stream) {
377  TuneParam tp = tuneLaunch(*this, QUDA_TUNE_YES, QUDA_DEBUG_VERBOSE);
378  packSpinorKernel<FloatOut, FloatIn, Ns, Nc, OutOrder, InOrder, Basis>
379  <<<tp.grid, tp.block, tp.shared_bytes, stream>>>
380  (out, in, basis, volume);
381  }
382 
383  TuneKey tuneKey() const {
384  std::stringstream vol, aux;
385  vol << in.volume;
386  aux << "out_stride=" << out.stride << ",in_stride=" << in.stride;
387  return TuneKey(vol.str(), typeid(*this).name(), aux.str());
388  }
389 
390  std::string paramString(const TuneParam &param) const { // Don't bother printing the grid dim.
391  std::stringstream ps;
392  ps << "block=(" << param.block.x << "," << param.block.y << "," << param.block.z << "), ";
393  ps << "shared=" << param.shared_bytes;
394  return ps.str();
395  }
396 
397  virtual void initTuneParam(TuneParam &param) const {
398  Tunable::initTuneParam(param);
399  param.grid = dim3( (volume+param.block.x-1) / param.block.x, 1, 1);
400  }
401 
403  virtual void defaultTuneParam(TuneParam &param) const {
404  Tunable::defaultTuneParam(param);
405  param.grid = dim3( (volume+param.block.x-1) / param.block.x, 1, 1);
406  }
407 
408  long long flops() const { return 0; }
409  long long bytes() const { return in.Bytes() + out.Bytes(); }
410 };
411 
412 
414 template <int Ns, int Nc, typename OutOrder, typename InOrder, typename FloatOut, typename FloatIn>
415 void packParitySpinor(FloatOut *dst, FloatIn *src, OutOrder &outOrder, const InOrder &inOrder, int Vh, int pad,
417  if (dstBasis==srcBasis) {
419  if (location == QUDA_CPU_FIELD_LOCATION) {
420  packSpinor<FloatOut, FloatIn, Ns, Nc>(outOrder, inOrder, basis, Vh);
421  } else {
423  pack.apply(0);
424  }
425  } else if (dstBasis == QUDA_UKQCD_GAMMA_BASIS && srcBasis == QUDA_DEGRAND_ROSSI_GAMMA_BASIS) {
426  if (Ns != 4) errorQuda("Can only change basis with Nspin = 4, not Nspin = %d", Ns);
428  if (location == QUDA_CPU_FIELD_LOCATION) {
429  packSpinor<FloatOut, FloatIn, Ns, Nc>(outOrder, inOrder, basis, Vh);
430  } else {
432  pack.apply(0);
433  }
434  } else if (srcBasis == QUDA_UKQCD_GAMMA_BASIS && dstBasis == QUDA_DEGRAND_ROSSI_GAMMA_BASIS) {
435  if (Ns != 4) errorQuda("Can only change basis with Nspin = 4, not Nspin = %d", Ns);
437  if (location == QUDA_CPU_FIELD_LOCATION) {
438  packSpinor<FloatOut, FloatIn, Ns, Nc>(outOrder, inOrder, basis, Vh);
439  } else {
441  pack.apply(0);
442  }
443  } else {
444  errorQuda("Basis change not supported");
445  }
446 }
447 
448 
449 template <int Nc, int Ns, int N, typename dstFloat, typename srcFloat>
450 void packSpinor(dstFloat *Dst, srcFloat *Src, ColorSpinorField &dst, const ColorSpinorField &src, QudaFieldLocation location) {
451 
452  if (dst.Ndim() != src.Ndim()) {
453  errorQuda("Number of dimensions %d %d don't match", dst.Ndim(), src.Ndim());
454  }
455 
456  if (dst.Volume() != src.Volume()) {
457  errorQuda("Volumes %d %d don't match", dst.Volume(), src.Volume());
458  }
459 
460  if (!( dst.SiteOrder() == src.SiteOrder() ||
461  (dst.SiteOrder() == QUDA_EVEN_ODD_SITE_ORDER &&
462  src.SiteOrder() == QUDA_ODD_EVEN_SITE_ORDER) ||
463  (dst.SiteOrder() == QUDA_ODD_EVEN_SITE_ORDER &&
464  src.SiteOrder() == QUDA_EVEN_ODD_SITE_ORDER) ) ) {
465  errorQuda("Subset orders %d %d don't match", dst.SiteOrder(), src.SiteOrder());
466  }
467 
468  if (dst.SiteSubset() != src.SiteSubset()) {
469  errorQuda("Subset types do not match %d %d", dst.SiteSubset(), src.SiteSubset());
470  }
471 
472  int V = dst.Volume();
473  QudaSiteSubset subset = dst.SiteSubset();
474  QudaSiteOrder siteOrder = dst.SiteOrder();
475  int dstLength = dst.Bytes() / dst.Precision(); // cannot use total_length since ALIGNMENT_ADJUST
476  int srcLength = src.Bytes() / src.Precision(); // changes position of odd field
477  QudaGammaBasis dstBasis = dst.GammaBasis();
478  QudaGammaBasis srcBasis = src.GammaBasis();
479  QudaFieldOrder dstOrder = dst.FieldOrder();
480  QudaFieldOrder srcOrder = src.FieldOrder();
481 
482  // We currently only support parity-ordered fields; even-odd or odd-even
483  if (siteOrder == QUDA_LEXICOGRAPHIC_SITE_ORDER) {
484  errorQuda("Copying to full fields with lexicographical ordering is not currently supported");
485  }
486 
487  if (subset == QUDA_FULL_SITE_SUBSET) {
488  // check what src parity ordering is
489  unsigned int evenOff, oddOff;
490  if (siteOrder == QUDA_EVEN_ODD_SITE_ORDER) {
491  evenOff = 0;
492  oddOff = srcLength/2;
493  } else {
494  oddOff = 0;
495  evenOff = srcLength/2;
496  }
497 
498  int Vh = V/2;
499  if ((dstOrder == QUDA_FLOAT4_FIELD_ORDER || dstOrder == QUDA_FLOAT2_FIELD_ORDER) &&
501  if (src.Pad() != 0) errorQuda("Non-zero pad not supported with fieldOrder %d\n", srcOrder);
502  if (srcOrder == QUDA_SPACE_SPIN_COLOR_FIELD_ORDER) {
503  {
504  SpaceSpinorColorOrder<srcFloat, Ns, Nc> inOrder(Src+evenOff, Vh, Vh);
505  FloatNOrder<dstFloat, Ns, Nc, N> outOrder(Dst, Vh, Vh+dst.Pad());
506  packParitySpinor<Ns,Nc>(Dst, Src+evenOff, outOrder, inOrder, Vh, dst.Pad(), dstBasis, srcBasis, location);
507  }
508  {
509  SpaceSpinorColorOrder<srcFloat, Ns, Nc> inOrder(Src+oddOff, Vh, Vh);
510  FloatNOrder<dstFloat, Ns, Nc, N> outOrder(Dst + dstLength/2, Vh, Vh+dst.Pad());
511  packParitySpinor<Ns,Nc>(Dst + dstLength/2, Src+oddOff, outOrder, inOrder, Vh, dst.Pad(), dstBasis, srcBasis, location);
512  }
513  } else if (srcOrder == QUDA_SPACE_COLOR_SPIN_FIELD_ORDER) {
514  {
515  SpaceColorSpinorOrder<srcFloat, Ns, Nc> inOrder(Src+evenOff, Vh, Vh);
516  FloatNOrder<dstFloat, Ns, Nc, N> outOrder(Dst, Vh, Vh+dst.Pad());
517  packParitySpinor<Ns,Nc>(Dst, Src+evenOff, outOrder, inOrder, Vh, dst.Pad(), dstBasis, srcBasis, location);
518  }
519  {
520  SpaceColorSpinorOrder<srcFloat, Ns, Nc> inOrder(Src+oddOff, Vh, Vh);
521  FloatNOrder<dstFloat, Ns, Nc, N> outOrder(Dst + dstLength/2, Vh, Vh+dst.Pad());
522  packParitySpinor<Ns,Nc>(Dst + dstLength/2, Src+oddOff, outOrder, inOrder, Vh, dst.Pad(), dstBasis, srcBasis, location);
523  }
524  }
525  } else if ((srcOrder == QUDA_FLOAT4_FIELD_ORDER || srcOrder == QUDA_FLOAT2_FIELD_ORDER) &&
527  if (dst.Pad() != 0) errorQuda("Non-zero pad not supported with fieldOrder %d\n", dstOrder);
528  if (dstOrder == QUDA_SPACE_SPIN_COLOR_FIELD_ORDER) {
529  {
530  FloatNOrder<srcFloat, Ns, Nc, N> inOrder(Src+evenOff, Vh, Vh+src.Pad());
531  SpaceSpinorColorOrder<dstFloat, Ns, Nc> outOrder(Dst, Vh, Vh);
532  packParitySpinor<Ns,Nc>(Dst, Src+evenOff, outOrder, inOrder, Vh, src.Pad(), dstBasis, srcBasis, location);
533  }
534  {
535  FloatNOrder<srcFloat, Ns, Nc, N> inOrder(Src+oddOff, Vh, Vh+src.Pad());
536  SpaceSpinorColorOrder<dstFloat, Ns, Nc> outOrder(Dst + dstLength/2, Vh, Vh);
537  packParitySpinor<Ns,Nc>(Dst + dstLength/2, Src+oddOff, outOrder, inOrder, Vh, src.Pad(), dstBasis, srcBasis, location);
538  }
539  } else if (dstOrder == QUDA_SPACE_COLOR_SPIN_FIELD_ORDER) {
540  {
541  FloatNOrder<srcFloat, Ns, Nc, N> inOrder(Src+evenOff, Vh, Vh+src.Pad());
542  SpaceColorSpinorOrder<dstFloat, Ns, Nc> outOrder(Dst, Vh, Vh);
543  packParitySpinor<Ns,Nc>(Dst, Src+evenOff, outOrder, inOrder, Vh, src.Pad(), dstBasis, srcBasis, location);
544  }
545  {
546  FloatNOrder<srcFloat, Ns, Nc, N> inOrder(Src+oddOff, Vh, Vh+src.Pad());
547  SpaceColorSpinorOrder<dstFloat, Ns, Nc> outOrder(Dst + dstLength/2, Vh, Vh);
548  packParitySpinor<Ns,Nc>(Dst + dstLength/2, Src+oddOff, outOrder, inOrder, Vh, src.Pad(), dstBasis, srcBasis, location);
549  }
550  }
551  } else {
552  errorQuda("Field order conversion from %d to %d not supported", srcOrder, dstOrder);
553  }
554 
555  } else { // parity field
556 
557  if ((dstOrder == QUDA_FLOAT4_FIELD_ORDER || dstOrder == QUDA_FLOAT2_FIELD_ORDER) &&
559  if (src.Pad() != 0) errorQuda("Non-zero pad not supported with fieldOrder %d\n", srcOrder);
560  if (srcOrder == QUDA_SPACE_SPIN_COLOR_FIELD_ORDER) {
561  SpaceSpinorColorOrder<srcFloat, Ns, Nc> inOrder(Src, V, V);
562  FloatNOrder<dstFloat, Ns, Nc, N> outOrder(Dst, V, V+dst.Pad());
563  packParitySpinor<Ns,Nc>(Dst, Src, outOrder, inOrder, V, dst.Pad(), dstBasis, srcBasis, location);
564  } else if (srcOrder == QUDA_SPACE_COLOR_SPIN_FIELD_ORDER) {
565  SpaceColorSpinorOrder<srcFloat, Ns, Nc> inOrder(Src, V, V);
566  FloatNOrder<dstFloat, Ns, Nc, N> outOrder(Dst, V, V+dst.Pad());
567  packParitySpinor<Ns,Nc>(Dst, Src, outOrder, inOrder, V, dst.Pad(), dstBasis, srcBasis, location);
568  }
569  } else if ((srcOrder == QUDA_FLOAT4_FIELD_ORDER || srcOrder == QUDA_FLOAT2_FIELD_ORDER) &&
571  if (dst.Pad() != 0) errorQuda("Non-zero pad not supported with fieldOrder %d\n", dstOrder);
572  if (dstOrder == QUDA_SPACE_SPIN_COLOR_FIELD_ORDER) {
573  FloatNOrder<srcFloat, Ns, Nc, N> inOrder(Src, V, V+src.Pad());
574  SpaceSpinorColorOrder<dstFloat, Ns, Nc> outOrder(Dst, V, V);
575  packParitySpinor<Ns,Nc>(Dst, Src, outOrder, inOrder, V, src.Pad(), dstBasis, srcBasis, location);
576  } else if (dstOrder == QUDA_SPACE_COLOR_SPIN_FIELD_ORDER) {
577  FloatNOrder<srcFloat, Ns, Nc, N> inOrder(Src, V, V+src.Pad());
578  SpaceColorSpinorOrder<dstFloat, Ns, Nc> outOrder(Dst, V, V);
579  packParitySpinor<Ns,Nc>(Dst, Src, outOrder, inOrder, V, src.Pad(), dstBasis, srcBasis, location);
580  }
581  } else {
582  errorQuda("Field order conversion from %d to %d not supported", srcOrder, dstOrder);
583  }
584 
585  } // parity or full
586 
587 }
588 
589  /*template <int Nc, int Ns, int N, typename Float, typename FloatN>
590 void unpackSpinor(Float *dst, FloatN *src, int V, int pad, const int x[], int dstLength,
591  int srcLength, QudaSiteSubset dstSubset, QudaSiteOrder siteOrder,
592  QudaGammaBasis dstBasis, QudaGammaBasis srcBasis, QudaFieldOrder dstOrder,
593  QudaFieldLocation location) {
594 
595  if (dstSubset == QUDA_FULL_SITE_SUBSET) {
596  if (siteOrder == QUDA_LEXICOGRAPHIC_SITE_ORDER) {
597  errorQuda("Copying to full fields with lexicographical ordering is not currently supported");
598  } else {
599  // We are copying a parity ordered field
600 
601  // check what src parity ordering is
602  unsigned int evenOff, oddOff;
603  if (siteOrder == QUDA_EVEN_ODD_SITE_ORDER) {
604  evenOff = 0;
605  oddOff = srcLength/2;
606  } else {
607  oddOff = 0;
608  evenOff = srcLength/2;
609  }
610 
611  int Vh = V/2;
612  if (dstOrder == QUDA_SPACE_SPIN_COLOR_FIELD_ORDER) {
613  {
614  FloatNOrder<FloatN, Ns, Nc, N> inOrder(src+evenOff, Vh, Vh+pad);
615  SpaceSpinorColorOrder<Float, Ns, Nc> outOrder(dst, Vh, Vh);
616  packParitySpinor<Ns,Nc>(dst, src+evenOff, outOrder, inOrder, Vh, pad, dstBasis, srcBasis, location);
617  }
618  {
619  FloatNOrder<FloatN, Ns, Nc, N> inOrder(src+oddOff, Vh, Vh+pad);
620  SpaceSpinorColorOrder<Float, Ns, Nc> outOrder(dst + dstLength/2, Vh, Vh);
621  packParitySpinor<Ns,Nc>(dst + dstLength/2, src+oddOff, outOrder, inOrder, Vh, pad, dstBasis, srcBasis, location);
622  }
623  } else if (dstOrder == QUDA_SPACE_COLOR_SPIN_FIELD_ORDER) {
624  {
625  FloatNOrder<FloatN, Ns, Nc, N> inOrder(src+evenOff, Vh, Vh+pad);
626  SpaceColorSpinorOrder<Float, Ns, Nc> outOrder(dst, Vh, Vh);
627  packParitySpinor<Ns,Nc>(dst, src+evenOff, outOrder, inOrder, Vh, pad, dstBasis, srcBasis, location);
628  }
629  {
630  FloatNOrder<FloatN, Ns, Nc, N> inOrder(src+oddOff, Vh, Vh+pad);
631  SpaceColorSpinorOrder<Float, Ns, Nc> outOrder(dst + dstLength/2, Vh, Vh);
632  packParitySpinor<Ns,Nc>(dst + dstLength/2, src+oddOff, outOrder, inOrder, Vh, pad, dstBasis, srcBasis, location);
633  }
634  } else {
635  errorQuda("Destination field order not supported");
636  }
637  }
638  } else {
639  // dst is defined on a single parity only
640 
641  }
642 
643  }*/
644 
645 
646 
647 /*
648 template <int Nc, int Ns, int N, typename Float, typename FloatN>
649 void packFullSpinor(FloatN *dst, Float *src, int V, int pad, const int x[], int dstLength,
650  QudaGammaBasis dstBasis, QudaGammaBasis srcBasis) {
651 
652  int Vh = V/2;
653  if (dstBasis==srcBasis) {
654  for (int i=0; i<V/2; i++) {
655 
656  int boundaryCrossings = i/(x[0]/2) + i/(x[1]*(x[0]/2)) + i/(x[2]*x[1]*(x[0]/2));
657 
658  { // even sites
659  int k = 2*i + boundaryCrossings%2;
660  packSpinorField<Nc,Ns,N>(dst+N*i, src+2*Nc*Ns*k, Vh+pad);
661  }
662 
663  { // odd sites
664  int k = 2*i + (boundaryCrossings+1)%2;
665  packSpinorField<Nc,Ns,N>(dst+dstLength/2+N*i, src+ 2*Nc*Ns*k, Vh+pad);
666  }
667  }
668  } else if (dstBasis == QUDA_UKQCD_GAMMA_BASIS && srcBasis == QUDA_DEGRAND_ROSSI_GAMMA_BASIS) {
669  if (Ns != 4) errorQuda("Can only change basis with Nspin = 4, not Nspin = %d", Ns);
670  for (int i=0; i<V/2; i++) {
671 
672  int boundaryCrossings = i/(x[0]/2) + i/(x[1]*(x[0]/2)) + i/(x[2]*x[1]*(x[0]/2));
673 
674  { // even sites
675  int k = 2*i + boundaryCrossings%2;
676  packNonRelSpinorField<Nc,N>(dst+N*i, src+2*Nc*Ns*k, Vh+pad);
677  }
678 
679  { // odd sites
680  int k = 2*i + (boundaryCrossings+1)%2;
681  packNonRelSpinorField<Nc,N>(dst+dstLength/2+N*i, src+2*Nc*Ns*k, Vh+pad);
682  }
683  }
684  } else {
685  errorQuda("Basis change not supported");
686  }
687 }
688 
689 
690 template <int Nc, int Ns, int N, typename Float, typename FloatN>
691 void unpackFullSpinor(Float *dst, FloatN *src, int V, int pad, const int x[],
692  int srcLength, QudaGammaBasis dstBasis, QudaGammaBasis srcBasis) {
693 
694  int Vh = V/2;
695  if (dstBasis==srcBasis) {
696  for (int i=0; i<V/2; i++) {
697 
698  int boundaryCrossings = i/(x[0]/2) + i/(x[1]*(x[0]/2)) + i/(x[2]*x[1]*(x[0]/2));
699 
700  { // even sites
701  int k = 2*i + boundaryCrossings%2;
702  unpackSpinorField<Nc,Ns,N>(dst+2*Ns*Nc*k, src+N*i, Vh+pad);
703  }
704 
705  { // odd sites
706  int k = 2*i + (boundaryCrossings+1)%2;
707  unpackSpinorField<Nc,Ns,N>(dst+2*Ns*Nc*k, src+srcLength/2+N*i, Vh+pad);
708  }
709  }
710  } else if (srcBasis == QUDA_UKQCD_GAMMA_BASIS && dstBasis == QUDA_DEGRAND_ROSSI_GAMMA_BASIS) {
711  if (Ns != 4) errorQuda("Can only change basis with Nspin = 4, not Nspin = %d", Ns);
712  for (int i=0; i<V/2; i++) {
713 
714  int boundaryCrossings = i/(x[0]/2) + i/(x[1]*(x[0]/2)) + i/(x[2]*x[1]*(x[0]/2));
715 
716  { // even sites
717  int k = 2*i + boundaryCrossings%2;
718  unpackNonRelSpinorField<Nc,N>(dst+2*Ns*Nc*k, src+N*i, Vh+pad);
719  }
720 
721  { // odd sites
722  int k = 2*i + (boundaryCrossings+1)%2;
723  unpackNonRelSpinorField<Nc,N>(dst+2*Ns*Nc*k, src+srcLength/2+N*i, Vh+pad);
724  }
725  }
726  } else {
727  errorQuda("Basis change not supported");
728  }
729 }
730 
731 template <int Nc, int Ns, int N, typename Float, typename FloatN>
732 void unpackQLAFullSpinor(Float *dst, FloatN *src, int V, int pad, const int x[],
733  int srcLength, QudaGammaBasis dstBasis, QudaGammaBasis srcBasis) {
734 
735  int Vh = V/2;
736  if (dstBasis==srcBasis) {
737  for (int i=0; i<V/2; i++) {
738 
739  int boundaryCrossings = i/(x[0]/2) + i/(x[1]*(x[0]/2)) + i/(x[2]*x[1]*(x[0]/2));
740 
741  { // even sites
742  int k = 2*i + boundaryCrossings%2;
743  unpackQLASpinorField<Nc,Ns,N>(dst+2*Nc*Ns*k, src+N*i, Vh+pad);
744  }
745 
746  { // odd sites
747  int k = 2*i + (boundaryCrossings+1)%2;
748  unpackQLASpinorField<Nc,Ns,N>(dst+2*Nc*Ns*k, src+srcLength/2+N*i, Vh+pad);
749  }
750  }
751  } else if (srcBasis == QUDA_UKQCD_GAMMA_BASIS && dstBasis == QUDA_DEGRAND_ROSSI_GAMMA_BASIS) {
752  for (int i=0; i<V/2; i++) {
753 
754  int boundaryCrossings = i/(x[0]/2) + i/(x[1]*(x[0]/2)) + i/(x[2]*x[1]*(x[0]/2));
755 
756  { // even sites
757  int k = 2*i + boundaryCrossings%2;
758  unpackNonRelQLASpinorField<Nc,N>(dst+2*Ns*Nc*k, src+N*i, Vh+pad);
759  }
760 
761  { // odd sites
762  int k = 2*i + (boundaryCrossings+1)%2;
763  unpackNonRelQLASpinorField<Nc,N>(dst+2*Ns*Nc*k, src+srcLength/2+N*i, Vh+pad);
764  }
765  }
766  } else {
767  errorQuda("Basis change not supported");
768  }
769  }*/