QUDA  0.9.0
dslash_pack.cu
Go to the documentation of this file.
1 #include <cstdlib>
2 #include <cstdio>
3 #include <string>
4 #include <iostream>
5 
6 #include <color_spinor_field.h>
7 #include <clover_field.h> // Do we need this now?
8 
9 // these control the Wilson-type actions
10 #ifdef GPU_WILSON_DIRAC
11 //#define DIRECT_ACCESS_WILSON_PACK_SPINOR
12 #endif // GPU_WILSON_DIRAC
13 
14 
15 #include <quda_internal.h>
16 #include <dslash_quda.h>
17 #include <sys/time.h>
18 #include <blas_quda.h>
19 
20 #include <inline_ptx.h>
21 #include <uint_to_char.h>
22 
23 namespace quda {
24 
25  namespace pack {
26 
27 #include <dslash_constants.h>
28 #include <dslash_textures.h>
29 
30  } // end namespace pack
31 
32  using namespace pack;
33 
34 #ifdef MULTI_GPU
35  static int commDim[QUDA_MAX_DIM]; // Whether to do comms or not
36  void setPackComms(const int *comm_dim) {
37  for (int i=0; i<4; i++) commDim[i] = comm_dim[i];
38  for (int i=4; i<QUDA_MAX_DIM; i++) commDim[i] = 0;
39  }
40 #else
41  void setPackComms(const int *comm_dim) { ; }
42 #endif
43 
44 #define STRIPED
45 #ifdef STRIPED
46 #else
47 #define SWIZZLE
48 #endif
49 
50 #include <dslash_index.cuh>
51 
52  // routines for packing the ghost zones (multi-GPU only)
53 
54 
55 #ifdef MULTI_GPU
56 
57  template <typename FloatN>
58  struct PackParam {
59 
60  FloatN *out[2*4];
61  float *outNorm[2*4];
62 
63  FloatN *in;
64  float *inNorm;
65 
66  int_fastdiv threads; // total number of threads
67 
68  // offsets which determine thread mapping to dimension
69  int threadDimMapLower[4]; // lowest thread which maps to dim
70  int threadDimMapUpper[4]; // greatest thread + 1 which maps to dim
71 
72  int parity;
73 #ifdef USE_TEXTURE_OBJECTS
74  cudaTextureObject_t inTex;
75  cudaTextureObject_t inTexNorm;
76 #endif
77 
78  int dim;
79  int face_num;
80 
81  DslashConstant dc;
82 
83  int sp_stride;
84 
85  int_fastdiv swizzle;
86  int sites_per_block;
87  };
88 
89  template<typename FloatN>
90  std::ostream& operator<<(std::ostream& output, const PackParam<FloatN>& param) {
91  output << "threads = " << param.threads << std::endl;
92  output << "threadDimMapLower = {" << param.threadDimMapLower[0] << "," <<
93  param.threadDimMapLower[1] << "," << param.threadDimMapLower[2] << "," << param.threadDimMapLower[3] << "}" << std::endl;
94  output << "threadDimMapUpper = {" << param.threadDimMapUpper[0] << "," <<
95  param.threadDimMapUpper[1] << "," << param.threadDimMapUpper[2] << "," << param.threadDimMapUpper[3] << "}" << std::endl;
96  output << "parity = " << param.parity << std::endl;
97  output << "dim = " << param.dim << std::endl;
98  output << "face_num = " << param.face_num << std::endl;
99  output << "X = {" << param.dc.X[0] << ","<< param.dc.X[1] << "," << param.dc.X[2] << "," << param.dc.X[3] << "," << param.dc.X[4] << "}" << std::endl;
100  output << "ghostFace = {" << param.dc.ghostFace[0] << ","<< param.dc.ghostFace[1] << ","
101  << param.dc.ghostFace[2] << "," << param.dc.ghostFace[3] << "}" << std::endl;
102  output << "sp_stride = " << param.sp_stride << std::endl;
103  return output;
104  }
105 
106  // Extend the PackParam class to PackExtendedParam
107  template<typename Float>
108  struct PackExtendedParam : public PackParam<Float>
109  {
110  PackExtendedParam(){}
111  PackExtendedParam(const PackParam<Float>& base) : PackParam<Float>(base) {}
112  int R[QUDA_MAX_DIM]; // boundary dimensions
113  };
114 
115 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC)
116 
117  // double precision
118 #if (defined DIRECT_ACCESS_WILSON_PACK_SPINOR) || (defined FERMI_NO_DBLE_TEX)
119 #define READ_SPINOR READ_SPINOR_DOUBLE
120 #define READ_SPINOR_UP READ_SPINOR_DOUBLE_UP
121 #define READ_SPINOR_DOWN READ_SPINOR_DOUBLE_DOWN
122 #define SPINORTEX in
123 #else
124 #define READ_SPINOR READ_SPINOR_DOUBLE_TEX
125 #define READ_SPINOR_UP READ_SPINOR_DOUBLE_UP_TEX
126 #define READ_SPINOR_DOWN READ_SPINOR_DOUBLE_DOWN_TEX
127 #ifdef USE_TEXTURE_OBJECTS
128 #define SPINORTEX param.inTex
129 #else
130 #define SPINORTEX spinorTexDouble
131 #endif
132 #endif
133 #define WRITE_HALF_SPINOR WRITE_HALF_SPINOR_DOUBLE2
134 #define SPINOR_DOUBLE
135  template <int dim, int dagger, int face_num>
136  static inline __device__ void packFaceWilsonCore(double2 *out, float *outNorm, const double2 *in,
137  const float *inNorm, const int &idx,
138  const int &face_idx, const int &face_volume,
139  PackParam<double2> &param)
140  {
141  if (dagger) {
143  } else {
144 #include "wilson_pack_face_core.h"
145  }
146  }
147 
148  template <int dim, int dagger, int face_num>
149  static inline __device__ void unpackFaceWilsonCore(double2 *out, float *outNorm, const double2 *in,
150  const float *inNorm, const int &idx,
151  const int &face_idx, const int &face_volume,
152  PackParam<double2> &param)
153  {
154  if (dagger) {
156  } else {
157 #include "wilson_pack_face_core.h"
158  }
159  }
160 
161 #undef READ_SPINOR
162 #undef READ_SPINOR
163 #undef READ_SPINOR_UP
164 #undef READ_SPINOR_DOWN
165 #undef SPINORTEX
166 #undef WRITE_HALF_SPINOR
167 #undef SPINOR_DOUBLE
168 
169 
170  // single precision
171 #ifdef DIRECT_ACCESS_WILSON_PACK_SPINOR
172 #define READ_SPINOR READ_SPINOR_SINGLE
173 #define READ_SPINOR_UP READ_SPINOR_SINGLE_UP
174 #define READ_SPINOR_DOWN READ_SPINOR_SINGLE_DOWN
175 #define SPINORTEX in
176 #else
177 #define READ_SPINOR READ_SPINOR_SINGLE_TEX
178 #define READ_SPINOR_UP READ_SPINOR_SINGLE_UP_TEX
179 #define READ_SPINOR_DOWN READ_SPINOR_SINGLE_DOWN_TEX
180 #ifdef USE_TEXTURE_OBJECTS
181 #define SPINORTEX param.inTex
182 #else
183 #define SPINORTEX spinorTexSingle
184 #endif
185 #endif
186 #define WRITE_HALF_SPINOR WRITE_HALF_SPINOR_FLOAT4
187  template <int dim, int dagger, int face_num>
188  static inline __device__ void packFaceWilsonCore(float4 *out, float *outNorm, const float4 *in, const float *inNorm,
189  const int &idx, const int &face_idx,
190  const int &face_volume,
191  const PackParam<float4> &param)
192  {
193  if (dagger) {
195  } else {
196 #include "wilson_pack_face_core.h"
197  }
198  }
199 
200  template <int dim, int dagger, int face_num>
201  static inline __device__ void unpackFaceWilsonCore(float4 *out, float *outNorm, const float4 *in, const float *inNorm,
202  const int &idx, const int &face_idx,
203  const int &face_volume,
204  const PackParam<float4> &param)
205  {
206  if (dagger) {
208  } else {
209 #include "wilson_pack_face_core.h"
210  }
211  }
212 #undef READ_SPINOR
213 #undef READ_SPINOR_UP
214 #undef READ_SPINOR_DOWN
215 #undef SPINORTEX
216 #undef WRITE_HALF_SPINOR
217 
218 
219  // half precision
220 #ifdef DIRECT_ACCESS_WILSON_PACK_SPINOR
221 #define READ_SPINOR READ_SPINOR_HALF
222 #define READ_SPINOR_UP READ_SPINOR_HALF_UP
223 #define READ_SPINOR_DOWN READ_SPINOR_HALF_DOWN
224 #define SPINORTEX in
225 #else
226 #define READ_SPINOR READ_SPINOR_HALF_TEX
227 #define READ_SPINOR_UP READ_SPINOR_HALF_UP_TEX
228 #define READ_SPINOR_DOWN READ_SPINOR_HALF_DOWN_TEX
229 #ifdef USE_TEXTURE_OBJECTS
230 #define SPINORTEX param.inTex
231 #else
232 #define SPINORTEX spinorTexHalf
233 #endif
234 #endif
235 #define WRITE_HALF_SPINOR WRITE_HALF_SPINOR_SHORT4
236  template <int dim, int dagger, int face_num>
237  static inline __device__ void packFaceWilsonCore(short4 *out, float *outNorm, const short4 *in, const float *inNorm,
238  const int &idx, const int &face_idx,
239  const int &face_volume,
240  const PackParam<short4> &param)
241  {
242  if (dagger) {
244  } else {
245 #include "wilson_pack_face_core.h"
246  }
247  }
248 
249  template <int dim, int dagger, int face_num>
250  static inline __device__ void unpackFaceWilsonCore(short4 *out, float *outNorm, const short4 *in, const float *inNorm,
251  const int &idx, const int &face_idx,
252  const int &face_volume,
253  const PackParam<short4> &param)
254  {
255  if (dagger) {
257  } else {
258 #include "wilson_pack_face_core.h"
259  }
260  }
261 #undef READ_SPINOR
262 #undef READ_SPINOR_UP
263 #undef READ_SPINOR_DOWN
264 #undef SPINORTEX
265 #undef WRITE_HALF_SPINOR
266 
267  template <int dagger, typename FloatN>
268  __global__ void packFaceWilsonKernel(PackParam<FloatN> param)
269  {
270  const int nFace = 1; // 1 face for Wilson
271 
272 #ifdef STRIPED
273  const int sites_per_block = param.sites_per_block;
274  int local_tid = threadIdx.x;
275  int tid = sites_per_block * blockIdx.x + local_tid;
276 #else
277  int tid = block_idx(param.swizzle) * blockDim.x + threadIdx.x;
278  constexpr int sites_per_block = 1;
279  constexpr int local_tid = 0;
280 #endif
281 
282  while ( local_tid < sites_per_block && tid < param.threads ) {
283 
284  // determine which dimension we are packing
285  int face_idx;
286  const int dim = dimFromFaceIndex(face_idx, tid, param);
287 
288  // compute where the output is located
289  // compute an index into the local volume from the index into the face
290  // read spinor, spin-project, and write half spinor to face
291  if (dim == 0) {
292  // face_num determines which end of the lattice we are packing: 0 = start, 1 = end
293  const int face_num = (face_idx >= nFace*param.dc.ghostFace[0]) ? 1 : 0;
294  face_idx -= face_num*nFace*param.dc.ghostFace[0];
295  if (face_num == 0) {
296  const int idx = indexFromFaceIndex<4,QUDA_4D_PC,0,nFace,0>(face_idx,param);
297  packFaceWilsonCore<0,dagger,0>(param.out[0], param.outNorm[0], param.in,
298  param.inNorm,idx, face_idx, param.dc.ghostFace[0], param);
299  } else {
300  const int idx = indexFromFaceIndex<4,QUDA_4D_PC,0,nFace,1>(face_idx,param);
301  packFaceWilsonCore<0,dagger,1>(param.out[1], param.outNorm[1], param.in,
302  param.inNorm,idx, face_idx, param.dc.ghostFace[0], param);
303  }
304  } else if (dim == 1) {
305  // face_num determines which end of the lattice we are packing: 0 = start, 1 = end
306  const int face_num = (face_idx >= nFace*param.dc.ghostFace[1]) ? 1 : 0;
307  face_idx -= face_num*nFace*param.dc.ghostFace[1];
308  if (face_num == 0) {
309  const int idx = indexFromFaceIndex<4,QUDA_4D_PC,1,nFace,0>(face_idx,param);
310  packFaceWilsonCore<1, dagger,0>(param.out[2], param.outNorm[2], param.in,
311  param.inNorm,idx, face_idx, param.dc.ghostFace[1], param);
312  } else {
313  const int idx = indexFromFaceIndex<4,QUDA_4D_PC,1,nFace,1>(face_idx,param);
314  packFaceWilsonCore<1, dagger,1>(param.out[3], param.outNorm[3], param.in,
315  param.inNorm,idx, face_idx, param.dc.ghostFace[1], param);
316  }
317  } else if (dim == 2) {
318  // face_num determines which end of the lattice we are packing: 0 = start, 1 = end
319  const int face_num = (face_idx >= nFace*param.dc.ghostFace[2]) ? 1 : 0;
320  face_idx -= face_num*nFace*param.dc.ghostFace[2];
321  if (face_num == 0) {
322  const int idx = indexFromFaceIndex<4,QUDA_4D_PC,2,nFace,0>(face_idx,param);
323  packFaceWilsonCore<2, dagger,0>(param.out[4], param.outNorm[4], param.in,
324  param.inNorm,idx, face_idx, param.dc.ghostFace[2], param);
325  } else {
326  const int idx = indexFromFaceIndex<4,QUDA_4D_PC,2,nFace,1>(face_idx,param);
327  packFaceWilsonCore<2, dagger,1>(param.out[5], param.outNorm[5], param.in,
328  param.inNorm,idx, face_idx, param.dc.ghostFace[2], param);
329  }
330  } else {
331  // face_num determines which end of the lattice we are packing: 0 = start, 1 = end
332  const int face_num = (face_idx >= nFace*param.dc.ghostFace[3]) ? 1 : 0;
333  face_idx -= face_num*nFace*param.dc.ghostFace[3];
334  if (face_num == 0) {
335  const int idx = indexFromFaceIndex<4,QUDA_4D_PC,3,nFace,0>(face_idx,param);
336  packFaceWilsonCore<3, dagger,0>(param.out[6], param.outNorm[6], param.in,
337  param.inNorm,idx, face_idx, param.dc.ghostFace[3], param);
338  } else {
339  const int idx = indexFromFaceIndex<4,QUDA_4D_PC,3,nFace,1>(face_idx,param);
340  packFaceWilsonCore<3, dagger,1>(param.out[7], param.outNorm[7], param.in,
341  param.inNorm,idx, face_idx, param.dc.ghostFace[3], param);
342  }
343  }
344 
345 #ifdef STRIPED
346  local_tid += blockDim.x;
347  tid += blockDim.x;
348 #else
349  tid += blockDim.x*gridDim.x;
350 #endif
351  } // while tid
352 
353  }
354 
355 
356  template <int dagger, typename FloatN, int nFace>
357  __global__ void packFaceExtendedWilsonKernel(PackParam<FloatN> param)
358  {
359 
360 #ifdef STRIPED
361  const int sites_per_block = param.sites_per_block;
362  int local_tid = threadIdx.x;
363  int tid = sites_per_block * blockIdx.x + local_tid;
364 #else
365  int tid = block_idx(param.swizzle) * blockDim.x + threadIdx.x;
366  constexpr int sites_per_block = 1;
367  constexpr int local_tid = 0;
368 #endif
369 
370  while ( local_tid < sites_per_block && tid < param.threads ) {
371 
372  // determine which dimension we are packing
373  int face_idx;
374  const int dim = dimFromFaceIndex(face_idx, tid, param);
375 
376  // compute where the output is located
377  // compute an index into the local volume from the index into the face
378  // read spinor, spin-project, and write half spinor to face
379  if (dim == 0) {
380  // face_num determines which end of the lattice we are packing: 0 = start, 1 = end
381  // if param.face_num==2 pack both the start and the end, otherwise pack the region of the lattice
382  // specified by param.face_num
383  const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.dc.ghostFace[0]) ? 1 : 0) : param.face_num;
384  if(param.face_num==2) face_idx -= face_num*nFace*param.dc.ghostFace[0];
385  if (face_num == 0) {
386  const int idx = indexFromFaceIndexExtended<0,nFace,0>(face_idx,param);
387  packFaceWilsonCore<0,dagger,0>(param.out[0], param.outNorm[0], param.in,
388  param.inNorm,idx, face_idx, param.dc.ghostFace[0], param);
389  } else {
390  const int idx = indexFromFaceIndexExtended<0,nFace,1>(face_idx,param);
391  packFaceWilsonCore<0,dagger,1>(param.out[1], param.outNorm[1], param.in,
392  param.inNorm,idx, face_idx, param.dc.ghostFace[0], param);
393  }
394  } else if (dim == 1) {
395  const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.dc.ghostFace[1]) ? 1 : 0) : param.face_num;
396  if(param.face_num==2) face_idx -= face_num*nFace*param.dc.ghostFace[1];
397  if (face_num == 0) {
398  const int idx = indexFromFaceIndexExtended<1,nFace,0>(face_idx,param);
399  packFaceWilsonCore<1, dagger,0>(param.out[2], param.outNorm[2], param.in,
400  param.inNorm,idx, face_idx, param.dc.ghostFace[1], param);
401  } else {
402  const int idx = indexFromFaceIndexExtended<1,nFace,1>(face_idx,param);
403  packFaceWilsonCore<1, dagger,1>(param.out[3], param.outNorm[3], param.in,
404  param.inNorm,idx, face_idx, param.dc.ghostFace[1], param);
405  }
406  } else if (dim == 2) {
407  const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.dc.ghostFace[2]) ? 1 : 0) : param.face_num;
408  if(param.face_num==2) face_idx -= face_num*nFace*param.dc.ghostFace[2];
409  if (face_num == 0) {
410  const int idx = indexFromFaceIndexExtended<2,nFace,0>(face_idx,param);
411  packFaceWilsonCore<2, dagger,0>(param.out[4], param.outNorm[4], param.in,
412  param.inNorm,idx, face_idx, param.dc.ghostFace[2], param);
413  } else {
414  const int idx = indexFromFaceIndexExtended<2,nFace,1>(face_idx,param);
415  packFaceWilsonCore<2, dagger,1>(param.out[5], param.outNorm[5], param.in,
416  param.inNorm,idx, face_idx, param.dc.ghostFace[2], param);
417  }
418  } else {
419  const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.dc.ghostFace[3]) ? 1 : 0) : param.face_num;
420  if(param.face_num==2) face_idx -= face_num*nFace*param.dc.ghostFace[3];
421 
422  if (face_num == 0) {
423  const int idx = indexFromFaceIndexExtended<3,nFace,0>(face_idx,param);
424  packFaceWilsonCore<3, dagger,0>(param.out[6], param.outNorm[6], param.in,
425  param.inNorm,idx, face_idx, param.dc.ghostFace[3], param);
426  } else {
427  const int idx = indexFromFaceIndexExtended<3,nFace,1>(face_idx,param);
428  packFaceWilsonCore<3, dagger,1>(param.out[7], param.outNorm[7], param.in,
429  param.inNorm,idx, face_idx, param.dc.ghostFace[3], param);
430  }
431  }
432 
433 #ifdef STRIPED
434  local_tid += blockDim.x;
435  tid += blockDim.x;
436 #else
437  tid += blockDim.x*gridDim.x;
438 #endif
439  } // while tid
440 
441  }
442 
443 
444  template <int dagger, typename FloatN, int nFace>
445  __global__ void unpackFaceExtendedWilsonKernel(PackParam<FloatN> param)
446  {
447 
448 #ifdef STRIPED
449  const int sites_per_block = param.sites_per_block;
450  int local_tid = threadIdx.x;
451  int tid = sites_per_block * blockIdx.x + local_tid;
452 #else
453  int tid = block_idx(param.swizzle) * blockDim.x + threadIdx.x;
454  constexpr int sites_per_block = 1;
455  constexpr int local_tid = 0;
456 #endif
457 
458  while ( local_tid < sites_per_block && tid < param.threads ) {
459 
460  // determine which dimension we are packing
461  int face_idx;
462  const int dim = dimFromFaceIndex(face_idx, tid, param);
463 
464  // compute where the output is located
465  // compute an index into the local volume from the index into the face
466  // read spinor, spin-project, and write half spinor to face
467  if (dim == 0) {
468  // face_num determines which end of the lattice we are packing: 0 = start, 1 = end
469  // if param.face_num==2 pack both the start and the end, otherwise pack the region of the lattice
470  // specified by param.face_num
471  const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.dc.ghostFace[0]) ? 1 : 0) : param.face_num;
472  if(param.face_num==2) face_idx -= face_num*nFace*param.dc.ghostFace[0];
473 
474  if (face_num == 0) {
475  const int idx = indexFromFaceIndexExtended<0,nFace,0>(face_idx,param);
476  unpackFaceWilsonCore<0,dagger,0>(param.out[0], param.outNorm[0], param.in,
477  param.inNorm,idx, face_idx, param.dc.ghostFace[0], param);
478  } else {
479  const int idx = indexFromFaceIndexExtended<0,nFace,1>(face_idx,param);
480  unpackFaceWilsonCore<0,dagger,1>(param.out[1], param.outNorm[1], param.in,
481  param.inNorm,idx, face_idx, param.dc.ghostFace[0], param);
482  }
483  } else if (dim == 1) {
484  const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.dc.ghostFace[1]) ? 1 : 0) : param.face_num;
485  if(param.face_num==2) face_idx -= face_num*nFace*param.dc.ghostFace[1];
486 
487  if (face_num == 0) {
488  const int idx = indexFromFaceIndexExtended<1,nFace,0>(face_idx,param);
489  unpackFaceWilsonCore<1, dagger,0>(param.out[2], param.outNorm[2], param.in,
490  param.inNorm,idx, face_idx, param.dc.ghostFace[1], param);
491  } else {
492  const int idx = indexFromFaceIndexExtended<1,nFace,1>(face_idx,param);
493  unpackFaceWilsonCore<1, dagger,1>(param.out[3], param.outNorm[3], param.in,
494  param.inNorm,idx, face_idx, param.dc.ghostFace[1], param);
495  }
496  } else if (dim == 2) {
497  const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.dc.ghostFace[2]) ? 1 : 0) : param.face_num;
498  if(param.face_num==2) face_idx -= face_num*nFace*param.dc.ghostFace[2];
499 
500  if (face_num == 0) {
501  const int idx = indexFromFaceIndexExtended<2,nFace,0>(face_idx,param);
502  unpackFaceWilsonCore<2, dagger,0>(param.out[4], param.outNorm[4], param.in,
503  param.inNorm,idx, face_idx, param.dc.ghostFace[2], param);
504  } else {
505  const int idx = indexFromFaceIndexExtended<2,nFace,1>(face_idx,param);
506  unpackFaceWilsonCore<2, dagger,1>(param.out[5], param.outNorm[5], param.in,
507  param.inNorm,idx, face_idx, param.dc.ghostFace[2], param);
508  }
509  } else {
510  const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.dc.ghostFace[3]) ? 1 : 0) : param.face_num;
511  if(param.face_num==2) face_idx -= face_num*nFace*param.dc.ghostFace[3];
512 
513  if (face_num == 0) {
514  const int idx = indexFromFaceIndexExtended<3,nFace,0>(face_idx,param);
515  unpackFaceWilsonCore<3, dagger,0>(param.out[6], param.outNorm[6], param.in,
516  param.inNorm,idx, face_idx, param.dc.ghostFace[3], param);
517  } else {
518  const int idx = indexFromFaceIndexExtended<3,nFace,1>(face_idx,param);
519  unpackFaceWilsonCore<3, dagger,1>(param.out[7], param.outNorm[7], param.in,
520  param.inNorm,idx, face_idx, param.dc.ghostFace[3], param);
521  }
522  }
523 
524 #ifdef STRIPED
525  local_tid += blockDim.x;
526  tid += blockDim.x;
527 #else
528  tid += blockDim.x*gridDim.x;
529 #endif
530  } // while tid
531 
532  }
533 
534 #endif // GPU_WILSON_DIRAC || GPU_DOMAIN_WALL_DIRAC
535 
536 
537 #if defined(GPU_WILSON_DIRAC) || defined(GPU_TWISTED_MASS_DIRAC)
538 
539 
540 #endif // GPU_WILSON_DIRAC || GPU_DOMAIN_WALL_DIRAC
541 
542 
543 #if defined(GPU_WILSON_DIRAC) || defined(GPU_TWISTED_MASS_DIRAC)
544 
545  // double precision
546 
547 #endif // GPU_WILSON_DIRAC || GPU_DOMAIN_WALL_DIRAC
548 
549 
550 #if defined(GPU_WILSON_DIRAC) || defined(GPU_TWISTED_MASS_DIRAC)
551 
552  // double precision
553 #if (defined DIRECT_ACCESS_WILSON_PACK_SPINOR) || (defined FERMI_NO_DBLE_TEX)
554 #define READ_SPINOR READ_SPINOR_DOUBLE
555 #define READ_SPINOR_UP READ_SPINOR_DOUBLE_UP
556 #define READ_SPINOR_DOWN READ_SPINOR_DOUBLE_DOWN
557 #define SPINORTEX in
558 #else
559 #define READ_SPINOR READ_SPINOR_DOUBLE_TEX
560 #define READ_SPINOR_UP READ_SPINOR_DOUBLE_UP_TEX
561 #define READ_SPINOR_DOWN READ_SPINOR_DOUBLE_DOWN_TEX
562 #ifdef USE_TEXTURE_OBJECTS
563 #define SPINORTEX param.inTex
564 #else
565 #define SPINORTEX spinorTexDouble
566 #endif
567 #endif
568 #define WRITE_HALF_SPINOR WRITE_HALF_SPINOR_DOUBLE2
569 #define SPINOR_DOUBLE
570  template <int dim, int dagger, int face_num>
571  static inline __device__ void packTwistedFaceWilsonCore(double2 *out, float *outNorm, const double2 *in,
572  const float *inNorm, double a, double b, const int &idx,
573  const int &face_idx, const int &face_volume,
574  PackParam<double2> &param)
575  {
576  if (dagger) {
578  } else {
580  }
581  }
582 #undef READ_SPINOR
583 #undef READ_SPINOR_UP
584 #undef READ_SPINOR_DOWN
585 #undef SPINORTEX
586 #undef WRITE_HALF_SPINOR
587 #undef SPINOR_DOUBLE
588 
589 
590  // single precision
591 #ifdef DIRECT_ACCESS_WILSON_PACK_SPINOR
592 #define READ_SPINOR READ_SPINOR_SINGLE
593 #define READ_SPINOR_UP READ_SPINOR_SINGLE_UP
594 #define READ_SPINOR_DOWN READ_SPINOR_SINGLE_DOWN
595 #define SPINORTEX in
596 #else
597 #define READ_SPINOR READ_SPINOR_SINGLE_TEX
598 #define READ_SPINOR_UP READ_SPINOR_SINGLE_UP_TEX
599 #define READ_SPINOR_DOWN READ_SPINOR_SINGLE_DOWN_TEX
600 #ifdef USE_TEXTURE_OBJECTS
601 #define SPINORTEX param.inTex
602 #else
603 #define SPINORTEX spinorTexSingle
604 #endif
605 #endif
606 #define WRITE_HALF_SPINOR WRITE_HALF_SPINOR_FLOAT4
607  template <int dim, int dagger, int face_num>
608  static inline __device__ void packTwistedFaceWilsonCore(float4 *out, float *outNorm, const float4 *in, const float *inNorm, float a, float b,
609  const int &idx, const int &face_idx,
610  const int &face_volume,
611  const PackParam<float4> &param)
612  {
613  if (dagger) {
615  } else {
617  }
618  }
619 #undef READ_SPINOR
620 #undef READ_SPINOR_UP
621 #undef READ_SPINOR_DOWN
622 #undef SPINORTEX
623 #undef WRITE_HALF_SPINOR
624 
625 
626  // half precision
627 #ifdef DIRECT_ACCESS_WILSON_PACK_SPINOR
628 #define READ_SPINOR READ_SPINOR_HALF
629 #define READ_SPINOR_UP READ_SPINOR_HALF_UP
630 #define READ_SPINOR_DOWN READ_SPINOR_HALF_DOWN
631 #define SPINORTEX in
632 #else
633 #define READ_SPINOR READ_SPINOR_HALF_TEX
634 #define READ_SPINOR_UP READ_SPINOR_HALF_UP_TEX
635 #define READ_SPINOR_DOWN READ_SPINOR_HALF_DOWN_TEX
636 #ifdef USE_TEXTURE_OBJECTS
637 #define SPINORTEX param.inTex
638 #else
639 #define SPINORTEX spinorTexHalf
640 #endif
641 #endif
642 #define WRITE_HALF_SPINOR WRITE_HALF_SPINOR_SHORT4
643  template <int dim, int dagger, int face_num>
644  static inline __device__ void packTwistedFaceWilsonCore(short4 *out, float *outNorm, const short4 *in, const float *inNorm, float a, float b,
645  const int &idx, const int &face_idx,
646  const int &face_volume,
647  const PackParam<short4> &param)
648  {
649  if (dagger) {
651  } else {
653  }
654  }
655 #undef READ_SPINOR
656 #undef READ_SPINOR_UP
657 #undef READ_SPINOR_DOWN
658 #undef SPINORTEX
659 #undef WRITE_HALF_SPINOR
660 
661  template <int dagger, typename FloatN, typename Float>
662  __global__ void packTwistedFaceWilsonKernel(Float a, Float b, PackParam<FloatN> param)
663  {
664  const int nFace = 1; // 1 face for Wilson
665 
666 #ifdef STRIPED
667  const int sites_per_block = param.sites_per_block;
668  int local_tid = threadIdx.x;
669  int tid = sites_per_block * blockIdx.x + local_tid;
670 #else
671  int tid = block_idx(param.swizzle) * blockDim.x + threadIdx.x;
672  constexpr int sites_per_block = 1;
673  constexpr int local_tid = 0;
674 #endif
675 
676  while ( local_tid < sites_per_block && tid < param.threads ) {
677 
678  // determine which dimension we are packing
679  int face_idx;
680  const int dim = dimFromFaceIndex(face_idx, tid, param);
681 
682  // compute where the output is located
683  // compute an index into the local volume from the index into the face
684  // read spinor, spin-project, and write half spinor to face
685  if (dim == 0) {
686  // face_num determines which end of the lattice we are packing: 0 = start, 1 = end
687  const int face_num = (face_idx >= nFace*param.dc.ghostFace[0]) ? 1 : 0;
688  face_idx -= face_num*nFace*param.dc.ghostFace[0];
689  if (face_num == 0) {
690  const int idx = indexFromFaceIndex<4,QUDA_4D_PC,0,nFace,0>(face_idx,param);
691  packTwistedFaceWilsonCore<0,dagger,0>(param.out[0], param.outNorm[0], param.in,
692  param.inNorm, a, b, idx, face_idx, param.dc.ghostFace[0], param);
693  } else {
694  const int idx = indexFromFaceIndex<4,QUDA_4D_PC,0,nFace,1>(face_idx,param);
695  packTwistedFaceWilsonCore<0,dagger,1>(param.out[1], param.outNorm[1], param.in,
696  param.inNorm, a, b, idx, face_idx, param.dc.ghostFace[0], param);
697  }
698  } else if (dim == 1) {
699  const int face_num = (face_idx >= nFace*param.dc.ghostFace[1]) ? 1 : 0;
700  face_idx -= face_num*nFace*param.dc.ghostFace[1];
701  if (face_num == 0) {
702  const int idx = indexFromFaceIndex<4,QUDA_4D_PC,1,nFace,0>(face_idx,param);
703  packTwistedFaceWilsonCore<1, dagger,0>(param.out[2], param.outNorm[2], param.in,
704  param.inNorm, a, b, idx, face_idx, param.dc.ghostFace[1], param);
705  } else {
706  const int idx = indexFromFaceIndex<4,QUDA_4D_PC,1,nFace,1>(face_idx,param);
707  packTwistedFaceWilsonCore<1, dagger,1>(param.out[3], param.outNorm[3], param.in,
708  param.inNorm, a, b, idx, face_idx, param.dc.ghostFace[1], param);
709  }
710  } else if (dim == 2) {
711  const int face_num = (face_idx >= nFace*param.dc.ghostFace[2]) ? 1 : 0;
712  face_idx -= face_num*nFace*param.dc.ghostFace[2];
713  if (face_num == 0) {
714  const int idx = indexFromFaceIndex<4,QUDA_4D_PC,2,nFace,0>(face_idx,param);
715  packTwistedFaceWilsonCore<2, dagger,0>(param.out[4], param.outNorm[4], param.in,
716  param.inNorm, a, b, idx, face_idx, param.dc.ghostFace[2], param);
717  } else {
718  const int idx = indexFromFaceIndex<4,QUDA_4D_PC,2,nFace,1>(face_idx,param);
719  packTwistedFaceWilsonCore<2, dagger,1>(param.out[5], param.outNorm[5], param.in,
720  param.inNorm, a, b, idx, face_idx, param.dc.ghostFace[2], param);
721  }
722  } else {
723  const int face_num = (face_idx >= nFace*param.dc.ghostFace[3]) ? 1 : 0;
724  face_idx -= face_num*nFace*param.dc.ghostFace[3];
725  if (face_num == 0) {
726  const int idx = indexFromFaceIndex<4,QUDA_4D_PC,3,nFace,0>(face_idx,param);
727  packTwistedFaceWilsonCore<3, dagger,0>(param.out[6], param.outNorm[6], param.in,
728  param.inNorm, a, b,idx, face_idx, param.dc.ghostFace[3], param);
729  } else {
730  const int idx = indexFromFaceIndex<4,QUDA_4D_PC,3,nFace,1>(face_idx,param);
731  packTwistedFaceWilsonCore<3, dagger,1>(param.out[7], param.outNorm[7], param.in,
732  param.inNorm, a, b, idx, face_idx, param.dc.ghostFace[3], param);
733  }
734  }
735 
736 #ifdef STRIPED
737  local_tid += blockDim.x;
738  tid += blockDim.x;
739 #else
740  tid += blockDim.x*gridDim.x;
741 #endif
742  } // while tid
743 
744  }
745 
746 #endif // GPU_TWISTED_MASS_DIRAC
747 
748  template <typename FloatN, typename Float>
749  class PackFace : public Tunable {
750 
751  protected:
752  void *faces[2*QUDA_MAX_DIM];
753  const cudaColorSpinorField *in;
754  const int dagger;
755  const int parity;
756  const int nFace;
757  const int dim;
758  const int face_num;
759  const MemoryLocation location;
760 
761  // compute how many threads we need in total for the face packing
762  unsigned int threads() const {
763  unsigned int threads = 0;
764  if(dim < 0){ // if dim is negative, pack all dimensions
765  for (int i=0; i<4; i++) {
766  if (!commDim[i]) continue;
767  if ( i==3 && !getKernelPackT() ) continue;
768  threads += 2*nFace*in->GhostFace()[i]; // 2 for forwards and backwards faces
769  }
770  }else{ // pack only in dim dimension
771  if( commDim[dim] && (dim!=3 || getKernelPackT() )){
772  threads = nFace*in->GhostFace()[dim];
773  if(face_num==2) threads *= 2; // sending data forwards and backwards
774  }
775  }
776  return threads;
777  }
778 
779  virtual int inputPerSite() const = 0;
780  virtual int outputPerSite() const = 0;
781 
782  // prepare the param struct with kernel arguments
783  void prepareParam(PackParam<FloatN> &param, TuneParam &tp, int dim=-1, int face_num=2) {
784  param.in = (FloatN*)in->V();
785  param.inNorm = (float*)in->Norm();
786  param.dim = dim;
787  param.face_num = face_num;
788  param.parity = parity;
789 
790 #ifdef USE_TEXTURE_OBJECTS
791  param.inTex = in->Tex();
792  param.inTexNorm = in->TexNorm();
793 #endif
794 
795  param.threads = threads();
796  param.sp_stride = in->Stride();
797 
798  int prev = -1; // previous dimension that was partitioned
799  for (int i=0; i<4; i++) {
800  param.threadDimMapLower[i] = 0;
801  param.threadDimMapUpper[i] = 0;
802  if (!commDim[i]) continue;
803  param.threadDimMapLower[i] = (prev>=0 ? param.threadDimMapUpper[prev] : 0);
804  param.threadDimMapUpper[i] = param.threadDimMapLower[i] + 2*nFace*in->GhostFace()[i];
805 
806  param.out[2*i+0] = static_cast<FloatN*>(faces[2*i+0]);
807  param.out[2*i+1] = static_cast<FloatN*>(faces[2*i+1]);
808 
809  param.outNorm[2*i+0] = reinterpret_cast<float*>(static_cast<char*>(faces[2*i+0]) + nFace*outputPerSite()*in->GhostFace()[i]*QUDA_HALF_PRECISION);
810  param.outNorm[2*i+1] = reinterpret_cast<float*>(static_cast<char*>(faces[2*i+1]) + nFace*outputPerSite()*in->GhostFace()[i]*QUDA_HALF_PRECISION);
811 
812  prev=i;
813  }
814 
815  param.dc = in->getDslashConstant(); // get pre-computed constants
816 
817  param.swizzle = tp.aux.x;
818  param.sites_per_block = (param.threads + tp.grid.x - 1) / tp.grid.x;
819  }
820 
821  unsigned int sharedBytesPerThread() const { return 0; }
822  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
823 
824 #ifdef STRIPED
825  bool tuneGridDim() const { return true; } // If striping, always tune grid dimension
826  unsigned int maxGridSize() const {
827  if (location & Host) {
828  // if zero-copy policy then set a maximum number of blocks to be
829  // the 3 * number of dimensions we are communicating
830  int nDimComms = 0;
831  for (int d=0; d<in->Ndim(); d++) nDimComms += commDim[d];
832  return 3*nDimComms;
833  } else {
834  return Tunable::maxGridSize();
835  }
836  } // use no more than a quarter of the GPU
837  unsigned int minGridSize() const {
838  if (location & Host) {
839  // if zero-copy policy then set a maximum number of blocks to be
840  // the 1 * number of dimensions we are communicating
841  int nDimComms = 0;
842  for (int d=0; d<in->Ndim(); d++) nDimComms += commDim[d];
843  return nDimComms;
844  } else {
845  return Tunable::minGridSize();
846  }
847  }
848 #else
849  bool tuneGridDim() const { return location & Host; } // only tune grid dimension if doing zero-copy writing
850  unsigned int maxGridSize() const { return tuneGridDim ? deviceProp.multiProcessorCount/4 : Tunable::maxGridSize(); } // use no more than a quarter of the GPU
851 #endif
852 
853  bool tuneAuxDim() const { return true; } // Do tune the aux dimensions.
854  unsigned int minThreads() const { return threads(); }
855 
856  void fillAux() {
857  strcpy(aux,"policy_kernel,");
858  strcat(aux, in->AuxString());
859  char comm[5];
860  comm[0] = (commDim[0] ? '1' : '0');
861  comm[1] = (commDim[1] ? '1' : '0');
862  comm[2] = (commDim[2] ? '1' : '0');
863  comm[3] = (commDim[3] ? '1' : '0');
864  comm[4] = '\0'; strcat(aux,",comm=");
865  strcat(aux,comm);
867  if (getKernelPackT()) { strcat(aux,",kernelPackT"); }
868  switch (nFace) {
869  case 1: strcat(aux,",nFace=1,"); break;
870  case 3: strcat(aux,",nFace=3,"); break;
871  default: errorQuda("Number of faces not supported");
872  }
873 
874  // label the locations we are packing to
875  // location lable is nonp2p-p2p
876  switch ((int)location) {
877  case Device|Remote: strcat(aux,"device-remote"); break;
878  case Host|Remote: strcat(aux, "host-remote"); break;
879  case Device: strcat(aux,"device-device"); break;
880  case Host: strcat(aux, comm_peer2peer_enabled_global() ? "host-device" : "host-host"); break;
881  default: errorQuda("Unknown pack target location %d\n", location);
882  }
883 
884  }
885 
886  public:
887  PackFace(void *faces_[], const cudaColorSpinorField *in, MemoryLocation location,
888  const int dagger, const int parity, const int nFace, const int dim=-1, const int face_num=2)
889  : in(in), dagger(dagger),
890  parity(parity), nFace(nFace), dim(dim), face_num(face_num), location(location)
891  {
892  memcpy(faces, faces_, 2*QUDA_MAX_DIM*sizeof(void*));
893  fillAux();
894 #ifndef USE_TEXTURE_OBJECTS
895  bindSpinorTex<FloatN>(in);
896 #endif
897  }
898 
899  virtual ~PackFace() {
900 #ifndef USE_TEXTURE_OBJECTS
901  unbindSpinorTex<FloatN>(in);
902 #endif
903  }
904 
905  bool tuneSharedBytes() const { return location & Host ? false : Tunable::tuneSharedBytes(); }
906 
907  bool advanceAux(TuneParam &param) const
908  {
909 #ifdef SWIZZLE
910  if ( location & Remote ) { // only swizzling if we're doing remote writing
911  if (param.aux.x < (int)maxGridSize()) {
912  param.aux.x++;
913  return true;
914  } else {
915  param.aux.x = 1;
916  return false;
917  }
918  } else {
919  return false;
920  }
921 #else
922  return false;
923 #endif
924  }
925 
926  void initTuneParam(TuneParam &param) const {
928  param.aux.x = 1; // swizzle factor
929  // if doing a zero-copy policy then ensure that each thread block
930  // runs exclusively on a given SM - this is to ensure quality of
931  // service for the packing kernel when running concurrently.
932  if (location & Host) param.shared_bytes = deviceProp.sharedMemPerBlock / 2 + 1;
933  }
934 
935  void defaultTuneParam(TuneParam &param) const {
937  param.aux.x = 1; // swizzle factor
938  }
939 
940  long long flops() const { return outputPerSite()*this->threads(); }
941 
942  virtual int tuningIter() const { return 3; }
943 
944  virtual TuneKey tuneKey() const { return TuneKey(in->VolString(), typeid(*this).name(), aux); }
945 
946  virtual void apply(const cudaStream_t &stream) = 0;
947 
948  long long bytes() const {
949  size_t faceBytes = (inputPerSite() + outputPerSite())*this->threads()*sizeof(((FloatN*)0)->x);
950  if (sizeof(((FloatN*)0)->x) == QUDA_HALF_PRECISION)
951  faceBytes += 2*this->threads()*sizeof(float); // 2 is from input and output
952  return faceBytes;
953  }
954  };
955 
956  template <typename FloatN, typename Float>
957  class PackFaceWilson : public PackFace<FloatN, Float> {
958 
959  private:
960 
961  int inputPerSite() const { return 24; } // input is full spinor
962  int outputPerSite() const { return 12; } // output is spin projected
963 
964  public:
965  PackFaceWilson(void *faces[], const cudaColorSpinorField *in, MemoryLocation location,
966  const int dagger, const int parity)
967  : PackFace<FloatN, Float>(faces, in, location, dagger, parity, 1) { }
968  virtual ~PackFaceWilson() { }
969 
970  void apply(const cudaStream_t &stream) {
971  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
972 
973 #ifdef GPU_WILSON_DIRAC
974  static PackParam<FloatN> param;
975  this->prepareParam(param,tp);
976 
977  void *args[] = { &param };
978  void (*func)(PackParam<FloatN>) = this->dagger ? &(packFaceWilsonKernel<1,FloatN>) : &(packFaceWilsonKernel<0,FloatN>);
979  qudaLaunchKernel( (const void*)func, tp.grid, tp.block, args, tp.shared_bytes, stream);
980 #else
981  errorQuda("Wilson face packing kernel is not built");
982 #endif
983  }
984 
985  };
986 
987  void packFaceWilson(void *ghost_buf[], cudaColorSpinorField &in, MemoryLocation location,
988  const int dagger, const int parity, const cudaStream_t &stream) {
989 
990  switch(in.Precision()) {
992  {
993  PackFaceWilson<double2, double> pack(ghost_buf, &in, location, dagger, parity);
994  pack.apply(stream);
995  }
996  break;
998  {
999  PackFaceWilson<float4, float> pack(ghost_buf, &in, location, dagger, parity);
1000  pack.apply(stream);
1001  }
1002  break;
1003  case QUDA_HALF_PRECISION:
1004  {
1005  PackFaceWilson<short4, float> pack(ghost_buf, &in, location, dagger, parity);
1006  pack.apply(stream);
1007  }
1008  break;
1009  default:
1010  errorQuda("Precision %d not supported", in.Precision());
1011  }
1012  }
1013 
1014  template <typename FloatN, typename Float>
1015  class PackFaceTwisted : public PackFace<FloatN, Float> {
1016 
1017  private:
1018 
1019  int inputPerSite() const { return 24; } // input is full spinor
1020  int outputPerSite() const { return 12; } // output is spin projected
1021  Float a;
1022  Float b;
1023 
1024  public:
1025  PackFaceTwisted(void *faces[], const cudaColorSpinorField *in, MemoryLocation location,
1026  const int dagger, const int parity, Float a, Float b)
1027  : PackFace<FloatN, Float>(faces, in, location, dagger, parity, 1), a(a), b(b) { }
1028  virtual ~PackFaceTwisted() { }
1029 
1030  void apply(const cudaStream_t &stream) {
1031  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
1032 
1033 #ifdef GPU_TWISTED_MASS_DIRAC
1034  static PackParam<FloatN> param;
1035  this->prepareParam(param,tp);
1036  void *args[] = { &a, &b, &param };
1037  void (*func)(Float,Float,PackParam<FloatN>) = this->dagger ? &(packTwistedFaceWilsonKernel<1,FloatN,Float>) : &(packTwistedFaceWilsonKernel<0,FloatN,Float>);
1038  cudaLaunchKernel( (const void*)func, tp.grid, tp.block, args, tp.shared_bytes, stream);
1039 #else
1040  errorQuda("Twisted face packing kernel is not built");
1041 #endif
1042  }
1043 
1044  long long flops() const { return outputPerSite()*this->threads(); }
1045  };
1046 
1048  void packTwistedFaceWilson(void *ghost_buf[], cudaColorSpinorField &in, MemoryLocation location, const int dagger,
1049  const int parity, const double a, const double b, const cudaStream_t &stream) {
1050 
1051  switch(in.Precision()) {
1052  case QUDA_DOUBLE_PRECISION:
1053  {
1054  PackFaceTwisted<double2, double> pack(ghost_buf, &in, location, dagger, parity, a, b);
1055  pack.apply(stream);
1056  }
1057  break;
1058  case QUDA_SINGLE_PRECISION:
1059  {
1060  PackFaceTwisted<float4, float> pack(ghost_buf, &in, location, dagger, parity, (float)a, (float)b);
1061  pack.apply(stream);
1062  }
1063  break;
1064  case QUDA_HALF_PRECISION:
1065  {
1066  PackFaceTwisted<short4, float> pack(ghost_buf, &in, location, dagger, parity, (float)a, (float)b);
1067  pack.apply(stream);
1068  }
1069  break;
1070  default:
1071  errorQuda("Precision %d not supported", in.Precision());
1072  }
1073  }
1074 
1075 #ifdef GPU_STAGGERED_DIRAC
1076 
1077 #ifdef USE_TEXTURE_OBJECTS
1078 #define SPINORTEXDOUBLE param.inTex
1079 #define SPINORTEXSINGLE param.inTex
1080 #define SPINORTEXHALF param.inTex
1081 #define SPINORTEXHALFNORM param.inTexNorm
1082 #else
1083 #define SPINORTEXDOUBLE spinorTexDouble
1084 #define SPINORTEXSINGLE spinorTexSingle2
1085 #define SPINORTEXHALF spinorTexHalf2
1086 #define SPINORTEXHALFNORM spinorTexHalf2Norm
1087 #endif
1088 
1089  template <typename Float2>
1090  __device__ void packFaceStaggeredCore(Float2 *out, float *outNorm, const int out_idx,
1091  const int out_stride, const Float2 *in, const float *inNorm,
1092  const int in_idx, const int in_stride) {
1093  out[out_idx + 0*out_stride] = in[in_idx + 0*in_stride];
1094  out[out_idx + 1*out_stride] = in[in_idx + 1*in_stride];
1095  out[out_idx + 2*out_stride] = in[in_idx + 2*in_stride];
1096  }
1097  template<>
1098  __device__ void packFaceStaggeredCore(short2 *out, float *outNorm, const int out_idx,
1099  const int out_stride, const short2 *in, const float *inNorm,
1100  const int in_idx, const int in_stride) {
1101  out[out_idx + 0*out_stride] = in[in_idx + 0*in_stride];
1102  out[out_idx + 1*out_stride] = in[in_idx + 1*in_stride];
1103  out[out_idx + 2*out_stride] = in[in_idx + 2*in_stride];
1104  outNorm[out_idx] = inNorm[in_idx];
1105  }
1106 
1107 #if (defined DIRECT_ACCESS_PACK) || (defined FERMI_NO_DBLE_TEX)
1108  template <typename Float2>
1109  __device__ void packFaceStaggeredCore(Float2 *out, float *outNorm, const int out_idx,
1110  const int out_stride, const Float2 *in, const float *inNorm,
1111  const int in_idx, const PackParam<double2> &param) {
1112  out[out_idx + 0*out_stride] = in[in_idx + 0*param.sp_stride];
1113  out[out_idx + 1*out_stride] = in[in_idx + 1*param.sp_stride];
1114  out[out_idx + 2*out_stride] = in[in_idx + 2*param.sp_stride];
1115  }
1116  template<>
1117  __device__ void packFaceStaggeredCore(short2 *out, float *outNorm, const int out_idx,
1118  const int out_stride, const short2 *in, const float *inNorm,
1119  const int in_idx, const PackParam<double2> &param) {
1120  out[out_idx + 0*out_stride] = in[in_idx + 0*param.sp_stride];
1121  out[out_idx + 1*out_stride] = in[in_idx + 1*param.sp_stride];
1122  out[out_idx + 2*out_stride] = in[in_idx + 2*param.sp_stride];
1123  outNorm[out_idx] = inNorm[in_idx];
1124  }
1125 
1126 
1127 #else
1128  __device__ void packFaceStaggeredCore(double2 *out, float *outNorm, const int out_idx,
1129  const int out_stride, const double2 *in, const float *inNorm,
1130  const int in_idx, const PackParam<double2> &param) {
1131  out[out_idx + 0*out_stride] = fetch_double2(SPINORTEXDOUBLE, in_idx + 0*param.sp_stride);
1132  out[out_idx + 1*out_stride] = fetch_double2(SPINORTEXDOUBLE, in_idx + 1*param.sp_stride);
1133  out[out_idx + 2*out_stride] = fetch_double2(SPINORTEXDOUBLE, in_idx + 2*param.sp_stride);
1134  }
1135 
1136  __device__ void packFaceStaggeredCore(float2 *out, float *outNorm, const int out_idx,
1137  const int out_stride, const float2 *in,
1138  const float *inNorm, const int in_idx,
1139  const PackParam<float2> &param) {
1140  out[out_idx + 0*out_stride] = TEX1DFETCH(float2, SPINORTEXSINGLE, in_idx + 0*param.sp_stride);
1141  out[out_idx + 1*out_stride] = TEX1DFETCH(float2, SPINORTEXSINGLE, in_idx + 1*param.sp_stride);
1142  out[out_idx + 2*out_stride] = TEX1DFETCH(float2, SPINORTEXSINGLE, in_idx + 2*param.sp_stride);
1143  }
1144 
1145  // this is rather dumb: undoing the texture load because cudaNormalizedReadMode is used
1146  // should really bind to an appropriate texture instead of reusing
1147  static inline __device__ short2 float22short2(float c, float2 a) {
1148  return make_short2((short)(a.x*c*MAX_SHORT), (short)(a.y*c*MAX_SHORT));
1149  }
1150 
1151  __device__ void packFaceStaggeredCore(short2 *out, float *outNorm, const int out_idx,
1152  const int out_stride, const short2 *in,
1153  const float *inNorm, const int in_idx,
1154  const PackParam<short2> &param) {
1155  out[out_idx + 0*out_stride] = float22short2(1.0f,TEX1DFETCH(float2,SPINORTEXHALF,in_idx+0*param.sp_stride));
1156  out[out_idx + 1*out_stride] = float22short2(1.0f,TEX1DFETCH(float2,SPINORTEXHALF,in_idx+1*param.sp_stride));
1157  out[out_idx + 2*out_stride] = float22short2(1.0f,TEX1DFETCH(float2,SPINORTEXHALF,in_idx+2*param.sp_stride));
1158  outNorm[out_idx] = TEX1DFETCH(float, SPINORTEXHALFNORM, in_idx);
1159  }
1160 #endif
1161 
1162 
1163  template <typename FloatN, int nFace>
1164  __global__ void packFaceStaggeredKernel(PackParam<FloatN> param)
1165  {
1166 
1167 #ifdef STRIPED
1168  const int sites_per_block = param.sites_per_block;
1169  int local_tid = threadIdx.x;
1170  int tid = sites_per_block * blockIdx.x + local_tid;
1171 #else
1172  int tid = block_idx(param.swizzle) * blockDim.x + threadIdx.x;
1173  constexpr int sites_per_block = 1;
1174  constexpr int local_tid = 0;
1175 #endif
1176 
1177  while ( local_tid < sites_per_block && tid < param.threads ) {
1178 
1179  const int Ls = param.dc.X[4];
1180 
1181  // determine which dimension we are packing
1182  int face_idx;
1183  const int dim = dimFromFaceIndex(face_idx, tid, param);
1184 
1185  // compute where the output is located
1186  // compute an index into the local volume from the index into the face
1187  // read spinor and write to face
1188  if (dim == 0) {
1189  // face_num determines which end of the lattice we are packing: 0 = start, 1 = end
1190  const int face_num = (param.face_num==2) ? ((face_idx >= Ls*nFace*param.dc.ghostFace[0]) ? 1 : 0) : param.face_num;
1191  if(param.face_num==2) face_idx -= face_num*Ls*nFace*param.dc.ghostFace[0];
1192  if (face_num == 0) {
1193  const int idx = indexFromFaceIndexStaggered<0,nFace,0>(face_idx,param);
1194  packFaceStaggeredCore(param.out[0], param.outNorm[0], face_idx,
1195  Ls*nFace*param.dc.ghostFace[0], param.in, param.inNorm, idx, param);
1196  } else {
1197  const int idx = indexFromFaceIndexStaggered<0,nFace,1>(face_idx,param);
1198  packFaceStaggeredCore(param.out[1], param.outNorm[1], face_idx,
1199  Ls*nFace*param.dc.ghostFace[0], param.in, param.inNorm, idx, param);
1200  }
1201  } else if (dim == 1) {
1202  const int face_num = (param.face_num==2) ? ((face_idx >= Ls*nFace*param.dc.ghostFace[1]) ? 1 : 0) : param.face_num;
1203  if(param.face_num==2) face_idx -= face_num*Ls*nFace*param.dc.ghostFace[1];
1204  if (face_num == 0) {
1205  const int idx = indexFromFaceIndexStaggered<1,nFace,0>(face_idx,param);
1206  packFaceStaggeredCore(param.out[2], param.outNorm[2], face_idx,
1207  Ls*nFace*param.dc.ghostFace[1], param.in, param.inNorm, idx, param);
1208  } else {
1209  const int idx = indexFromFaceIndexStaggered<1,nFace,1>(face_idx,param);
1210  packFaceStaggeredCore(param.out[3], param.outNorm[3], face_idx,
1211  Ls*nFace*param.dc.ghostFace[1], param.in, param.inNorm, idx, param);
1212  }
1213  } else if (dim == 2) {
1214  const int face_num = (param.face_num==2) ? ((face_idx >= Ls*nFace*param.dc.ghostFace[2]) ? 1 : 0) : param.face_num;
1215  if(param.face_num==2) face_idx -= face_num*Ls*nFace*param.dc.ghostFace[2];
1216  if (face_num == 0) {
1217  const int idx = indexFromFaceIndexStaggered<2,nFace,0>(face_idx,param);
1218  packFaceStaggeredCore(param.out[4], param.outNorm[4], face_idx,
1219  Ls*nFace*param.dc.ghostFace[2], param.in, param.inNorm, idx, param);
1220  } else {
1221  const int idx = indexFromFaceIndexStaggered<2,nFace,1>(face_idx,param);
1222  packFaceStaggeredCore(param.out[5], param.outNorm[5], face_idx,
1223  Ls*nFace*param.dc.ghostFace[2], param.in, param.inNorm, idx, param);
1224  }
1225  } else {
1226  const int face_num = (param.face_num==2) ? ((face_idx >= Ls*nFace*param.dc.ghostFace[3]) ? 1 : 0) : param.face_num;
1227  if(param.face_num==2) face_idx -= face_num*Ls*nFace*param.dc.ghostFace[3];
1228  if (face_num == 0) {
1229  const int idx = indexFromFaceIndexStaggered<3,nFace,0>(face_idx,param);
1230  packFaceStaggeredCore(param.out[6], param.outNorm[6], face_idx,
1231  Ls*nFace*param.dc.ghostFace[3], param.in, param.inNorm,idx, param);
1232  } else {
1233  const int idx = indexFromFaceIndexStaggered<3,nFace,1>(face_idx,param);
1234  packFaceStaggeredCore(param.out[7], param.outNorm[7], face_idx,
1235  Ls*nFace*param.dc.ghostFace[3], param.in, param.inNorm, idx, param);
1236  }
1237  }
1238 
1239 #ifdef STRIPED
1240  local_tid += blockDim.x;
1241  tid += blockDim.x;
1242 #else
1243  tid += blockDim.x*gridDim.x;
1244 #endif
1245  } // while tid
1246 
1247  }
1248 
1249 
1250  template <typename FloatN, int nFace>
1251  __global__ void packFaceExtendedStaggeredKernel(PackExtendedParam<FloatN> param)
1252  {
1253 
1254 #ifdef STRIPED
1255  const int sites_per_block = param.sites_per_block;
1256  int local_tid = threadIdx.x;
1257  int tid = sites_per_block * blockIdx.x + local_tid;
1258 #else
1259  int tid = block_idx(param.swizzle) * blockDim.x + threadIdx.x;
1260  constexpr int sites_per_block = 1;
1261  constexpr int local_tid = 0;
1262 #endif
1263 
1264  while ( local_tid < sites_per_block && tid < param.threads ) {
1265 
1266  // determine which dimension we are packing
1267  int face_idx;
1268  const int dim = dimFromFaceIndex(face_idx, tid, param);
1269 
1270  // compute where the output is located
1271  // compute an index into the local volume from the index into the face
1272  // read spinor and write half spinor to face
1273  if (dim == 0) {
1274  // face_num determines which end of the lattice we are packing: 0 = start, 1 = end
1275  // if param.face_num==2 pack both the start and the end, otherwise pack the region of the
1276  // lattice specified by param.face_num
1277  const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.dc.ghostFace[0]) ? 1 : 0) : param.face_num;
1278  if(param.face_num==2) face_idx -= face_num*nFace*param.dc.ghostFace[0];
1279  if (face_num == 0) {
1280  const int idx = indexFromFaceIndexExtendedStaggered<0,nFace,0>(face_idx,param);
1281  packFaceStaggeredCore(param.out[0], param.outNorm[0], face_idx,
1282  nFace*param.dc.ghostFace[0], param.in, param.inNorm, idx, param);
1283  } else {
1284  const int idx = indexFromFaceIndexExtendedStaggered<0,nFace,1>(face_idx,param);
1285  packFaceStaggeredCore(param.out[1], param.outNorm[1], face_idx,
1286  nFace*param.dc.ghostFace[0], param.in, param.inNorm, idx, param);
1287  }
1288  } else if (dim == 1) {
1289  const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.dc.ghostFace[1]) ? 1 : 0) : param.face_num;
1290  if(param.face_num==2) face_idx -= face_num*nFace*param.dc.ghostFace[1];
1291  if (face_num == 0) {
1292  const int idx = indexFromFaceIndexExtendedStaggered<1,nFace,0>(face_idx,param);
1293  packFaceStaggeredCore(param.out[2], param.outNorm[2], face_idx,
1294  nFace*param.dc.ghostFace[1], param.in, param.inNorm, idx, param);
1295  } else {
1296  const int idx = indexFromFaceIndexExtendedStaggered<1,nFace,1>(face_idx,param);
1297  packFaceStaggeredCore(param.out[3], param.outNorm[3], face_idx,
1298  nFace*param.dc.ghostFace[1], param.in, param.inNorm, idx, param);
1299  }
1300  } else if (dim == 2) {
1301  const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.dc.ghostFace[2]) ? 1 : 0) : param.face_num;
1302  if(param.face_num==2) face_idx -= face_num*nFace*param.dc.ghostFace[2];
1303  if (face_num == 0) {
1304  const int idx = indexFromFaceIndexExtendedStaggered<2,nFace,0>(face_idx,param);
1305  packFaceStaggeredCore(param.out[4], param.outNorm[4], face_idx,
1306  nFace*param.dc.ghostFace[2], param.in, param.inNorm, idx, param);
1307  } else {
1308  const int idx = indexFromFaceIndexExtendedStaggered<2,nFace,1>(face_idx,param);
1309  packFaceStaggeredCore(param.out[5], param.outNorm[5], face_idx,
1310  nFace*param.dc.ghostFace[2], param.in, param.inNorm, idx, param);
1311  }
1312  } else {
1313  const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.dc.ghostFace[3]) ? 1 : 0) : param.face_num;
1314  if(param.face_num==2) face_idx -= face_num*nFace*param.dc.ghostFace[3];
1315  if (face_num == 0) {
1316  const int idx = indexFromFaceIndexExtendedStaggered<3,nFace,0>(face_idx,param);
1317  packFaceStaggeredCore(param.out[6], param.outNorm[6], face_idx,
1318  nFace*param.dc.ghostFace[3], param.in, param.inNorm,idx, param);
1319  } else {
1320  const int idx = indexFromFaceIndexExtendedStaggered<3,nFace,1>(face_idx,param);
1321  packFaceStaggeredCore(param.out[7], param.outNorm[7], face_idx,
1322  nFace*param.dc.ghostFace[3], param.in, param.inNorm, idx, param);
1323  }
1324  }
1325 
1326 #ifdef STRIPED
1327  local_tid += blockDim.x;
1328  tid += blockDim.x;
1329 #else
1330  tid += blockDim.x*gridDim.x;
1331 #endif
1332  } // while tid
1333 
1334  }
1335 
1336 
1337  template <typename FloatN, int nFace>
1338  __global__ void unpackFaceExtendedStaggeredKernel(PackExtendedParam<FloatN> param)
1339  {
1340 
1341 #ifdef STRIPED
1342  const int sites_per_block = param.sites_per_block;
1343  int local_tid = threadIdx.x;
1344  int tid = sites_per_block * blockIdx.x + local_tid;
1345 #else
1346  int tid = block_idx(param.swizzle) * blockDim.x + threadIdx.x;
1347  constexpr int sites_per_block = 1;
1348  constexpr int local_tid = 0;
1349 #endif
1350 
1351  while ( local_tid < sites_per_block && tid < param.threads ) {
1352 
1353  // determine which dimension we are packing
1354  int face_idx;
1355  const int dim = dimFromFaceIndex(face_idx, tid, param);
1356 
1357  // compute where the output is located
1358  // compute an index into the local volume from the index into the face
1359  // read spinor, spin-project, and write half spinor to face
1360  if (dim == 0) {
1361  // face_num determines which end of the lattice we are packing: 0 = start, 1 = end
1362  // if param.face_num==2 pack both the start and the end, otherwist pack the region of the
1363  // lattice specified by param.face_num
1364  const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.dc.ghostFace[0]) ? 1 : 0) : param.face_num;
1365  if(param.face_num==2) face_idx -= face_num*nFace*param.dc.ghostFace[0];
1366  if (face_num == 0) {
1367  const int idx = indexFromFaceIndexExtendedStaggered<0,nFace,0>(face_idx,param);
1368  packFaceStaggeredCore(param.in, param.inNorm, idx,
1369  param.sp_stride, param.out[0], param.outNorm[0], face_idx, nFace*param.dc.ghostFace[0]);
1370  } else {
1371  const int idx = indexFromFaceIndexExtendedStaggered<0,nFace,1>(face_idx,param);
1372  packFaceStaggeredCore(param.in, param.inNorm, idx,
1373  param.sp_stride, param.out[1], param.outNorm[1], face_idx, nFace*param.dc.ghostFace[0]);
1374  }
1375  } else if (dim == 1) {
1376  const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.dc.ghostFace[1]) ? 1 : 0) : param.face_num;
1377  if(param.face_num==2) face_idx -= face_num*nFace*param.dc.ghostFace[1];
1378  if (face_num == 0) {
1379  const int idx = indexFromFaceIndexExtendedStaggered<1,nFace,0>(face_idx,param);
1380  packFaceStaggeredCore(param.in, param.inNorm, idx,
1381  param.sp_stride, param.out[2], param.outNorm[2], face_idx, nFace*param.dc.ghostFace[1]);
1382  } else {
1383  const int idx = indexFromFaceIndexExtendedStaggered<1,nFace,1>(face_idx,param);
1384  packFaceStaggeredCore(param.in, param.inNorm, idx,
1385  param.sp_stride, param.out[3], param.outNorm[3], face_idx, nFace*param.dc.ghostFace[1]);
1386  }
1387  } else if (dim == 2) {
1388  const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.dc.ghostFace[2]) ? 1 : 0) : param.face_num;
1389  if(param.face_num==2) face_idx -= face_num*nFace*param.dc.ghostFace[2];
1390  if (face_num == 0) {
1391  const int idx = indexFromFaceIndexExtendedStaggered<2,nFace,0>(face_idx,param);
1392  packFaceStaggeredCore(param.in, param.inNorm, idx,
1393  param.sp_stride, param.out[4], param.outNorm[4], face_idx, nFace*param.dc.ghostFace[2]);
1394  } else {
1395  const int idx = indexFromFaceIndexExtendedStaggered<2,nFace,1>(face_idx,param);
1396  packFaceStaggeredCore(param.in, param.inNorm, idx,
1397  param.sp_stride, param.out[5], param.outNorm[5], face_idx, nFace*param.dc.ghostFace[2]);
1398  }
1399  } else {
1400  const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.dc.ghostFace[3]) ? 1 : 0) : param.face_num;
1401  if(param.face_num==2) face_idx -= face_num*nFace*param.dc.ghostFace[3];
1402  if (face_num == 0) {
1403  const int idx = indexFromFaceIndexExtendedStaggered<3,nFace,0>(face_idx,param);
1404  packFaceStaggeredCore(param.in, param.inNorm, idx,
1405  param.sp_stride, param.out[6], param.outNorm[6], face_idx, nFace*param.dc.ghostFace[3]);
1406  } else {
1407  const int idx = indexFromFaceIndexExtendedStaggered<3,nFace,1>(face_idx,param);
1408  packFaceStaggeredCore(param.in, param.inNorm, idx,
1409  param.sp_stride, param.out[7], param.outNorm[7], face_idx, nFace*param.dc.ghostFace[3]);
1410  }
1411  }
1412 
1413 #ifdef STRIPED
1414  local_tid += blockDim.x;
1415  tid += blockDim.x;
1416 #else
1417  tid += blockDim.x*gridDim.x;
1418 #endif
1419  } // while tid
1420 
1421  }
1422 
1423 
1424 #undef SPINORTEXDOUBLE
1425 #undef SPINORTEXSINGLE
1426 #undef SPINORTEXHALF
1427 
1428 #endif // GPU_STAGGERED_DIRAC
1429 
1430 
1431  template <typename FloatN, typename Float>
1432  class PackFaceStaggered : public PackFace<FloatN, Float> {
1433 
1434  private:
1435  const int* R; // boundary dimensions for extended field
1436  const bool unpack;
1437 
1438  int inputPerSite() const { return 6; } // input is full spinor
1439  int outputPerSite() const { return 6; } // output is full spinor
1440 
1441 
1442  public:
1443  PackFaceStaggered(void *faces[], const cudaColorSpinorField *in, MemoryLocation location,
1444  const int nFace, const int dagger, const int parity,
1445  const int dim, const int face_num, const int* R=NULL, const bool unpack=false)
1446  : PackFace<FloatN, Float>(faces, in, location, dagger, parity, nFace, dim, face_num), R(R), unpack(unpack) { }
1447  virtual ~PackFaceStaggered() { }
1448 
1449  void apply(const cudaStream_t &stream) {
1450  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
1451 
1452 #ifdef GPU_STAGGERED_DIRAC
1453 
1454  static PackParam<FloatN> param;
1455  this->prepareParam(param,tp,this->dim,this->face_num);
1456  if(!R){
1457  void *args[] = { &param };
1458  void (*func)(PackParam<FloatN>) = PackFace<FloatN,Float>::nFace==1 ? &(packFaceStaggeredKernel<FloatN,1>) : &(packFaceStaggeredKernel<FloatN,3>);
1459  cudaLaunchKernel( (const void*)func, tp.grid, tp.block, args, tp.shared_bytes, stream);
1460  }else{ // R!=NULL => this is an extended field
1461  PackExtendedParam<FloatN> extendedParam(param);
1462  if(!unpack){
1463  for(int d=0; d<QUDA_MAX_DIM; ++d) extendedParam.R[d] = R[d];
1464  switch(PackFace<FloatN,Float>::nFace){
1465  case 1:
1466  packFaceExtendedStaggeredKernel<FloatN,1><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(extendedParam);
1467  break;
1468 
1469  case 2:
1470  packFaceExtendedStaggeredKernel<FloatN,2><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(extendedParam);
1471  break;
1472 
1473  case 3:
1474  packFaceExtendedStaggeredKernel<FloatN,3><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(extendedParam);
1475  break;
1476 
1477  case 4:
1478  packFaceExtendedStaggeredKernel<FloatN,4><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(extendedParam);
1479  break;
1480 
1481  default:
1482  errorQuda("Unsupported boundary width");
1483  break;
1484  }
1485  }else{ // extended field unpack
1486  switch(PackFace<FloatN,Float>::nFace){
1487  case 1:
1488  unpackFaceExtendedStaggeredKernel<FloatN,1><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(extendedParam);
1489  break;
1490 
1491  case 2:
1492  unpackFaceExtendedStaggeredKernel<FloatN,2><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(extendedParam);
1493  break;
1494 
1495  case 3:
1496  unpackFaceExtendedStaggeredKernel<FloatN,3><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(extendedParam);
1497  break;
1498 
1499  case 4:
1500  unpackFaceExtendedStaggeredKernel<FloatN,4><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(extendedParam);
1501  break;
1502 
1503  default:
1504  errorQuda("Unsupported boundary width");
1505  break;
1506  }
1507  }
1508  }
1509 #else
1510  errorQuda("Staggered face packing kernel is not built");
1511 #endif
1512  }
1513 
1514  long long flops() const { return 0; }
1515  };
1516 
1517 
1518  void packFaceStaggered(void *ghost_buf[], cudaColorSpinorField &in, MemoryLocation location, int nFace,
1519  int dagger, int parity, const int dim, const int face_num, const cudaStream_t &stream) {
1520 
1521  switch(in.Precision()) {
1522  case QUDA_DOUBLE_PRECISION:
1523  {
1524  PackFaceStaggered<double2, double> pack(ghost_buf, &in, location, nFace, dagger, parity, dim, face_num);
1525  pack.apply(stream);
1526  }
1527  break;
1528  case QUDA_SINGLE_PRECISION:
1529  {
1530  PackFaceStaggered<float2, float> pack(ghost_buf, &in, location, nFace, dagger, parity, dim, face_num);
1531  pack.apply(stream);
1532  }
1533  break;
1534  case QUDA_HALF_PRECISION:
1535  {
1536  PackFaceStaggered<short2, float> pack(ghost_buf, &in, location, nFace, dagger, parity, dim, face_num);
1537  pack.apply(stream);
1538  }
1539  break;
1540  default:
1541  errorQuda("Precision %d not supported", in.Precision());
1542  }
1543  }
1544 
1545  void packFaceExtendedStaggered(void *buffer[], cudaColorSpinorField &field, MemoryLocation location, const int nFace, const int R[],
1546  int dagger, int parity, const int dim, const int face_num, const cudaStream_t &stream, bool unpack=false)
1547  {
1548  switch(field.Precision()){
1549  case QUDA_DOUBLE_PRECISION:
1550  {
1551  PackFaceStaggered<double2,double> pack(buffer, &field, location, nFace, dagger, parity, dim, face_num, R, unpack);
1552  pack.apply(stream);
1553  }
1554  break;
1555  case QUDA_SINGLE_PRECISION:
1556  {
1557  PackFaceStaggered<float2,float> pack(buffer, &field, location, nFace, dagger, parity, dim, face_num, R, unpack);
1558  pack.apply(stream);
1559  }
1560  break;
1561  case QUDA_HALF_PRECISION:
1562  {
1563  PackFaceStaggered<short2,float> pack(buffer, &field, location, nFace, dagger, parity, dim, face_num, R, unpack);
1564  pack.apply(stream);
1565  }
1566  break;
1567  default:
1568  errorQuda("Precision %d not supported", field.Precision());
1569  } // switch(field.Precision())
1570  }
1571 
1572 #ifdef GPU_DOMAIN_WALL_DIRAC
1573  template <int dagger, typename FloatN>
1574  __global__ void packFaceDWKernel(PackParam<FloatN> param)
1575  {
1576  const int nFace = 1; // 1 face for dwf
1577 
1578 #ifdef STRIPED
1579  const int sites_per_block = param.sites_per_block;
1580  int local_tid = threadIdx.x;
1581  int tid = sites_per_block * blockIdx.x + local_tid;
1582 #else
1583  int tid = block_idx(param.swizzle) * blockDim.x + threadIdx.x;
1584  constexpr int sites_per_block = 1;
1585  constexpr int local_tid = 0;
1586 #endif
1587 
1588  while ( local_tid < sites_per_block && tid < param.threads ) {
1589 
1590  // determine which dimension we are packing
1591  int face_idx;
1592  const int dim = dimFromFaceIndex(face_idx, tid, param);
1593 
1594  const int Ls = param.dc.X[4];
1595 
1596  // compute where the output is located
1597  // compute an index into the local volume from the index into the face
1598  // read spinor, spin-project, and write half spinor to face
1599  if (dim == 0) {
1600  // face_num determines which end of the lattice we are packing: 0 = beginning, 1 = end
1601  // FIXME these param.dc.ghostFace constants do not incude the Ls dimension
1602  const int face_num = (face_idx >= nFace*Ls*param.dc.ghostFace[0]) ? 1 : 0;
1603  face_idx -= face_num*nFace*Ls*param.dc.ghostFace[0];
1604  if (face_num == 0) {
1605  const int idx = indexFromFaceIndex<5,QUDA_5D_PC,0,nFace,0>(face_idx,param);
1606  packFaceWilsonCore<0,dagger,0>(param.out[0], param.outNorm[0], param.in,
1607  param.inNorm, idx, face_idx, Ls*param.dc.ghostFace[0], param);
1608  } else {
1609  const int idx = indexFromFaceIndex<5,QUDA_5D_PC,0,nFace,1>(face_idx,param);
1610  packFaceWilsonCore<0,dagger,1>(param.out[1], param.outNorm[1], param.in,
1611  param.inNorm, idx, face_idx, Ls*param.dc.ghostFace[0], param);
1612  }
1613  } else if (dim == 1) {
1614  const int face_num = (face_idx >= nFace*Ls*param.dc.ghostFace[1]) ? 1 : 0;
1615  face_idx -= face_num*nFace*Ls*param.dc.ghostFace[1];
1616  if (face_num == 0) {
1617  const int idx = indexFromFaceIndex<5,QUDA_5D_PC,1,nFace,0>(face_idx,param);
1618  packFaceWilsonCore<1, dagger,0>(param.out[2], param.outNorm[2], param.in,
1619  param.inNorm, idx, face_idx, Ls*param.dc.ghostFace[1], param);
1620  } else {
1621  const int idx = indexFromFaceIndex<5,QUDA_5D_PC,1,nFace,1>(face_idx,param);
1622  packFaceWilsonCore<1, dagger,1>(param.out[3], param.outNorm[3], param.in,
1623  param.inNorm, idx, face_idx, Ls*param.dc.ghostFace[1], param);
1624  }
1625  } else if (dim == 2) {
1626  const int face_num = (face_idx >= nFace*Ls*param.dc.ghostFace[2]) ? 1 : 0;
1627  face_idx -= face_num*nFace*Ls*param.dc.ghostFace[2];
1628  if (face_num == 0) {
1629  const int idx = indexFromFaceIndex<5,QUDA_5D_PC,2,nFace,0>(face_idx,param);
1630  packFaceWilsonCore<2, dagger,0>(param.out[4], param.outNorm[4], param.in,
1631  param.inNorm, idx, face_idx, Ls*param.dc.ghostFace[2], param);
1632  } else {
1633  const int idx = indexFromFaceIndex<5,QUDA_5D_PC,2,nFace,1>(face_idx,param);
1634  packFaceWilsonCore<2, dagger,1>(param.out[5], param.outNorm[5], param.in,
1635  param.inNorm, idx, face_idx, Ls*param.dc.ghostFace[2], param);
1636  }
1637  } else {
1638  const int face_num = (face_idx >= nFace*Ls*param.dc.ghostFace[3]) ? 1 : 0;
1639  face_idx -= face_num*nFace*Ls*param.dc.ghostFace[3];
1640  if (face_num == 0) {
1641  const int idx = indexFromFaceIndex<5,QUDA_5D_PC,3,nFace,0>(face_idx,param);
1642  packFaceWilsonCore<3, dagger,0>(param.out[6], param.outNorm[6], param.in,
1643  param.inNorm, idx, face_idx, Ls*param.dc.ghostFace[3], param);
1644  } else {
1645  const int idx = indexFromFaceIndex<5,QUDA_5D_PC,3,nFace,1>(face_idx,param);
1646  packFaceWilsonCore<3, dagger,1>(param.out[7], param.outNorm[7], param.in,
1647  param.inNorm, idx, face_idx, Ls*param.dc.ghostFace[3], param);
1648  }
1649  }
1650 
1651 #ifdef STRIPED
1652  local_tid += blockDim.x;
1653  tid += blockDim.x;
1654 #else
1655  tid += blockDim.x*gridDim.x;
1656 #endif
1657  } // while tid
1658 
1659  }
1660 
1661 
1662  template <int dagger, typename FloatN>
1663  __global__ void packFaceDW4DKernel(PackParam<FloatN> param)
1664  {
1665  const int nFace = 1; // 1 face for Wilson
1666 
1667 #ifdef STRIPED
1668  const int sites_per_block = param.sites_per_block;
1669  int local_tid = threadIdx.x;
1670  int tid = sites_per_block * blockIdx.x + local_tid;
1671 #else
1672  int tid = block_idx(param.swizzle) * blockDim.x + threadIdx.x;
1673  constexpr int sites_per_block = 1;
1674  constexpr int local_tid = 0;
1675 #endif
1676 
1677  while ( local_tid < sites_per_block && tid < param.threads ) {
1678 
1679  const int Ls = param.dc.X[4];
1680 
1681  // determine which dimension we are packing
1682  int face_idx;
1683  const int dim = dimFromFaceIndex(face_idx, tid, param);
1684 
1685  // compute where the output is located
1686  // compute an index into the local volume from the index into the face
1687  // read spinor, spin-project, and write half spinor to face
1688  if (dim == 0) {
1689  // face_num determines which end of the lattice we are packing: 0 = beginning, 1 = end
1690  // FIXME these param.dc.ghostFace constants do not incude the Ls dimension
1691  const int face_num = (face_idx >= nFace*Ls*param.dc.ghostFace[0]) ? 1 : 0;
1692  face_idx -= face_num*nFace*Ls*param.dc.ghostFace[0];
1693  if (face_num == 0) {
1694  const int idx = indexFromFaceIndex<5,QUDA_4D_PC,0,nFace,0>(face_idx,param);
1695  packFaceWilsonCore<0,dagger,0>(param.out[0], param.outNorm[0], param.in,
1696  param.inNorm, idx, face_idx, Ls*param.dc.ghostFace[0], param);
1697  } else {
1698  const int idx = indexFromFaceIndex<5,QUDA_4D_PC,0,nFace,1>(face_idx,param);
1699  packFaceWilsonCore<0,dagger,1>(param.out[1], param.outNorm[1], param.in,
1700  param.inNorm, idx, face_idx, Ls*param.dc.ghostFace[0], param);
1701  }
1702  } else if (dim == 1) {
1703  const int face_num = (face_idx >= nFace*Ls*param.dc.ghostFace[1]) ? 1 : 0;
1704  face_idx -= face_num*nFace*Ls*param.dc.ghostFace[1];
1705  if (face_num == 0) {
1706  const int idx = indexFromFaceIndex<5,QUDA_4D_PC,1,nFace,0>(face_idx,param);
1707  packFaceWilsonCore<1, dagger,0>(param.out[2], param.outNorm[2], param.in,
1708  param.inNorm, idx, face_idx, Ls*param.dc.ghostFace[1], param);
1709  } else {
1710  const int idx = indexFromFaceIndex<5,QUDA_4D_PC,1,nFace,1>(face_idx,param);
1711  packFaceWilsonCore<1, dagger,1>(param.out[3], param.outNorm[3], param.in,
1712  param.inNorm, idx, face_idx, Ls*param.dc.ghostFace[1], param);
1713  }
1714  } else if (dim == 2) {
1715  const int face_num = (face_idx >= nFace*Ls*param.dc.ghostFace[2]) ? 1 : 0;
1716  face_idx -= face_num*nFace*Ls*param.dc.ghostFace[2];
1717  if (face_num == 0) {
1718  const int idx = indexFromFaceIndex<5,QUDA_4D_PC,2,nFace,0>(face_idx,param);
1719  packFaceWilsonCore<2, dagger,0>(param.out[4], param.outNorm[4], param.in,
1720  param.inNorm, idx, face_idx, Ls*param.dc.ghostFace[2], param);
1721  } else {
1722  const int idx = indexFromFaceIndex<5,QUDA_4D_PC,2,nFace,1>(face_idx,param);
1723  packFaceWilsonCore<2, dagger,1>(param.out[5], param.outNorm[5], param.in,
1724  param.inNorm, idx, face_idx, Ls*param.dc.ghostFace[2], param);
1725  }
1726  } else {
1727  const int face_num = (face_idx >= nFace*Ls*param.dc.ghostFace[3]) ? 1 : 0;
1728  face_idx -= face_num*nFace*Ls*param.dc.ghostFace[3];
1729  if (face_num == 0) {
1730  const int idx = indexFromFaceIndex<5,QUDA_4D_PC,3,nFace,0>(face_idx,param);
1731  packFaceWilsonCore<3, dagger,0>(param.out[6], param.outNorm[6], param.in,
1732  param.inNorm, idx, face_idx, Ls*param.dc.ghostFace[3], param);
1733  } else {
1734  const int idx = indexFromFaceIndex<5,QUDA_4D_PC,3,nFace,1>(face_idx,param);
1735  packFaceWilsonCore<3, dagger,1>(param.out[7], param.outNorm[7], param.in,
1736  param.inNorm, idx, face_idx, Ls*param.dc.ghostFace[3], param);
1737  }
1738  }
1739 
1740 #ifdef STRIPED
1741  local_tid += blockDim.x;
1742  tid += blockDim.x;
1743 #else
1744  tid += blockDim.x*gridDim.x;
1745 #endif
1746  } // while tid
1747 
1748  }
1749 
1750 #endif
1751 
1752  template <typename FloatN, typename Float>
1753  class PackFaceDW : public PackFace<FloatN, Float> {
1754 
1755  private:
1756 
1757  int inputPerSite() const { return 24; } // input is full spinor
1758  int outputPerSite() const { return 12; } // output is spin projected
1759 
1760  public:
1761  PackFaceDW(void *faces[], const cudaColorSpinorField *in, MemoryLocation location,
1762  const int dagger, const int parity)
1763  : PackFace<FloatN, Float>(faces, in, location, dagger, parity, 1) { }
1764  virtual ~PackFaceDW() { }
1765 
1766  void apply(const cudaStream_t &stream) {
1767  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
1768 
1769 #ifdef GPU_DOMAIN_WALL_DIRAC
1770  static PackParam<FloatN> param;
1771  this->prepareParam(param,tp);
1772  void *args[] = { &param };
1773  void (*func)(PackParam<FloatN>) = this->dagger ? &(packFaceDWKernel<1,FloatN>) : &(packFaceDWKernel<0,FloatN>);
1774  cudaLaunchKernel( (const void*)func, tp.grid, tp.block, args, tp.shared_bytes, stream);
1775 #else
1776  errorQuda("DW face packing kernel is not built");
1777 #endif
1778  }
1779 
1780  long long flops() const { return outputPerSite()*this->threads(); }
1781  };
1782 
1783  template <typename FloatN, typename Float>
1784  class PackFaceDW4D : public PackFace<FloatN, Float> {
1785 
1786  private:
1787 
1788  int inputPerSite() const { return 24; } // input is full spinor
1789  int outputPerSite() const { return 12; } // output is spin projected
1790 
1791  public:
1792  PackFaceDW4D(void *faces[], const cudaColorSpinorField *in, MemoryLocation location,
1793  const int dagger, const int parity)
1794  : PackFace<FloatN, Float>(faces, in, location, dagger, parity, 1) { }
1795  virtual ~PackFaceDW4D() { }
1796 
1797  void apply(const cudaStream_t &stream) {
1798  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
1799 
1800 #ifdef GPU_DOMAIN_WALL_DIRAC
1801  static PackParam<FloatN> param;
1802  this->prepareParam(param,tp);
1803  void *args[] = { &param };
1804  void (*func)(PackParam<FloatN>) = this->dagger ? &(packFaceDW4DKernel<1,FloatN>) : &(packFaceDW4DKernel<0,FloatN>);
1805  cudaLaunchKernel( (const void*)func, tp.grid, tp.block, args, tp.shared_bytes, stream);
1806 #else
1807  errorQuda("4D preconditioned DW face packing kernel is not built");
1808 #endif
1809  }
1810 
1811  long long flops() const { return outputPerSite()*this->threads(); }
1812  };
1813 
1814  void packFaceDW(void *ghost_buf[], cudaColorSpinorField &in, MemoryLocation location, const int dagger,
1815  const int parity, const cudaStream_t &stream) {
1816 
1817 
1818  if(in.DWFPCtype() == QUDA_4D_PC)
1819  {
1820  switch(in.Precision()) {
1821  case QUDA_DOUBLE_PRECISION:
1822  {
1823  PackFaceDW4D<double2, double> pack(ghost_buf, &in, location, dagger, parity);
1824  pack.apply(stream);
1825  }
1826  break;
1827  case QUDA_SINGLE_PRECISION:
1828  {
1829  PackFaceDW4D<float4, float> pack(ghost_buf, &in, location, dagger, parity);
1830  pack.apply(stream);
1831  }
1832  break;
1833  case QUDA_HALF_PRECISION:
1834  {
1835  PackFaceDW4D<short4, float> pack(ghost_buf, &in, location, dagger, parity);
1836  pack.apply(stream);
1837  }
1838  break;
1839  default:
1840  errorQuda("Precision %d not supported", in.Precision());
1841  }
1842  }
1843  else
1844  {
1845  switch(in.Precision()) {
1846  case QUDA_DOUBLE_PRECISION:
1847  {
1848  PackFaceDW<double2, double> pack(ghost_buf, &in, location, dagger, parity);
1849  pack.apply(stream);
1850  }
1851  break;
1852  case QUDA_SINGLE_PRECISION:
1853  {
1854  PackFaceDW<float4, float> pack(ghost_buf, &in, location, dagger, parity);
1855  pack.apply(stream);
1856  }
1857  break;
1858  case QUDA_HALF_PRECISION:
1859  {
1860  PackFaceDW<short4, float> pack(ghost_buf, &in, location, dagger, parity);
1861  pack.apply(stream);
1862  }
1863  break;
1864  default:
1865  errorQuda("Precision %d not supported", in.Precision());
1866  }
1867  }
1868  }
1869 
1870 #ifdef GPU_NDEG_TWISTED_MASS_DIRAC
1871  template <int dagger, typename FloatN>
1872  __global__ void packFaceNdegTMKernel(PackParam<FloatN> param)
1873  {
1874  const int nFace = 1; // 1 face for Wilson
1875  const int Nf = 2;
1876 
1877 #ifdef STRIPED
1878  const int sites_per_block = param.sites_per_block;
1879  int local_tid = threadIdx.x;
1880  int tid = sites_per_block * blockIdx.x + local_tid;
1881 #else
1882  int tid = block_idx(param.swizzle) * blockDim.x + threadIdx.x;
1883  constexpr int sites_per_block = 1;
1884  constexpr int local_tid = 0;
1885 #endif
1886 
1887  while ( local_tid < sites_per_block && tid < param.threads ) {
1888 
1889  // determine which dimension we are packing
1890  int face_idx;
1891  const int dim = dimFromFaceIndex(face_idx, tid, param);
1892 
1893  // compute where the output is located
1894  // compute an index into the local volume from the index into the face
1895  // read spinor, spin-project, and write half spinor to face
1896  if (dim == 0) {
1897  // face_num determines which end of the lattice we are packing:
1898  // 0 = beginning, 1 = end FIXME these param.dc.ghostFace constants
1899  // do not include the Nf dimension
1900  const int face_num = (face_idx >= nFace*Nf*param.dc.ghostFace[0]) ? 1 : 0;
1901  face_idx -= face_num*nFace*Nf*param.dc.ghostFace[0];
1902  if (face_num == 0) {
1903  const int idx = indexFromFaceIndex<5,QUDA_4D_PC,0,nFace,0>(face_idx,param);
1904  packFaceWilsonCore<0,dagger,0>(param.out[0], param.outNorm[0], param.in,
1905  param.inNorm, idx, face_idx, Nf*param.dc.ghostFace[0], param);
1906  } else {
1907  const int idx = indexFromFaceIndex<5,QUDA_4D_PC,0,nFace,1>(face_idx,param);
1908  packFaceWilsonCore<0,dagger,1>(param.out[1], param.outNorm[1], param.in,
1909  param.inNorm, idx, face_idx, Nf*param.dc.ghostFace[0], param);
1910  }
1911  } else if (dim == 1) {
1912  const int face_num = (face_idx >= nFace*Nf*param.dc.ghostFace[1]) ? 1 : 0;
1913  face_idx -= face_num*nFace*Nf*param.dc.ghostFace[1];
1914  if (face_num == 0) {
1915  const int idx = indexFromFaceIndex<5,QUDA_4D_PC,1,nFace,0>(face_idx,param);
1916  packFaceWilsonCore<1, dagger,0>(param.out[2], param.outNorm[2], param.in,
1917  param.inNorm, idx, face_idx, Nf*param.dc.ghostFace[1], param);
1918  } else {
1919  const int idx = indexFromFaceIndex<5,QUDA_4D_PC,1,nFace,1>(face_idx,param);
1920  packFaceWilsonCore<1, dagger,1>(param.out[3], param.outNorm[3], param.in,
1921  param.inNorm, idx, face_idx, Nf*param.dc.ghostFace[1], param);
1922  }
1923  } else if (dim == 2) {
1924  const int face_num = (face_idx >= nFace*Nf*param.dc.ghostFace[2]) ? 1 : 0;
1925  face_idx -= face_num*nFace*Nf*param.dc.ghostFace[2];
1926  if (face_num == 0) {
1927  const int idx = indexFromFaceIndex<5,QUDA_4D_PC,2,nFace,0>(face_idx,param);
1928  packFaceWilsonCore<2, dagger,0>(param.out[4], param.outNorm[4], param.in,
1929  param.inNorm, idx, face_idx, Nf*param.dc.ghostFace[2], param);
1930  } else {
1931  const int idx = indexFromFaceIndex<5,QUDA_4D_PC,2,nFace,1>(face_idx,param);
1932  packFaceWilsonCore<2, dagger,1>(param.out[5], param.outNorm[5], param.in,
1933  param.inNorm, idx, face_idx, Nf*param.dc.ghostFace[2], param);
1934  }
1935  } else {
1936  const int face_num = (face_idx >= nFace*Nf*param.dc.ghostFace[3]) ? 1 : 0;
1937  face_idx -= face_num*nFace*Nf*param.dc.ghostFace[3];
1938  if (face_num == 0) {
1939  const int idx = indexFromFaceIndex<5,QUDA_4D_PC,3,nFace,0>(face_idx,param);
1940  packFaceWilsonCore<3, dagger,0>(param.out[6], param.outNorm[6], param.in,
1941  param.inNorm, idx, face_idx, Nf*param.dc.ghostFace[3], param);
1942  } else {
1943  const int idx = indexFromFaceIndex<5,QUDA_4D_PC,3,nFace,1>(face_idx,param);
1944  packFaceWilsonCore<3, dagger,1>(param.out[7], param.outNorm[7], param.in,
1945  param.inNorm, idx, face_idx, Nf*param.dc.ghostFace[3], param);
1946  }
1947  }
1948 
1949 #ifdef STRIPED
1950  local_tid += blockDim.x;
1951  tid += blockDim.x;
1952 #else
1953  tid += blockDim.x*gridDim.x;
1954 #endif
1955  } // while tid
1956 
1957  }
1958 
1959 #endif
1960 
1961  template <typename FloatN, typename Float>
1962  class PackFaceNdegTM : public PackFace<FloatN, Float> {
1963 
1964  private:
1965 
1966  int inputPerSite() const { return 24; } // input is full spinor
1967  int outputPerSite() const { return 12; } // output is spin projected
1968 
1969  public:
1970  PackFaceNdegTM(void *faces[], const cudaColorSpinorField *in, MemoryLocation location,
1971  const int dagger, const int parity)
1972  : PackFace<FloatN, Float>(faces, in, location, dagger, parity, 1) { }
1973  virtual ~PackFaceNdegTM() { }
1974 
1975  void apply(const cudaStream_t &stream) {
1976  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
1977 
1978 #ifdef GPU_NDEG_TWISTED_MASS_DIRAC
1979  static PackParam<FloatN> param;
1980  this->prepareParam(param,tp);
1981  void *args[] = { &param };
1982  void (*func)(PackParam<FloatN>) = this->dagger ? &(packFaceNdegTMKernel<1,FloatN>) : &(packFaceNdegTMKernel<0,FloatN>);
1983  cudaLaunchKernel( (const void*)func, tp.grid, tp.block, args, tp.shared_bytes, stream);
1984 #else
1985  errorQuda("Non-degenerate twisted mass face packing kernel is not built");
1986 #endif
1987  }
1988 
1989  long long flops() const { return outputPerSite()*this->threads(); }
1990  };
1991 
1992  void packFaceNdegTM(void *ghost_buf[], cudaColorSpinorField &in, MemoryLocation location, const int dagger,
1993  const int parity, const cudaStream_t &stream) {
1994 
1995  switch(in.Precision()) {
1996  case QUDA_DOUBLE_PRECISION:
1997  {
1998  PackFaceNdegTM<double2, double> pack(ghost_buf, &in, location, dagger, parity);
1999  pack.apply(stream);
2000  }
2001  break;
2002  case QUDA_SINGLE_PRECISION:
2003  {
2004  PackFaceNdegTM<float4, float> pack(ghost_buf, &in, location, dagger, parity);
2005  pack.apply(stream);
2006  }
2007  break;
2008  case QUDA_HALF_PRECISION:
2009  {
2010  PackFaceNdegTM<short4, float> pack(ghost_buf, &in, location, dagger, parity);
2011  pack.apply(stream);
2012  }
2013  break;
2014  default:
2015  errorQuda("Precision %d not supported", in.Precision());
2016  }
2017  }
2018 
2019  void packFace(void *ghost_buf[2*QUDA_MAX_DIM], cudaColorSpinorField &in,
2020  MemoryLocation location, const int nFace,
2021  const int dagger, const int parity,
2022  const int dim, const int face_num,
2023  const cudaStream_t &stream,
2024  const double a, const double b)
2025  {
2026  int nDimPack = 0;
2027  if(dim < 0){
2028  for (int d=0; d<4; d++) {
2029  if(!commDim[d]) continue;
2030  if (d != 3 || getKernelPackT() || a != 0.0 || b!= 0.0) nDimPack++;
2031  }
2032  }else{
2033  if(commDim[dim]){
2034  if(dim!=3 || getKernelPackT() || a!=0.0 || b != 0.0) nDimPack++;
2035  }
2036  }
2037  if (!nDimPack) return; // if zero then we have nothing to pack
2038 
2039  if (nFace != 1 && in.Nspin() != 1)
2040  errorQuda("Unsupported number of faces %d", nFace);
2041 
2042  // Need to update this logic for other multi-src dslash packing
2043  if (in.Nspin() == 1) {
2044  packFaceStaggered(ghost_buf, in, location, nFace, dagger, parity, dim, face_num, stream);
2045  } else if (a!=0.0 || b!=0.0) {
2046  // Need to update this logic for other multi-src dslash packing
2047  if(in.TwistFlavor() == QUDA_TWIST_SINGLET) {
2048  packTwistedFaceWilson(ghost_buf, in, location, dagger, parity, a, b, stream);
2049  } else {
2050  errorQuda("Cannot perform twisted packing for the spinor.");
2051  }
2052  } else if (in.Ndim() == 5) {
2053  if(in.TwistFlavor() == QUDA_TWIST_INVALID) {
2054  packFaceDW(ghost_buf, in, location, dagger, parity, stream);
2055  } else {
2056  packFaceNdegTM(ghost_buf, in, location, dagger, parity, stream);
2057  }
2058  } else {
2059  packFaceWilson(ghost_buf, in, location, dagger, parity, stream);
2060  }
2061  }
2062 
2063 
2064 
2065  void packFaceExtended(void* buffer[2*QUDA_MAX_DIM], cudaColorSpinorField &field,
2066  MemoryLocation location, const int nFace, const int R[],
2067  const int dagger, const int parity, const int dim, const int face_num,
2068  const cudaStream_t &stream, const bool unpack)
2069  {
2070  int nDimPack = 0;
2071  if(dim < 0){
2072  for(int d=0; d<4; d++){
2073  if(R[d]) nDimPack++;
2074  }
2075  }else{
2076  if(R[dim]) nDimPack++;
2077  }
2078 
2079  if(!nDimPack) return; // if zero then we have nothing to pack
2080  if(field.Nspin() == 1){
2081  packFaceExtendedStaggered(buffer, field, location, nFace, R, dagger, parity, dim, face_num, stream, unpack);
2082  }else{
2083  errorQuda("Extended quark field is not supported");
2084  }
2085 
2086  }
2087 
2088 #endif // MULTI_GPU
2089 
2090 }
dim3 dim3 blockDim
cudaDeviceProp deviceProp
bool getKernelPackT()
Definition: dslash_quda.cu:61
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
const void * func
#define errorQuda(...)
Definition: util_quda.h:90
void setPackComms(const int *commDim)
Definition: dslash_pack.cu:41
int comm_dim(int dim)
cudaStream_t * stream
static __inline__ dim3 dim3 void size_t cudaStream_t int dim
char * strcpy(char *__dst, const char *__src)
static int R[4]
char * strcat(char *__s1, const char *__s2)
void packFaceExtended(void *ghost_buf[2 *QUDA_MAX_DIM], cudaColorSpinorField &field, MemoryLocation location, const int nFace, const int R[], const int dagger, const int parity, const int dim, const int face_num, const cudaStream_t &stream, const bool unpack=false)
int Ls
Definition: test_util.cpp:39
QudaGaugeParam param
Definition: pack_test.cpp:17
#define b
void packFace(void *ghost_buf[2 *QUDA_MAX_DIM], cudaColorSpinorField &in, MemoryLocation location, const int nFace, const int dagger, const int parity, const int dim, const int face_num, const cudaStream_t &stream, const double a=0.0, const double b=0.0)
Dslash face packing routine.
virtual unsigned int maxGridSize() const
Definition: tune_quda.h:96
__device__ int dimFromFaceIndex(int &face_idx, const int tid, const Param &param)
Determines which face a given thread is computing. Also rescale face_idx so that is relative to a giv...
const char * comm_dim_topology_string()
Return a string that defines the comm topology (for use as a tuneKey)
Definition: comm_mpi.cpp:346
cpuColorSpinorField * in
int commDim(int)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603
int int int enum cudaChannelFormatKind f
__inline__ __device__ double2 fetch_double2(texture< int4, 1 > t, int i)
Definition: texture.h:88
int X[4]
Definition: quda.h:29
void * memcpy(void *__dst, const void *__src, size_t __n)
dim3 dim3 void ** args
virtual bool tuneSharedBytes() const
Definition: tune_quda.h:76
virtual unsigned int minGridSize() const
Definition: tune_quda.h:97
cpuColorSpinorField * out
unsigned long long flops
Definition: blas_quda.cu:42
int face_idx
const void * c
const int face_num
#define MAX_SHORT
Definition: quda_internal.h:29
virtual void initTuneParam(TuneParam &param) const
Definition: tune_quda.h:230
static unsigned base
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51
static __inline__ size_t size_t d
QudaParity parity
Definition: covdev_test.cpp:53
int comm_peer2peer_enabled_global()
cudaError_t qudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream)
Wrapper around cudaLaunchKernel.
#define a
__device__ int block_idx(const T &swizzle)
Swizzler for reordering the (x) thread block indices - use on conjunction with swizzle-factor autotun...
unsigned long long bytes
Definition: blas_quda.cu:43
#define TEX1DFETCH(type, tex, idx)
virtual void defaultTuneParam(TuneParam &param) const
Definition: tune_quda.h:254