QUDA  v0.7.0
A library for QCD on GPUs
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
dslash_pack.cu
Go to the documentation of this file.
1 #include <cstdlib>
2 #include <cstdio>
3 #include <string>
4 #include <iostream>
5 
6 #include <color_spinor_field.h>
7 #include <clover_field.h> // Do we need this now?
8 
9 // these control the Wilson-type actions
10 #ifdef GPU_WILSON_DIRAC
11 //#define DIRECT_ACCESS_WILSON_PACK_SPINOR
12 #endif // GPU_WILSON_DIRAC
13 
14 
15 #include <quda_internal.h>
16 #include <dslash_quda.h>
17 #include <sys/time.h>
18 #include <blas_quda.h>
19 
20 #include <inline_ptx.h>
21 
22 namespace quda {
23 
24  namespace pack {
25 
26 #include <dslash_constants.h>
27 #include <dslash_textures.h>
28 
29  } // end namespace pack
30 
31  using namespace pack;
32 
33 #ifdef MULTI_GPU
34  static int commDim[QUDA_MAX_DIM]; // Whether to do comms or not
35  void setPackComms(const int *comm_dim) {
36  for (int i=0; i<QUDA_MAX_DIM; i++) commDim[i] = comm_dim[i];
37  }
38 #else
39  void setPackComms(const int *comm_dim) { ; }
40 #endif
41 
42 #include <dslash_index.cuh>
43 
44  // routines for packing the ghost zones (multi-GPU only)
45 
46 
47 #ifdef MULTI_GPU
48 
49  template <typename FloatN>
50  struct PackParam {
51 
52  FloatN *out[2*4];
53  float *outNorm[2*4];
54 
55  FloatN *in;
56  float *inNorm;
57 
58  int threads; // total number of threads
59 
60  // offsets which determine thread mapping to dimension
61  int threadDimMapLower[4]; // lowest thread which maps to dim
62  int threadDimMapUpper[4]; // greatest thread + 1 which maps to dim
63 
64  int parity;
65 #ifdef USE_TEXTURE_OBJECTS
66  cudaTextureObject_t inTex;
67  cudaTextureObject_t inTexNorm;
68 #endif
69 
70  int dim;
71  int face_num;
72  int X[QUDA_MAX_DIM]; // lattice dimensions
73  int ghostFace[4];
74 
75  int sp_stride;
76  };
77 
78  template<typename FloatN>
79  std::ostream& operator<<(std::ostream& output, const PackParam<FloatN>& param) {
80  output << "threads = " << param.threads << std::endl;
81  output << "threadDimMapLower = {" << param.threadDimMapLower[0] << "," <<
82  param.threadDimMapLower[1] << "," << param.threadDimMapLower[2] << "," << param.threadDimMapLower[3] << "}" << std::endl;
83  output << "threadDimMapUpper = {" << param.threadDimMapUpper[0] << "," <<
84  param.threadDimMapUpper[1] << "," << param.threadDimMapUpper[2] << "," << param.threadDimMapUpper[3] << "}" << std::endl;
85  output << "parity = " << param.parity << std::endl;
86  output << "dim = " << param.dim << std::endl;
87  output << "face_num = " << param.face_num << std::endl;
88  output << "X = {" << param.X[0] << ","<< param.X[1] << "," << param.X[2] << "," << param.X[3] << "}" << std::endl;
89  output << "ghostFace = {" << param.ghostFace[0] << ","<< param.ghostFace[1] << ","
90  << param.ghostFace[2] << "," << param.ghostFace[3] << "}" << std::endl;
91  output << "sp_stride = " << param.sp_stride << std::endl;
92  return output;
93  }
94 
95  // Extend the PackParam class to PackExtendedParam
96  template<typename Float>
97  struct PackExtendedParam : public PackParam<Float>
98  {
99  PackExtendedParam(){}
100  PackExtendedParam(const PackParam<Float>& base) : PackParam<Float>(base) {}
101  int R[QUDA_MAX_DIM]; // boundary dimensions
102  };
103 
108 /*
109  template <typename Param>
110  __device__ inline int dimFromFaceIndex (int &face_idx, const Param &param) {
111  if (face_idx < param.threadDimMapUpper[0]) {
112  return 0;
113  } else if (face_idx < param.threadDimMapUpper[1]) {
114  face_idx -= param.threadDimMapLower[1];
115  return 1;
116  } else if (face_idx < param.threadDimMapUpper[2]) {
117  face_idx -= param.threadDimMapLower[2];
118  return 2;
119  } else { // this is only called if we use T kernel packing
120  face_idx -= param.threadDimMapLower[3];
121  return 3;
122  }
123  }
124 */
125 #if defined(GPU_WILSON_DIRAC) || defined(GPU_DOMAIN_WALL_DIRAC)
126 
127  // double precision
128 #if (defined DIRECT_ACCESS_WILSON_PACK_SPINOR) || (defined FERMI_NO_DBLE_TEX)
129 #define READ_SPINOR READ_SPINOR_DOUBLE
130 #define READ_SPINOR_UP READ_SPINOR_DOUBLE_UP
131 #define READ_SPINOR_DOWN READ_SPINOR_DOUBLE_DOWN
132 #define SPINORTEX in
133 #else
134 #define READ_SPINOR READ_SPINOR_DOUBLE_TEX
135 #define READ_SPINOR_UP READ_SPINOR_DOUBLE_UP_TEX
136 #define READ_SPINOR_DOWN READ_SPINOR_DOUBLE_DOWN_TEX
137 #ifdef USE_TEXTURE_OBJECTS
138 #define SPINORTEX param.inTex
139 #else
140 #define SPINORTEX spinorTexDouble
141 #endif
142 #endif
143 #define WRITE_HALF_SPINOR WRITE_HALF_SPINOR_DOUBLE2
144 #define SPINOR_DOUBLE
145  template <int dim, int dagger, int face_num>
146  static inline __device__ void packFaceWilsonCore(double2 *out, float *outNorm, const double2 *in,
147  const float *inNorm, const int &idx,
148  const int &face_idx, const int &face_volume,
149  PackParam<double2> &param)
150  {
151 #if (__COMPUTE_CAPABILITY__ >= 130)
152  if (dagger) {
154  } else {
155 #include "wilson_pack_face_core.h"
156  }
157 #endif // (__COMPUTE_CAPABILITY__ >= 130)
158  }
159 
160  template <int dim, int dagger, int face_num>
161  static inline __device__ void unpackFaceWilsonCore(double2 *out, float *outNorm, const double2 *in,
162  const float *inNorm, const int &idx,
163  const int &face_idx, const int &face_volume,
164  PackParam<double2> &param)
165  {
166 #if (__COMPUTE_CAPABILITY__ >= 130)
167  if (dagger) {
169  } else {
170 #include "wilson_pack_face_core.h"
171  }
172 #endif // (__COMPUTE_CAPABILITY__ >= 130)
173  }
174 
175 #undef READ_SPINOR
176 #undef READ_SPINOR
177 #undef READ_SPINOR_UP
178 #undef READ_SPINOR_DOWN
179 #undef SPINORTEX
180 #undef WRITE_HALF_SPINOR
181 #undef SPINOR_DOUBLE
182 
183 
184  // single precision
185 #ifdef DIRECT_ACCESS_WILSON_PACK_SPINOR
186 #define READ_SPINOR READ_SPINOR_SINGLE
187 #define READ_SPINOR_UP READ_SPINOR_SINGLE_UP
188 #define READ_SPINOR_DOWN READ_SPINOR_SINGLE_DOWN
189 #define SPINORTEX in
190 #else
191 #define READ_SPINOR READ_SPINOR_SINGLE_TEX
192 #define READ_SPINOR_UP READ_SPINOR_SINGLE_UP_TEX
193 #define READ_SPINOR_DOWN READ_SPINOR_SINGLE_DOWN_TEX
194 #ifdef USE_TEXTURE_OBJECTS
195 #define SPINORTEX param.inTex
196 #else
197 #define SPINORTEX spinorTexSingle
198 #endif
199 #endif
200 #define WRITE_HALF_SPINOR WRITE_HALF_SPINOR_FLOAT4
201  template <int dim, int dagger, int face_num>
202  static inline __device__ void packFaceWilsonCore(float4 *out, float *outNorm, const float4 *in, const float *inNorm,
203  const int &idx, const int &face_idx,
204  const int &face_volume,
205  const PackParam<float4> &param)
206  {
207  if (dagger) {
209  } else {
210 #include "wilson_pack_face_core.h"
211  }
212  }
213 
214  template <int dim, int dagger, int face_num>
215  static inline __device__ void unpackFaceWilsonCore(float4 *out, float *outNorm, const float4 *in, const float *inNorm,
216  const int &idx, const int &face_idx,
217  const int &face_volume,
218  const PackParam<float4> &param)
219  {
220  if (dagger) {
222  } else {
223 #include "wilson_pack_face_core.h"
224  }
225  }
226 #undef READ_SPINOR
227 #undef READ_SPINOR_UP
228 #undef READ_SPINOR_DOWN
229 #undef SPINORTEX
230 #undef WRITE_HALF_SPINOR
231 
232 
233  // half precision
234 #ifdef DIRECT_ACCESS_WILSON_PACK_SPINOR
235 #define READ_SPINOR READ_SPINOR_HALF
236 #define READ_SPINOR_UP READ_SPINOR_HALF_UP
237 #define READ_SPINOR_DOWN READ_SPINOR_HALF_DOWN
238 #define SPINORTEX in
239 #else
240 #define READ_SPINOR READ_SPINOR_HALF_TEX
241 #define READ_SPINOR_UP READ_SPINOR_HALF_UP_TEX
242 #define READ_SPINOR_DOWN READ_SPINOR_HALF_DOWN_TEX
243 #ifdef USE_TEXTURE_OBJECTS
244 #define SPINORTEX param.inTex
245 #else
246 #define SPINORTEX spinorTexHalf
247 #endif
248 #endif
249 #define WRITE_HALF_SPINOR WRITE_HALF_SPINOR_SHORT4
250  template <int dim, int dagger, int face_num>
251  static inline __device__ void packFaceWilsonCore(short4 *out, float *outNorm, const short4 *in, const float *inNorm,
252  const int &idx, const int &face_idx,
253  const int &face_volume,
254  const PackParam<short4> &param)
255  {
256  if (dagger) {
258  } else {
259 #include "wilson_pack_face_core.h"
260  }
261  }
262 
263  template <int dim, int dagger, int face_num>
264  static inline __device__ void unpackFaceWilsonCore(short4 *out, float *outNorm, const short4 *in, const float *inNorm,
265  const int &idx, const int &face_idx,
266  const int &face_volume,
267  const PackParam<short4> &param)
268  {
269  if (dagger) {
271  } else {
272 #include "wilson_pack_face_core.h"
273  }
274  }
275 #undef READ_SPINOR
276 #undef READ_SPINOR_UP
277 #undef READ_SPINOR_DOWN
278 #undef SPINORTEX
279 #undef WRITE_HALF_SPINOR
280 
281  template <int dagger, typename FloatN>
282  __global__ void packFaceWilsonKernel(PackParam<FloatN> param)
283  {
284  const int nFace = 1; // 1 face for Wilson
285 
286  int face_idx = blockIdx.x*blockDim.x + threadIdx.x;
287  if (face_idx >= param.threads) return;
288 
289  // determine which dimension we are packing
290  const int dim = dimFromFaceIndex(face_idx, param);
291 
292  // compute where the output is located
293  // compute an index into the local volume from the index into the face
294  // read spinor, spin-project, and write half spinor to face
295  if (dim == 0) {
296  // face_num determines which end of the lattice we are packing: 0 = start, 1 = end
297  const int face_num = (face_idx >= nFace*param.ghostFace[0]) ? 1 : 0;
298  face_idx -= face_num*nFace*param.ghostFace[0];
299  if (face_num == 0) {
300  const int idx = indexFromFaceIndex<0,nFace,0>(face_idx,param.ghostFace[0],param.parity,param.X);
301  packFaceWilsonCore<0,dagger,0>(param.out[0], param.outNorm[0], param.in,
302  param.inNorm,idx, face_idx, param.ghostFace[0], param);
303  } else {
304  const int idx = indexFromFaceIndex<0,nFace,1>(face_idx,param.ghostFace[0],param.parity,param.X);
305  packFaceWilsonCore<0,dagger,1>(param.out[1], param.outNorm[1], param.in,
306  param.inNorm,idx, face_idx, param.ghostFace[0], param);
307  }
308  } else if (dim == 1) {
309  // face_num determines which end of the lattice we are packing: 0 = start, 1 = end
310  const int face_num = (face_idx >= nFace*param.ghostFace[1]) ? 1 : 0;
311  face_idx -= face_num*nFace*param.ghostFace[1];
312  if (face_num == 0) {
313  const int idx = indexFromFaceIndex<1,nFace,0>(face_idx,param.ghostFace[1],param.parity,param.X);
314  packFaceWilsonCore<1, dagger,0>(param.out[2], param.outNorm[2], param.in,
315  param.inNorm,idx, face_idx, param.ghostFace[1], param);
316  } else {
317  const int idx = indexFromFaceIndex<1,nFace,1>(face_idx,param.ghostFace[1],param.parity,param.X);
318  packFaceWilsonCore<1, dagger,1>(param.out[3], param.outNorm[3], param.in,
319  param.inNorm,idx, face_idx, param.ghostFace[1], param);
320  }
321  } else if (dim == 2) {
322  // face_num determines which end of the lattice we are packing: 0 = start, 1 = end
323  const int face_num = (face_idx >= nFace*param.ghostFace[2]) ? 1 : 0;
324  face_idx -= face_num*nFace*param.ghostFace[2];
325  if (face_num == 0) {
326  const int idx = indexFromFaceIndex<2,nFace,0>(face_idx,param.ghostFace[2],param.parity,param.X);
327  packFaceWilsonCore<2, dagger,0>(param.out[4], param.outNorm[4], param.in,
328  param.inNorm,idx, face_idx, param.ghostFace[2], param);
329  } else {
330  const int idx = indexFromFaceIndex<2,nFace,1>(face_idx,param.ghostFace[2],param.parity,param.X);
331  packFaceWilsonCore<2, dagger,1>(param.out[5], param.outNorm[5], param.in,
332  param.inNorm,idx, face_idx, param.ghostFace[2], param);
333  }
334  } else {
335  // face_num determines which end of the lattice we are packing: 0 = start, 1 = end
336  const int face_num = (face_idx >= nFace*param.ghostFace[3]) ? 1 : 0;
337  face_idx -= face_num*nFace*param.ghostFace[3];
338  if (face_num == 0) {
339  const int idx = indexFromFaceIndex<3,nFace,0>(face_idx,param.ghostFace[3],param.parity,param.X);
340  packFaceWilsonCore<3, dagger,0>(param.out[6], param.outNorm[6], param.in,
341  param.inNorm,idx, face_idx, param.ghostFace[3], param);
342  } else {
343  const int idx = indexFromFaceIndex<3,nFace,1>(face_idx,param.ghostFace[3],param.parity,param.X);
344  packFaceWilsonCore<3, dagger,1>(param.out[7], param.outNorm[7], param.in,
345  param.inNorm,idx, face_idx, param.ghostFace[3], param);
346  }
347  }
348 
349  }
350 
351 
352  template <int dagger, typename FloatN, int nFace>
353  __global__ void packFaceExtendedWilsonKernel(PackParam<FloatN> param)
354  {
355  int face_idx = blockIdx.x*blockDim.x + threadIdx.x;
356  if (face_idx >= param.threads) return;
357 
358  // determine which dimension we are packing
359  const int dim = dimFromFaceIndex(face_idx, param);
360 
361  // compute where the output is located
362  // compute an index into the local volume from the index into the face
363  // read spinor, spin-project, and write half spinor to face
364  if (dim == 0) {
365  // face_num determines which end of the lattice we are packing: 0 = start, 1 = end
366  // if param.face_num==2 pack both the start and the end, otherwise pack the region of the lattice
367  // specified by param.face_num
368  const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.ghostFace[0]) ? 1 : 0) : param.face_num;
369  if(param.face_num==2) face_idx -= face_num*nFace*param.ghostFace[0];
370  if (face_num == 0) {
371  const int idx = indexFromFaceIndexExtended<0,nFace,0>(face_idx,param.ghostFace[0],param.parity,param.X,param.R);
372  packFaceWilsonCore<0,dagger,0>(param.out[0], param.outNorm[0], param.in,
373  param.inNorm,idx, face_idx, param.ghostFace[0], param);
374  } else {
375  const int idx = indexFromFaceIndexExtended<0,nFace,1>(face_idx,param.ghostFace[0],param.parity,param.X,param.R);
376  packFaceWilsonCore<0,dagger,1>(param.out[1], param.outNorm[1], param.in,
377  param.inNorm,idx, face_idx, param.ghostFace[0], param);
378  }
379  } else if (dim == 1) {
380  const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.ghostFace[1]) ? 1 : 0) : param.face_num;
381  if(param.face_num==2) face_idx -= face_num*nFace*param.ghostFace[1];
382  if (face_num == 0) {
383  const int idx = indexFromFaceIndexExtended<1,nFace,0>(face_idx,param.ghostFace[1],param.parity,param.X,param.R);
384  packFaceWilsonCore<1, dagger,0>(param.out[2], param.outNorm[2], param.in,
385  param.inNorm,idx, face_idx, param.ghostFace[1], param);
386  } else {
387  const int idx = indexFromFaceIndexExtended<1,nFace,1>(face_idx,param.ghostFace[1],param.parity,param.X,param.R);
388  packFaceWilsonCore<1, dagger,1>(param.out[3], param.outNorm[3], param.in,
389  param.inNorm,idx, face_idx, param.ghostFace[1], param);
390  }
391  } else if (dim == 2) {
392  const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.ghostFace[2]) ? 1 : 0) : param.face_num;
393  if(param.face_num==2) face_idx -= face_num*nFace*param.ghostFace[2];
394  if (face_num == 0) {
395  const int idx = indexFromFaceIndexExtended<2,nFace,0>(face_idx,param.ghostFace[2],param.parity,param.X,param.R);
396  packFaceWilsonCore<2, dagger,0>(param.out[4], param.outNorm[4], param.in,
397  param.inNorm,idx, face_idx, param.ghostFace[2], param);
398  } else {
399  const int idx = indexFromFaceIndexExtended<2,nFace,1>(face_idx,param.ghostFace[2],param.parity,param.X,param.R);
400  packFaceWilsonCore<2, dagger,1>(param.out[5], param.outNorm[5], param.in,
401  param.inNorm,idx, face_idx, param.ghostFace[2], param);
402  }
403  } else {
404  const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.ghostFace[3]) ? 1 : 0) : param.face_num;
405  if(param.face_num==2) face_idx -= face_num*nFace*param.ghostFace[3];
406 
407  if (face_num == 0) {
408  const int idx = indexFromFaceIndexExtended<3,nFace,0>(face_idx,param.ghostFace[3],param.parity,param.X,param.R);
409  packFaceWilsonCore<3, dagger,0>(param.out[6], param.outNorm[6], param.in,
410  param.inNorm,idx, face_idx, param.ghostFace[3], param);
411  } else {
412  const int idx = indexFromFaceIndexExtended<3,nFace,1>(face_idx,param.ghostFace[3],param.parity,param.X,param.R);
413  packFaceWilsonCore<3, dagger,1>(param.out[7], param.outNorm[7], param.in,
414  param.inNorm,idx, face_idx, param.ghostFace[3], param);
415  }
416  }
417 
418  }
419 
420 
421  template <int dagger, typename FloatN, int nFace>
422  __global__ void unpackFaceExtendedWilsonKernel(PackParam<FloatN> param)
423  {
424  int face_idx = blockIdx.x*blockDim.x + threadIdx.x;
425  if (face_idx >= param.threads) return;
426 
427  // determine which dimension we are packing
428  const int dim = dimFromFaceIndex(face_idx, param);
429 
430  // compute where the output is located
431  // compute an index into the local volume from the index into the face
432  // read spinor, spin-project, and write half spinor to face
433  if (dim == 0) {
434  // face_num determines which end of the lattice we are packing: 0 = start, 1 = end
435  // if param.face_num==2 pack both the start and the end, otherwise pack the region of the lattice
436  // specified by param.face_num
437  const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.ghostFace[0]) ? 1 : 0) : param.face_num;
438  if(param.face_num==2) face_idx -= face_num*nFace*param.ghostFace[0];
439 
440  if (face_num == 0) {
441  const int idx = indexFromFaceIndexExtended<0,nFace,0>(face_idx,param.ghostFace[0],param.parity,param.X,param.R);
442  unpackFaceWilsonCore<0,dagger,0>(param.out[0], param.outNorm[0], param.in,
443  param.inNorm,idx, face_idx, param.ghostFace[0], param);
444  } else {
445  const int idx = indexFromFaceIndexExtended<0,nFace,1>(face_idx,param.ghostFace[0],param.parity,param.X,param.R);
446  unpackFaceWilsonCore<0,dagger,1>(param.out[1], param.outNorm[1], param.in,
447  param.inNorm,idx, face_idx, param.ghostFace[0], param);
448  }
449  } else if (dim == 1) {
450  const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.ghostFace[1]) ? 1 : 0) : param.face_num;
451  if(param.face_num==2) face_idx -= face_num*nFace*param.ghostFace[1];
452 
453  if (face_num == 0) {
454  const int idx = indexFromFaceIndexExtended<1,nFace,0>(face_idx,param.ghostFace[1],param.parity,param.X,param.R);
455  unpackFaceWilsonCore<1, dagger,0>(param.out[2], param.outNorm[2], param.in,
456  param.inNorm,idx, face_idx, param.ghostFace[1], param);
457  } else {
458  const int idx = indexFromFaceIndexExtended<1,nFace,1>(face_idx,param.ghostFace[1],param.parity,param.X,param.R);
459  unpackFaceWilsonCore<1, dagger,1>(param.out[3], param.outNorm[3], param.in,
460  param.inNorm,idx, face_idx, param.ghostFace[1], param);
461  }
462  } else if (dim == 2) {
463  const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.ghostFace[2]) ? 1 : 0) : param.face_num;
464  if(param.face_num==2) face_idx -= face_num*nFace*param.ghostFace[2];
465 
466  if (face_num == 0) {
467  const int idx = indexFromFaceIndexExtended<2,nFace,0>(face_idx,param.ghostFace[2],param.parity,param.X,param.R);
468  unpackFaceWilsonCore<2, dagger,0>(param.out[4], param.outNorm[4], param.in,
469  param.inNorm,idx, face_idx, param.ghostFace[2], param);
470  } else {
471  const int idx = indexFromFaceIndexExtended<2,nFace,1>(face_idx,param.ghostFace[2],param.parity,param.X,param.R);
472  unpackFaceWilsonCore<2, dagger,1>(param.out[5], param.outNorm[5], param.in,
473  param.inNorm,idx, face_idx, param.ghostFace[2], param);
474  }
475  } else {
476  const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.ghostFace[3]) ? 1 : 0) : param.face_num;
477  if(param.face_num==2) face_idx -= face_num*nFace*param.ghostFace[3];
478 
479  if (face_num == 0) {
480  const int idx = indexFromFaceIndexExtended<3,nFace,0>(face_idx,param.ghostFace[3],param.parity,param.X,param.R);
481  unpackFaceWilsonCore<3, dagger,0>(param.out[6], param.outNorm[6], param.in,
482  param.inNorm,idx, face_idx, param.ghostFace[3], param);
483  } else {
484  const int idx = indexFromFaceIndexExtended<3,nFace,1>(face_idx,param.ghostFace[3],param.parity,param.X,param.R);
485  unpackFaceWilsonCore<3, dagger,1>(param.out[7], param.outNorm[7], param.in,
486  param.inNorm,idx, face_idx, param.ghostFace[3], param);
487  }
488  }
489 
490  }
491 
492 #endif // GPU_WILSON_DIRAC || GPU_DOMAIN_WALL_DIRAC
493 
494 
495 #if defined(GPU_WILSON_DIRAC) || defined(GPU_TWISTED_MASS_DIRAC)
496 
497 
498 #endif // GPU_WILSON_DIRAC || GPU_DOMAIN_WALL_DIRAC
499 
500 
501 #if defined(GPU_WILSON_DIRAC) || defined(GPU_TWISTED_MASS_DIRAC)
502 
503  // double precision
504 
505 #endif // GPU_WILSON_DIRAC || GPU_DOMAIN_WALL_DIRAC
506 
507 
508 #if defined(GPU_WILSON_DIRAC) || defined(GPU_TWISTED_MASS_DIRAC)
509 
510  // double precision
511 #if (defined DIRECT_ACCESS_WILSON_PACK_SPINOR) || (defined FERMI_NO_DBLE_TEX)
512 #define READ_SPINOR READ_SPINOR_DOUBLE
513 #define READ_SPINOR_UP READ_SPINOR_DOUBLE_UP
514 #define READ_SPINOR_DOWN READ_SPINOR_DOUBLE_DOWN
515 #define SPINORTEX in
516 #else
517 #define READ_SPINOR READ_SPINOR_DOUBLE_TEX
518 #define READ_SPINOR_UP READ_SPINOR_DOUBLE_UP_TEX
519 #define READ_SPINOR_DOWN READ_SPINOR_DOUBLE_DOWN_TEX
520 #ifdef USE_TEXTURE_OBJECTS
521 #define SPINORTEX param.inTex
522 #else
523 #define SPINORTEX spinorTexDouble
524 #endif
525 #endif
526 #define WRITE_HALF_SPINOR WRITE_HALF_SPINOR_DOUBLE2
527 #define SPINOR_DOUBLE
528  template <int dim, int dagger, int face_num>
529  static inline __device__ void packTwistedFaceWilsonCore(double2 *out, float *outNorm, const double2 *in,
530  const float *inNorm, double a, double b, const int &idx,
531  const int &face_idx, const int &face_volume,
532  PackParam<double2> &param)
533  {
534 #if (__COMPUTE_CAPABILITY__ >= 130)
535  if (dagger) {
537  } else {
539  }
540 #endif // (__COMPUTE_CAPABILITY__ >= 130)
541  }
542 #undef READ_SPINOR
543 #undef READ_SPINOR_UP
544 #undef READ_SPINOR_DOWN
545 #undef SPINORTEX
546 #undef WRITE_HALF_SPINOR
547 #undef SPINOR_DOUBLE
548 
549 
550  // single precision
551 #ifdef DIRECT_ACCESS_WILSON_PACK_SPINOR
552 #define READ_SPINOR READ_SPINOR_SINGLE
553 #define READ_SPINOR_UP READ_SPINOR_SINGLE_UP
554 #define READ_SPINOR_DOWN READ_SPINOR_SINGLE_DOWN
555 #define SPINORTEX in
556 #else
557 #define READ_SPINOR READ_SPINOR_SINGLE_TEX
558 #define READ_SPINOR_UP READ_SPINOR_SINGLE_UP_TEX
559 #define READ_SPINOR_DOWN READ_SPINOR_SINGLE_DOWN_TEX
560 #ifdef USE_TEXTURE_OBJECTS
561 #define SPINORTEX param.inTex
562 #else
563 #define SPINORTEX spinorTexSingle
564 #endif
565 #endif
566 #define WRITE_HALF_SPINOR WRITE_HALF_SPINOR_FLOAT4
567  template <int dim, int dagger, int face_num>
568  static inline __device__ void packTwistedFaceWilsonCore(float4 *out, float *outNorm, const float4 *in, const float *inNorm, float a, float b,
569  const int &idx, const int &face_idx,
570  const int &face_volume,
571  const PackParam<float4> &param)
572  {
573  if (dagger) {
575  } else {
577  }
578  }
579 #undef READ_SPINOR
580 #undef READ_SPINOR_UP
581 #undef READ_SPINOR_DOWN
582 #undef SPINORTEX
583 #undef WRITE_HALF_SPINOR
584 
585 
586  // half precision
587 #ifdef DIRECT_ACCESS_WILSON_PACK_SPINOR
588 #define READ_SPINOR READ_SPINOR_HALF
589 #define READ_SPINOR_UP READ_SPINOR_HALF_UP
590 #define READ_SPINOR_DOWN READ_SPINOR_HALF_DOWN
591 #define SPINORTEX in
592 #else
593 #define READ_SPINOR READ_SPINOR_HALF_TEX
594 #define READ_SPINOR_UP READ_SPINOR_HALF_UP_TEX
595 #define READ_SPINOR_DOWN READ_SPINOR_HALF_DOWN_TEX
596 #ifdef USE_TEXTURE_OBJECTS
597 #define SPINORTEX param.inTex
598 #else
599 #define SPINORTEX spinorTexHalf
600 #endif
601 #endif
602 #define WRITE_HALF_SPINOR WRITE_HALF_SPINOR_SHORT4
603  template <int dim, int dagger, int face_num>
604  static inline __device__ void packTwistedFaceWilsonCore(short4 *out, float *outNorm, const short4 *in, const float *inNorm, float a, float b,
605  const int &idx, const int &face_idx,
606  const int &face_volume,
607  const PackParam<short4> &param)
608  {
609  if (dagger) {
611  } else {
613  }
614  }
615 #undef READ_SPINOR
616 #undef READ_SPINOR_UP
617 #undef READ_SPINOR_DOWN
618 #undef SPINORTEX
619 #undef WRITE_HALF_SPINOR
620 
621  template <int dagger, typename FloatN, typename Float>
622  __global__ void packTwistedFaceWilsonKernel(Float a, Float b, PackParam<FloatN> param)
623  {
624  const int nFace = 1; // 1 face for Wilson
625 
626  int face_idx = blockIdx.x*blockDim.x + threadIdx.x;
627  if (face_idx >= param.threads) return;
628 
629  // determine which dimension we are packing
630  const int dim = dimFromFaceIndex(face_idx, param);
631 
632  // compute where the output is located
633  // compute an index into the local volume from the index into the face
634  // read spinor, spin-project, and write half spinor to face
635  if (dim == 0) {
636  // face_num determines which end of the lattice we are packing: 0 = start, 1 = end
637  const int face_num = (face_idx >= nFace*param.ghostFace[0]) ? 1 : 0;
638  face_idx -= face_num*nFace*param.ghostFace[0];
639  if (face_num == 0) {
640  const int idx = indexFromFaceIndex<0,nFace,0>(face_idx,param.ghostFace[0],param.parity,param.X);
641  packTwistedFaceWilsonCore<0,dagger,0>(param.out[0], param.outNorm[0], param.in,
642  param.inNorm, a, b, idx, face_idx, param.ghostFace[0], param);
643  } else {
644  const int idx = indexFromFaceIndex<0,nFace,1>(face_idx,param.ghostFace[0],param.parity,param.X);
645  packTwistedFaceWilsonCore<0,dagger,1>(param.out[1], param.outNorm[1], param.in,
646  param.inNorm, a, b, idx, face_idx, param.ghostFace[0], param);
647  }
648  } else if (dim == 1) {
649  const int face_num = (face_idx >= nFace*param.ghostFace[1]) ? 1 : 0;
650  face_idx -= face_num*nFace*param.ghostFace[1];
651  if (face_num == 0) {
652  const int idx = indexFromFaceIndex<1,nFace,0>(face_idx,param.ghostFace[1],param.parity,param.X);
653  packTwistedFaceWilsonCore<1, dagger,0>(param.out[2], param.outNorm[2], param.in,
654  param.inNorm, a, b, idx, face_idx, param.ghostFace[1], param);
655  } else {
656  const int idx = indexFromFaceIndex<1,nFace,1>(face_idx,param.ghostFace[1],param.parity,param.X);
657  packTwistedFaceWilsonCore<1, dagger,1>(param.out[3], param.outNorm[3], param.in,
658  param.inNorm, a, b, idx, face_idx, param.ghostFace[1], param);
659  }
660  } else if (dim == 2) {
661  const int face_num = (face_idx >= nFace*param.ghostFace[2]) ? 1 : 0;
662  face_idx -= face_num*nFace*param.ghostFace[2];
663  if (face_num == 0) {
664  const int idx = indexFromFaceIndex<2,nFace,0>(face_idx,param.ghostFace[2],param.parity,param.X);
665  packTwistedFaceWilsonCore<2, dagger,0>(param.out[4], param.outNorm[4], param.in,
666  param.inNorm, a, b, idx, face_idx, param.ghostFace[2], param);
667  } else {
668  const int idx = indexFromFaceIndex<2,nFace,1>(face_idx,param.ghostFace[2],param.parity,param.X);
669  packTwistedFaceWilsonCore<2, dagger,1>(param.out[5], param.outNorm[5], param.in,
670  param.inNorm, a, b, idx, face_idx, param.ghostFace[2], param);
671  }
672  } else {
673  const int face_num = (face_idx >= nFace*param.ghostFace[3]) ? 1 : 0;
674  face_idx -= face_num*nFace*param.ghostFace[3];
675  if (face_num == 0) {
676  const int idx = indexFromFaceIndex<3,nFace,0>(face_idx,param.ghostFace[3],param.parity,param.X);
677  packTwistedFaceWilsonCore<3, dagger,0>(param.out[6], param.outNorm[6], param.in,
678  param.inNorm, a, b,idx, face_idx, param.ghostFace[3], param);
679  } else {
680  const int idx = indexFromFaceIndex<3,nFace,1>(face_idx,param.ghostFace[3],param.parity,param.X);
681  packTwistedFaceWilsonCore<3, dagger,1>(param.out[7], param.outNorm[7], param.in,
682  param.inNorm, a, b, idx, face_idx, param.ghostFace[3], param);
683  }
684  }
685 
686  }
687 
688 #endif // GPU_TWISTED_MASS_DIRAC
689 
690  template <typename FloatN, typename Float>
691  class PackFace : public Tunable {
692 
693  protected:
694  FloatN *faces;
695  const cudaColorSpinorField *in;
696  const int dagger;
697  const int parity;
698  const int nFace;
699  const int dim;
700  const int face_num;
701 
702  // compute how many threads we need in total for the face packing
703  unsigned int threads() const {
704  unsigned int threads = 0;
705  if(dim < 0){ // if dim is negative, pack all dimensions
706  for (int i=0; i<4; i++) {
707  if (!commDim[i]) continue;
708  if ((i==3 && !(getKernelPackT() || getTwistPack()))) continue;
709  threads += 2*nFace*in->GhostFace()[i]; // 2 for forwards and backwards faces
710  }
711  }else{ // pack only in dim dimension
712  if(commDim[dim] && dim!=3 || (getKernelPackT() || getTwistPack())){
713  threads = nFace*in->GhostFace()[dim];
714  if(face_num==2) threads *= 2; // sending data forwards and backwards
715  }
716  }
717  return threads;
718  }
719 
720  virtual int inputPerSite() const = 0;
721  virtual int outputPerSite() const = 0;
722 
723  // prepare the param struct with kernel arguments
724  PackParam<FloatN> prepareParam(int dim=-1, int face_num=2) {
725  PackParam<FloatN> param;
726  param.in = (FloatN*)in->V();
727  param.inNorm = (float*)in->Norm();
728  param.dim = dim;
729  param.face_num = face_num;
730  param.parity = parity;
731  for(int d=0; d<QUDA_MAX_DIM; d++) param.X[d] = in->X()[d];
732  param.X[0] *= 2;
733 
734 #ifdef USE_TEXTURE_OBJECTS
735  param.inTex = in->Tex();
736  param.inTexNorm = in->TexNorm();
737 #endif
738 
739  param.threads = threads();
740  param.sp_stride = in->Stride();
741 
742  int prev = -1; // previous dimension that was partitioned
743  for (int i=0; i<4; i++) {
744  param.threadDimMapLower[i] = 0;
745  param.threadDimMapUpper[i] = 0;
746  if (!commDim[i]) continue;
747  param.threadDimMapLower[i] = (prev>=0 ? param.threadDimMapUpper[prev] : 0);
748  param.threadDimMapUpper[i] = param.threadDimMapLower[i] + 2*nFace*in->GhostFace()[i];
749 
750  size_t faceBytes = nFace*outputPerSite()*in->GhostFace()[i]*sizeof(faces->x);
751 
752  if (typeid(FloatN) == typeid(short4) || typeid(FloatN) == typeid(short2)) {
753  faceBytes += nFace*in->GhostFace()[i]*sizeof(float);
754  param.out[2*i] = (FloatN*)((char*)faces +
755  (outputPerSite()*sizeof(faces->x) + sizeof(float))*param.threadDimMapLower[i]);
756  param.outNorm[2*i] = (float*)((char*)param.out[2*i] +
757  nFace*outputPerSite()*in->GhostFace()[i]*sizeof(faces->x));
758  } else {
759  param.out[2*i] = (FloatN*)((char*)faces+outputPerSite()*sizeof(faces->x)*param.threadDimMapLower[i]);
760  }
761 
762  param.out[2*i+1] = (FloatN*)((char*)param.out[2*i] + faceBytes);
763  param.outNorm[2*i+1] = (float*)((char*)param.outNorm[2*i] + faceBytes);
764 
765  prev=i;
766  }
767 
768  param.ghostFace[0] = param.X[1]*param.X[2]*param.X[3]/2;
769  param.ghostFace[1] = param.X[0]*param.X[2]*param.X[3]/2;
770  param.ghostFace[2] = param.X[0]*param.X[1]*param.X[3]/2;
771  param.ghostFace[3] = param.X[0]*param.X[1]*param.X[2]/2;
772 
773  return param;
774  }
775 
776  unsigned int sharedBytesPerThread() const { return 0; }
777  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
778 
779  bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
780  unsigned int minThreads() const { return threads(); }
781 
782  void fillAux() {
783  strcpy(aux, in->AuxString());
784  char comm[5];
785  comm[0] = (commDim[0] ? '1' : '0');
786  comm[1] = (commDim[1] ? '1' : '0');
787  comm[2] = (commDim[2] ? '1' : '0');
788  comm[3] = (commDim[3] ? '1' : '0');
789  comm[4] = '\0'; strcat(aux,",comm=");
790  strcat(aux,comm);
791  if (getKernelPackT() || getTwistPack()) { strcat(aux,",kernelPackT"); }
792  }
793 
794  public:
795  PackFace(FloatN *faces, const cudaColorSpinorField *in,
796  const int dagger, const int parity, const int nFace, const int dim=-1, const int face_num=2)
797  : faces(faces), in(in), dagger(dagger),
798  parity(parity), nFace(nFace), dim(dim), face_num(face_num)
799  {
800  fillAux();
801  bindSpinorTex<FloatN>(in);
802  }
803 
804  virtual ~PackFace() {
805  unbindSpinorTex<FloatN>(in);
806  }
807 
808  virtual int tuningIter() const { return 3; }
809 
810  virtual TuneKey tuneKey() const {
811  return TuneKey(in->VolString(), typeid(*this).name(), aux);
812  }
813 
814  virtual void apply(const cudaStream_t &stream) = 0;
815 
816  long long bytes() const {
817  size_t faceBytes = (inputPerSite() + outputPerSite())*this->threads()*sizeof(((FloatN*)0)->x);
818  if (sizeof(((FloatN*)0)->x) == QUDA_HALF_PRECISION)
819  faceBytes += 2*this->threads()*sizeof(float); // 2 is from input and output
820  return faceBytes;
821  }
822  };
823 
824  template <typename FloatN, typename Float>
825  class PackFaceWilson : public PackFace<FloatN, Float> {
826 
827  private:
828 
829  int inputPerSite() const { return 24; } // input is full spinor
830  int outputPerSite() const { return 12; } // output is spin projected
831 
832  public:
833  PackFaceWilson(FloatN *faces, const cudaColorSpinorField *in,
834  const int dagger, const int parity)
835  : PackFace<FloatN, Float>(faces, in, dagger, parity, 1) { }
836  virtual ~PackFaceWilson() { }
837 
838  void apply(const cudaStream_t &stream) {
839  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
840 
841 #ifdef GPU_WILSON_DIRAC
842  PackParam<FloatN> param = this->prepareParam();
843  if (this->dagger) {
844  packFaceWilsonKernel<1><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(param);
845  } else {
846  packFaceWilsonKernel<0><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(param);
847  }
848 #else
849  errorQuda("Wilson face packing kernel is not built");
850 #endif
851  }
852 
853  long long flops() const { return outputPerSite()*this->threads(); }
854  };
855 
856  void packFaceWilson(void *ghost_buf, cudaColorSpinorField &in, const int dagger,
857  const int parity, const cudaStream_t &stream) {
858 
859  switch(in.Precision()) {
861  {
862  PackFaceWilson<double2, double> pack((double2*)ghost_buf, &in, dagger, parity);
863  pack.apply(stream);
864  }
865  break;
867  {
868  PackFaceWilson<float4, float> pack((float4*)ghost_buf, &in, dagger, parity);
869  pack.apply(stream);
870  }
871  break;
872  case QUDA_HALF_PRECISION:
873  {
874  PackFaceWilson<short4, float> pack((short4*)ghost_buf, &in, dagger, parity);
875  pack.apply(stream);
876  }
877  break;
878  }
879  }
880 
881  template <typename FloatN, typename Float>
882  class PackFaceTwisted : public PackFace<FloatN, Float> {
883 
884  private:
885 
886  int inputPerSite() const { return 24; } // input is full spinor
887  int outputPerSite() const { return 12; } // output is spin projected
888  Float a;
889  Float b;
890 
891  public:
892  PackFaceTwisted(FloatN *faces, const cudaColorSpinorField *in,
893  const int dagger, const int parity, Float a, Float b)
894  : PackFace<FloatN, Float>(faces, in, dagger, parity, 1), a(a), b(b) { }
895  virtual ~PackFaceTwisted() { }
896 
897  void apply(const cudaStream_t &stream) {
898  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
899 
900 #ifdef GPU_TWISTED_MASS_DIRAC
901  PackParam<FloatN> param = this->prepareParam();
902  if (this->dagger) {
903  packTwistedFaceWilsonKernel<1><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(a, b, param);
904  } else {
905  packTwistedFaceWilsonKernel<0><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(a, b, param);
906  }
907 #else
908  errorQuda("Twisted face packing kernel is not built");
909 #endif
910  }
911 
912  long long flops() const { return outputPerSite()*this->threads(); }
913  };
914 
916  void packTwistedFaceWilson(void *ghost_buf, cudaColorSpinorField &in, const int dagger,
917  const int parity, const double a, const double b, const cudaStream_t &stream) {
918 
919  switch(in.Precision()) {
921  {
922  PackFaceTwisted<double2, double> pack((double2*)ghost_buf, &in, dagger, parity, a, b);
923  pack.apply(stream);
924  }
925  break;
927  {
928  PackFaceTwisted<float4, float> pack((float4*)ghost_buf, &in, dagger, parity, (float)a, (float)b);
929  pack.apply(stream);
930  }
931  break;
932  case QUDA_HALF_PRECISION:
933  {
934  PackFaceTwisted<short4, float> pack((short4*)ghost_buf, &in, dagger, parity, (float)a, (float)b);
935  pack.apply(stream);
936  }
937  break;
938  }
939  }
940 
941 #ifdef GPU_STAGGERED_DIRAC
942 
943 #ifdef USE_TEXTURE_OBJECTS
944 #define SPINORTEXDOUBLE param.inTex
945 #define SPINORTEXSINGLE param.inTex
946 #define SPINORTEXHALF param.inTex
947 #define SPINORTEXHALFNORM param.inTexNorm
948 #else
949 #define SPINORTEXDOUBLE spinorTexDouble
950 #define SPINORTEXSINGLE spinorTexSingle2
951 #define SPINORTEXHALF spinorTexHalf2
952 #define SPINORTEXHALFNORM spinorTexHalf2Norm
953 #endif
954 
955  template <typename Float2>
956  __device__ void packFaceStaggeredCore(Float2 *out, float *outNorm, const int out_idx,
957  const int out_stride, const Float2 *in, const float *inNorm,
958  const int in_idx, const int in_stride) {
959  out[out_idx + 0*out_stride] = in[in_idx + 0*in_stride];
960  out[out_idx + 1*out_stride] = in[in_idx + 1*in_stride];
961  out[out_idx + 2*out_stride] = in[in_idx + 2*in_stride];
962  }
963  template<>
964  __device__ void packFaceStaggeredCore(short2 *out, float *outNorm, const int out_idx,
965  const int out_stride, const short2 *in, const float *inNorm,
966  const int in_idx, const int in_stride) {
967  out[out_idx + 0*out_stride] = in[in_idx + 0*in_stride];
968  out[out_idx + 1*out_stride] = in[in_idx + 1*in_stride];
969  out[out_idx + 2*out_stride] = in[in_idx + 2*in_stride];
970  outNorm[out_idx] = inNorm[in_idx];
971  }
972 
973 #if (defined DIRECT_ACCESS_PACK) || (defined FERMI_NO_DBLE_TEX)
974  template <typename Float2>
975  __device__ void packFaceStaggeredCore(Float2 *out, float *outNorm, const int out_idx,
976  const int out_stride, const Float2 *in, const float *inNorm,
977  const int in_idx, const PackParam<double2> &param) {
978  out[out_idx + 0*out_stride] = in[in_idx + 0*param.sp_stride];
979  out[out_idx + 1*out_stride] = in[in_idx + 1*param.sp_stride];
980  out[out_idx + 2*out_stride] = in[in_idx + 2*param.sp_stride];
981  }
982  template<>
983  __device__ void packFaceStaggeredCore(short2 *out, float *outNorm, const int out_idx,
984  const int out_stride, const short2 *in, const float *inNorm,
985  const int in_idx, const PackParam<double2> &param) {
986  out[out_idx + 0*out_stride] = in[in_idx + 0*param.sp_stride];
987  out[out_idx + 1*out_stride] = in[in_idx + 1*param.sp_stride];
988  out[out_idx + 2*out_stride] = in[in_idx + 2*param.sp_stride];
989  outNorm[out_idx] = inNorm[in_idx];
990  }
991 
992 
993 #else
994 #if __COMPUTE_CAPABILITY__ >= 130
995  __device__ void packFaceStaggeredCore(double2 *out, float *outNorm, const int out_idx,
996  const int out_stride, const double2 *in, const float *inNorm,
997  const int in_idx, const PackParam<double2> &param) {
998  out[out_idx + 0*out_stride] = fetch_double2(SPINORTEXDOUBLE, in_idx + 0*param.sp_stride);
999  out[out_idx + 1*out_stride] = fetch_double2(SPINORTEXDOUBLE, in_idx + 1*param.sp_stride);
1000  out[out_idx + 2*out_stride] = fetch_double2(SPINORTEXDOUBLE, in_idx + 2*param.sp_stride);
1001  }
1002 #endif
1003  __device__ void packFaceStaggeredCore(float2 *out, float *outNorm, const int out_idx,
1004  const int out_stride, const float2 *in,
1005  const float *inNorm, const int in_idx,
1006  const PackParam<float2> &param) {
1007  out[out_idx + 0*out_stride] = TEX1DFETCH(float2, SPINORTEXSINGLE, in_idx + 0*param.sp_stride);
1008  out[out_idx + 1*out_stride] = TEX1DFETCH(float2, SPINORTEXSINGLE, in_idx + 1*param.sp_stride);
1009  out[out_idx + 2*out_stride] = TEX1DFETCH(float2, SPINORTEXSINGLE, in_idx + 2*param.sp_stride);
1010  }
1011 
1012  // this is rather dumb: undoing the texture load because cudaNormalizedReadMode is used
1013  // should really bind to an appropriate texture instead of reusing
1014  static inline __device__ short2 float22short2(float c, float2 a) {
1015  return make_short2((short)(a.x*c*MAX_SHORT), (short)(a.y*c*MAX_SHORT));
1016  }
1017 
1018  __device__ void packFaceStaggeredCore(short2 *out, float *outNorm, const int out_idx,
1019  const int out_stride, const short2 *in,
1020  const float *inNorm, const int in_idx,
1021  const PackParam<short2> &param) {
1022  out[out_idx + 0*out_stride] = float22short2(1.0f,TEX1DFETCH(float2,SPINORTEXHALF,in_idx+0*param.sp_stride));
1023  out[out_idx + 1*out_stride] = float22short2(1.0f,TEX1DFETCH(float2,SPINORTEXHALF,in_idx+1*param.sp_stride));
1024  out[out_idx + 2*out_stride] = float22short2(1.0f,TEX1DFETCH(float2,SPINORTEXHALF,in_idx+2*param.sp_stride));
1025  outNorm[out_idx] = TEX1DFETCH(float, SPINORTEXHALFNORM, in_idx);
1026  }
1027 #endif
1028 
1029 
1030  template <typename FloatN, int nFace>
1031  __global__ void packFaceStaggeredKernel(PackParam<FloatN> param)
1032  {
1033  int face_idx = blockIdx.x*blockDim.x + threadIdx.x;
1034  if (face_idx >= param.threads) return;
1035 
1036  // determine which dimension we are packing
1037  const int dim = dimFromFaceIndex(face_idx, param);
1038 
1039  // compute where the output is located
1040  // compute an index into the local volume from the index into the face
1041  // read spinor and write to face
1042  if (dim == 0) {
1043  // face_num determines which end of the lattice we are packing: 0 = start, 1 = end
1044  const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.ghostFace[0]) ? 1 : 0) : param.face_num;
1045  if(param.face_num==2) face_idx -= face_num*nFace*param.ghostFace[0];
1046  if (face_num == 0) {
1047  const int idx = indexFromFaceIndexStaggered<0,nFace,0>(face_idx,param.ghostFace[0],param.parity,param.X);
1048  packFaceStaggeredCore(param.out[0], param.outNorm[0], face_idx,
1049  nFace*param.ghostFace[0], param.in, param.inNorm, idx, param);
1050  } else {
1051  const int idx = indexFromFaceIndexStaggered<0,nFace,1>(face_idx,param.ghostFace[0],param.parity,param.X);
1052  packFaceStaggeredCore(param.out[1], param.outNorm[1], face_idx,
1053  nFace*param.ghostFace[0], param.in, param.inNorm, idx, param);
1054  }
1055  } else if (dim == 1) {
1056  const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.ghostFace[1]) ? 1 : 0) : param.face_num;
1057  if(param.face_num==2) face_idx -= face_num*nFace*param.ghostFace[1];
1058  if (face_num == 0) {
1059  const int idx = indexFromFaceIndexStaggered<1,nFace,0>(face_idx,param.ghostFace[1],param.parity,param.X);
1060  packFaceStaggeredCore(param.out[2], param.outNorm[2], face_idx,
1061  nFace*param.ghostFace[1], param.in, param.inNorm, idx, param);
1062  } else {
1063  const int idx = indexFromFaceIndexStaggered<1,nFace,1>(face_idx,param.ghostFace[1],param.parity,param.X);
1064  packFaceStaggeredCore(param.out[3], param.outNorm[3], face_idx,
1065  nFace*param.ghostFace[1], param.in, param.inNorm, idx, param);
1066  }
1067  } else if (dim == 2) {
1068  const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.ghostFace[2]) ? 1 : 0) : param.face_num;
1069  if(param.face_num==2) face_idx -= face_num*nFace*param.ghostFace[2];
1070  if (face_num == 0) {
1071  const int idx = indexFromFaceIndexStaggered<2,nFace,0>(face_idx,param.ghostFace[2],param.parity,param.X);
1072  packFaceStaggeredCore(param.out[4], param.outNorm[4], face_idx,
1073  nFace*param.ghostFace[2], param.in, param.inNorm, idx, param);
1074  } else {
1075  const int idx = indexFromFaceIndexStaggered<2,nFace,1>(face_idx,param.ghostFace[2],param.parity,param.X);
1076  packFaceStaggeredCore(param.out[5], param.outNorm[5], face_idx,
1077  nFace*param.ghostFace[2], param.in, param.inNorm, idx, param);
1078  }
1079  } else {
1080  const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.ghostFace[3]) ? 1 : 0) : param.face_num;
1081  if(param.face_num==2) face_idx -= face_num*nFace*param.ghostFace[3];
1082  if (face_num == 0) {
1083  const int idx = indexFromFaceIndexStaggered<3,nFace,0>(face_idx,param.ghostFace[3],param.parity,param.X);
1084  packFaceStaggeredCore(param.out[6], param.outNorm[6], face_idx,
1085  nFace*param.ghostFace[3], param.in, param.inNorm,idx, param);
1086  } else {
1087  const int idx = indexFromFaceIndexStaggered<3,nFace,1>(face_idx,param.ghostFace[3],param.parity,param.X);
1088  packFaceStaggeredCore(param.out[7], param.outNorm[7], face_idx,
1089  nFace*param.ghostFace[3], param.in, param.inNorm, idx, param);
1090  }
1091  }
1092 
1093  }
1094 
1095 
1096  template <typename FloatN, int nFace>
1097  __global__ void packFaceExtendedStaggeredKernel(PackExtendedParam<FloatN> param)
1098  {
1099  int face_idx = blockIdx.x*blockDim.x + threadIdx.x;
1100  if (face_idx >= param.threads) return;
1101 
1102  // determine which dimension we are packing
1103  const int dim = dimFromFaceIndex(face_idx, param);
1104 
1105  // compute where the output is located
1106  // compute an index into the local volume from the index into the face
1107  // read spinor and write half spinor to face
1108  if (dim == 0) {
1109  // face_num determines which end of the lattice we are packing: 0 = start, 1 = end
1110  // if param.face_num==2 pack both the start and the end, otherwise pack the region of the
1111  // lattice specified by param.face_num
1112  const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.ghostFace[0]) ? 1 : 0) : param.face_num;
1113  if(param.face_num==2) face_idx -= face_num*nFace*param.ghostFace[0];
1114  if (face_num == 0) {
1115  const int idx = indexFromFaceIndexExtendedStaggered<0,nFace,0>(face_idx,param.ghostFace[0],param.parity,param.X,param.R);
1116  packFaceStaggeredCore(param.out[0], param.outNorm[0], face_idx,
1117  nFace*param.ghostFace[0], param.in, param.inNorm, idx, param);
1118  } else {
1119  const int idx = indexFromFaceIndexExtendedStaggered<0,nFace,1>(face_idx,param.ghostFace[0],param.parity,param.X,param.R);
1120  packFaceStaggeredCore(param.out[1], param.outNorm[1], face_idx,
1121  nFace*param.ghostFace[0], param.in, param.inNorm, idx, param);
1122  }
1123  } else if (dim == 1) {
1124  const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.ghostFace[1]) ? 1 : 0) : param.face_num;
1125  if(param.face_num==2) face_idx -= face_num*nFace*param.ghostFace[1];
1126  if (face_num == 0) {
1127  const int idx = indexFromFaceIndexExtendedStaggered<1,nFace,0>(face_idx,param.ghostFace[1],param.parity,param.X,param.R);
1128  packFaceStaggeredCore(param.out[2], param.outNorm[2], face_idx,
1129  nFace*param.ghostFace[1], param.in, param.inNorm, idx, param);
1130  } else {
1131  const int idx = indexFromFaceIndexExtendedStaggered<1,nFace,1>(face_idx,param.ghostFace[1],param.parity,param.X,param.R);
1132  packFaceStaggeredCore(param.out[3], param.outNorm[3], face_idx,
1133  nFace*param.ghostFace[1], param.in, param.inNorm, idx, param);
1134  }
1135  } else if (dim == 2) {
1136  const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.ghostFace[2]) ? 1 : 0) : param.face_num;
1137  if(param.face_num==2) face_idx -= face_num*nFace*param.ghostFace[2];
1138  if (face_num == 0) {
1139  const int idx = indexFromFaceIndexExtendedStaggered<2,nFace,0>(face_idx,param.ghostFace[2],param.parity,param.X,param.R);
1140  packFaceStaggeredCore(param.out[4], param.outNorm[4], face_idx,
1141  nFace*param.ghostFace[2], param.in, param.inNorm, idx, param);
1142  } else {
1143  const int idx = indexFromFaceIndexExtendedStaggered<2,nFace,1>(face_idx,param.ghostFace[2],param.parity,param.X,param.R);
1144  packFaceStaggeredCore(param.out[5], param.outNorm[5], face_idx,
1145  nFace*param.ghostFace[2], param.in, param.inNorm, idx, param);
1146  }
1147  } else {
1148  const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.ghostFace[3]) ? 1 : 0) : param.face_num;
1149  if(param.face_num==2) face_idx -= face_num*nFace*param.ghostFace[3];
1150  if (face_num == 0) {
1151  const int idx = indexFromFaceIndexExtendedStaggered<3,nFace,0>(face_idx,param.ghostFace[3],param.parity,param.X,param.R);
1152  packFaceStaggeredCore(param.out[6], param.outNorm[6], face_idx,
1153  nFace*param.ghostFace[3], param.in, param.inNorm,idx, param);
1154  } else {
1155  const int idx = indexFromFaceIndexExtendedStaggered<3,nFace,1>(face_idx,param.ghostFace[3],param.parity,param.X,param.R);
1156  packFaceStaggeredCore(param.out[7], param.outNorm[7], face_idx,
1157  nFace*param.ghostFace[3], param.in, param.inNorm, idx, param);
1158  }
1159  }
1160 
1161  }
1162 
1163 
1164  template <typename FloatN, int nFace>
1165  __global__ void unpackFaceExtendedStaggeredKernel(PackExtendedParam<FloatN> param)
1166  {
1167  int face_idx = blockIdx.x*blockDim.x + threadIdx.x;
1168  if (face_idx >= param.threads) return;
1169 
1170  // determine which dimension we are packing
1171  const int dim = dimFromFaceIndex(face_idx, param);
1172 
1173  // compute where the output is located
1174  // compute an index into the local volume from the index into the face
1175  // read spinor, spin-project, and write half spinor to face
1176  if (dim == 0) {
1177  // face_num determines which end of the lattice we are packing: 0 = start, 1 = end
1178  // if param.face_num==2 pack both the start and the end, otherwist pack the region of the
1179  // lattice specified by param.face_num
1180  const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.ghostFace[0]) ? 1 : 0) : param.face_num;
1181  if(param.face_num==2) face_idx -= face_num*nFace*param.ghostFace[0];
1182  if (face_num == 0) {
1183  const int idx = indexFromFaceIndexExtendedStaggered<0,nFace,0>(face_idx,param.ghostFace[0],param.parity,param.X,param.R);
1184  packFaceStaggeredCore(param.in, param.inNorm, idx,
1185  param.sp_stride, param.out[0], param.outNorm[0], face_idx, nFace*param.ghostFace[0]);
1186  } else {
1187  const int idx = indexFromFaceIndexExtendedStaggered<0,nFace,1>(face_idx,param.ghostFace[0],param.parity,param.X,param.R);
1188  packFaceStaggeredCore(param.in, param.inNorm, idx,
1189  param.sp_stride, param.out[1], param.outNorm[1], face_idx, nFace*param.ghostFace[0]);
1190  }
1191  } else if (dim == 1) {
1192  const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.ghostFace[1]) ? 1 : 0) : param.face_num;
1193  if(param.face_num==2) face_idx -= face_num*nFace*param.ghostFace[1];
1194  if (face_num == 0) {
1195  const int idx = indexFromFaceIndexExtendedStaggered<1,nFace,0>(face_idx,param.ghostFace[1],param.parity,param.X,param.R);
1196  packFaceStaggeredCore(param.in, param.inNorm, idx,
1197  param.sp_stride, param.out[2], param.outNorm[2], face_idx, nFace*param.ghostFace[1]);
1198  } else {
1199  const int idx = indexFromFaceIndexExtendedStaggered<1,nFace,1>(face_idx,param.ghostFace[1],param.parity,param.X,param.R);
1200  packFaceStaggeredCore(param.in, param.inNorm, idx,
1201  param.sp_stride, param.out[3], param.outNorm[3], face_idx, nFace*param.ghostFace[1]);
1202  }
1203  } else if (dim == 2) {
1204  const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.ghostFace[2]) ? 1 : 0) : param.face_num;
1205  if(param.face_num==2) face_idx -= face_num*nFace*param.ghostFace[2];
1206  if (face_num == 0) {
1207  const int idx = indexFromFaceIndexExtendedStaggered<2,nFace,0>(face_idx,param.ghostFace[2],param.parity,param.X,param.R);
1208  packFaceStaggeredCore(param.in, param.inNorm, idx,
1209  param.sp_stride, param.out[4], param.outNorm[4], face_idx, nFace*param.ghostFace[2]);
1210  } else {
1211  const int idx = indexFromFaceIndexExtendedStaggered<2,nFace,1>(face_idx,param.ghostFace[2],param.parity,param.X,param.R);
1212  packFaceStaggeredCore(param.in, param.inNorm, idx,
1213  param.sp_stride, param.out[5], param.outNorm[5], face_idx, nFace*param.ghostFace[2]);
1214  }
1215  } else {
1216  const int face_num = (param.face_num==2) ? ((face_idx >= nFace*param.ghostFace[3]) ? 1 : 0) : param.face_num;
1217  if(param.face_num==2) face_idx -= face_num*nFace*param.ghostFace[3];
1218  if (face_num == 0) {
1219  const int idx = indexFromFaceIndexExtendedStaggered<3,nFace,0>(face_idx,param.ghostFace[3],param.parity,param.X,param.R);
1220  packFaceStaggeredCore(param.in, param.inNorm, idx,
1221  param.sp_stride, param.out[6], param.outNorm[6], face_idx, nFace*param.ghostFace[3]);
1222  } else {
1223  const int idx = indexFromFaceIndexExtendedStaggered<3,nFace,1>(face_idx,param.ghostFace[3],param.parity,param.X,param.R);
1224  packFaceStaggeredCore(param.in, param.inNorm, idx,
1225  param.sp_stride, param.out[7], param.outNorm[7], face_idx, nFace*param.ghostFace[3]);
1226  }
1227  }
1228 
1229  }
1230 
1231 
1232 #undef SPINORTEXDOUBLE
1233 #undef SPINORTEXSINGLE
1234 #undef SPINORTEXHALF
1235 
1236 #endif // GPU_STAGGERED_DIRAC
1237 
1238 
1239  template <typename FloatN, typename Float>
1240  class PackFaceStaggered : public PackFace<FloatN, Float> {
1241 
1242  private:
1243  const int* R; // boundary dimensions for extended field
1244  const bool unpack;
1245 
1246  int inputPerSite() const { return 6; } // input is full spinor
1247  int outputPerSite() const { return 6; } // output is full spinor
1248 
1249 
1250  public:
1251  PackFaceStaggered(FloatN *faces, const cudaColorSpinorField *in,
1252  const int nFace, const int dagger, const int parity,
1253  const int dim, const int face_num, const int* R=NULL, const bool unpack=false)
1254  : PackFace<FloatN, Float>(faces, in, dagger, parity, nFace, dim, face_num), R(R), unpack(unpack) { }
1255  virtual ~PackFaceStaggered() { }
1256 
1257  void apply(const cudaStream_t &stream) {
1258  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
1259 
1260 #ifdef GPU_STAGGERED_DIRAC
1261 
1262  PackParam<FloatN> param = this->prepareParam(this->dim, this->face_num);
1263  if(!R){
1264  if (PackFace<FloatN,Float>::nFace==1) {
1265  packFaceStaggeredKernel<FloatN, 1> <<<tp.grid, tp.block, tp.shared_bytes, stream>>>(param);
1266  } else {
1267  packFaceStaggeredKernel<FloatN, 3> <<<tp.grid, tp.block, tp.shared_bytes, stream>>>(param);
1268  }
1269  }else{ // R!=NULL => this is an extended field
1270  PackExtendedParam<FloatN> extendedParam(param);
1271  if(!unpack){
1272  for(int d=0; d<QUDA_MAX_DIM; ++d) extendedParam.R[d] = R[d];
1273  switch(PackFace<FloatN,Float>::nFace){
1274  case 1:
1275  packFaceExtendedStaggeredKernel<FloatN,1><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(extendedParam);
1276  break;
1277 
1278  case 2:
1279  packFaceExtendedStaggeredKernel<FloatN,2><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(extendedParam);
1280  break;
1281 
1282  case 3:
1283  packFaceExtendedStaggeredKernel<FloatN,3><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(extendedParam);
1284  break;
1285 
1286  case 4:
1287  packFaceExtendedStaggeredKernel<FloatN,4><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(extendedParam);
1288  break;
1289 
1290  default:
1291  errorQuda("Unsupported boundary width");
1292  break;
1293  }
1294  }else{ // extended field unpack
1295  switch(PackFace<FloatN,Float>::nFace){
1296  case 1:
1297  unpackFaceExtendedStaggeredKernel<FloatN,1><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(extendedParam);
1298  break;
1299 
1300  case 2:
1301  unpackFaceExtendedStaggeredKernel<FloatN,2><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(extendedParam);
1302  break;
1303 
1304  case 3:
1305  unpackFaceExtendedStaggeredKernel<FloatN,3><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(extendedParam);
1306  break;
1307 
1308  case 4:
1309  unpackFaceExtendedStaggeredKernel<FloatN,4><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(extendedParam);
1310  break;
1311 
1312  default:
1313  errorQuda("Unsupported boundary width");
1314  break;
1315  }
1316  }
1317  }
1318 #else
1319  errorQuda("Staggered face packing kernel is not built");
1320 #endif
1321  }
1322 
1323  long long flops() const { return 0; }
1324  };
1325 
1326 
1327  void packFaceStaggered(void *ghost_buf, cudaColorSpinorField &in, int nFace,
1328  int dagger, int parity, const int dim, const int face_num, const cudaStream_t &stream) {
1329 
1330  switch(in.Precision()) {
1331  case QUDA_DOUBLE_PRECISION:
1332  {
1333 #if __COMPUTE_CAPABILITY__ >= 130
1334  PackFaceStaggered<double2, double> pack((double2*)ghost_buf, &in, nFace, dagger, parity, dim, face_num);
1335  pack.apply(stream);
1336 #endif
1337  }
1338  break;
1339  case QUDA_SINGLE_PRECISION:
1340  {
1341  PackFaceStaggered<float2, float> pack((float2*)ghost_buf, &in, nFace, dagger, parity, dim, face_num);
1342  pack.apply(stream);
1343  }
1344  break;
1345  case QUDA_HALF_PRECISION:
1346  {
1347  PackFaceStaggered<short2, float> pack((short2*)ghost_buf, &in, nFace, dagger, parity, dim, face_num);
1348  pack.apply(stream);
1349  }
1350  break;
1351  }
1352  }
1353 
1354  void packFaceExtendedStaggered(void *buffer, cudaColorSpinorField &field, const int nFace, const int R[],
1355  int dagger, int parity, const int dim, const int face_num, const cudaStream_t &stream, bool unpack=false)
1356  {
1357  switch(field.Precision()){
1358  case QUDA_DOUBLE_PRECISION:
1359  {
1360 #if __COMPUTE_CAPABILITY__ >= 130
1361  PackFaceStaggered<double2,double> pack(static_cast<double2*>(buffer), &field, nFace, dagger, parity, dim, face_num, R, unpack);
1362  pack.apply(stream);
1363 #endif
1364  }
1365  break;
1366  case QUDA_SINGLE_PRECISION:
1367  {
1368  PackFaceStaggered<float2,float> pack(static_cast<float2*>(buffer), &field, nFace, dagger, parity, dim, face_num, R, unpack);
1369  pack.apply(stream);
1370  }
1371  break;
1372  case QUDA_HALF_PRECISION:
1373  {
1374  PackFaceStaggered<short2,float> pack(static_cast<short2*>(buffer), &field, nFace, dagger, parity, dim, face_num, R, unpack);
1375  pack.apply(stream);
1376  }
1377  break;
1378 
1379  } // switch(field.Precision())
1380  }
1381 
1382 #ifdef GPU_DOMAIN_WALL_DIRAC
1383  template <int dagger, typename FloatN>
1384  __global__ void packFaceDWKernel(PackParam<FloatN> param)
1385  {
1386  const int nFace = 1; // 1 face for dwf
1387 
1388  int face_idx = blockIdx.x*blockDim.x + threadIdx.x;
1389  if (face_idx >= param.threads) return;
1390 
1391  // determine which dimension we are packing
1392  const int dim = dimFromFaceIndex(face_idx, param);
1393 
1394  const int Ls = param.X[4];
1395 
1396  // compute where the output is located
1397  // compute an index into the local volume from the index into the face
1398  // read spinor, spin-project, and write half spinor to face
1399  if (dim == 0) {
1400  // face_num determines which end of the lattice we are packing: 0 = beginning, 1 = end
1401  // FIXME these param.ghostFace constants do not incude the Ls dimension
1402  const int face_num = (face_idx >= nFace*Ls*param.ghostFace[0]) ? 1 : 0;
1403  face_idx -= face_num*nFace*Ls*param.ghostFace[0];
1404  if (face_num == 0) {
1405  const int idx = indexFromDWFaceIndex<0,nFace,0>(face_idx,Ls*param.ghostFace[0],param.parity,param.X);
1406  packFaceWilsonCore<0,dagger,0>(param.out[0], param.outNorm[0], param.in,
1407  param.inNorm, idx, face_idx, Ls*param.ghostFace[0], param);
1408  } else {
1409  const int idx = indexFromDWFaceIndex<0,nFace,1>(face_idx,Ls*param.ghostFace[0],param.parity,param.X);
1410  packFaceWilsonCore<0,dagger,1>(param.out[1], param.outNorm[1], param.in,
1411  param.inNorm, idx, face_idx, Ls*param.ghostFace[0], param);
1412  }
1413  } else if (dim == 1) {
1414  const int face_num = (face_idx >= nFace*Ls*param.ghostFace[1]) ? 1 : 0;
1415  face_idx -= face_num*nFace*Ls*param.ghostFace[1];
1416  if (face_num == 0) {
1417  const int idx = indexFromDWFaceIndex<1,nFace,0>(face_idx,Ls*param.ghostFace[1],param.parity,param.X);
1418  packFaceWilsonCore<1, dagger,0>(param.out[2], param.outNorm[2], param.in,
1419  param.inNorm, idx, face_idx, Ls*param.ghostFace[1], param);
1420  } else {
1421  const int idx = indexFromDWFaceIndex<1,nFace,1>(face_idx,Ls*param.ghostFace[1],param.parity,param.X);
1422  packFaceWilsonCore<1, dagger,1>(param.out[3], param.outNorm[3], param.in,
1423  param.inNorm, idx, face_idx, Ls*param.ghostFace[1], param);
1424  }
1425  } else if (dim == 2) {
1426  const int face_num = (face_idx >= nFace*Ls*param.ghostFace[2]) ? 1 : 0;
1427  face_idx -= face_num*nFace*Ls*param.ghostFace[2];
1428  if (face_num == 0) {
1429  const int idx = indexFromDWFaceIndex<2,nFace,0>(face_idx,Ls*param.ghostFace[2],param.parity,param.X);
1430  packFaceWilsonCore<2, dagger,0>(param.out[4], param.outNorm[4], param.in,
1431  param.inNorm, idx, face_idx, Ls*param.ghostFace[2], param);
1432  } else {
1433  const int idx = indexFromDWFaceIndex<2,nFace,1>(face_idx,Ls*param.ghostFace[2],param.parity,param.X);
1434  packFaceWilsonCore<2, dagger,1>(param.out[5], param.outNorm[5], param.in,
1435  param.inNorm, idx, face_idx, Ls*param.ghostFace[2], param);
1436  }
1437  } else {
1438  const int face_num = (face_idx >= nFace*Ls*param.ghostFace[3]) ? 1 : 0;
1439  face_idx -= face_num*nFace*Ls*param.ghostFace[3];
1440  if (face_num == 0) {
1441  const int idx = indexFromDWFaceIndex<3,nFace,0>(face_idx,Ls*param.ghostFace[3],param.parity,param.X);
1442  packFaceWilsonCore<3, dagger,0>(param.out[6], param.outNorm[6], param.in,
1443  param.inNorm, idx, face_idx, Ls*param.ghostFace[3], param);
1444  } else {
1445  const int idx = indexFromDWFaceIndex<3,nFace,1>(face_idx,Ls*param.ghostFace[3],param.parity,param.X);
1446  packFaceWilsonCore<3, dagger,1>(param.out[7], param.outNorm[7], param.in,
1447  param.inNorm, idx, face_idx, Ls*param.ghostFace[3], param);
1448  }
1449  }
1450  }
1451 
1452 
1453  template <int dagger, typename FloatN>
1454  __global__ void packFaceDW4DKernel(PackParam<FloatN> param)
1455  {
1456  const int nFace = 1; // 1 face for Wilson
1457 
1458  int face_idx = blockIdx.x*blockDim.x + threadIdx.x;
1459  if (face_idx >= param.threads) return;
1460 
1461  const int Ls = param.X[4];
1462 
1463  // determine which dimension we are packing
1464  const int dim = dimFromFaceIndex(face_idx, param);
1465 
1466  // compute where the output is located
1467  // compute an index into the local volume from the index into the face
1468  // read spinor, spin-project, and write half spinor to face
1469  if (dim == 0) {
1470  // face_num determines which end of the lattice we are packing: 0 = beginning, 1 = end
1471  // FIXME these param.ghostFace constants do not incude the Ls dimension
1472  const int face_num = (face_idx >= nFace*Ls*param.ghostFace[0]) ? 1 : 0;
1473  face_idx -= face_num*nFace*Ls*param.ghostFace[0];
1474  if (face_num == 0) {
1475  const int idx = indexFromDW4DFaceIndex<0,nFace,0>(face_idx,Ls*param.ghostFace[0],param.parity,param.X);
1476  packFaceWilsonCore<0,dagger,0>(param.out[0], param.outNorm[0], param.in,
1477  param.inNorm, idx, face_idx, Ls*param.ghostFace[0], param);
1478  } else {
1479  const int idx = indexFromDW4DFaceIndex<0,nFace,1>(face_idx,Ls*param.ghostFace[0],param.parity,param.X);
1480  packFaceWilsonCore<0,dagger,1>(param.out[1], param.outNorm[1], param.in,
1481  param.inNorm, idx, face_idx, Ls*param.ghostFace[0], param);
1482  }
1483  } else if (dim == 1) {
1484  const int face_num = (face_idx >= nFace*Ls*param.ghostFace[1]) ? 1 : 0;
1485  face_idx -= face_num*nFace*Ls*param.ghostFace[1];
1486  if (face_num == 0) {
1487  const int idx = indexFromDW4DFaceIndex<1,nFace,0>(face_idx,Ls*param.ghostFace[1],param.parity,param.X);
1488  packFaceWilsonCore<1, dagger,0>(param.out[2], param.outNorm[2], param.in,
1489  param.inNorm, idx, face_idx, Ls*param.ghostFace[1], param);
1490  } else {
1491  const int idx = indexFromDW4DFaceIndex<1,nFace,1>(face_idx,Ls*param.ghostFace[1],param.parity,param.X);
1492  packFaceWilsonCore<1, dagger,1>(param.out[3], param.outNorm[3], param.in,
1493  param.inNorm, idx, face_idx, Ls*param.ghostFace[1], param);
1494  }
1495  } else if (dim == 2) {
1496  const int face_num = (face_idx >= nFace*Ls*param.ghostFace[2]) ? 1 : 0;
1497  face_idx -= face_num*nFace*Ls*param.ghostFace[2];
1498  if (face_num == 0) {
1499  const int idx = indexFromDW4DFaceIndex<2,nFace,0>(face_idx,Ls*param.ghostFace[2],param.parity,param.X);
1500  packFaceWilsonCore<2, dagger,0>(param.out[4], param.outNorm[4], param.in,
1501  param.inNorm, idx, face_idx, Ls*param.ghostFace[2], param);
1502  } else {
1503  const int idx = indexFromDW4DFaceIndex<2,nFace,1>(face_idx,Ls*param.ghostFace[2],param.parity,param.X);
1504  packFaceWilsonCore<2, dagger,1>(param.out[5], param.outNorm[5], param.in,
1505  param.inNorm, idx, face_idx, Ls*param.ghostFace[2], param);
1506  }
1507  } else {
1508  const int face_num = (face_idx >= nFace*Ls*param.ghostFace[3]) ? 1 : 0;
1509  face_idx -= face_num*nFace*Ls*param.ghostFace[3];
1510  if (face_num == 0) {
1511  const int idx = indexFromDW4DFaceIndex<3,nFace,0>(face_idx,Ls*param.ghostFace[3],param.parity,param.X);
1512  packFaceWilsonCore<3, dagger,0>(param.out[6], param.outNorm[6], param.in,
1513  param.inNorm, idx, face_idx, Ls*param.ghostFace[3], param);
1514  } else {
1515  const int idx = indexFromDW4DFaceIndex<3,nFace,1>(face_idx,Ls*param.ghostFace[3],param.parity,param.X);
1516  packFaceWilsonCore<3, dagger,1>(param.out[7], param.outNorm[7], param.in,
1517  param.inNorm, idx, face_idx, Ls*param.ghostFace[3], param);
1518  }
1519  }
1520  }
1521 
1522 #endif
1523 
1524  template <typename FloatN, typename Float>
1525  class PackFaceDW : public PackFace<FloatN, Float> {
1526 
1527  private:
1528 
1529  int inputPerSite() const { return 24; } // input is full spinor
1530  int outputPerSite() const { return 12; } // output is spin projected
1531 
1532  public:
1533  PackFaceDW(FloatN *faces, const cudaColorSpinorField *in,
1534  const int dagger, const int parity)
1535  : PackFace<FloatN, Float>(faces, in, dagger, parity, 1) { }
1536  virtual ~PackFaceDW() { }
1537 
1538  void apply(const cudaStream_t &stream) {
1539  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
1540 
1541 #ifdef GPU_DOMAIN_WALL_DIRAC
1542  PackParam<FloatN> param = this->prepareParam();
1543  if (this->dagger) {
1544  packFaceDWKernel<1><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(param);
1545  } else {
1546  packFaceDWKernel<0><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(param);
1547  }
1548 #else
1549  errorQuda("DW face packing kernel is not built");
1550 #endif
1551  }
1552 
1553  long long flops() const { return outputPerSite()*this->threads(); }
1554  };
1555 
1556  template <typename FloatN, typename Float>
1557  class PackFaceDW4D : public PackFace<FloatN, Float> {
1558 
1559  private:
1560 
1561  int inputPerSite() const { return 24; } // input is full spinor
1562  int outputPerSite() const { return 12; } // output is spin projected
1563 
1564  public:
1565  PackFaceDW4D(FloatN *faces, const cudaColorSpinorField *in,
1566  const int dagger, const int parity)
1567  : PackFace<FloatN, Float>(faces, in, dagger, parity, 1) { }
1568  virtual ~PackFaceDW4D() { }
1569 
1570  void apply(const cudaStream_t &stream) {
1571  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
1572 
1573 #ifdef GPU_DOMAIN_WALL_DIRAC
1574  PackParam<FloatN> param = this->prepareParam();
1575  if (this->dagger) {
1576  packFaceDW4DKernel<1><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(param);
1577  } else {
1578  packFaceDW4DKernel<0><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(param);
1579  }
1580 #else
1581  errorQuda("4D preconditioned DW face packing kernel is not built");
1582 #endif
1583  }
1584 
1585  long long flops() const { return outputPerSite()*this->threads(); }
1586  };
1587 
1588  void packFaceDW(void *ghost_buf, cudaColorSpinorField &in, const int dagger,
1589  const int parity, const cudaStream_t &stream) {
1590 
1591 
1592  if(in.DWFPCtype() == QUDA_4D_PC)
1593  {
1594  switch(in.Precision()) {
1595  case QUDA_DOUBLE_PRECISION:
1596  {
1597  PackFaceDW4D<double2, double> pack((double2*)ghost_buf, &in, dagger, parity);
1598  pack.apply(stream);
1599  }
1600  break;
1601  case QUDA_SINGLE_PRECISION:
1602  {
1603  PackFaceDW4D<float4, float> pack((float4*)ghost_buf, &in, dagger, parity);
1604  pack.apply(stream);
1605  }
1606  break;
1607  case QUDA_HALF_PRECISION:
1608  {
1609  PackFaceDW4D<short4, float> pack((short4*)ghost_buf, &in, dagger, parity);
1610  pack.apply(stream);
1611  }
1612  break;
1613  }
1614  }
1615  else
1616  {
1617  switch(in.Precision()) {
1618  case QUDA_DOUBLE_PRECISION:
1619  {
1620  PackFaceDW<double2, double> pack((double2*)ghost_buf, &in, dagger, parity);
1621  pack.apply(stream);
1622  }
1623  break;
1624  case QUDA_SINGLE_PRECISION:
1625  {
1626  PackFaceDW<float4, float> pack((float4*)ghost_buf, &in, dagger, parity);
1627  pack.apply(stream);
1628  }
1629  break;
1630  case QUDA_HALF_PRECISION:
1631  {
1632  PackFaceDW<short4, float> pack((short4*)ghost_buf, &in, dagger, parity);
1633  pack.apply(stream);
1634  }
1635  break;
1636  }
1637  }
1638  }
1639 
1640 #ifdef GPU_NDEG_TWISTED_MASS_DIRAC
1641  template <int dagger, typename FloatN>
1642  __global__ void packFaceNdegTMKernel(PackParam<FloatN> param)
1643  {
1644  const int nFace = 1; // 1 face for Wilson
1645  const int Nf = 2;
1646 
1647  int face_idx = blockIdx.x*blockDim.x + threadIdx.x;
1648  if (face_idx >= param.threads) return;
1649 
1650  // determine which dimension we are packing
1651  const int dim = dimFromFaceIndex(face_idx, param);
1652 
1653  // compute where the output is located
1654  // compute an index into the local volume from the index into the face
1655  // read spinor, spin-project, and write half spinor to face
1656  if (dim == 0) {
1657  // face_num determines which end of the lattice we are packing: 0 = beginning, 1 = end
1658  // FIXME these param.ghostFace constants do not include the Nf dimension
1659  const int face_num = (face_idx >= nFace*Nf*param.ghostFace[0]) ? 1 : 0;
1660  face_idx -= face_num*nFace*Nf*param.ghostFace[0];
1661  if (face_num == 0) {
1662  const int idx = indexFromNdegTMFaceIndex<0,nFace,0>(face_idx,Nf*param.ghostFace[0],param.parity,param.X);
1663  packFaceWilsonCore<0,dagger,0>(param.out[0], param.outNorm[0], param.in,
1664  param.inNorm, idx, face_idx, Nf*param.ghostFace[0], param);
1665  } else {
1666  const int idx = indexFromNdegTMFaceIndex<0,nFace,1>(face_idx,Nf*param.ghostFace[0],param.parity,param.X);
1667  packFaceWilsonCore<0,dagger,1>(param.out[1], param.outNorm[1], param.in,
1668  param.inNorm, idx, face_idx, Nf*param.ghostFace[0], param);
1669  }
1670  } else if (dim == 1) {
1671  const int face_num = (face_idx >= nFace*Nf*param.ghostFace[1]) ? 1 : 0;
1672  face_idx -= face_num*nFace*Nf*param.ghostFace[1];
1673  if (face_num == 0) {
1674  const int idx = indexFromNdegTMFaceIndex<1,nFace,0>(face_idx,Nf*param.ghostFace[1],param.parity,param.X);
1675  packFaceWilsonCore<1, dagger,0>(param.out[2], param.outNorm[2], param.in,
1676  param.inNorm, idx, face_idx, Nf*param.ghostFace[1], param);
1677  } else {
1678  const int idx = indexFromNdegTMFaceIndex<1,nFace,1>(face_idx,Nf*param.ghostFace[1],param.parity,param.X);
1679  packFaceWilsonCore<1, dagger,1>(param.out[3], param.outNorm[3], param.in,
1680  param.inNorm, idx, face_idx, Nf*param.ghostFace[1], param);
1681  }
1682  } else if (dim == 2) {
1683  const int face_num = (face_idx >= nFace*Nf*param.ghostFace[2]) ? 1 : 0;
1684  face_idx -= face_num*nFace*Nf*param.ghostFace[2];
1685  if (face_num == 0) {
1686  const int idx = indexFromNdegTMFaceIndex<2,nFace,0>(face_idx,Nf*param.ghostFace[2],param.parity,param.X);
1687  packFaceWilsonCore<2, dagger,0>(param.out[4], param.outNorm[4], param.in,
1688  param.inNorm, idx, face_idx, Nf*param.ghostFace[2], param);
1689  } else {
1690  const int idx = indexFromNdegTMFaceIndex<2,nFace,1>(face_idx,Nf*param.ghostFace[2],param.parity,param.X);
1691  packFaceWilsonCore<2, dagger,1>(param.out[5], param.outNorm[5], param.in,
1692  param.inNorm, idx, face_idx, Nf*param.ghostFace[2], param);
1693  }
1694  } else {
1695  const int face_num = (face_idx >= nFace*Nf*param.ghostFace[3]) ? 1 : 0;
1696  face_idx -= face_num*nFace*Nf*param.ghostFace[3];
1697  if (face_num == 0) {
1698  const int idx = indexFromNdegTMFaceIndex<3,nFace,0>(face_idx,Nf*param.ghostFace[3],param.parity,param.X);
1699  packFaceWilsonCore<3, dagger,0>(param.out[6], param.outNorm[6], param.in,
1700  param.inNorm, idx, face_idx, Nf*param.ghostFace[3], param);
1701  } else {
1702  const int idx = indexFromNdegTMFaceIndex<3,nFace,1>(face_idx,Nf*param.ghostFace[3],param.parity,param.X);
1703  packFaceWilsonCore<3, dagger,1>(param.out[7], param.outNorm[7], param.in,
1704  param.inNorm, idx, face_idx, Nf*param.ghostFace[3], param);
1705  }
1706  }
1707  }
1708 #endif
1709 
1710  template <typename FloatN, typename Float>
1711  class PackFaceNdegTM : public PackFace<FloatN, Float> {
1712 
1713  private:
1714 
1715  int inputPerSite() const { return 24; } // input is full spinor
1716  int outputPerSite() const { return 12; } // output is spin projected
1717 
1718  public:
1719  PackFaceNdegTM(FloatN *faces, const cudaColorSpinorField *in,
1720  const int dagger, const int parity)
1721  : PackFace<FloatN, Float>(faces, in, dagger, parity, 1) { }
1722  virtual ~PackFaceNdegTM() { }
1723 
1724  void apply(const cudaStream_t &stream) {
1725  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
1726 
1727 #ifdef GPU_NDEG_TWISTED_MASS_DIRAC
1728  PackParam<FloatN> param = this->prepareParam();
1729  if (this->dagger) {
1730  packFaceNdegTMKernel<1><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(param);
1731  } else {
1732  packFaceNdegTMKernel<0><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(param);
1733  }
1734 #else
1735  errorQuda("Non-degenerate twisted mass face packing kernel is not built");
1736 #endif
1737  }
1738 
1739  long long flops() const { return outputPerSite()*this->threads(); }
1740  };
1741 
1742  void packFaceNdegTM(void *ghost_buf, cudaColorSpinorField &in, const int dagger,
1743  const int parity, const cudaStream_t &stream) {
1744 
1745  switch(in.Precision()) {
1746  case QUDA_DOUBLE_PRECISION:
1747  {
1748  PackFaceNdegTM<double2, double> pack((double2*)ghost_buf, &in, dagger, parity);
1749  pack.apply(stream);
1750  }
1751  break;
1752  case QUDA_SINGLE_PRECISION:
1753  {
1754  PackFaceNdegTM<float4, float> pack((float4*)ghost_buf, &in, dagger, parity);
1755  pack.apply(stream);
1756  }
1757  break;
1758  case QUDA_HALF_PRECISION:
1759  {
1760  PackFaceNdegTM<short4, float> pack((short4*)ghost_buf, &in, dagger, parity);
1761  pack.apply(stream);
1762  }
1763  break;
1764  }
1765  }
1766 
1767  void packFace(void *ghost_buf, cudaColorSpinorField &in, const int nFace,
1768  const int dagger, const int parity,
1769  const int dim, const int face_num,
1770  const cudaStream_t &stream,
1771  const double a, const double b)
1772  {
1773  int nDimPack = 0;
1774  if(dim < 0){
1775  for (int d=0; d<4; d++) {
1776  if(!commDim[d]) continue;
1777  if (d != 3 || getKernelPackT() || a != 0.0 || b!= 0.0) nDimPack++;
1778  }
1779  }else{
1780  if(commDim[dim]){
1781  if(dim!=3 || getKernelPackT() || a!=0.0 || b != 0.0) nDimPack++;
1782  }
1783  }
1784  if (!nDimPack) return; // if zero then we have nothing to pack
1785 
1786  if (nFace != 1 && in.Nspin() != 1)
1787  errorQuda("Unsupported number of faces %d", nFace);
1788 
1789  // Need to update this logic for other multi-src dslash packing
1790  if (in.Nspin() == 1) {
1791  packFaceStaggered(ghost_buf, in, nFace, dagger, parity, dim, face_num, stream);
1792  } else if (a!=0.0 || b!=0.0) {
1793  // Need to update this logic for other multi-src dslash packing
1794  if(in.TwistFlavor() == QUDA_TWIST_PLUS || in.TwistFlavor() == QUDA_TWIST_MINUS) {
1795  packTwistedFaceWilson(ghost_buf, in, dagger, parity, a, b, stream);
1796  } else {
1797  errorQuda("Cannot perform twisted packing for the spinor.");
1798  }
1799  } else if (in.Ndim() == 5) {
1800  if(in.TwistFlavor() == QUDA_TWIST_INVALID) {
1801  packFaceDW(ghost_buf, in, dagger, parity, stream);
1802  } else {
1803  packFaceNdegTM(ghost_buf, in, dagger, parity, stream);
1804  }
1805  } else {
1806  packFaceWilson(ghost_buf, in, dagger, parity, stream);
1807  }
1808  }
1809 
1810 
1811 
1812  void packFaceExtended(void* buffer, cudaColorSpinorField &field, const int nFace, const int R[],
1813  const int dagger, const int parity, const int dim, const int face_num,
1814  const cudaStream_t &stream, const bool unpack)
1815  {
1816  int nDimPack = 0;
1817  if(dim < 0){
1818  for(int d=0; d<4; d++){
1819  if(R[d]) nDimPack++;
1820  }
1821  }else{
1822  if(R[dim]) nDimPack++;
1823  }
1824 
1825  if(!nDimPack) return; // if zero then we have nothing to pack
1826  if(field.Nspin() == 1){
1827  packFaceExtendedStaggered(buffer, field, nFace, R, dagger, parity, dim, face_num, stream, unpack);
1828  }else{
1829  errorQuda("Extended quark field is not supported");
1830  }
1831 
1832  }
1833 
1834 #endif // MULTI_GPU
1835 
1836 }
int commDim(int)
void packFace(void *ghost_buf, cudaColorSpinorField &in, const int nFace, const int dagger, const int parity, const int dim, const int face_num, const cudaStream_t &stream, const double a=0.0, const double b=0.0)
bool getKernelPackT()
Definition: dslash_quda.cu:84
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
#define errorQuda(...)
Definition: util_quda.h:73
void setPackComms(const int *commDim)
Definition: dslash_pack.cu:39
int comm_dim(int dim)
cudaStream_t * stream
__global__ void const FloatN FloatM FloatM Float Float int threads
Definition: llfat_core.h:1099
QudaDagType dagger
Definition: test_util.cpp:1558
int Ls
Definition: test_util.cpp:40
QudaGaugeParam param
Definition: pack_test.cpp:17
__constant__ int ghostFace[QUDA_MAX_DIM+1]
FloatingPoint< float > Float
Definition: gtest.h:7350
cpuColorSpinorField * in
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:271
__inline__ __device__ double2 fetch_double2(texture< int4, 1 > t, int i)
Definition: texture.h:90
int X[4]
Definition: quda.h:29
int x[4]
cpuColorSpinorField * out
bool getTwistPack()
Definition: dslash_quda.cu:91
#define MAX_SHORT
Definition: quda_internal.h:30
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
void packFaceExtended(void *ghost_buf, cudaColorSpinorField &field, const int nFace, const int R[], const int dagger, const int parity, const int dim, const int face_num, const cudaStream_t &stream, const bool unpack=false)
QudaTune getTuning()
Definition: util_quda.cpp:32
const QudaParity parity
Definition: dslash_test.cpp:29
#define TEX1DFETCH(type, tex, idx)