QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
gauge_field_order.h
Go to the documentation of this file.
1 #include <tune_quda.h>
2 #include <assert.h>
3 #include <register_traits.h>
4 
5 namespace quda {
6 
7  // a += b*c
8  template <typename Float>
9  __device__ __host__ inline void accumulateComplexProduct(Float *a, const Float *b, const Float *c, Float sign) {
10  a[0] += sign*(b[0]*c[0] - b[1]*c[1]);
11  a[1] += sign*(b[0]*c[1] + b[1]*c[0]);
12  }
13 
14  // a = b*c
15  template <typename Float>
16  __device__ __host__ inline void complexProduct(Float *a, const Float *b, const Float *c) {
17  a[0] = b[0]*c[0] - b[1]*c[1];
18  a[1] = b[0]*c[1] + b[1]*c[0];
19  }
20 
21  // a = conj(b)*c
22  template <typename Float>
23  __device__ __host__ inline void complexDotProduct(Float *a, const Float *b, const Float *c) {
24  a[0] = b[0]*c[0] + b[1]*c[1];
25  a[1] = b[0]*c[1] - b[1]*c[0];
26  }
27 
28 
29  // a = b/c
30  template <typename Float>
31  __device__ __host__ inline void complexQuotient(Float *a, const Float *b, const Float *c){
32  complexDotProduct(a, c, b);
33  Float denom = c[0]*c[0] + c[1]*c[1];
34  a[0] /= denom;
35  a[1] /= denom;
36  }
37 
38  // a += conj(b) * conj(c)
39  template <typename Float>
40  __device__ __host__ inline void accumulateConjugateProduct(Float *a, const Float *b, const Float *c, int sign) {
41  a[0] += sign * (b[0]*c[0] - b[1]*c[1]);
42  a[1] -= sign * (b[0]*c[1] + b[1]*c[0]);
43  }
44 
45  // a = conj(b)*conj(c)
46  template <typename Float>
47  __device__ __host__ inline void complexConjugateProduct(Float *a, const Float *b, const Float *c) {
48  a[0] = b[0]*c[0] - b[1]*c[1];
49  a[1] = -b[0]*c[1] - b[1]*c[0];
50  }
51 
53  template <int N, typename Float>
54  struct Reconstruct {
55  typedef typename mapper<Float>::type RegType;
56  Reconstruct(const GaugeField &u) { ; }
57 
58  __device__ __host__ inline void Pack(RegType out[N], const RegType in[N], int idx ) const {
59  for (int i=0; i<N; i++) out[i] = in[i];
60  }
61  __device__ __host__ inline void Unpack(RegType out[N], const RegType in[N], int idx, int dir, const RegType phase) const {
62  for (int i=0; i<N; i++) out[i] = in[i];
63  }
64 
65 
66  __device__ __host__ inline void getPhase(RegType *phase, const RegType in[18]) const {
67  *phase = 0;
68  }
69 
70  };
71 
74  template <typename Float>
75  struct Reconstruct<19,Float> {
76  typedef typename mapper<Float>::type RegType;
78  Reconstruct(const GaugeField &u) : scale(u.LinkMax()) { ; }
79 
80  __device__ __host__ inline void Pack(RegType out[18], const RegType in[18], int idx) const {
81  for (int i=0; i<18; i++) out[i] = in[i] / scale;
82  }
83  __device__ __host__ inline void Unpack(RegType out[18], const RegType in[18],
84  int idx, int dir, const RegType phase) const {
85  for (int i=0; i<18; i++) out[i] = scale * in[i];
86  }
87 
88  __device__ __host__ inline void getPhase(RegType* phase, const RegType in[18]) const { *phase=0; return; }
89  };
90 
91  template <typename Float>
92  __device__ __host__ inline Float timeBoundary(int idx, const int X[QUDA_MAX_DIM], QudaTboundary tBoundary,
93  bool isFirstTimeSlice, bool isLastTimeSlice) {
94  }
95 
106  template <typename Float>
107  __device__ __host__ inline Float timeBoundary(int idx, const int X[QUDA_MAX_DIM], const int R[QUDA_MAX_DIM],
108  QudaTboundary tBoundary, bool isFirstTimeSlice, bool isLastTimeSlice,
109  QudaGhostExchange ghostExchange) {
110  if (ghostExchange != QUDA_GHOST_EXCHANGE_EXTENDED) {
111  if ( idx >= X[3]*X[2]*X[1]*X[0]/2 ) { // halo region on the first time slice
112  return isFirstTimeSlice ? tBoundary : 1.0;
113  } else if ( idx >= (X[3]-1)*X[0]*X[1]*X[2]/2 ) { // last link on the last time slice
114  return isLastTimeSlice ? tBoundary : 1.0;
115  } else {
116  return 1.0;
117  }
118  } else {
119  if ( idx >= (R[3]-1)*X[0]*X[1]*X[2]/2 && idx < R[3]*X[0]*X[1]*X[2]/2 ) {
120  // the boundary condition is on the R[3]-1 time slice
121  return isFirstTimeSlice ? tBoundary : 1.0;
122  } else if ( idx >= (X[3]-R[3]-1)*X[0]*X[1]*X[2]/2 && idx < (X[3]-R[3])*X[0]*X[1]*X[2]/2 ) {
123  // the boundary condition lies on the X[3]-R[3]-1 time slice
124  return isLastTimeSlice ? tBoundary : 1.0;
125  } else {
126  return 1.0;
127  }
128  }
129  }
130 
131  template <typename Float>
132  struct Reconstruct<12,Float> {
133  typedef typename mapper<Float>::type RegType;
135  int R[QUDA_MAX_DIM];
141 
142  Reconstruct(const GaugeField &u) : anisotropy(u.Anisotropy()), tBoundary(u.TBoundary()),
143  isFirstTimeSlice(comm_coord(3) == 0 ?true : false),
144  isLastTimeSlice(comm_coord(3) == comm_dim(3)-1 ? true : false),
145  ghostExchange(u.GhostExchange()) {
146  for (int i=0; i<QUDA_MAX_DIM; i++) {
147  X[i] = u.X()[i];
148  R[i] = u.R()[i];
149  }
150  }
151 
152  __device__ __host__ inline void Pack(RegType out[12], const RegType in[18], int idx) const {
153  for (int i=0; i<12; i++) out[i] = in[i];
154  }
155 
156  __device__ __host__ inline void Unpack(RegType out[18], const RegType in[12],
157  int idx, int dir, const RegType phase) const {
158  for (int i=0; i<12; i++) out[i] = in[i];
159  for (int i=12; i<18; i++) out[i] = 0.0;
160  accumulateConjugateProduct(&out[12], &out[2], &out[10], +1);
161  accumulateConjugateProduct(&out[12], &out[4], &out[8], -1);
162  accumulateConjugateProduct(&out[14], &out[4], &out[6], +1);
163  accumulateConjugateProduct(&out[14], &out[0], &out[10], -1);
164  accumulateConjugateProduct(&out[16], &out[0], &out[8], +1);
165  accumulateConjugateProduct(&out[16], &out[2], &out[6], -1);
166 
167  RegType u0 = dir < 3 ? anisotropy :
168  timeBoundary<RegType>(idx, X, R, tBoundary,isFirstTimeSlice, isLastTimeSlice, ghostExchange);
169 
170  for (int i=12; i<18; i++) out[i]*=u0;
171  }
172 
173  __device__ __host__ inline void getPhase(RegType* phase, const RegType in[18]){ *phase=0; return; }
174 
175  };
176 
177 
178  // FIX ME - 11 is a misnomer to avoid confusion in template instantiation
179  template <typename Float>
180  struct Reconstruct<11,Float> {
181  typedef typename mapper<Float>::type RegType;
182 
183  Reconstruct(const GaugeField &u) { ; }
184 
185  __device__ __host__ inline void Pack(RegType out[10], const RegType in[18], int idx) const {
186  for (int i=0; i<4; i++) out[i] = in[i+2];
187  out[4] = in[10];
188  out[5] = in[11];
189  out[6] = in[1];
190  out[7] = in[9];
191  out[8] = in[17];
192  out[9] = 0.0;
193  }
194 
195  __device__ __host__ inline void Unpack(RegType out[18], const RegType in[10],
196  int idx, int dir, const RegType phase) const {
197  out[0] = 0.0;
198  out[1] = in[6];
199  for (int i=0; i<4; i++) out[i+2] = in[i];
200  out[6] = -out[2];
201  out[7] = out[3];
202  out[8] = 0.0;
203  out[9] = in[7];
204  out[10] = in[4];
205  out[11] = in[5];
206  out[12] = -out[4];
207  out[13] = out[5];
208  out[14] = -out[10];
209  out[15] = out[11];
210  out[16] = 0.0;
211  out[17] = in[8];
212  }
213 
214  __device__ __host__ inline void getPhase(RegType* phase, const RegType in[18])
215  { *phase=0; return; }
216 
217  };
218 
219  template <typename Float>
220  struct Reconstruct<13,Float> {
221  typedef typename mapper<Float>::type RegType;
223  const RegType scale;
224 
225  Reconstruct(const GaugeField &u) : reconstruct_12(u), scale(u.Scale()) {}
226 
227  __device__ __host__ inline void Pack(RegType out[12], const RegType in[18], int idx) const {
228  reconstruct_12.Pack(out, in, idx);
229  }
230 
231  __device__ __host__ inline void Unpack(RegType out[18], const RegType in[12], int idx, int dir, const RegType phase) const {
232  for(int i=0; i<12; ++i) out[i] = in[i];
233  for(int i=12; i<18; ++i) out[i] = 0.0;
234 
235  const RegType coeff = 1./scale;
236 
237  accumulateConjugateProduct(&out[12], &out[2], &out[10], +coeff);
238  accumulateConjugateProduct(&out[12], &out[4], &out[8], -coeff);
239  accumulateConjugateProduct(&out[14], &out[4], &out[6], +coeff);
240  accumulateConjugateProduct(&out[14], &out[0], &out[10], -coeff);
241  accumulateConjugateProduct(&out[16], &out[0], &out[8], +coeff);
242  accumulateConjugateProduct(&out[16], &out[2], &out[6], -coeff);
243 
244  // Multiply the third row by exp(I*3*phase)
245  RegType cos_sin[2];
246  Trig<isHalf<RegType>::value>::SinCos(static_cast<RegType>(3.*phase), &cos_sin[1], &cos_sin[0]);
247  RegType tmp[2];
248  complexProduct(tmp, cos_sin, &out[12]); out[12] = tmp[0]; out[13] = tmp[1];
249  complexProduct(tmp, cos_sin, &out[14]); out[14] = tmp[0]; out[15] = tmp[1];
250  complexProduct(tmp, cos_sin, &out[16]); out[16] = tmp[0]; out[17] = tmp[1];
251  }
252 
253  __device__ __host__ inline void getPhase(RegType *phase, const RegType in[18]) const {
254  RegType denom[2];
255  // denominator = (U[0][0]*U[1][1] - U[0][1]*U[1][0])*
256  complexProduct(denom, in, in+8);
257  accumulateComplexProduct(denom, in+2, in+6, static_cast<RegType>(-1.0));
258 
259  denom[0] /= scale;
260  denom[1] /= (-scale); // complex conjugate
261 
262  RegType expI3Phase[2];
263  // numerator = U[2][2]
264  complexQuotient(expI3Phase, in+16, denom);
265 
266  *phase = Trig<isHalf<RegType>::value>::Atan2(expI3Phase[1], expI3Phase[0])/3.;
267  return;
268  }
269 
270  };
271 
272 
273  template <typename Float>
274  struct Reconstruct<8,Float> {
275  typedef typename mapper<Float>::type RegType;
277  int R[QUDA_MAX_DIM];
283 
284  Reconstruct(const GaugeField &u) : anisotropy(u.Anisotropy()), tBoundary(u.TBoundary()),
285  isFirstTimeSlice(comm_coord(3) == 0 ? true : false),
286  isLastTimeSlice(comm_coord(3) == comm_dim(3)-1 ? true : false),
287  ghostExchange(u.GhostExchange()) {
288  for (int i=0; i<QUDA_MAX_DIM; i++) {
289  X[i] = u.X()[i];
290  R[i] = u.R()[i];
291  }
292  }
293 
294  __device__ __host__ inline void Pack(RegType out[8], const RegType in[18], int idx) const {
295  out[0] = Trig<isHalf<Float>::value>::Atan2(in[1], in[0]);
296  out[1] = Trig<isHalf<Float>::value>::Atan2(in[13], in[12]);
297  for (int i=2; i<8; i++) out[i] = in[i];
298  }
299 
300  __device__ __host__ inline void Unpack(RegType out[18], const RegType in[8],
301  int idx, int dir, const RegType phase) const {
302  // First reconstruct first row
303  RegType row_sum = 0.0;
304  for (int i=2; i<6; i++) {
305  out[i] = in[i];
306  row_sum += in[i]*in[i];
307  }
308 
309  RegType u0 = dir < 3 ? anisotropy :
310  timeBoundary<RegType>(idx, X, R, tBoundary,isFirstTimeSlice, isLastTimeSlice, ghostExchange);
311 
312  RegType diff = 1.0/(u0*u0) - row_sum;
313  RegType U00_mag = sqrt(diff >= 0 ? diff : 0.0);
314 
315  out[0] = U00_mag * Trig<isHalf<Float>::value>::Cos(in[0]);
316  out[1] = U00_mag * Trig<isHalf<Float>::value>::Sin(in[0]);
317 
318  // Now reconstruct first column
319  RegType column_sum = 0.0;
320  for (int i=0; i<2; i++) column_sum += out[i]*out[i];
321  for (int i=6; i<8; i++) {
322  out[i] = in[i];
323  column_sum += in[i]*in[i];
324  }
325  diff = 1.f/(u0*u0) - column_sum;
326  RegType U20_mag = sqrt(diff >= 0 ? diff : 0.0);
327 
328  out[12] = U20_mag * Trig<isHalf<Float>::value>::Cos(in[1]);
329  out[13] = U20_mag * Trig<isHalf<Float>::value>::Sin(in[1]);
330  // First column now restored
331 
332  // finally reconstruct last elements from SU(2) rotation
333  RegType r_inv2 = 1.0/(u0*row_sum);
334 
335  // U11
336  RegType A[2];
337  complexDotProduct(A, out+0, out+6);
338  complexConjugateProduct(out+8, out+12, out+4);
339  accumulateComplexProduct(out+8, A, out+2, u0);
340  out[8] *= -r_inv2;
341  out[9] *= -r_inv2;
342 
343  // U12
344  complexConjugateProduct(out+10, out+12, out+2);
345  accumulateComplexProduct(out+10, A, out+4, -u0);
346  out[10] *= r_inv2;
347  out[11] *= r_inv2;
348 
349  // U21
350  complexDotProduct(A, out+0, out+12);
351  complexConjugateProduct(out+14, out+6, out+4);
352  accumulateComplexProduct(out+14, A, out+2, -u0);
353  out[14] *= r_inv2;
354  out[15] *= r_inv2;
355 
356  // U12
357  complexConjugateProduct(out+16, out+6, out+2);
358  accumulateComplexProduct(out+16, A, out+4, u0);
359  out[16] *= -r_inv2;
360  out[17] *= -r_inv2;
361  }
362 
363  __device__ __host__ inline void getPhase(RegType* phase, const RegType in[18]){ *phase=0; return; }
364  };
365 
366 
367  template <typename Float>
368  struct Reconstruct<9,Float> {
369  typedef typename mapper<Float>::type RegType;
371  const RegType scale;
372 
373  Reconstruct(const GaugeField &u) : reconstruct_8(u), scale(u.Scale()) {}
374 
375  __device__ __host__ inline void getPhase(RegType *phase, const RegType in[18]) const {
376 
377  RegType denom[2];
378  // denominator = (U[0][0]*U[1][1] - U[0][1]*U[1][0])*
379  complexProduct(denom, in, in+8);
380  accumulateComplexProduct(denom, in+2, in+6, static_cast<RegType>(-1.0));
381 
382  denom[0] /= scale;
383  denom[1] /= (-scale); // complex conjugate
384 
385  RegType expI3Phase[2];
386  // numerator = U[2][2]
387  complexQuotient(expI3Phase, in+16, denom);
388 
389  *phase = Trig<isHalf<RegType>::value>::Atan2(expI3Phase[1], expI3Phase[0])/3.;
390  }
391 
392 
393  __device__ __host__ inline void Pack(RegType out[8], const RegType in[18], int idx) const {
394 
395  RegType phase;
396  getPhase(&phase,in);
397  RegType cos_sin[2];
398  sincos(-phase, &cos_sin[1], &cos_sin[0]);
399  // Rescale the U3 input matrix by exp(-I*phase) to obtain an SU3 matrix multiplied by a real scale factor,
400  // which the macros in read_gauge.h can handle.
401  // NB: Only 5 complex matrix elements are used in the reconstruct 8 packing routine,
402  // so only need to rescale those elements.
403  RegType su3[18];
404  for(int i=0; i<4; ++i){
405  complexProduct(su3 + 2*i, cos_sin, in + 2*i);
406  }
407  complexProduct(&su3[12], cos_sin, &in[12]);
408  reconstruct_8.Pack(out, su3, idx);
409  }
410 
411  __device__ __host__ inline void Unpack(RegType out[18], const RegType in[8], int idx, int dir, const RegType phase) const {
412  reconstruct_8.Unpack(out, in, idx, dir, phase);
413  RegType cos_sin[2];
414  Trig<isHalf<RegType>::value>::SinCos(phase, &cos_sin[1], &cos_sin[0]);
415  RegType tmp[2];
416  cos_sin[0] *= scale;
417  cos_sin[1] *= scale;
418 
419  // rescale the matrix by exp(I*phase)*scale
420  complexProduct(tmp, cos_sin, &out[0]); out[0] = tmp[0]; out[1] = tmp[1];
421  complexProduct(tmp, cos_sin, &out[2]); out[2] = tmp[0]; out[3] = tmp[1];
422  complexProduct(tmp, cos_sin, &out[4]); out[4] = tmp[0]; out[5] = tmp[1];
423  complexProduct(tmp, cos_sin, &out[6]); out[6] = tmp[0]; out[7] = tmp[1];
424  complexProduct(tmp, cos_sin, &out[8]); out[8] = tmp[0]; out[9] = tmp[1];
425  complexProduct(tmp, cos_sin, &out[10]); out[10] = tmp[0]; out[11] = tmp[1];
426  complexProduct(tmp, cos_sin, &out[12]); out[12] = tmp[0]; out[13] = tmp[1];
427  complexProduct(tmp, cos_sin, &out[14]); out[14] = tmp[0]; out[15] = tmp[1];
428  complexProduct(tmp, cos_sin, &out[16]); out[16] = tmp[0]; out[17] = tmp[1];
429  }
430 
431  };
432 
433 
434  template <typename Float, int length, int N, int reconLen>
435  struct FloatNOrder {
436  typedef typename mapper<Float>::type RegType;
440  int faceVolumeCB[4];
441  const int volumeCB;
442  const int stride;
443  const int geometry;
444 #if __COMPUTE_CAPABILITY__ >= 200
445  const int hasPhase;
446  const size_t phaseOffset;
447 #endif
448 
449  FloatNOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0) :
450  reconstruct(u), volumeCB(u.VolumeCB()), stride(u.Stride()), geometry(u.Geometry())
451 #if __COMPUTE_CAPABILITY__ >= 200
452  , hasPhase((u.Reconstruct() == QUDA_RECONSTRUCT_9 || u.Reconstruct() == QUDA_RECONSTRUCT_13) ? 1 : 0),
453  phaseOffset(u.PhaseOffset())
454 #endif
455  {
456  if (gauge_) { gauge[0] = gauge_; gauge[1] = (Float*)((char*)gauge_ + u.Bytes()/2);
457  } else { gauge[0] = (Float*)u.Gauge_p(); gauge[1] = (Float*)((char*)u.Gauge_p() + u.Bytes()/2); }
458 
459  for (int i=0; i<4; i++) {
460  ghost[i] = ghost_ ? ghost_[i] : 0;
461  faceVolumeCB[i] = u.SurfaceCB(i)*u.Nface(); // face volume equals surface * depth
462  }
463  }
464 
465 
466  FloatNOrder(const FloatNOrder &order)
467  : reconstruct(order.reconstruct), volumeCB(order.volumeCB), stride(order.stride),
468  geometry(order.geometry)
469 #if __COMPUTE_CAPABILITY__ >= 200
470  , hasPhase(order.hasPhase), phaseOffset(order.phaseOffset)
471 #endif
472  {
473  gauge[0] = order.gauge[0];
474  gauge[1] = order.gauge[1];
475  for (int i=0; i<4; i++) {
476  ghost[i] = order.ghost[i];
477  faceVolumeCB[i] = order.faceVolumeCB[i];
478  }
479  }
480  virtual ~FloatNOrder() { ; }
481 
482  __device__ __host__ inline void load(RegType v[length], int x, int dir, int parity) const {
483  const int M = reconLen / N;
484  RegType tmp[reconLen];
485  for (int i=0; i<M; i++) {
486  for (int j=0; j<N; j++) {
487  int intIdx = i*N + j; // internal dof index
488  int padIdx = intIdx / N;
489  copy(tmp[i*N+j], gauge[parity][dir*stride*M*N + (padIdx*stride + x)*N + intIdx%N]);
490  }
491  }
492  RegType phase = 0.;
493 #if __COMPUTE_CAPABILITY__ >= 200
494  if(hasPhase) copy(phase, gauge[parity][phaseOffset/sizeof(Float) + stride*dir + x]);
495  // The phases come after the ghost matrices
496 #endif
497  reconstruct.Unpack(v, tmp, x, dir, 2.*M_PI*phase);
498  }
499 
500  __device__ __host__ inline void save(const RegType v[length], int x, int dir, int parity) {
501  const int M = reconLen / N;
502  RegType tmp[reconLen];
503  reconstruct.Pack(tmp, v, x);
504  for (int i=0; i<M; i++) {
505  for (int j=0; j<N; j++) {
506  int intIdx = i*N + j;
507  int padIdx = intIdx / N;
508  copy(gauge[parity][dir*stride*M*N + (padIdx*stride + x)*N + intIdx%N], tmp[i*N+j]);
509  }
510  }
511 #if __COMPUTE_CAPABILITY__ >= 200
512  if(hasPhase){
513  RegType phase;
514  reconstruct.getPhase(&phase,v);
515  copy(gauge[parity][phaseOffset/sizeof(Float) + dir*stride + x], static_cast<RegType>(phase/(2.*M_PI)));
516  }
517 #endif
518  }
519 
520  __device__ __host__ inline void loadGhost(RegType v[length], int x, int dir, int parity) const {
521  if (!ghost[dir]) { // load from main field not separate array
522  load(v, volumeCB+x, dir, parity); // an offset of size volumeCB puts us at the padded region
523  // This also works perfectly when phases are stored. No need to change this.
524  } else {
525  const int M = reconLen / N;
526  RegType tmp[reconLen];
527  for (int i=0; i<M; i++) {
528  for (int j=0; j<N; j++) {
529  int intIdx = i*N + j; // internal dof index
530  int padIdx = intIdx / N;
531 #if __COMPUTE_CAPABILITY__ < 200
532  const int hasPhase = 0;
533 #endif
534  copy(tmp[i*N+j], ghost[dir][parity*faceVolumeCB[dir]*(M*N + hasPhase) + (padIdx*faceVolumeCB[dir]+x)*N + intIdx%N]);
535  }
536  }
537  RegType phase=0.;
538 #if __COMPUTE_CAPABILITY__ >= 200
539  if(hasPhase) copy(phase, ghost[dir][parity*faceVolumeCB[dir]*(M*N + 1) + faceVolumeCB[dir]*M*N + x]);
540 #endif
541  reconstruct.Unpack(v, tmp, x, dir, 2.*M_PI*phase);
542  }
543  }
544 
545  __device__ __host__ inline void saveGhost(const RegType v[length], int x, int dir, int parity) {
546  if (!ghost[dir]) { // store in main field not separate array
547  save(v, volumeCB+x, dir, parity); // an offset of size volumeCB puts us at the padded region
548  } else {
549  const int M = reconLen / N;
550  RegType tmp[reconLen];
551  reconstruct.Pack(tmp, v, x);
552  for (int i=0; i<M; i++) {
553  for (int j=0; j<N; j++) {
554  int intIdx = i*N + j;
555  int padIdx = intIdx / N;
556 #if __COMPUTE_CAPABILITY__ < 200
557  const int hasPhase = 0;
558 #endif
559  copy(ghost[dir][parity*faceVolumeCB[dir]*(M*N + hasPhase) + (padIdx*faceVolumeCB[dir]+x)*N + intIdx%N], tmp[i*N+j]);
560  }
561  }
562 
563 #if __COMPUTE_CAPABILITY__ >= 200
564  if(hasPhase){
565  RegType phase=0.;
566  reconstruct.getPhase(&phase, v);
567  copy(ghost[dir][parity*faceVolumeCB[dir]*(M*N + 1) + faceVolumeCB[dir]*M*N + x], static_cast<RegType>(phase/(2.*M_PI)));
568  }
569 #endif
570  }
571  }
572 
573  __device__ __host__ inline void loadGhostEx(RegType v[length], int buff_idx, int extended_idx, int dir,
574  int dim, int g, int parity, const int R[]) const {
575 #if __COMPUTE_CAPABILITY__ < 200
576  const int hasPhase = 0;
577 #endif
578  const int M = reconLen / N;
579  RegType tmp[reconLen];
580  for (int i=0; i<M; i++) {
581  for (int j=0; j<N; j++) {
582  int intIdx = i*N + j; // internal dof index
583  int padIdx = intIdx / N;
584  copy(tmp[i*N+j], ghost[dim][((dir*2+parity)*geometry+g)*R[dim]*faceVolumeCB[dim]*(M*N + hasPhase)
585  + (padIdx*R[dim]*faceVolumeCB[dim]+buff_idx)*N + intIdx%N]);
586  }
587  }
588  RegType phase=0.;
589  if(hasPhase) copy(phase, ghost[dim][((dir*2+parity)*geometry+g)*R[dim]*faceVolumeCB[dim]*(M*N + 1)
590  + R[dim]*faceVolumeCB[dim]*M*N + buff_idx]);
591 
592  // use the extended_idx to determine the boundary condition
593  reconstruct.Unpack(v, tmp, extended_idx, g, 2.*M_PI*phase);
594  }
595 
596  __device__ __host__ inline void saveGhostEx(const RegType v[length], int buff_idx, int extended_idx,
597  int dir, int dim, int g, int parity, const int R[]) {
598 #if __COMPUTE_CAPABILITY__ < 200
599  const int hasPhase = 0;
600 #endif
601  const int M = reconLen / N;
602  RegType tmp[reconLen];
603  // use the extended_idx to determine the boundary condition
604  reconstruct.Pack(tmp, v, extended_idx);
605  for (int i=0; i<M; i++) {
606  for (int j=0; j<N; j++) {
607  int intIdx = i*N + j;
608  int padIdx = intIdx / N;
609  copy(ghost[dim][((dir*2+parity)*geometry+g)*R[dim]*faceVolumeCB[dim]*(M*N + hasPhase)
610  + (padIdx*R[dim]*faceVolumeCB[dim]+buff_idx)*N + intIdx%N], tmp[i*N+j]);
611  }
612  }
613  if(hasPhase){
614  RegType phase=0.;
615  reconstruct.getPhase(&phase, v);
616  copy(ghost[dim][((dir*2+parity)*geometry+g)*R[dim]*faceVolumeCB[dim]*(M*N + 1) + R[dim]*faceVolumeCB[dim]*M*N + buff_idx],
617  static_cast<RegType>(phase/(2.*M_PI)));
618  }
619  }
620 
621  size_t Bytes() const { return reconLen * sizeof(Float); }
622  };
623 
628  template <typename Float, int length>
629  struct LegacyOrder {
630  typedef typename mapper<Float>::type RegType;
633  const int volumeCB;
634  const int stride;
635  const int geometry;
636  const int hasPhase;
637 
638  LegacyOrder(const GaugeField &u, Float **ghost_)
639  : volumeCB(u.VolumeCB()), stride(u.Stride()), geometry(u.Geometry()), hasPhase(0) {
640  for (int i=0; i<4; i++) {
641  ghost[i] = (ghost_) ? ghost_[i] : (Float*)(u.Ghost()[i]);
642  faceVolumeCB[i] = u.SurfaceCB(i)*u.Nface(); // face volume equals surface * depth
643  }
644  }
645 
646  LegacyOrder(const LegacyOrder &order)
647  : volumeCB(order.volumeCB), stride(order.stride), geometry(order.geometry), hasPhase(0) {
648  for (int i=0; i<4; i++) {
649  ghost[i] = order.ghost[i];
650  faceVolumeCB[i] = order.faceVolumeCB[i];
651  }
652  }
653 
654  virtual ~LegacyOrder() { ; }
655 
656  __device__ __host__ inline void loadGhost(RegType v[length], int x, int dir, int parity) const {
657  for (int i=0; i<length; i++) v[i] = ghost[dir][(parity*faceVolumeCB[dir] + x)*length + i];
658  }
659 
660  __device__ __host__ inline void saveGhost(const RegType v[length], int x, int dir, int parity) {
661  for (int i=0; i<length; i++) ghost[dir][(parity*faceVolumeCB[dir] + x)*length + i] = v[i];
662  }
663 
664  __device__ __host__ inline void loadGhostEx(RegType v[length], int x, int dummy, int dir,
665  int dim, int g, int parity, const int R[]) const {
666  for (int i=0; i<length; i++) {
667  v[i] = ghost[dim]
668  [(((dir*2+parity)*R[dim]*faceVolumeCB[dim] + x)*geometry+g)*length + i];
669  }
670  }
671 
672  __device__ __host__ inline void saveGhostEx(const RegType v[length], int x, int dummy,
673  int dir, int dim, int g, int parity, const int R[]) {
674  for (int i=0; i<length; i++) {
675  ghost[dim]
676  [(((dir*2+parity)*R[dim]*faceVolumeCB[dim] + x)*geometry+g)*length + i] = v[i];
677  }
678  }
679 
680  };
681 
686  template <typename Float, int length> struct QDPOrder : public LegacyOrder<Float,length> {
687  typedef typename mapper<Float>::type RegType;
689  const int volumeCB;
690  QDPOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
691  : LegacyOrder<Float,length>(u, ghost_), volumeCB(u.VolumeCB())
692  { for (int i=0; i<4; i++) gauge[i] = gauge_ ? ((Float**)gauge_)[i] : ((Float**)u.Gauge_p())[i]; }
693  QDPOrder(const QDPOrder &order) : LegacyOrder<Float,length>(order), volumeCB(order.volumeCB) {
694  for(int i=0; i<4; i++) gauge[i] = order.gauge[i];
695  }
696  virtual ~QDPOrder() { ; }
697 
698  __device__ __host__ inline void load(RegType v[length], int x, int dir, int parity) const {
699  for (int i=0; i<length; i++) {
700  v[i] = (RegType)gauge[dir][(parity*volumeCB + x)*length + i];
701  }
702  }
703 
704  __device__ __host__ inline void save(const RegType v[length], int x, int dir, int parity) {
705  for (int i=0; i<length; i++) {
706  gauge[dir][(parity*volumeCB + x)*length + i] = (Float)v[i];
707  }
708  }
709 
710  size_t Bytes() const { return length * sizeof(Float); }
711  };
712 
717  template <typename Float, int length> struct QDPJITOrder : public LegacyOrder<Float,length> {
718  typedef typename mapper<Float>::type RegType;
720  const int volumeCB;
721  QDPJITOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
722  : LegacyOrder<Float,length>(u, ghost_), volumeCB(u.VolumeCB())
723  { for (int i=0; i<4; i++) gauge[i] = gauge_ ? ((Float**)gauge_)[i] : ((Float**)u.Gauge_p())[i]; }
724  QDPJITOrder(const QDPJITOrder &order) : LegacyOrder<Float,length>(order), volumeCB(order.volumeCB) {
725  for(int i=0; i<4; i++) gauge[i] = order.gauge[i];
726  }
727  virtual ~QDPJITOrder() { ; }
728 
729  __device__ __host__ inline void load(RegType v[length], int x, int dir, int parity) const {
730  for (int i=0; i<length; i++) {
731  int z = i%2;
732  int rolcol = i/2;
733  v[i] = (RegType)gauge[dir][((z*(length/2) + rolcol)*2 + parity)*volumeCB + x];
734  }
735  }
736 
737  __device__ __host__ inline void save(const RegType v[length], int x, int dir, int parity) {
738  for (int i=0; i<length; i++) {
739  int z = i%2;
740  int rolcol = i/2;
741  gauge[dir][((z*(length/2) + rolcol)*2 + parity)*volumeCB + x] = (Float)v[i];
742  }
743  }
744 
745  size_t Bytes() const { return length * sizeof(Float); }
746  };
747 
752  template <typename Float, int length> struct MILCOrder : public LegacyOrder<Float,length> {
753  typedef typename mapper<Float>::type RegType;
755  const int volumeCB;
756  const int geometry;
757  MILCOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0) :
758  LegacyOrder<Float,length>(u, ghost_), gauge(gauge_ ? gauge_ : (Float*)u.Gauge_p()),
759  volumeCB(u.VolumeCB()), geometry(u.Geometry()) { ; }
760  MILCOrder(const MILCOrder &order) : LegacyOrder<Float,length>(order),
761  gauge(order.gauge), volumeCB(order.volumeCB), geometry(order.geometry)
762  { ; }
763  virtual ~MILCOrder() { ; }
764 
765  __device__ __host__ inline void load(RegType v[length], int x, int dir, int parity) const {
766  for (int i=0; i<length; i++) {
767  v[i] = (RegType)gauge[((parity*volumeCB+x)*geometry + dir)*length + i];
768  }
769  }
770 
771  __device__ __host__ inline void save(const RegType v[length], int x, int dir, int parity) {
772  for (int i=0; i<length; i++) {
773  gauge[((parity*volumeCB+x)*geometry + dir)*length + i] = (Float)v[i];
774  }
775  }
776 
777  size_t Bytes() const { return length * sizeof(Float); }
778  };
779 
784  template <typename Float, int length> struct CPSOrder : LegacyOrder<Float,length> {
785  typedef typename mapper<Float>::type RegType;
787  const int volumeCB;
789  const int Nc;
790  const int geometry;
791  CPSOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
792  : LegacyOrder<Float,length>(u, ghost_), gauge(gauge_ ? gauge_ : (Float*)u.Gauge_p()),
793  volumeCB(u.VolumeCB()), anisotropy(u.Anisotropy()), Nc(3),
794  geometry(u.Geometry())
795  { if (length != 18) errorQuda("Gauge length %d not supported", length); }
796  CPSOrder(const CPSOrder &order) : LegacyOrder<Float,length>(order), gauge(order.gauge),
797  volumeCB(order.volumeCB), anisotropy(order.anisotropy), Nc(3), geometry(order.geometry)
798  { ; }
799  virtual ~CPSOrder() { ; }
800 
801  // we need to transpose and scale for CPS ordering
802  __device__ __host__ inline void load(RegType v[18], int x, int dir, int parity) const {
803  for (int i=0; i<Nc; i++) {
804  for (int j=0; j<Nc; j++) {
805  for (int z=0; z<2; z++) {
806  v[(i*Nc+j)*2+z] =
807  (RegType)(gauge[((((parity*volumeCB+x)*geometry + dir)*Nc + j)*Nc + i)*2 + z] / anisotropy);
808  }
809  }
810  }
811  }
812 
813  __device__ __host__ inline void save(const RegType v[18], int x, int dir, int parity) {
814  for (int i=0; i<Nc; i++) {
815  for (int j=0; j<Nc; j++) {
816  for (int z=0; z<2; z++) {
817  gauge[((((parity*volumeCB+x)*geometry + dir)*Nc + j)*Nc + i)*2 + z] =
818  (Float)(anisotropy * v[(i*Nc+j)*2+z]);
819  }
820  }
821  }
822  }
823 
824  size_t Bytes() const { return Nc * Nc * 2 * sizeof(Float); }
825  };
826 
831  template <typename Float, int length> struct BQCDOrder : LegacyOrder<Float,length> {
832  typedef typename mapper<Float>::type RegType;
834  const int volumeCB;
835  int exVolumeCB; // extended checkerboard volume
836  const int Nc;
837  BQCDOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
838  : LegacyOrder<Float,length>(u, ghost_), gauge(gauge_ ? gauge_ : (Float*)u.Gauge_p()), volumeCB(u.VolumeCB()), Nc(3) {
839  if (length != 18) errorQuda("Gauge length %d not supported", length);
840  // compute volumeCB + halo region
841  exVolumeCB = u.X()[0]/2 + 2;
842  for (int i=1; i<4; i++) exVolumeCB *= u.X()[i] + 2;
843  }
844  BQCDOrder(const BQCDOrder &order) : LegacyOrder<Float,length>(order), gauge(order.gauge),
845  volumeCB(order.volumeCB), exVolumeCB(order.exVolumeCB), Nc(3) {
846  if (length != 18) errorQuda("Gauge length %d not supported", length);
847  }
848 
849  virtual ~BQCDOrder() { ; }
850 
851  // we need to transpose for BQCD ordering
852  __device__ __host__ inline void load(RegType v[18], int x, int dir, int parity) const {
853  for (int i=0; i<Nc; i++) {
854  for (int j=0; j<Nc; j++) {
855  for (int z=0; z<2; z++) {
856  v[(i*Nc+j)*2+z] = (RegType)gauge[((((dir*2+parity)*exVolumeCB + x)*Nc + j)*Nc + i)*2 + z];
857  }
858  }
859  }
860  }
861 
862  __device__ __host__ inline void save(const RegType v[18], int x, int dir, int parity) {
863  for (int i=0; i<Nc; i++) {
864  for (int j=0; j<Nc; j++) {
865  for (int z=0; z<2; z++) {
866  gauge[((((dir*2+parity)*exVolumeCB + x)*Nc + j)*Nc + i)*2 + z] = (Float)v[(i*Nc+j)*2+z];
867  }
868  }
869  }
870  }
871 
872  size_t Bytes() const { return Nc * Nc * 2 * sizeof(Float); }
873  };
874 
879  template <typename Float, int length> struct TIFROrder : LegacyOrder<Float,length> {
880  typedef typename mapper<Float>::type RegType;
882  const int volumeCB;
883  const int Nc;
884  const Float scale;
885  TIFROrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
886  : LegacyOrder<Float,length>(u, ghost_), gauge(gauge_ ? gauge_ : (Float*)u.Gauge_p()),
887  volumeCB(u.VolumeCB()), Nc(3), scale(u.Scale()) {
888  if (length != 18) errorQuda("Gauge length %d not supported", length);
889  }
890  TIFROrder(const TIFROrder &order)
891  : LegacyOrder<Float,length>(order), gauge(order.gauge), volumeCB(order.volumeCB), Nc(3), scale(order.scale) {
892  if (length != 18) errorQuda("Gauge length %d not supported", length);
893  }
894 
895  virtual ~TIFROrder() { ; }
896 
897  // we need to transpose for TIFR ordering
898  __device__ __host__ inline void load(RegType v[18], int x, int dir, int parity) const {
899  for (int i=0; i<Nc; i++) {
900  for (int j=0; j<Nc; j++) {
901  for (int z=0; z<2; z++) {
902  v[(i*Nc+j)*2+z] = (RegType)gauge[((((dir*2+parity)*volumeCB + x)*Nc + j)*Nc + i)*2 + z] / scale;
903  }
904  }
905  }
906  }
907 
908  __device__ __host__ inline void save(const RegType v[18], int x, int dir, int parity) {
909  for (int i=0; i<Nc; i++) {
910  for (int j=0; j<Nc; j++) {
911  for (int z=0; z<2; z++) {
912  gauge[((((dir*2+parity)*volumeCB + x)*Nc + j)*Nc + i)*2 + z] = (Float)v[(i*Nc+j)*2+z] * scale;
913  }
914  }
915  }
916  }
917 
918  size_t Bytes() const { return Nc * Nc * 2 * sizeof(Float); }
919  };
920 
921 
922 }
QDPOrder(const QDPOrder &order)
__device__ __host__ void Unpack(RegType out[18], const RegType in[12], int idx, int dir, const RegType phase) const
mapper< Float >::type RegType
const void ** Ghost() const
Definition: gauge_field.h:209
Float * ghost[QUDA_MAX_DIM]
__device__ __host__ void loadGhostEx(RegType v[length], int buff_idx, int extended_idx, int dir, int dim, int g, int parity, const int R[]) const
__device__ __host__ void save(const RegType v[18], int x, int dir, int parity)
Float * gauge[QUDA_MAX_DIM]
__device__ __host__ void loadGhost(RegType v[length], int x, int dir, int parity) const
Reconstruct(const GaugeField &u)
Float * gauge[QUDA_MAX_DIM]
__device__ __host__ void accumulateConjugateProduct(Float *a, const Float *b, const Float *c, int sign)
__device__ __host__ void save(const RegType v[length], int x, int parity)
__device__ __host__ void complexProduct(Float *a, const Float *b, const Float *c)
size_t Bytes() const
Reconstruct(const GaugeField &u)
__device__ __host__ void load(RegType v[length], int x, int dir, int parity) const
#define errorQuda(...)
Definition: util_quda.h:73
const int * X() const
__device__ __host__ void complexQuotient(Float *a, const Float *b, const Float *c)
__device__ __host__ void save(const RegType v[length], int x, int dir, int parity)
__host__ __device__ ValueType sqrt(ValueType x)
Definition: complex_quda.h:105
int comm_dim(int dim)
__device__ __host__ void getPhase(RegType *phase, const RegType in[18])
__device__ __host__ void getPhase(RegType *phase, const RegType in[18])
__device__ __host__ void getPhase(RegType *phase, const RegType in[18]) const
__device__ __host__ void Unpack(RegType out[18], const RegType in[12], int idx, int dir, const RegType phase) const
__device__ __host__ void Unpack(RegType out[18], const RegType in[8], int idx, int dir, const RegType phase) const
int comm_coord(int dim)
__host__ __device__ void copy(T1 &a, const T2 &b)
__device__ __host__ void saveGhostEx(const RegType v[length], int buff_idx, int extended_idx, int dir, int dim, int g, int parity, const int R[])
const Reconstruct< 8, Float > reconstruct_8
__device__ __host__ void save(const RegType v[length], int x, int dir, int parity)
__device__ __host__ void Pack(RegType out[10], const RegType in[18], int idx) const
int Nface() const
Definition: gauge_field.h:193
BQCDOrder(const BQCDOrder &order)
int length[]
__device__ __host__ void Pack(RegType out[12], const RegType in[18], int idx) const
enum QudaTboundary_s QudaTboundary
__device__ __host__ void Unpack(RegType out[N], const RegType in[N], int idx, int dir, const RegType phase) const
const int * SurfaceCB() const
QDPJITOrder(const QDPJITOrder &order)
const Reconstruct< 12, Float > reconstruct_12
CPSOrder(const CPSOrder &order)
__device__ __host__ void load(RegType v[length], int x, int dir, int parity) const
__device__ __host__ void Pack(RegType out[N], const RegType in[N], int idx) const
__device__ __host__ Float timeBoundary(int idx, const int X[QUDA_MAX_DIM], QudaTboundary tBoundary, bool isFirstTimeSlice, bool isLastTimeSlice)
size_t Bytes() const
__device__ __host__ void Pack(RegType out[8], const RegType in[18], int idx) const
__device__ __host__ void Pack(RegType out[8], const RegType in[18], int idx) const
FloatNOrder(const FloatNOrder &order)
size_t Bytes() const
Definition: gauge_field.h:197
__device__ __host__ void getPhase(RegType *phase, const RegType in[18]) const
mapper< Float >::type RegType
__device__ __host__ void load(RegType v[18], int x, int dir, int parity) const
Reconstruct(const GaugeField &u)
cudaColorSpinorField * tmp
__device__ __host__ void loadGhost(RegType v[length], int x, int dir, int parity) const
TIFROrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
__device__ __host__ void load(RegType v[18], int x, int dir, int parity) const
__device__ __host__ void saveGhost(const RegType v[length], int x, int dir, int parity)
int faceVolumeCB[QUDA_MAX_DIM]
FloatingPoint< float > Float
Definition: gtest.h:7350
__device__ __host__ void accumulateComplexProduct(Float *a, const Float *b, const Float *c, Float sign)
size_t Bytes() const
const Float anisotropy
TIFROrder(const TIFROrder &order)
__device__ __host__ void save(const RegType v[18], int x, int dir, int parity)
__device__ __host__ void load(RegType v[length], int x, int dir, int parity) const
cpuColorSpinorField * in
enum QudaGhostExchange_s QudaGhostExchange
mapper< Float >::type RegType
mapper< Float >::type RegType
MILCOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
__constant__ double anisotropy
__constant__ double coeff
mapper< Float >::type RegType
BQCDOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
__device__ __host__ void getPhase(RegType *phase, const RegType in[18])
__device__ __host__ void Unpack(RegType out[18], const RegType in[8], int idx, int dir, const RegType phase) const
size_t Bytes() const
__device__ __host__ void complexDotProduct(Float *a, const Float *b, const Float *c)
int x[4]
const int * R() const
Definition: gauge_field.h:178
__device__ __host__ void load(RegType v[18], int x, int dir, int parity) const
CPSOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
FloatNOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
__device__ __host__ void saveGhost(const RegType v[length], int x, int dir, int parity)
mapper< Float >::type RegType
__device__ __host__ void complexConjugateProduct(Float *a, const Float *b, const Float *c)
LegacyOrder(const GaugeField &u, Float **ghost_)
MILCOrder(const MILCOrder &order)
cpuColorSpinorField * out
__device__ __host__ void Pack(RegType out[12], const RegType in[18], int idx) const
if(x2 >=X2) return
Reconstruct< reconLen, Float > reconstruct
__device__ __host__ void load(RegType v[length], int x, int parity) const
__device__ __host__ void Pack(RegType out[18], const RegType in[18], int idx) const
__device__ __host__ void save(const RegType v[length], int x, int dir, int parity)
__device__ __host__ void loadGhostEx(RegType v[length], int x, int dummy, int dir, int dim, int g, int parity, const int R[]) const
Reconstruct(const GaugeField &u)
__device__ __host__ void Unpack(RegType out[18], const RegType in[10], int idx, int dir, const RegType phase) const
QDPJITOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
LegacyOrder(const LegacyOrder &order)
mapper< Float >::type RegType
mapper< Float >::type RegType
size_t Bytes() const
__device__ __host__ void load(RegType v[length], int x, int dir, int parity) const
__device__ __host__ void getPhase(RegType *phase, const RegType in[18]) const
virtual void * Gauge_p()
Definition: gauge_field.h:201
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
mapper< Float >::type RegType
__device__ __host__ void saveGhostEx(const RegType v[length], int x, int dummy, int dir, int dim, int g, int parity, const int R[])
__device__ __host__ void save(const RegType v[length], int x, int dir, int parity)
__device__ __host__ void Unpack(RegType out[18], const RegType in[18], int idx, int dir, const RegType phase) const
const QudaParity parity
Definition: dslash_test.cpp:29
mapper< Float >::type RegType
size_t Bytes() const
QDPOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
mapper< Float >::type RegType
mapper< Float >::type RegType
__device__ __host__ void getPhase(RegType *phase, const RegType in[18]) const
size_t Bytes() const
Reconstruct(const GaugeField &u)
__device__ __host__ void save(const RegType v[18], int x, int dir, int parity)
Reconstruct(const GaugeField &u)
Reconstruct(const GaugeField &u)