QUDA  v0.5.0
A library for QCD on GPUs
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
pack_gauge.h
Go to the documentation of this file.
1 // Routines used to pack the gauge field matrices
2 
3 #include <math.h>
4 
5 #define SHORT_LENGTH 65536
6 #define SCALE_FLOAT ((SHORT_LENGTH-1) / 2.f)
7 #define SHIFT_FLOAT (-1.f / (SHORT_LENGTH-1))
8 
9 template <typename Float>
10 inline short FloatToShort(const Float &a) {
11  return (short)((a+SHIFT_FLOAT)*SCALE_FLOAT);
12 }
13 
14 template <typename Float>
15 inline void ShortToFloat(Float &a, const short &b) {
16  a = ((Float)b/SCALE_FLOAT-SHIFT_FLOAT);
17 }
18 
19 /*template <int N, typename FloatN, typename Float>
20 inline void pack8(FloatN *res, Float *g, int dir, int V) {
21  Float *r = res + N*dir*4*V;
22  r[0] = atan2(g[1], g[0]);
23  r[1] = atan2(g[13], g[12]);
24 }*/
25 
26 template <typename Float>
27 inline void pack8(double2 *res, Float *g, int dir, int V) {
28  double2 *r = res + dir*4*V;
29  r[0].x = atan2(g[1], g[0]);
30  r[0].y = atan2(g[13], g[12]);
31  for (int j=1; j<4; j++) {
32  r[j*V].x = g[2*j+0];
33  r[j*V].y = g[2*j+1];
34  }
35 }
36 
37 template <typename Float>
38 inline void pack8(float4 *res, Float *g, int dir, int V) {
39  float4 *r = res + dir*2*V;
40  r[0].x = atan2(g[1], g[0]);
41  r[0].y = atan2(g[13], g[12]);
42  r[0].z = g[2];
43  r[0].w = g[3];
44  r[V].x = g[4];
45  r[V].y = g[5];
46  r[V].z = g[6];
47  r[V].w = g[7];
48 }
49 
50 template <typename Float>
51 inline void pack8(float2 *res, Float *g, int dir, int V) {
52  float2 *r = res + dir*4*V;
53  r[0].x = atan2(g[1], g[0]);
54  r[0].y = atan2(g[13], g[12]);
55  for (int j=1; j<4; j++) {
56  r[j*V].x = g[2*j+0];
57  r[j*V].y = g[2*j+1];
58  }
59 }
60 
61 template <typename Float>
62 inline void pack8(short4 *res, Float *g, int dir, int V) {
63  short4 *r = res + dir*2*V;
64  r[0].x = FloatToShort(atan2(g[1], g[0]) / M_PI);
65  r[0].y = FloatToShort(atan2(g[13], g[12]) / M_PI);
66  r[0].z = FloatToShort(g[2]);
67  r[0].w = FloatToShort(g[3]);
68  r[V].x = FloatToShort(g[4]);
69  r[V].y = FloatToShort(g[5]);
70  r[V].z = FloatToShort(g[6]);
71  r[V].w = FloatToShort(g[7]);
72 }
73 
74 template <typename Float>
75 inline void pack8(short2 *res, Float *g, int dir, int V) {
76  short2 *r = res + dir*4*V;
77  r[0].x = FloatToShort(atan2(g[1], g[0]) / M_PI);
78  r[0].y = FloatToShort(atan2(g[13], g[12]) / M_PI);
79  for (int j=1; j<4; j++) {
80  r[j*V].x = FloatToShort(g[2*j+0]);
81  r[j*V].y = FloatToShort(g[2*j+1]);
82  }
83 }
84 
85 template <typename Float>
86 inline void pack12(double2 *res, Float *g, int dir, int V) {
87  double2 *r = res + dir*6*V;
88  for (int j=0; j<6; j++) {
89  r[j*V].x = g[j*2+0];
90  r[j*V].y = g[j*2+1];
91  }
92 }
93 
94 template <typename Float>
95 inline void pack12(float4 *res, Float *g, int dir, int V) {
96  float4 *r = res + dir*3*V;
97  for (int j=0; j<3; j++) {
98  r[j*V].x = g[j*4+0];
99  r[j*V].y = g[j*4+1];
100  r[j*V].z = g[j*4+2];
101  r[j*V].w = g[j*4+3];
102  }
103 }
104 
105 template <typename Float>
106 inline void pack12(float2 *res, Float *g, int dir, int V) {
107  float2 *r = res + dir*6*V;
108  for (int j=0; j<6; j++) {
109  r[j*V].x = g[j*2+0];
110  r[j*V].y = g[j*2+1];
111  }
112 }
113 
114 template <typename Float>
115 inline void pack12(short4 *res, Float *g, int dir, int V) {
116  short4 *r = res + dir*3*V;
117  for (int j=0; j<3; j++) {
118  r[j*V].x = FloatToShort(g[j*4+0]);
119  r[j*V].y = FloatToShort(g[j*4+1]);
120  r[j*V].z = FloatToShort(g[j*4+2]);
121  r[j*V].w = FloatToShort(g[j*4+3]);
122  }
123 }
124 
125 template <typename Float>
126 inline void pack12(short2 *res, Float *g, int dir, int V) {
127  short2 *r = res + dir*6*V;
128  for (int j=0; j<6; j++) {
129  r[j*V].x = FloatToShort(g[j*2+0]);
130  r[j*V].y = FloatToShort(g[j*2+1]);
131  }
132 }
133 
134 template <typename Float>
135 inline void pack18(double2 *res, Float *g, int dir, int V) {
136  double2 *r = res + dir*9*V;
137  for (int j=0; j<9; j++) {
138  r[j*V].x = g[j*2+0];
139  r[j*V].y = g[j*2+1];
140  }
141 }
142 
143 template <typename Float>
144 inline void pack18(float4 *res, Float *g, int dir, int V) {
145  float4 *r = res + dir*5*V;
146  for (int j=0; j<4; j++) {
147  r[j*V].x = g[j*4+0];
148  r[j*V].y = g[j*4+1];
149  r[j*V].z = g[j*4+2];
150  r[j*V].w = g[j*4+3];
151  }
152  r[4*V].x = g[16];
153  r[4*V].y = g[17];
154  r[4*V].z = 0.0;
155  r[4*V].w = 0.0;
156 }
157 
158 template <typename Float>
159 inline void pack18(float2 *res, Float *g, int dir, int V) {
160  float2 *r = res + dir*9*V;
161  for (int j=0; j<9; j++) {
162  r[j*V].x = g[j*2+0];
163  r[j*V].y = g[j*2+1];
164  }
165 }
166 
167 template <typename Float>
168 inline void pack18(short4 *res, Float *g, int dir, int V) {
169  short4 *r = res + dir*5*V;
170  for (int j=0; j<4; j++) {
171  r[j*V].x = FloatToShort(g[j*4+0]);
172  r[j*V].y = FloatToShort(g[j*4+1]);
173  r[j*V].z = FloatToShort(g[j*4+2]);
174  r[j*V].w = FloatToShort(g[j*4+3]);
175  }
176  r[4*V].x = FloatToShort(g[16]);
177  r[4*V].y = FloatToShort(g[17]);
178  r[4*V].z = (short)0;
179  r[4*V].w = (short)0;
180 }
181 
182 template <typename Float>
183 inline void pack18(short2 *res, Float *g, int dir, int V)
184 {
185  short2 *r = res + dir*9*V;
186  for (int j=0; j<9; j++) {
187  r[j*V].x = FloatToShort(g[j*2+0]);
188  r[j*V].y = FloatToShort(g[j*2+1]);
189  }
190 }
191 
192 template<typename Float>
193 inline void fatlink_short_pack18(short2 *d_gauge, Float *h_gauge, int dir, int V)
194 {
195  short2 *dg = d_gauge + dir*9*V;
196  for (int j=0; j<9; j++) {
197  dg[j*V].x = FloatToShort((h_gauge[j*2+0]/fat_link_max_));
198  dg[j*V].y = FloatToShort((h_gauge[j*2+1]/fat_link_max_));
199  }
200 }
201 
202 
203 
204 // a += b*c
205 template <typename Float>
206 inline void accumulateComplexProduct(Float *a, Float *b, Float *c, Float sign) {
207  a[0] += sign*(b[0]*c[0] - b[1]*c[1]);
208  a[1] += sign*(b[0]*c[1] + b[1]*c[0]);
209 }
210 
211 // a = conj(b)*c
212 template <typename Float>
213 inline void complexDotProduct(Float *a, Float *b, Float *c) {
214  a[0] = b[0]*c[0] + b[1]*c[1];
215  a[1] = b[0]*c[1] - b[1]*c[0];
216 }
217 
218 // a += conj(b) * conj(c)
219 template <typename Float>
220 inline void accumulateConjugateProduct(Float *a, Float *b, Float *c, int sign) {
221  a[0] += sign * (b[0]*c[0] - b[1]*c[1]);
222  a[1] -= sign * (b[0]*c[1] + b[1]*c[0]);
223 }
224 
225 // a = conj(b)*conj(c)
226 template <typename Float>
227 inline void complexConjugateProduct(Float *a, Float *b, Float *c) {
228  a[0] = b[0]*c[0] - b[1]*c[1];
229  a[1] = -b[0]*c[1] - b[1]*c[0];
230 }
231 
232 
233 // Routines used to unpack the gauge field matrices
234 template <typename Float>
235 inline void reconstruct8(Float *mat, int dir, int idx) {
236  // First reconstruct first row
237  Float row_sum = 0.0;
238  row_sum += mat[2]*mat[2];
239  row_sum += mat[3]*mat[3];
240  row_sum += mat[4]*mat[4];
241  row_sum += mat[5]*mat[5];
242  Float u0 = (dir < 3 ? anisotropy_ : (idx >= (X_[3]-1)*X_[0]*X_[1]*X_[2]/2 ? t_boundary_ : 1));
243  Float diff = 1.f/(u0*u0) - row_sum;
244  Float U00_mag = sqrt(diff >= 0 ? diff : 0.0);
245 
246  mat[14] = mat[0];
247  mat[15] = mat[1];
248 
249  mat[0] = U00_mag * cos(mat[14]);
250  mat[1] = U00_mag * sin(mat[14]);
251 
252  Float column_sum = 0.0;
253  for (int i=0; i<2; i++) column_sum += mat[i]*mat[i];
254  for (int i=6; i<8; i++) column_sum += mat[i]*mat[i];
255  diff = 1.f/(u0*u0) - column_sum;
256  Float U20_mag = sqrt(diff >= 0 ? diff : 0.0);
257 
258  mat[12] = U20_mag * cos(mat[15]);
259  mat[13] = U20_mag * sin(mat[15]);
260 
261  // First column now restored
262 
263  // finally reconstruct last elements from SU(2) rotation
264  Float r_inv2 = 1.0/(u0*row_sum);
265 
266  // U11
267  Float A[2];
268  complexDotProduct(A, mat+0, mat+6);
269  complexConjugateProduct(mat+8, mat+12, mat+4);
270  accumulateComplexProduct(mat+8, A, mat+2, u0);
271  mat[8] *= -r_inv2;
272  mat[9] *= -r_inv2;
273 
274  // U12
275  complexConjugateProduct(mat+10, mat+12, mat+2);
276  accumulateComplexProduct(mat+10, A, mat+4, -u0);
277  mat[10] *= r_inv2;
278  mat[11] *= r_inv2;
279 
280  // U21
281  complexDotProduct(A, mat+0, mat+12);
282  complexConjugateProduct(mat+14, mat+6, mat+4);
283  accumulateComplexProduct(mat+14, A, mat+2, -u0);
284  mat[14] *= r_inv2;
285  mat[15] *= r_inv2;
286 
287  // U12
288  complexConjugateProduct(mat+16, mat+6, mat+2);
289  accumulateComplexProduct(mat+16, A, mat+4, u0);
290  mat[16] *= -r_inv2;
291  mat[17] *= -r_inv2;
292 }
293 
294 template <typename Float>
295 inline void unpack8(Float *h_gauge, double2 *d_gauge, int dir, int V, int idx) {
296  double2 *dg = d_gauge + dir*4*V;
297  for (int j=0; j<4; j++) {
298  h_gauge[2*j+0] = dg[j*V].x;
299  h_gauge[2*j+1] = dg[j*V].y;
300  }
301  reconstruct8(h_gauge, dir, idx);
302 }
303 
304 template <typename Float>
305 inline void unpack8(Float *h_gauge, float4 *d_gauge, int dir, int V, int idx) {
306  float4 *dg = d_gauge + dir*2*V;
307  h_gauge[0] = dg[0].x;
308  h_gauge[1] = dg[0].y;
309  h_gauge[2] = dg[0].z;
310  h_gauge[3] = dg[0].w;
311  h_gauge[4] = dg[V].x;
312  h_gauge[5] = dg[V].y;
313  h_gauge[6] = dg[V].z;
314  h_gauge[7] = dg[V].w;
315  reconstruct8(h_gauge, dir, idx);
316 }
317 
318 template <typename Float>
319 inline void unpack8(Float *h_gauge, float2 *d_gauge, int dir, int V, int idx) {
320  float2 *dg = d_gauge + dir*4*V;
321  for (int j=0; j<4; j++) {
322  h_gauge[2*j+0] = dg[j*V].x;
323  h_gauge[2*j+1] = dg[j*V].y;
324  }
325  reconstruct8(h_gauge, dir, idx);
326 }
327 
328 template <typename Float>
329 inline void unpack8(Float *h_gauge, short4 *d_gauge, int dir, int V, int idx) {
330  short4 *dg = d_gauge + dir*2*V;
331  ShortToFloat(h_gauge[0], dg[0].x);
332  ShortToFloat(h_gauge[1], dg[0].y);
333  ShortToFloat(h_gauge[2], dg[0].z);
334  ShortToFloat(h_gauge[3], dg[0].w);
335  ShortToFloat(h_gauge[4], dg[V].x);
336  ShortToFloat(h_gauge[5], dg[V].y);
337  ShortToFloat(h_gauge[6], dg[V].z);
338  ShortToFloat(h_gauge[7], dg[V].w);
339  h_gauge[0] *= M_PI;
340  h_gauge[1] *= M_PI;
341  reconstruct8(h_gauge, dir, idx);
342 }
343 
344 template <typename Float>
345 inline void unpack8(Float *h_gauge, short2 *d_gauge, int dir, int V, int idx) {
346  short2 *dg = d_gauge + dir*4*V;
347  for (int j=0; j<4; j++) {
348  ShortToFloat(h_gauge[2*j+0], dg[j*V].x);
349  ShortToFloat(h_gauge[2*j+1], dg[j*V].y);
350  }
351  h_gauge[0] *= M_PI;
352  h_gauge[1] *= M_PI;
353  reconstruct8(h_gauge, dir, idx);
354 }
355 
356 // do this using complex numbers (simplifies)
357 template <typename Float>
358 inline void reconstruct12(Float *mat, int dir, int idx) {
359  Float *u = &mat[0*(3*2)];
360  Float *v = &mat[1*(3*2)];
361  Float *w = &mat[2*(3*2)];
362  w[0] = 0.0; w[1] = 0.0; w[2] = 0.0; w[3] = 0.0; w[4] = 0.0; w[5] = 0.0;
363  accumulateConjugateProduct(w+0*(2), u+1*(2), v+2*(2), +1);
364  accumulateConjugateProduct(w+0*(2), u+2*(2), v+1*(2), -1);
365  accumulateConjugateProduct(w+1*(2), u+2*(2), v+0*(2), +1);
366  accumulateConjugateProduct(w+1*(2), u+0*(2), v+2*(2), -1);
367  accumulateConjugateProduct(w+2*(2), u+0*(2), v+1*(2), +1);
368  accumulateConjugateProduct(w+2*(2), u+1*(2), v+0*(2), -1);
369  Float u0 = (dir < 3 ? anisotropy_ :
370  (idx >= (X_[3]-1)*X_[0]*X_[1]*X_[2]/2 ? t_boundary_ : 1));
371  w[0]*=u0; w[1]*=u0; w[2]*=u0; w[3]*=u0; w[4]*=u0; w[5]*=u0;
372 }
373 
374 template <typename Float>
375 inline void unpack12(Float *h_gauge, double2 *d_gauge, int dir, int V, int idx) {
376  double2 *dg = d_gauge + dir*6*V;
377  for (int j=0; j<6; j++) {
378  h_gauge[j*2+0] = dg[j*V].x;
379  h_gauge[j*2+1] = dg[j*V].y;
380  }
381  reconstruct12(h_gauge, dir, idx);
382 }
383 
384 template <typename Float>
385 inline void unpack12(Float *h_gauge, float4 *d_gauge, int dir, int V, int idx) {
386  float4 *dg = d_gauge + dir*3*V;
387  for (int j=0; j<3; j++) {
388  h_gauge[j*4+0] = dg[j*V].x;
389  h_gauge[j*4+1] = dg[j*V].y;
390  h_gauge[j*4+2] = dg[j*V].z;
391  h_gauge[j*4+3] = dg[j*V].w;
392  }
393  reconstruct12(h_gauge, dir, idx);
394 }
395 
396 template <typename Float>
397 inline void unpack12(Float *h_gauge, float2 *d_gauge, int dir, int V, int idx) {
398  float2 *dg = d_gauge + dir*6*V;
399  for (int j=0; j<6; j++) {
400  h_gauge[j*2+0] = dg[j*V].x;
401  h_gauge[j*2+1] = dg[j*V].y;
402  }
403  reconstruct12(h_gauge, dir, idx);
404 }
405 
406 template <typename Float>
407 inline void unpack12(Float *h_gauge, short4 *d_gauge, int dir, int V, int idx) {
408  short4 *dg = d_gauge + dir*3*V;
409  for (int j=0; j<3; j++) {
410  ShortToFloat(h_gauge[j*4+0], dg[j*V].x);
411  ShortToFloat(h_gauge[j*4+1], dg[j*V].y);
412  ShortToFloat(h_gauge[j*4+2], dg[j*V].z);
413  ShortToFloat(h_gauge[j*4+3], dg[j*V].w);
414  }
415  reconstruct12(h_gauge, dir, idx);
416 }
417 
418 template <typename Float>
419 inline void unpack12(Float *h_gauge, short2 *d_gauge, int dir, int V, int idx) {
420  short2 *dg = d_gauge + dir*6*V;
421  for (int j=0; j<6; j++) {
422  ShortToFloat(h_gauge[j*2+0], dg[j*V].x);
423  ShortToFloat(h_gauge[j*2+1], dg[j*V].y);
424  }
425  reconstruct12(h_gauge, dir, idx);
426 }
427 
428 template <typename Float>
429 inline void unpack18(Float *h_gauge, double2 *d_gauge, int dir, int V) {
430  double2 *dg = d_gauge + dir*9*V;
431  for (int j=0; j<9; j++) {
432  h_gauge[j*2+0] = dg[j*V].x;
433  h_gauge[j*2+1] = dg[j*V].y;
434  }
435 }
436 
437 template <typename Float>
438 inline void unpack18(Float *h_gauge, float4 *d_gauge, int dir, int V) {
439  float4 *dg = d_gauge + dir*5*V;
440  for (int j=0; j<4; j++) {
441  h_gauge[j*4+0] = dg[j*V].x;
442  h_gauge[j*4+1] = dg[j*V].y;
443  h_gauge[j*4+2] = dg[j*V].z;
444  h_gauge[j*4+3] = dg[j*V].w;
445  }
446  h_gauge[16] = dg[4*V].x;
447  h_gauge[17] = dg[4*V].y;
448 }
449 
450 template <typename Float>
451 inline void unpack18(Float *h_gauge, float2 *d_gauge, int dir, int V) {
452  float2 *dg = d_gauge + dir*9*V;
453  for (int j=0; j<9; j++) {
454  h_gauge[j*2+0] = dg[j*V].x;
455  h_gauge[j*2+1] = dg[j*V].y;
456  }
457 }
458 
459 template <typename Float>
460 inline void unpack18(Float *h_gauge, short4 *d_gauge, int dir, int V) {
461  short4 *dg = d_gauge + dir*5*V;
462  for (int j=0; j<4; j++) {
463  ShortToFloat(h_gauge[j*4+0], dg[j*V].x);
464  ShortToFloat(h_gauge[j*4+1], dg[j*V].y);
465  ShortToFloat(h_gauge[j*4+2], dg[j*V].z);
466  ShortToFloat(h_gauge[j*4+3], dg[j*V].w);
467  }
468  ShortToFloat(h_gauge[16],dg[4*V].x);
469  ShortToFloat(h_gauge[17],dg[4*V].y);
470 
471 }
472 
473 template <typename Float>
474 inline void unpack18(Float *h_gauge, short2 *d_gauge, int dir, int V) {
475  short2 *dg = d_gauge + dir*9*V;
476  for (int j=0; j<9; j++) {
477  ShortToFloat(h_gauge[j*2+0], dg[j*V].x);
478  ShortToFloat(h_gauge[j*2+1], dg[j*V].y);
479  }
480 }
481 
482 
483 
484 // Assume the gauge field is "QDP" ordered: directions outside of
485 // space-time, row-column ordering, even-odd space-time
486 // offset = 0 for body
487 // offset = Vh for face
488 // voxels = Vh for body
489 // voxels[i] = face volume[i]
490 template <typename Float, typename FloatN>
491 static void packQDPGaugeField(FloatN *d_gauge, Float **h_gauge, int oddBit,
492  QudaReconstructType reconstruct, int Vh, int *voxels,
493  int pad, int offset, int nFace, QudaLinkType type) {
494  if (reconstruct == QUDA_RECONSTRUCT_12) {
495  for (int dir = 0; dir < 4; dir++) {
496  int nMat = nFace*voxels[dir];
497  Float *g = h_gauge[dir] + oddBit*nMat*18;
498  for (int i = 0; i < nMat; i++) pack12(d_gauge+offset+i, g+i*18, dir, Vh+pad);
499  }
500  } else if (reconstruct == QUDA_RECONSTRUCT_8) {
501  for (int dir = 0; dir < 4; dir++) {
502  int nMat = nFace*voxels[dir];
503  Float *g = h_gauge[dir] + oddBit*nMat*18;
504  for (int i = 0; i < nMat; i++) pack8(d_gauge+offset+i, g+i*18, dir, Vh+pad);
505  }
506  } else {
507  for (int dir = 0; dir < 4; dir++) {
508  int nMat = nFace*voxels[dir];
509  Float *g = h_gauge[dir] + oddBit*nMat*18;
510  if(type == QUDA_ASQTAD_FAT_LINKS && typeid(FloatN) == typeid(short2) ){
511  //we know it is half precison with fatlink at this stage
512  for (int i = 0; i < nMat; i++)
513  fatlink_short_pack18((short2*)(d_gauge+offset+i), g+i*18, dir, Vh+pad);
514  }else{
515  for (int i = 0; i < nMat; i++) pack18(d_gauge+offset+i, g+i*18, dir, Vh+pad);
516  }
517  }
518  }
519 }
520 
521 // Assume the gauge field is "QDP" ordered: directions outside of
522 // space-time, row-column ordering, even-odd space-time
523 template <typename Float, typename FloatN>
524 static void unpackQDPGaugeField(Float **h_gauge, FloatN *d_gauge, int oddBit,
525  QudaReconstructType reconstruct, int V, int pad) {
526  if (reconstruct == QUDA_RECONSTRUCT_12) {
527  for (int dir = 0; dir < 4; dir++) {
528  Float *g = h_gauge[dir] + oddBit*V*18;
529  for (int i = 0; i < V; i++) unpack12(g+i*18, d_gauge+i, dir, V+pad, i);
530  }
531  } else if (reconstruct == QUDA_RECONSTRUCT_8) {
532  for (int dir = 0; dir < 4; dir++) {
533  Float *g = h_gauge[dir] + oddBit*V*18;
534  for (int i = 0; i < V; i++) unpack8(g+i*18, d_gauge+i, dir, V+pad, i);
535  }
536  } else {
537  for (int dir = 0; dir < 4; dir++) {
538  Float *g = h_gauge[dir] + oddBit*V*18;
539  for (int i = 0; i < V; i++) unpack18(g+i*18, d_gauge+i, dir, V+pad);
540  }
541  }
542 }
543 
544 // transpose and scale the matrix
545 template <typename Float, typename Float2>
546 static void transposeScale(Float *gT, Float *g, const Float2 &a) {
547  for (int ic=0; ic<3; ic++) for (int jc=0; jc<3; jc++) for (int r=0; r<2; r++)
548  gT[(ic*3+jc)*2+r] = a*g[(jc*3+ic)*2+r];
549 }
550 
551 // Assume the gauge field is "Wilson" ordered directions inside of
552 // space-time column-row ordering even-odd space-time
553 template <typename Float, typename FloatN>
554 static void packCPSGaugeField(FloatN *d_gauge, Float *h_gauge, int oddBit,
555  QudaReconstructType reconstruct, int V, int pad) {
556  Float gT[18];
557  if (reconstruct == QUDA_RECONSTRUCT_12) {
558  for (int dir = 0; dir < 4; dir++) {
559  Float *g = h_gauge + (oddBit*V*4+dir)*18;
560  for (int i = 0; i < V; i++) {
561  transposeScale(gT, g+4*i*18, 1.0 / anisotropy_);
562  pack12(d_gauge+i, gT, dir, V+pad);
563  }
564  }
565  } else if (reconstruct == QUDA_RECONSTRUCT_8) {
566  for (int dir = 0; dir < 4; dir++) {
567  Float *g = h_gauge + (oddBit*V*4+dir)*18;
568  for (int i = 0; i < V; i++) {
569  transposeScale(gT, g+4*i*18, 1.0 / anisotropy_);
570  pack8(d_gauge+i, gT, dir, V+pad);
571  }
572  }
573  } else {
574  for (int dir = 0; dir < 4; dir++) {
575  Float *g = h_gauge + (oddBit*V*4+dir)*18;
576  for (int i = 0; i < V; i++) {
577  transposeScale(gT, g+4*i*18, 1.0 / anisotropy_);
578  pack18(d_gauge+i, gT, dir, V+pad);
579  }
580  }
581  }
582 
583 }
584 
585 // Assume the gauge field is "Wilson" ordered directions inside of
586 // space-time column-row ordering even-odd space-time
587 template <typename Float, typename FloatN>
588 static void unpackCPSGaugeField(Float *h_gauge, FloatN *d_gauge, int oddBit,
589  QudaReconstructType reconstruct, int V, int pad) {
590  Float gT[18];
591  if (reconstruct == QUDA_RECONSTRUCT_12) {
592  for (int dir = 0; dir < 4; dir++) {
593  Float *hg = h_gauge + (oddBit*V*4+dir)*18;
594  for (int i = 0; i < V; i++) {
595  unpack12(gT, d_gauge+i, dir, V+pad, i);
596  transposeScale(hg+4*i*18, gT, anisotropy_);
597  }
598  }
599  } else if (reconstruct == QUDA_RECONSTRUCT_8) {
600  for (int dir = 0; dir < 4; dir++) {
601  Float *hg = h_gauge + (oddBit*V*4+dir)*18;
602  for (int i = 0; i < V; i++) {
603  unpack8(gT, d_gauge+i, dir, V+pad, i);
604  transposeScale(hg+4*i*18, gT, anisotropy_);
605  }
606  }
607  } else {
608  for (int dir = 0; dir < 4; dir++) {
609  Float *hg = h_gauge + (oddBit*V*4+dir)*18;
610  for (int i = 0; i < V; i++) {
611  unpack18(gT, d_gauge+i, dir, V+pad);
612  transposeScale(hg+4*i*18, gT, anisotropy_);
613  }
614  }
615  }
616 
617 }
618 
619 
620 // Assume the gauge field is MILC ordered: directions inside of
621 // space-time row-column ordering even-odd space-time
622 template <typename Float, typename FloatN>
623 void packMILCGaugeField(FloatN *res, Float *gauge, int oddBit,
624  QudaReconstructType reconstruct, int Vh, int pad)
625 {
626  int dir, i;
627  if (reconstruct == QUDA_RECONSTRUCT_12) {
628  for (dir = 0; dir < 4; dir++) {
629  Float *g = gauge + oddBit*Vh*gaugeSiteSize*4;
630  for (i = 0; i < Vh; i++) {
631  pack12(res+i, g+(4*i+dir)*gaugeSiteSize, dir, Vh+pad);
632  }
633  }
634  } else if (reconstruct == QUDA_RECONSTRUCT_8){
635  for (dir = 0; dir < 4; dir++) {
636  Float *g = gauge + oddBit*Vh*gaugeSiteSize*4;
637  for (i = 0; i < Vh; i++) {
638  pack8(res+i, g+(4*i+dir)*gaugeSiteSize, dir, Vh+pad);
639  }
640  }
641  }else{
642  for (dir = 0; dir < 4; dir++) {
643  Float *g = gauge + oddBit*Vh*gaugeSiteSize*4;
644  for (i = 0; i < Vh; i++) {
645  pack18(res+i, g+(4*i+dir)*gaugeSiteSize, dir, Vh+pad);
646  }
647  }
648  }
649 }
650 
651 // Assume the gauge field is MILC ordered: directions inside of
652 // space-time row-column ordering even-odd space-time
653 template <typename Float, typename FloatN>
654 static void unpackMILCGaugeField(Float *h_gauge, FloatN *d_gauge, int oddBit,
655  QudaReconstructType reconstruct, int V, int pad) {
656  if (reconstruct == QUDA_RECONSTRUCT_12) {
657  for (int dir = 0; dir < 4; dir++) {
658  Float *hg = h_gauge + (oddBit*V*4+dir)*18;
659  for (int i = 0; i < V; i++) {
660  unpack12(hg+4*i*18, d_gauge+i, dir, V+pad, i);
661  }
662  }
663  } else if (reconstruct == QUDA_RECONSTRUCT_8) {
664  for (int dir = 0; dir < 4; dir++) {
665  Float *hg = h_gauge + (oddBit*V*4+dir)*18;
666  for (int i = 0; i < V; i++) {
667  unpack8(hg+4*i*18, d_gauge+i, dir, V+pad, i);
668  }
669  }
670  } else {
671  for (int dir = 0; dir < 4; dir++) {
672  Float *hg = h_gauge + (oddBit*V*4+dir)*18;
673  for (int i = 0; i < V; i++) {
674  unpack18(hg+4*i*18, d_gauge+i, dir, V+pad);
675  }
676  }
677  }
678 
679 }
680 
681 // Assume the gauge field is BQCD ordered: 1-d array with
682 // [mu][even-odd][spacetime+halos][column][row]
683 template <typename Float, typename FloatN>
684 void packBQCDGaugeField(FloatN *res, Float *gauge, int oddBit,
685  QudaReconstructType reconstruct, int Vh, int pad)
686 {
687 
688  // need to add on halo region
689  int mu_offset = X_[0]/2 + 2;
690  for (int i=1; i<4; i++) mu_offset *= (X_[i] + 2);
691  Float gT[18];
692 
693  int dir, i;
694  if (reconstruct == QUDA_RECONSTRUCT_12) {
695  for (dir = 0; dir < 4; dir++) {
696  Float *g = gauge + (dir*2+oddBit)*mu_offset*gaugeSiteSize;
697  for (i = 0; i < Vh; i++) {
698  transposeScale(gT, g+i*18, 1.0);
699  pack12(res+i, gT, dir, Vh+pad);
700  }
701  }
702  } else if (reconstruct == QUDA_RECONSTRUCT_8){
703  for (dir = 0; dir < 4; dir++) {
704  Float *g = gauge + (dir*2+oddBit)*mu_offset*gaugeSiteSize;
705  //Float *g = gauge + (dir*2+oddBit)*Vh*gaugeSiteSize;
706  for (i = 0; i < Vh; i++) {
707  transposeScale(gT, g+i*18, 1.0);
708  pack8(res+i, gT, dir, Vh+pad);
709  }
710  }
711  }else{
712  // FIXME - need to workout row-col order
713  for (dir = 0; dir < 4; dir++) {
714  Float *g = gauge + (dir*2+oddBit)*mu_offset*gaugeSiteSize;
715  //Float *g = gauge + (dir*2+oddBit)*Vh*gaugeSiteSize;
716  for (i = 0; i < Vh; i++) {
717  transposeScale(gT, g+i*18, 1.0);
718  pack18(res+i, gT, dir, Vh+pad);
719  //pack18(res+i, g+i*gaugeSiteSize, dir, Vh+pad);
720  }
721  }
722  }
723 }
724 
725 // Assume the gauge field is BQCD ordered: 1-d array with
726 // [mu][even-odd][spacetime+halos][column][row]
727 template <typename Float, typename FloatN>
728 static void unpackBQCDGaugeField(Float *h_gauge, FloatN *d_gauge, int oddBit,
729  QudaReconstructType reconstruct, int V, int pad) {
730  // need to add on halo region
731  int mu_offset = X_[0]/2 + 2;
732  for (int i=1; i<4; i++) mu_offset *= (X_[i] + 2);
733  Float gT[18];
734 
735  if (reconstruct == QUDA_RECONSTRUCT_12) {
736  for (int dir = 0; dir < 4; dir++) {
737  Float *hg = h_gauge + (dir*2+oddBit)*mu_offset*gaugeSiteSize;
738  for (int i = 0; i < V; i++) {
739  unpack12(gT, d_gauge+i, dir, V+pad, i);
740  transposeScale(hg+i*18, gT, 1.0);
741  }
742  }
743  } else if (reconstruct == QUDA_RECONSTRUCT_8) {
744  for (int dir = 0; dir < 4; dir++) {
745  Float *hg = h_gauge + (dir*2+oddBit)*mu_offset*gaugeSiteSize;
746  for (int i = 0; i < V; i++) {
747  unpack8(gT, d_gauge+i, dir, V+pad, i);
748  transposeScale(hg+i*18, gT, 1.0);
749  }
750  }
751  } else {
752  for (int dir = 0; dir < 4; dir++) {
753  Float *hg = h_gauge + (dir*2+oddBit)*mu_offset*gaugeSiteSize;
754  for (int i = 0; i < V; i++) {
755  unpack18(gT, d_gauge+i, dir, V+pad);
756  transposeScale(hg+i*18, gT, 1.0);
757  }
758  }
759  }
760 
761 }
762 
763 /*
764  Momentum packing/unpacking routines: these are for length 10
765  vectors, stored in Float2 format.
766  */
767 
768 template <typename Float, typename Float2>
769  inline void pack10(Float2 *res, Float *m, int dir, int Vh, int pad)
770 {
771  int stride = Vh + pad;
772  Float2 *r = res + dir*5*stride;
773  for (int j=0; j<5; j++) {
774  r[j*stride].x = m[j*2+0];
775  r[j*stride].y = m[j*2+1];
776  }
777 }
778 
779 template <typename Float, typename Float2>
780  void packMomField(Float2 *res, Float *mom, int oddBit, int Vh, int pad)
781 {
782  for (int dir = 0; dir < 4; dir++) {
783  Float *g = mom + (oddBit*Vh*4 + dir)*10;
784  for (int i = 0; i < Vh; i++) {
785  pack10(res+i, g + 4*i*10, dir, Vh, pad);
786  }
787  }
788 }
789 
790 template <typename Float, typename Float2>
791  inline void unpack10(Float* m, Float2 *res, int dir, int Vh, int pad)
792 {
793  int stride = Vh + pad;
794  Float2 *r = res + dir*5*stride;
795  for (int j=0; j<5; j++) {
796  m[j*2+0] = r[j*stride].x;
797  m[j*2+1] = r[j*stride].y;
798  }
799 }
800 
801 template <typename Float, typename Float2>
802  void unpackMomField(Float* mom, Float2 *res, int oddBit, int Vh, int pad)
803 {
804  int dir, i;
805  Float *m = mom + oddBit*Vh*10*4;
806 
807  for (i = 0; i < Vh; i++) {
808  for (dir = 0; dir < 4; dir++) {
809  Float* thismom = m + (4*i+dir)*10;
810  unpack10(thismom, res+i, dir, Vh, pad);
811  }
812  }
813 }