QUDA: quda/lib/pack_gauge.h Source File

QUDA v0.4.0
A library for QCD on GPUs
00001 // Routines used to pack the gauge field matrices
00002 
00003 #include <math.h>
00004 
00005 #define SHORT_LENGTH 65536
00006 #define SCALE_FLOAT ((SHORT_LENGTH-1) / 2.f)
00007 #define SHIFT_FLOAT (-1.f / (SHORT_LENGTH-1))
00008 
00009 template <typename Float>
00010 inline short FloatToShort(const Float &a) {
00011   return (short)((a+SHIFT_FLOAT)*SCALE_FLOAT);
00012 }
00013 
00014 template <typename Float>
00015 inline void ShortToFloat(Float &a, const short &b) {
00016   a = ((Float)b/SCALE_FLOAT-SHIFT_FLOAT);
00017 }
00018 
00019 /*template <int N, typename FloatN, typename Float>
00020 inline void pack8(FloatN *res, Float *g, int dir, int V) {
00021   Float *r = res + N*dir*4*V;
00022   r[0] = atan2(g[1], g[0]);
00023   r[1] = atan2(g[13], g[12]);
00024 }*/
00025 
00026 template <typename Float>
00027 inline void pack8(double2 *res, Float *g, int dir, int V) {
00028   double2 *r = res + dir*4*V;
00029   r[0].x = atan2(g[1], g[0]);
00030   r[0].y = atan2(g[13], g[12]);
00031   for (int j=1; j<4; j++) {
00032     r[j*V].x = g[2*j+0];
00033     r[j*V].y = g[2*j+1];
00034   }
00035 }
00036 
00037 template <typename Float>
00038 inline void pack8(float4 *res, Float *g, int dir, int V) {
00039   float4 *r = res + dir*2*V;
00040   r[0].x = atan2(g[1], g[0]);
00041   r[0].y = atan2(g[13], g[12]);
00042   r[0].z = g[2];
00043   r[0].w = g[3];
00044   r[V].x = g[4];
00045   r[V].y = g[5];
00046   r[V].z = g[6];
00047   r[V].w = g[7];
00048 }
00049 
00050 template <typename Float>
00051 inline void pack8(float2 *res, Float *g, int dir, int V) {
00052   float2 *r = res + dir*4*V;
00053   r[0].x = atan2(g[1], g[0]);
00054   r[0].y = atan2(g[13], g[12]);
00055   for (int j=1; j<4; j++) {
00056     r[j*V].x = g[2*j+0];
00057     r[j*V].y = g[2*j+1];
00058   }
00059 }
00060 
00061 template <typename Float>
00062 inline void pack8(short4 *res, Float *g, int dir, int V) {
00063   short4 *r = res + dir*2*V;
00064   r[0].x = FloatToShort(atan2(g[1], g[0]) / M_PI);
00065   r[0].y = FloatToShort(atan2(g[13], g[12]) / M_PI);
00066   r[0].z = FloatToShort(g[2]);
00067   r[0].w = FloatToShort(g[3]);
00068   r[V].x = FloatToShort(g[4]);
00069   r[V].y = FloatToShort(g[5]);
00070   r[V].z = FloatToShort(g[6]);
00071   r[V].w = FloatToShort(g[7]);
00072 }
00073 
00074 template <typename Float>
00075 inline void pack8(short2 *res, Float *g, int dir, int V) {
00076   short2 *r = res + dir*4*V;
00077   r[0].x = FloatToShort(atan2(g[1], g[0]) / M_PI);
00078   r[0].y = FloatToShort(atan2(g[13], g[12]) / M_PI);
00079   for (int j=1; j<4; j++) {
00080     r[j*V].x = FloatToShort(g[2*j+0]);
00081     r[j*V].y = FloatToShort(g[2*j+1]);
00082   }
00083 }
00084 
00085 template <typename Float>
00086 inline void pack12(double2 *res, Float *g, int dir, int V) {
00087   double2 *r = res + dir*6*V;
00088   for (int j=0; j<6; j++) {
00089     r[j*V].x = g[j*2+0];
00090     r[j*V].y = g[j*2+1];
00091   }
00092 }
00093 
00094 template <typename Float>
00095 inline void pack12(float4 *res, Float *g, int dir, int V) {
00096   float4 *r = res + dir*3*V;
00097   for (int j=0; j<3; j++) {
00098     r[j*V].x = g[j*4+0]; 
00099     r[j*V].y = g[j*4+1];
00100     r[j*V].z = g[j*4+2]; 
00101     r[j*V].w = g[j*4+3];
00102   }
00103 }
00104 
00105 template <typename Float>
00106 inline void pack12(float2 *res, Float *g, int dir, int V) {
00107   float2 *r = res + dir*6*V;
00108   for (int j=0; j<6; j++) {
00109     r[j*V].x = g[j*2+0];
00110     r[j*V].y = g[j*2+1];
00111   }
00112 }
00113 
00114 template <typename Float>
00115 inline void pack12(short4 *res, Float *g, int dir, int V) {
00116   short4 *r = res + dir*3*V;
00117   for (int j=0; j<3; j++) {
00118     r[j*V].x = FloatToShort(g[j*4+0]); 
00119     r[j*V].y = FloatToShort(g[j*4+1]);
00120     r[j*V].z = FloatToShort(g[j*4+2]);
00121     r[j*V].w = FloatToShort(g[j*4+3]);
00122   }
00123 }
00124 
00125 template <typename Float>
00126 inline void pack12(short2 *res, Float *g, int dir, int V) {
00127   short2 *r = res + dir*6*V;
00128   for (int j=0; j<6; j++) {
00129     r[j*V].x = FloatToShort(g[j*2+0]);
00130     r[j*V].y = FloatToShort(g[j*2+1]);
00131   }
00132 }
00133 
00134 template <typename Float>
00135 inline void pack18(double2 *res, Float *g, int dir, int V) {
00136   double2 *r = res + dir*9*V;
00137   for (int j=0; j<9; j++) {
00138     r[j*V].x = g[j*2+0]; 
00139     r[j*V].y = g[j*2+1]; 
00140   }
00141 }
00142 
00143 template <typename Float>
00144 inline void pack18(float4 *res, Float *g, int dir, int V) {
00145   float4 *r = res + dir*5*V;
00146   for (int j=0; j<4; j++) {
00147     r[j*V].x = g[j*4+0]; 
00148     r[j*V].y = g[j*4+1]; 
00149     r[j*V].z = g[j*4+2]; 
00150     r[j*V].w = g[j*4+3]; 
00151   }
00152   r[4*V].x = g[16]; 
00153   r[4*V].y = g[17]; 
00154   r[4*V].z = 0.0;
00155   r[4*V].w = 0.0;
00156 }
00157 
00158 template <typename Float>
00159 inline void pack18(float2 *res, Float *g, int dir, int V) {
00160   float2 *r = res + dir*9*V;
00161   for (int j=0; j<9; j++) {
00162     r[j*V].x = g[j*2+0]; 
00163     r[j*V].y = g[j*2+1]; 
00164   }
00165 }
00166 
00167 template <typename Float>
00168 inline void pack18(short4 *res, Float *g, int dir, int V) {
00169   short4 *r = res + dir*5*V;
00170   for (int j=0; j<4; j++) {
00171     r[j*V].x = FloatToShort(g[j*4+0]); 
00172     r[j*V].y = FloatToShort(g[j*4+1]); 
00173     r[j*V].z = FloatToShort(g[j*4+2]); 
00174     r[j*V].w = FloatToShort(g[j*4+3]); 
00175   }
00176   r[4*V].x = FloatToShort(g[16]); 
00177   r[4*V].y = FloatToShort(g[17]); 
00178   r[4*V].z = (short)0;
00179   r[4*V].w = (short)0;
00180 }
00181 
00182 template <typename Float>
00183 inline void pack18(short2 *res, Float *g, int dir, int V) 
00184 {
00185   short2 *r = res + dir*9*V;
00186   for (int j=0; j<9; j++) {
00187     r[j*V].x = FloatToShort(g[j*2+0]); 
00188     r[j*V].y = FloatToShort(g[j*2+1]); 
00189   }
00190 }
00191 
00192 template<typename Float>
00193 inline void fatlink_short_pack18(short2 *d_gauge, Float *h_gauge, int dir, int V) 
00194 {
00195   short2 *dg = d_gauge + dir*9*V;
00196   for (int j=0; j<9; j++) {
00197     dg[j*V].x = FloatToShort((h_gauge[j*2+0]/fat_link_max_)); 
00198     dg[j*V].y = FloatToShort((h_gauge[j*2+1]/fat_link_max_)); 
00199   }
00200 }
00201 
00202 
00203 
00204 // a += b*c
00205 template <typename Float>
00206 inline void accumulateComplexProduct(Float *a, Float *b, Float *c, Float sign) {
00207   a[0] += sign*(b[0]*c[0] - b[1]*c[1]);
00208   a[1] += sign*(b[0]*c[1] + b[1]*c[0]);
00209 }
00210 
00211 // a = conj(b)*c
00212 template <typename Float>
00213 inline void complexDotProduct(Float *a, Float *b, Float *c) {
00214     a[0] = b[0]*c[0] + b[1]*c[1];
00215     a[1] = b[0]*c[1] - b[1]*c[0];
00216 }
00217 
00218 // a += conj(b) * conj(c)
00219 template <typename Float>
00220 inline void accumulateConjugateProduct(Float *a, Float *b, Float *c, int sign) {
00221   a[0] += sign * (b[0]*c[0] - b[1]*c[1]);
00222   a[1] -= sign * (b[0]*c[1] + b[1]*c[0]);
00223 }
00224 
00225 // a = conj(b)*conj(c)
00226 template <typename Float>
00227 inline void complexConjugateProduct(Float *a, Float *b, Float *c) {
00228     a[0] = b[0]*c[0] - b[1]*c[1];
00229     a[1] = -b[0]*c[1] - b[1]*c[0];
00230 }
00231 
00232 
00233 // Routines used to unpack the gauge field matrices
00234 template <typename Float>
00235 inline void reconstruct8(Float *mat, int dir, int idx) {
00236   // First reconstruct first row
00237   Float row_sum = 0.0;
00238   row_sum += mat[2]*mat[2];
00239   row_sum += mat[3]*mat[3];
00240   row_sum += mat[4]*mat[4];
00241   row_sum += mat[5]*mat[5];
00242   Float u0 = (dir < 3 ? anisotropy_ : (idx >= (X_[3]-1)*X_[0]*X_[1]*X_[2]/2 ? t_boundary_ : 1));
00243   Float diff = 1.f/(u0*u0) - row_sum;
00244   Float U00_mag = sqrt(diff >= 0 ? diff : 0.0);
00245 
00246   mat[14] = mat[0];
00247   mat[15] = mat[1];
00248 
00249   mat[0] = U00_mag * cos(mat[14]);
00250   mat[1] = U00_mag * sin(mat[14]);
00251 
00252   Float column_sum = 0.0;
00253   for (int i=0; i<2; i++) column_sum += mat[i]*mat[i];
00254   for (int i=6; i<8; i++) column_sum += mat[i]*mat[i];
00255   diff = 1.f/(u0*u0) - column_sum;
00256   Float U20_mag = sqrt(diff >= 0 ? diff : 0.0);
00257 
00258   mat[12] = U20_mag * cos(mat[15]);
00259   mat[13] = U20_mag * sin(mat[15]);
00260 
00261   // First column now restored
00262 
00263   // finally reconstruct last elements from SU(2) rotation
00264   Float r_inv2 = 1.0/(u0*row_sum);
00265 
00266   // U11
00267   Float A[2];
00268   complexDotProduct(A, mat+0, mat+6);
00269   complexConjugateProduct(mat+8, mat+12, mat+4);
00270   accumulateComplexProduct(mat+8, A, mat+2, u0);
00271   mat[8] *= -r_inv2;
00272   mat[9] *= -r_inv2;
00273 
00274   // U12
00275   complexConjugateProduct(mat+10, mat+12, mat+2);
00276   accumulateComplexProduct(mat+10, A, mat+4, -u0);
00277   mat[10] *= r_inv2;
00278   mat[11] *= r_inv2;
00279 
00280   // U21
00281   complexDotProduct(A, mat+0, mat+12);
00282   complexConjugateProduct(mat+14, mat+6, mat+4);
00283   accumulateComplexProduct(mat+14, A, mat+2, -u0);
00284   mat[14] *= r_inv2;
00285   mat[15] *= r_inv2;
00286 
00287   // U12
00288   complexConjugateProduct(mat+16, mat+6, mat+2);
00289   accumulateComplexProduct(mat+16, A, mat+4, u0);
00290   mat[16] *= -r_inv2;
00291   mat[17] *= -r_inv2;
00292 }
00293 
00294 template <typename Float>
00295 inline void unpack8(Float *h_gauge, double2 *d_gauge, int dir, int V, int idx) {
00296   double2 *dg = d_gauge + dir*4*V;
00297   for (int j=0; j<4; j++) {
00298     h_gauge[2*j+0] = dg[j*V].x;
00299     h_gauge[2*j+1] = dg[j*V].y;
00300   }
00301   reconstruct8(h_gauge, dir, idx);
00302 }
00303 
00304 template <typename Float>
00305 inline void unpack8(Float *h_gauge, float4 *d_gauge, int dir, int V, int idx) {
00306   float4 *dg = d_gauge + dir*2*V;
00307   h_gauge[0] = dg[0].x;
00308   h_gauge[1] = dg[0].y;
00309   h_gauge[2] = dg[0].z;
00310   h_gauge[3] = dg[0].w;
00311   h_gauge[4] = dg[V].x;
00312   h_gauge[5] = dg[V].y;
00313   h_gauge[6] = dg[V].z;
00314   h_gauge[7] = dg[V].w;
00315   reconstruct8(h_gauge, dir, idx);
00316 }
00317 
00318 template <typename Float>
00319 inline void unpack8(Float *h_gauge, float2 *d_gauge, int dir, int V, int idx) {
00320   float2 *dg = d_gauge + dir*4*V;
00321   for (int j=0; j<4; j++) {
00322     h_gauge[2*j+0] = dg[j*V].x;
00323     h_gauge[2*j+1] = dg[j*V].y;
00324   }
00325   reconstruct8(h_gauge, dir, idx);
00326 }
00327 
00328 template <typename Float>
00329 inline void unpack8(Float *h_gauge, short4 *d_gauge, int dir, int V, int idx) {
00330   short4 *dg = d_gauge + dir*2*V;
00331   ShortToFloat(h_gauge[0], dg[0].x);
00332   ShortToFloat(h_gauge[1], dg[0].y);
00333   ShortToFloat(h_gauge[2], dg[0].z);
00334   ShortToFloat(h_gauge[3], dg[0].w);
00335   ShortToFloat(h_gauge[4], dg[V].x);
00336   ShortToFloat(h_gauge[5], dg[V].y);
00337   ShortToFloat(h_gauge[6], dg[V].z);
00338   ShortToFloat(h_gauge[7], dg[V].w);
00339   h_gauge[0] *= M_PI;
00340   h_gauge[1] *= M_PI;
00341   reconstruct8(h_gauge, dir, idx);
00342 }
00343 
00344 template <typename Float>
00345 inline void unpack8(Float *h_gauge, short2 *d_gauge, int dir, int V, int idx) {
00346   short2 *dg = d_gauge + dir*4*V;
00347   for (int j=0; j<4; j++) {
00348     ShortToFloat(h_gauge[2*j+0], dg[j*V].x);
00349     ShortToFloat(h_gauge[2*j+1], dg[j*V].y);
00350   }
00351   h_gauge[0] *= M_PI;
00352   h_gauge[1] *= M_PI;
00353   reconstruct8(h_gauge, dir, idx);
00354 }
00355 
00356 // do this using complex numbers (simplifies)
00357 template <typename Float>
00358 inline void reconstruct12(Float *mat, int dir, int idx) {
00359   Float *u = &mat[0*(3*2)];
00360   Float *v = &mat[1*(3*2)];
00361   Float *w = &mat[2*(3*2)];
00362   w[0] = 0.0; w[1] = 0.0; w[2] = 0.0; w[3] = 0.0; w[4] = 0.0; w[5] = 0.0;
00363   accumulateConjugateProduct(w+0*(2), u+1*(2), v+2*(2), +1);
00364   accumulateConjugateProduct(w+0*(2), u+2*(2), v+1*(2), -1);
00365   accumulateConjugateProduct(w+1*(2), u+2*(2), v+0*(2), +1);
00366   accumulateConjugateProduct(w+1*(2), u+0*(2), v+2*(2), -1);
00367   accumulateConjugateProduct(w+2*(2), u+0*(2), v+1*(2), +1);
00368   accumulateConjugateProduct(w+2*(2), u+1*(2), v+0*(2), -1);
00369   Float u0 = (dir < 3 ? anisotropy_ :
00370               (idx >= (X_[3]-1)*X_[0]*X_[1]*X_[2]/2 ? t_boundary_ : 1));
00371   w[0]*=u0; w[1]*=u0; w[2]*=u0; w[3]*=u0; w[4]*=u0; w[5]*=u0;
00372 }
00373 
00374 template <typename Float>
00375 inline void unpack12(Float *h_gauge, double2 *d_gauge, int dir, int V, int idx) {
00376   double2 *dg = d_gauge + dir*6*V;
00377   for (int j=0; j<6; j++) {
00378     h_gauge[j*2+0] = dg[j*V].x;
00379     h_gauge[j*2+1] = dg[j*V].y; 
00380   }
00381   reconstruct12(h_gauge, dir, idx);
00382 }
00383 
00384 template <typename Float>
00385 inline void unpack12(Float *h_gauge, float4 *d_gauge, int dir, int V, int idx) {
00386   float4 *dg = d_gauge + dir*3*V;
00387   for (int j=0; j<3; j++) {
00388     h_gauge[j*4+0] = dg[j*V].x;
00389     h_gauge[j*4+1] = dg[j*V].y; 
00390     h_gauge[j*4+2] = dg[j*V].z;
00391     h_gauge[j*4+3] = dg[j*V].w; 
00392   }
00393   reconstruct12(h_gauge, dir, idx);
00394 }
00395 
00396 template <typename Float>
00397 inline void unpack12(Float *h_gauge, float2 *d_gauge, int dir, int V, int idx) {
00398   float2 *dg = d_gauge + dir*6*V;
00399   for (int j=0; j<6; j++) {
00400     h_gauge[j*2+0] = dg[j*V].x;
00401     h_gauge[j*2+1] = dg[j*V].y; 
00402   }
00403   reconstruct12(h_gauge, dir, idx);
00404 }
00405 
00406 template <typename Float>
00407 inline void unpack12(Float *h_gauge, short4 *d_gauge, int dir, int V, int idx) {
00408   short4 *dg = d_gauge + dir*3*V;
00409   for (int j=0; j<3; j++) {
00410     ShortToFloat(h_gauge[j*4+0], dg[j*V].x);
00411     ShortToFloat(h_gauge[j*4+1], dg[j*V].y);
00412     ShortToFloat(h_gauge[j*4+2], dg[j*V].z);
00413     ShortToFloat(h_gauge[j*4+3], dg[j*V].w);
00414   }
00415   reconstruct12(h_gauge, dir, idx);
00416 }
00417 
00418 template <typename Float>
00419 inline void unpack12(Float *h_gauge, short2 *d_gauge, int dir, int V, int idx) {
00420   short2 *dg = d_gauge + dir*6*V;
00421   for (int j=0; j<6; j++) {
00422     ShortToFloat(h_gauge[j*2+0], dg[j*V].x);
00423     ShortToFloat(h_gauge[j*2+1], dg[j*V].y); 
00424   }
00425   reconstruct12(h_gauge, dir, idx);
00426 }
00427 
00428 template <typename Float>
00429 inline void unpack18(Float *h_gauge, double2 *d_gauge, int dir, int V) {
00430   double2 *dg = d_gauge + dir*9*V;
00431   for (int j=0; j<9; j++) {
00432     h_gauge[j*2+0] = dg[j*V].x; 
00433     h_gauge[j*2+1] = dg[j*V].y;
00434   }
00435 }
00436 
00437 template <typename Float>
00438 inline void unpack18(Float *h_gauge, float4 *d_gauge, int dir, int V) {
00439   float4 *dg = d_gauge + dir*5*V;
00440   for (int j=0; j<4; j++) {
00441     h_gauge[j*4+0] = dg[j*V].x; 
00442     h_gauge[j*4+1] = dg[j*V].y;
00443     h_gauge[j*4+2] = dg[j*V].z; 
00444     h_gauge[j*4+3] = dg[j*V].w;
00445   }
00446   h_gauge[16] = dg[4*V].x; 
00447   h_gauge[17] = dg[4*V].y;
00448 }
00449 
00450 template <typename Float>
00451 inline void unpack18(Float *h_gauge, float2 *d_gauge, int dir, int V) {
00452   float2 *dg = d_gauge + dir*9*V;
00453   for (int j=0; j<9; j++) {
00454     h_gauge[j*2+0] = dg[j*V].x; 
00455     h_gauge[j*2+1] = dg[j*V].y;
00456   }
00457 }
00458 
00459 template <typename Float>
00460 inline void unpack18(Float *h_gauge, short4 *d_gauge, int dir, int V) {
00461   short4 *dg = d_gauge + dir*5*V;
00462   for (int j=0; j<4; j++) {
00463     ShortToFloat(h_gauge[j*4+0], dg[j*V].x);
00464     ShortToFloat(h_gauge[j*4+1], dg[j*V].y);
00465     ShortToFloat(h_gauge[j*4+2], dg[j*V].z);
00466     ShortToFloat(h_gauge[j*4+3], dg[j*V].w);
00467   }
00468   ShortToFloat(h_gauge[16],dg[4*V].x);
00469   ShortToFloat(h_gauge[17],dg[4*V].y);
00470 
00471 }
00472 
00473 template <typename Float>
00474 inline void unpack18(Float *h_gauge, short2 *d_gauge, int dir, int V) {
00475   short2 *dg = d_gauge + dir*9*V;
00476   for (int j=0; j<9; j++) {
00477     ShortToFloat(h_gauge[j*2+0], dg[j*V].x); 
00478     ShortToFloat(h_gauge[j*2+1], dg[j*V].y);
00479   }
00480 }
00481 
00482 
00483 
00484 // Assume the gauge field is "QDP" ordered: directions outside of
00485 // space-time, row-column ordering, even-odd space-time
00486 // offset = 0 for body
00487 // offset = Vh for face
00488 // voxels = Vh for body
00489 // voxels[i] = face volume[i]
00490 template <typename Float, typename FloatN>
00491 static void packQDPGaugeField(FloatN *d_gauge, Float **h_gauge, int oddBit, 
00492                               QudaReconstructType reconstruct, int Vh, int *voxels,
00493                               int pad, int offset, int nFace, QudaLinkType type) {
00494   if (reconstruct == QUDA_RECONSTRUCT_12) {
00495     for (int dir = 0; dir < 4; dir++) {
00496       int nMat = nFace*voxels[dir];
00497       Float *g = h_gauge[dir] + oddBit*nMat*18;
00498       for (int i = 0; i < nMat; i++) pack12(d_gauge+offset+i, g+i*18, dir, Vh+pad);
00499     }
00500   } else if (reconstruct == QUDA_RECONSTRUCT_8) {
00501     for (int dir = 0; dir < 4; dir++) {
00502       int nMat = nFace*voxels[dir];
00503       Float *g = h_gauge[dir] + oddBit*nMat*18;
00504       for (int i = 0; i < nMat; i++) pack8(d_gauge+offset+i, g+i*18, dir, Vh+pad);
00505     }
00506   } else {
00507     for (int dir = 0; dir < 4; dir++) {
00508       int nMat = nFace*voxels[dir];
00509       Float *g = h_gauge[dir] + oddBit*nMat*18;
00510       if(type == QUDA_ASQTAD_FAT_LINKS && typeid(FloatN) == typeid(short2) ){
00511           //we know it is half precison with fatlink at this stage
00512         for (int i = 0; i < nMat; i++) 
00513           fatlink_short_pack18((short2*)(d_gauge+offset+i), g+i*18, dir, Vh+pad);
00514       }else{
00515         for (int i = 0; i < nMat; i++) pack18(d_gauge+offset+i, g+i*18, dir, Vh+pad);
00516       }
00517     }
00518   }
00519 }
00520 
00521 // Assume the gauge field is "QDP" ordered: directions outside of
00522 // space-time, row-column ordering, even-odd space-time
00523 template <typename Float, typename FloatN>
00524 static void unpackQDPGaugeField(Float **h_gauge, FloatN *d_gauge, int oddBit, 
00525                                 QudaReconstructType reconstruct, int V, int pad) {
00526   if (reconstruct == QUDA_RECONSTRUCT_12) {
00527     for (int dir = 0; dir < 4; dir++) {
00528       Float *g = h_gauge[dir] + oddBit*V*18;
00529       for (int i = 0; i < V; i++) unpack12(g+i*18, d_gauge+i, dir, V+pad, i);
00530     }
00531   } else if (reconstruct == QUDA_RECONSTRUCT_8) {
00532     for (int dir = 0; dir < 4; dir++) {
00533       Float *g = h_gauge[dir] + oddBit*V*18;
00534       for (int i = 0; i < V; i++) unpack8(g+i*18, d_gauge+i, dir, V+pad, i);
00535     }
00536   } else {
00537     for (int dir = 0; dir < 4; dir++) {
00538       Float *g = h_gauge[dir] + oddBit*V*18;
00539       for (int i = 0; i < V; i++) unpack18(g+i*18, d_gauge+i, dir, V+pad);
00540     }
00541   }
00542 }
00543 
00544 // transpose and scale the matrix
00545 template <typename Float, typename Float2>
00546 static void transposeScale(Float *gT, Float *g, const Float2 &a) {
00547   for (int ic=0; ic<3; ic++) for (int jc=0; jc<3; jc++) for (int r=0; r<2; r++)
00548     gT[(ic*3+jc)*2+r] = a*g[(jc*3+ic)*2+r];
00549 }
00550 
00551 // Assume the gauge field is "Wilson" ordered directions inside of
00552 // space-time column-row ordering even-odd space-time
00553 template <typename Float, typename FloatN>
00554 static void packCPSGaugeField(FloatN *d_gauge, Float *h_gauge, int oddBit, 
00555                               QudaReconstructType reconstruct, int V, int pad) {
00556   Float gT[18];
00557   if (reconstruct == QUDA_RECONSTRUCT_12) {
00558     for (int dir = 0; dir < 4; dir++) {
00559       Float *g = h_gauge + (oddBit*V*4+dir)*18;
00560       for (int i = 0; i < V; i++) {
00561         transposeScale(gT, g+4*i*18, 1.0 / anisotropy_);
00562         pack12(d_gauge+i, gT, dir, V+pad);
00563       }
00564     } 
00565   } else if (reconstruct == QUDA_RECONSTRUCT_8) {
00566     for (int dir = 0; dir < 4; dir++) {
00567       Float *g = h_gauge + (oddBit*V*4+dir)*18;
00568       for (int i = 0; i < V; i++) {
00569         transposeScale(gT, g+4*i*18, 1.0 / anisotropy_);
00570         pack8(d_gauge+i, gT, dir, V+pad);
00571       }
00572     }
00573   } else {
00574     for (int dir = 0; dir < 4; dir++) {
00575       Float *g = h_gauge + (oddBit*V*4+dir)*18;
00576       for (int i = 0; i < V; i++) {
00577         transposeScale(gT, g+4*i*18, 1.0 / anisotropy_);
00578         pack18(d_gauge+i, gT, dir, V+pad);
00579       }
00580     }
00581   }
00582 
00583 }
00584 
00585 // Assume the gauge field is "Wilson" ordered directions inside of
00586 // space-time column-row ordering even-odd space-time
00587 template <typename Float, typename FloatN>
00588 static void unpackCPSGaugeField(Float *h_gauge, FloatN *d_gauge, int oddBit, 
00589                                 QudaReconstructType reconstruct, int V, int pad) {
00590   Float gT[18];
00591   if (reconstruct == QUDA_RECONSTRUCT_12) {
00592     for (int dir = 0; dir < 4; dir++) {
00593       Float *hg = h_gauge + (oddBit*V*4+dir)*18;
00594       for (int i = 0; i < V; i++) {
00595         unpack12(gT, d_gauge+i, dir, V+pad, i);
00596         transposeScale(hg+4*i*18, gT, anisotropy_);
00597       }
00598     } 
00599   } else if (reconstruct == QUDA_RECONSTRUCT_8) {
00600     for (int dir = 0; dir < 4; dir++) {
00601       Float *hg = h_gauge + (oddBit*V*4+dir)*18;
00602       for (int i = 0; i < V; i++) {
00603         unpack8(gT, d_gauge+i, dir, V+pad, i);
00604         transposeScale(hg+4*i*18, gT, anisotropy_);
00605       }
00606     }
00607   } else {
00608     for (int dir = 0; dir < 4; dir++) {
00609       Float *hg = h_gauge + (oddBit*V*4+dir)*18;
00610       for (int i = 0; i < V; i++) {
00611         unpack18(gT, d_gauge+i, dir, V+pad);
00612         transposeScale(hg+4*i*18, gT, anisotropy_);
00613       }
00614     }
00615   }
00616 
00617 }
00618 
00619 
00620 // Assume the gauge field is MILC ordered: directions inside of
00621 // space-time row-column ordering even-odd space-time
00622 template <typename Float, typename FloatN>
00623 void packMILCGaugeField(FloatN *res, Float *gauge, int oddBit, 
00624                         QudaReconstructType reconstruct, int Vh, int pad)
00625 {
00626   int dir, i;
00627   if (reconstruct == QUDA_RECONSTRUCT_12) {
00628     for (dir = 0; dir < 4; dir++) {
00629       Float *g = gauge + oddBit*Vh*gaugeSiteSize*4;
00630       for (i = 0; i < Vh; i++) {
00631         pack12(res+i, g+(4*i+dir)*gaugeSiteSize, dir, Vh+pad);
00632       }
00633     }
00634   } else if (reconstruct == QUDA_RECONSTRUCT_8){
00635     for (dir = 0; dir < 4; dir++) {
00636       Float *g = gauge + oddBit*Vh*gaugeSiteSize*4;
00637       for (i = 0; i < Vh; i++) {
00638         pack8(res+i, g+(4*i+dir)*gaugeSiteSize, dir, Vh+pad);
00639       }
00640     }
00641   }else{
00642     for (dir = 0; dir < 4; dir++) {
00643       Float *g = gauge + oddBit*Vh*gaugeSiteSize*4;
00644       for (i = 0; i < Vh; i++) {
00645         pack18(res+i, g+(4*i+dir)*gaugeSiteSize, dir, Vh+pad);
00646       }
00647     }
00648   }
00649 }
00650 
00651 // Assume the gauge field is MILC ordered: directions inside of
00652 // space-time row-column ordering even-odd space-time
00653 template <typename Float, typename FloatN>
00654 static void unpackMILCGaugeField(Float *h_gauge, FloatN *d_gauge, int oddBit, 
00655                                  QudaReconstructType reconstruct, int V, int pad) {
00656   if (reconstruct == QUDA_RECONSTRUCT_12) {
00657     for (int dir = 0; dir < 4; dir++) {
00658       Float *hg = h_gauge + (oddBit*V*4+dir)*18;
00659       for (int i = 0; i < V; i++) {
00660         unpack12(hg+4*i*18, d_gauge+i, dir, V+pad, i);
00661       }
00662     } 
00663   } else if (reconstruct == QUDA_RECONSTRUCT_8) {
00664     for (int dir = 0; dir < 4; dir++) {
00665       Float *hg = h_gauge + (oddBit*V*4+dir)*18;
00666       for (int i = 0; i < V; i++) {
00667         unpack8(hg+4*i*18, d_gauge+i, dir, V+pad, i);
00668       }
00669     }
00670   } else {
00671     for (int dir = 0; dir < 4; dir++) {
00672       Float *hg = h_gauge + (oddBit*V*4+dir)*18;
00673       for (int i = 0; i < V; i++) {
00674         unpack18(hg+4*i*18, d_gauge+i, dir, V+pad);
00675       }
00676     }
00677   }
00678 
00679 }
00680 
00681 /*
00682   Momentum packing/unpacking routines: these are for length 10
00683   vectors, stored in Float2 format.
00684  */
00685 
00686 template <typename Float, typename Float2>
00687   inline void pack10(Float2 *res, Float *m, int dir, int Vh, int pad) 
00688 {
00689   int stride = Vh + pad;
00690   Float2 *r = res + dir*5*stride;
00691   for (int j=0; j<5; j++) {
00692     r[j*stride].x = m[j*2+0]; 
00693     r[j*stride].y = m[j*2+1]; 
00694   }
00695 }
00696 
00697 template <typename Float, typename Float2>
00698   void packMomField(Float2 *res, Float *mom, int oddBit, int Vh, int pad) 
00699 {    
00700   for (int dir = 0; dir < 4; dir++) {
00701     Float *g = mom + (oddBit*Vh*4 + dir)*10;
00702     for (int i = 0; i < Vh; i++) {
00703       pack10(res+i, g + 4*i*10, dir, Vh, pad);
00704     }
00705   }      
00706 }
00707 
00708 
00709 
00710 template <typename Float, typename Float2>
00711   inline void unpack10(Float* m, Float2 *res, int dir, int Vh, int pad) 
00712 {
00713   int stride = Vh + pad;
00714   Float2 *r = res + dir*5*stride;
00715   for (int j=0; j<5; j++) {
00716     m[j*2+0] = r[j*stride].x;
00717     m[j*2+1] = r[j*stride].y;
00718   }    
00719 }
00720 
00721 template <typename Float, typename Float2>
00722   void unpackMomField(Float* mom, Float2 *res, int oddBit, int Vh, int pad) 
00723 {
00724   int dir, i;
00725   Float *m = mom + oddBit*Vh*10*4;
00726   
00727   for (i = 0; i < Vh; i++) {
00728     for (dir = 0; dir < 4; dir++) {     
00729       Float* thismom = m + (4*i+dir)*10;
00730       unpack10(thismom, res+i, dir, Vh, pad);
00731     }
00732   }
00733 }
00734