QUDA: quda/lib/gauge_quda.cpp Source File

QUDA v0.3.2
A library for QCD on GPUs
00001 #include <stdlib.h>
00002 #include <stdio.h>
00003 #include <math.h>
00004 #include <quda.h>
00005 #include <gauge_quda.h>
00006 #include <quda_internal.h>
00007 
00008 #define SHORT_LENGTH 65536
00009 #define SCALE_FLOAT ((SHORT_LENGTH-1) / 2.f)
00010 #define SHIFT_FLOAT (-1.f / (SHORT_LENGTH-1))
00011 
00012 static double Anisotropy;
00013 static QudaTboundary tBoundary;
00014 static int X[4];
00015 
00016 template <typename Float>
00017 inline short FloatToShort(const Float &a) {
00018   return (short)((a+SHIFT_FLOAT)*SCALE_FLOAT);
00019 }
00020 
00021 template <typename Float>
00022 inline void ShortToFloat(Float &a, const short &b) {
00023   a = ((Float)b/SCALE_FLOAT-SHIFT_FLOAT);
00024 }
00025 
00026 // Routines used to pack the gauge field matrices
00027 
00028 template <typename Float>
00029 inline void pack8(double2 *res, Float *g, int dir, int V) {
00030   double2 *r = res + dir*4*V;
00031   r[0].x = atan2(g[1], g[0]);
00032   r[0].y = atan2(g[13], g[12]);
00033   for (int j=1; j<4; j++) {
00034     r[j*V].x = g[2*j+0];
00035     r[j*V].y = g[2*j+1];
00036   }
00037 }
00038 
00039 template <typename Float>
00040 inline void pack8(float4 *res, Float *g, int dir, int V) {
00041   float4 *r = res + dir*2*V;
00042   r[0].x = atan2(g[1], g[0]);
00043   r[0].y = atan2(g[13], g[12]);
00044   r[0].z = g[2];
00045   r[0].w = g[3];
00046   r[V].x = g[4];
00047   r[V].y = g[5];
00048   r[V].z = g[6];
00049   r[V].w = g[7];
00050 }
00051 
00052 template <typename Float>
00053 inline void pack8(float2 *res, Float *g, int dir, int V) {
00054   float2 *r = res + dir*4*V;
00055   r[0].x = atan2(g[1], g[0]);
00056   r[0].y = atan2(g[13], g[12]);
00057   for (int j=1; j<4; j++) {
00058     r[j*V].x = g[2*j+0];
00059     r[j*V].y = g[2*j+1];
00060   }
00061 }
00062 
00063 template <typename Float>
00064 inline void pack8(short4 *res, Float *g, int dir, int V) {
00065   short4 *r = res + dir*2*V;
00066   r[0].x = FloatToShort(atan2(g[1], g[0]) / M_PI);
00067   r[0].y = FloatToShort(atan2(g[13], g[12]) / M_PI);
00068   r[0].z = FloatToShort(g[2]);
00069   r[0].w = FloatToShort(g[3]);
00070   r[V].x = FloatToShort(g[4]);
00071   r[V].y = FloatToShort(g[5]);
00072   r[V].z = FloatToShort(g[6]);
00073   r[V].w = FloatToShort(g[7]);
00074 }
00075 
00076 template <typename Float>
00077 inline void pack8(short2 *res, Float *g, int dir, int V) {
00078   short2 *r = res + dir*4*V;
00079   r[0].x = FloatToShort(atan2(g[1], g[0]) / M_PI);
00080   r[0].y = FloatToShort(atan2(g[13], g[12]) / M_PI);
00081   for (int j=1; j<4; j++) {
00082     r[j*V].x = FloatToShort(g[2*j+0]);
00083     r[j*V].y = FloatToShort(g[2*j+1]);
00084   }
00085 }
00086 
00087 template <typename Float>
00088 inline void pack12(double2 *res, Float *g, int dir, int V) {
00089   double2 *r = res + dir*6*V;
00090   for (int j=0; j<6; j++) {
00091     r[j*V].x = g[j*2+0];
00092     r[j*V].y = g[j*2+1];
00093   }
00094 }
00095 
00096 template <typename Float>
00097 inline void pack12(float4 *res, Float *g, int dir, int V) {
00098   float4 *r = res + dir*3*V;
00099   for (int j=0; j<3; j++) {
00100     r[j*V].x = g[j*4+0]; 
00101     r[j*V].y = g[j*4+1];
00102     r[j*V].z = g[j*4+2]; 
00103     r[j*V].w = g[j*4+3];
00104   }
00105 }
00106 
00107 template <typename Float>
00108 inline void pack12(float2 *res, Float *g, int dir, int V) {
00109   float2 *r = res + dir*6*V;
00110   for (int j=0; j<6; j++) {
00111     r[j*V].x = g[j*2+0];
00112     r[j*V].y = g[j*2+1];
00113   }
00114 }
00115 
00116 template <typename Float>
00117 inline void pack12(short4 *res, Float *g, int dir, int V) {
00118   short4 *r = res + dir*3*V;
00119   for (int j=0; j<3; j++) {
00120     r[j*V].x = FloatToShort(g[j*4+0]); 
00121     r[j*V].y = FloatToShort(g[j*4+1]);
00122     r[j*V].z = FloatToShort(g[j*4+2]);
00123     r[j*V].w = FloatToShort(g[j*4+3]);
00124   }
00125 }
00126 
00127 template <typename Float>
00128 inline void pack12(short2 *res, Float *g, int dir, int V) {
00129   short2 *r = res + dir*6*V;
00130   for (int j=0; j<6; j++) {
00131     r[j*V].x = FloatToShort(g[j*2+0]);
00132     r[j*V].y = FloatToShort(g[j*2+1]);
00133   }
00134 }
00135 
00136 template <typename Float>
00137 inline void pack18(double2 *res, Float *g, int dir, int V) {
00138   double2 *r = res + dir*9*V;
00139   for (int j=0; j<9; j++) {
00140     r[j*V].x = g[j*2+0]; 
00141     r[j*V].y = g[j*2+1]; 
00142   }
00143 }
00144 
00145 template <typename Float>
00146 inline void pack18(float4 *res, Float *g, int dir, int V) {
00147   float4 *r = res + dir*5*V;
00148   for (int j=0; j<4; j++) {
00149     r[j*V].x = g[j*4+0]; 
00150     r[j*V].y = g[j*4+1]; 
00151     r[j*V].z = g[j*4+2]; 
00152     r[j*V].w = g[j*4+3]; 
00153   }
00154   r[4*V].x = g[16]; 
00155   r[4*V].y = g[17]; 
00156   r[4*V].z = 0.0;
00157   r[4*V].w = 0.0;
00158 }
00159 
00160 template <typename Float>
00161 inline void pack18(float2 *res, Float *g, int dir, int V) {
00162   float2 *r = res + dir*9*V;
00163   for (int j=0; j<9; j++) {
00164     r[j*V].x = g[j*2+0]; 
00165     r[j*V].y = g[j*2+1]; 
00166   }
00167 }
00168 
00169 template <typename Float>
00170 inline void pack18(short4 *res, Float *g, int dir, int V) {
00171   short4 *r = res + dir*5*V;
00172   for (int j=0; j<4; j++) {
00173     r[j*V].x = FloatToShort(g[j*4+0]); 
00174     r[j*V].y = FloatToShort(g[j*4+1]); 
00175     r[j*V].z = FloatToShort(g[j*4+2]); 
00176     r[j*V].w = FloatToShort(g[j*4+3]); 
00177   }
00178   r[4*V].x = FloatToShort(g[16]); 
00179   r[4*V].y = FloatToShort(g[17]); 
00180   r[4*V].z = (short)0;
00181   r[4*V].w = (short)0;
00182 }
00183 
00184 template <typename Float>
00185 inline void pack18(short2 *res, Float *g, int dir, int V) 
00186 {
00187   short2 *r = res + dir*9*V;
00188   for (int j=0; j<9; j++) {
00189     r[j*V].x = FloatToShort(g[j*2+0]); 
00190     r[j*V].y = FloatToShort(g[j*2+1]); 
00191   }
00192 }
00193 
00194 // a += b*c
00195 template <typename Float>
00196 inline void accumulateComplexProduct(Float *a, Float *b, Float *c, Float sign) {
00197   a[0] += sign*(b[0]*c[0] - b[1]*c[1]);
00198   a[1] += sign*(b[0]*c[1] + b[1]*c[0]);
00199 }
00200 
00201 // a = conj(b)*c
00202 template <typename Float>
00203 inline void complexDotProduct(Float *a, Float *b, Float *c) {
00204     a[0] = b[0]*c[0] + b[1]*c[1];
00205     a[1] = b[0]*c[1] - b[1]*c[0];
00206 }
00207 
00208 // a += conj(b) * conj(c)
00209 template <typename Float>
00210 inline void accumulateConjugateProduct(Float *a, Float *b, Float *c, int sign) {
00211   a[0] += sign * (b[0]*c[0] - b[1]*c[1]);
00212   a[1] -= sign * (b[0]*c[1] + b[1]*c[0]);
00213 }
00214 
00215 // a = conj(b)*conj(c)
00216 template <typename Float>
00217 inline void complexConjugateProduct(Float *a, Float *b, Float *c) {
00218     a[0] = b[0]*c[0] - b[1]*c[1];
00219     a[1] = -b[0]*c[1] - b[1]*c[0];
00220 }
00221 
00222 
00223 // Routines used to unpack the gauge field matrices
00224 template <typename Float>
00225 inline void reconstruct8(Float *mat, int dir, int idx) {
00226   // First reconstruct first row
00227   Float row_sum = 0.0;
00228   row_sum += mat[2]*mat[2];
00229   row_sum += mat[3]*mat[3];
00230   row_sum += mat[4]*mat[4];
00231   row_sum += mat[5]*mat[5];
00232   Float u0 = (dir < 3 ? Anisotropy : (idx >= (X[3]-1)*X[0]*X[1]*X[2]/2 ? tBoundary : 1));
00233   Float diff = 1.f/(u0*u0) - row_sum;
00234   Float U00_mag = sqrt(diff >= 0 ? diff : 0.0);
00235 
00236   mat[14] = mat[0];
00237   mat[15] = mat[1];
00238 
00239   mat[0] = U00_mag * cos(mat[14]);
00240   mat[1] = U00_mag * sin(mat[14]);
00241 
00242   Float column_sum = 0.0;
00243   for (int i=0; i<2; i++) column_sum += mat[i]*mat[i];
00244   for (int i=6; i<8; i++) column_sum += mat[i]*mat[i];
00245   diff = 1.f/(u0*u0) - column_sum;
00246   Float U20_mag = sqrt(diff >= 0 ? diff : 0.0);
00247 
00248   mat[12] = U20_mag * cos(mat[15]);
00249   mat[13] = U20_mag * sin(mat[15]);
00250 
00251   // First column now restored
00252 
00253   // finally reconstruct last elements from SU(2) rotation
00254   Float r_inv2 = 1.0/(u0*row_sum);
00255 
00256   // U11
00257   Float A[2];
00258   complexDotProduct(A, mat+0, mat+6);
00259   complexConjugateProduct(mat+8, mat+12, mat+4);
00260   accumulateComplexProduct(mat+8, A, mat+2, u0);
00261   mat[8] *= -r_inv2;
00262   mat[9] *= -r_inv2;
00263 
00264   // U12
00265   complexConjugateProduct(mat+10, mat+12, mat+2);
00266   accumulateComplexProduct(mat+10, A, mat+4, -u0);
00267   mat[10] *= r_inv2;
00268   mat[11] *= r_inv2;
00269 
00270   // U21
00271   complexDotProduct(A, mat+0, mat+12);
00272   complexConjugateProduct(mat+14, mat+6, mat+4);
00273   accumulateComplexProduct(mat+14, A, mat+2, -u0);
00274   mat[14] *= r_inv2;
00275   mat[15] *= r_inv2;
00276 
00277   // U12
00278   complexConjugateProduct(mat+16, mat+6, mat+2);
00279   accumulateComplexProduct(mat+16, A, mat+4, u0);
00280   mat[16] *= -r_inv2;
00281   mat[17] *= -r_inv2;
00282 }
00283 
00284 template <typename Float>
00285 inline void unpack8(Float *h_gauge, double2 *d_gauge, int dir, int V, int idx) {
00286   double2 *dg = d_gauge + dir*4*V;
00287   for (int j=0; j<4; j++) {
00288     h_gauge[2*j+0] = dg[j*V].x;
00289     h_gauge[2*j+1] = dg[j*V].y;
00290   }
00291   reconstruct8(h_gauge, dir, idx);
00292 }
00293 
00294 template <typename Float>
00295 inline void unpack8(Float *h_gauge, float4 *d_gauge, int dir, int V, int idx) {
00296   float4 *dg = d_gauge + dir*2*V;
00297   h_gauge[0] = dg[0].x;
00298   h_gauge[1] = dg[0].y;
00299   h_gauge[2] = dg[0].z;
00300   h_gauge[3] = dg[0].w;
00301   h_gauge[4] = dg[V].x;
00302   h_gauge[5] = dg[V].y;
00303   h_gauge[6] = dg[V].z;
00304   h_gauge[7] = dg[V].w;
00305   reconstruct8(h_gauge, dir, idx);
00306 }
00307 
00308 template <typename Float>
00309 inline void unpack8(Float *h_gauge, float2 *d_gauge, int dir, int V, int idx) {
00310   float2 *dg = d_gauge + dir*4*V;
00311   for (int j=0; j<4; j++) {
00312     h_gauge[2*j+0] = dg[j*V].x;
00313     h_gauge[2*j+1] = dg[j*V].y;
00314   }
00315   reconstruct8(h_gauge, dir, idx);
00316 }
00317 
00318 template <typename Float>
00319 inline void unpack8(Float *h_gauge, short4 *d_gauge, int dir, int V, int idx) {
00320   short4 *dg = d_gauge + dir*2*V;
00321   ShortToFloat(h_gauge[0], dg[0].x);
00322   ShortToFloat(h_gauge[1], dg[0].y);
00323   ShortToFloat(h_gauge[2], dg[0].z);
00324   ShortToFloat(h_gauge[3], dg[0].w);
00325   ShortToFloat(h_gauge[4], dg[V].x);
00326   ShortToFloat(h_gauge[5], dg[V].y);
00327   ShortToFloat(h_gauge[6], dg[V].z);
00328   ShortToFloat(h_gauge[7], dg[V].w);
00329   h_gauge[0] *= M_PI;
00330   h_gauge[1] *= M_PI;
00331   reconstruct8(h_gauge, dir, idx);
00332 }
00333 
00334 template <typename Float>
00335 inline void unpack8(Float *h_gauge, short2 *d_gauge, int dir, int V, int idx) {
00336   short2 *dg = d_gauge + dir*4*V;
00337   for (int j=0; j<4; j++) {
00338     ShortToFloat(h_gauge[2*j+0], dg[j*V].x);
00339     ShortToFloat(h_gauge[2*j+1], dg[j*V].y);
00340   }
00341   h_gauge[0] *= M_PI;
00342   h_gauge[1] *= M_PI;
00343   reconstruct8(h_gauge, dir, idx);
00344 }
00345 
00346 // do this using complex numbers (simplifies)
00347 template <typename Float>
00348 inline void reconstruct12(Float *mat, int dir, int idx) {
00349   Float *u = &mat[0*(3*2)];
00350   Float *v = &mat[1*(3*2)];
00351   Float *w = &mat[2*(3*2)];
00352   w[0] = 0.0; w[1] = 0.0; w[2] = 0.0; w[3] = 0.0; w[4] = 0.0; w[5] = 0.0;
00353   accumulateConjugateProduct(w+0*(2), u+1*(2), v+2*(2), +1);
00354   accumulateConjugateProduct(w+0*(2), u+2*(2), v+1*(2), -1);
00355   accumulateConjugateProduct(w+1*(2), u+2*(2), v+0*(2), +1);
00356   accumulateConjugateProduct(w+1*(2), u+0*(2), v+2*(2), -1);
00357   accumulateConjugateProduct(w+2*(2), u+0*(2), v+1*(2), +1);
00358   accumulateConjugateProduct(w+2*(2), u+1*(2), v+0*(2), -1);
00359   Float u0 = (dir < 3 ? Anisotropy :
00360               (idx >= (X[3]-1)*X[0]*X[1]*X[2]/2 ? tBoundary : 1));
00361   w[0]*=u0; w[1]*=u0; w[2]*=u0; w[3]*=u0; w[4]*=u0; w[5]*=u0;
00362 }
00363 
00364 template <typename Float>
00365 inline void unpack12(Float *h_gauge, double2 *d_gauge, int dir, int V, int idx) {
00366   double2 *dg = d_gauge + dir*6*V;
00367   for (int j=0; j<6; j++) {
00368     h_gauge[j*2+0] = dg[j*V].x;
00369     h_gauge[j*2+1] = dg[j*V].y; 
00370   }
00371   reconstruct12(h_gauge, dir, idx);
00372 }
00373 
00374 template <typename Float>
00375 inline void unpack12(Float *h_gauge, float4 *d_gauge, int dir, int V, int idx) {
00376   float4 *dg = d_gauge + dir*3*V;
00377   for (int j=0; j<3; j++) {
00378     h_gauge[j*4+0] = dg[j*V].x;
00379     h_gauge[j*4+1] = dg[j*V].y; 
00380     h_gauge[j*4+2] = dg[j*V].z;
00381     h_gauge[j*4+3] = dg[j*V].w; 
00382   }
00383   reconstruct12(h_gauge, dir, idx);
00384 }
00385 
00386 template <typename Float>
00387 inline void unpack12(Float *h_gauge, float2 *d_gauge, int dir, int V, int idx) {
00388   float2 *dg = d_gauge + dir*6*V;
00389   for (int j=0; j<6; j++) {
00390     h_gauge[j*2+0] = dg[j*V].x;
00391     h_gauge[j*2+1] = dg[j*V].y; 
00392   }
00393   reconstruct12(h_gauge, dir, idx);
00394 }
00395 
00396 template <typename Float>
00397 inline void unpack12(Float *h_gauge, short4 *d_gauge, int dir, int V, int idx) {
00398   short4 *dg = d_gauge + dir*3*V;
00399   for (int j=0; j<3; j++) {
00400     ShortToFloat(h_gauge[j*4+0], dg[j*V].x);
00401     ShortToFloat(h_gauge[j*4+1], dg[j*V].y);
00402     ShortToFloat(h_gauge[j*4+2], dg[j*V].z);
00403     ShortToFloat(h_gauge[j*4+3], dg[j*V].w);
00404   }
00405   reconstruct12(h_gauge, dir, idx);
00406 }
00407 
00408 template <typename Float>
00409 inline void unpack12(Float *h_gauge, short2 *d_gauge, int dir, int V, int idx) {
00410   short2 *dg = d_gauge + dir*6*V;
00411   for (int j=0; j<6; j++) {
00412     ShortToFloat(h_gauge[j*2+0], dg[j*V].x);
00413     ShortToFloat(h_gauge[j*2+1], dg[j*V].y); 
00414   }
00415   reconstruct12(h_gauge, dir, idx);
00416 }
00417 
00418 template <typename Float>
00419 inline void unpack18(Float *h_gauge, double2 *d_gauge, int dir, int V) {
00420   double2 *dg = d_gauge + dir*9*V;
00421   for (int j=0; j<9; j++) {
00422     h_gauge[j*2+0] = dg[j*V].x; 
00423     h_gauge[j*2+1] = dg[j*V].y;
00424   }
00425 }
00426 
00427 template <typename Float>
00428 inline void unpack18(Float *h_gauge, float4 *d_gauge, int dir, int V) {
00429   float4 *dg = d_gauge + dir*5*V;
00430   for (int j=0; j<4; j++) {
00431     h_gauge[j*4+0] = dg[j*V].x; 
00432     h_gauge[j*4+1] = dg[j*V].y;
00433     h_gauge[j*4+2] = dg[j*V].z; 
00434     h_gauge[j*4+3] = dg[j*V].w;
00435   }
00436   h_gauge[16] = dg[4*V].x; 
00437   h_gauge[17] = dg[4*V].y;
00438 }
00439 
00440 template <typename Float>
00441 inline void unpack18(Float *h_gauge, float2 *d_gauge, int dir, int V) {
00442   float2 *dg = d_gauge + dir*9*V;
00443   for (int j=0; j<9; j++) {
00444     h_gauge[j*2+0] = dg[j*V].x; 
00445     h_gauge[j*2+1] = dg[j*V].y;
00446   }
00447 }
00448 
00449 template <typename Float>
00450 inline void unpack18(Float *h_gauge, short4 *d_gauge, int dir, int V) {
00451   short4 *dg = d_gauge + dir*5*V;
00452   for (int j=0; j<4; j++) {
00453     ShortToFloat(h_gauge[j*4+0], dg[j*V].x);
00454     ShortToFloat(h_gauge[j*4+1], dg[j*V].y);
00455     ShortToFloat(h_gauge[j*4+2], dg[j*V].z);
00456     ShortToFloat(h_gauge[j*4+3], dg[j*V].w);
00457   }
00458   ShortToFloat(h_gauge[16],dg[4*V].x);
00459   ShortToFloat(h_gauge[17],dg[4*V].y);
00460 
00461 }
00462 
00463 template <typename Float>
00464 inline void unpack18(Float *h_gauge, short2 *d_gauge, int dir, int V) {
00465   short2 *dg = d_gauge + dir*9*V;
00466   for (int j=0; j<9; j++) {
00467     ShortToFloat(h_gauge[j*2+0], dg[j*V].x); 
00468     ShortToFloat(h_gauge[j*2+1], dg[j*V].y);
00469   }
00470 }
00471 
00472 // Assume the gauge field is "QDP" ordered: directions outside of
00473 // space-time, row-column ordering, even-odd space-time
00474 template <typename Float, typename FloatN>
00475 static void packQDPGaugeField(FloatN *d_gauge, Float **h_gauge, int oddBit, 
00476                        ReconstructType reconstruct, int V, int pad) {
00477   if (reconstruct == QUDA_RECONSTRUCT_12) {
00478     for (int dir = 0; dir < 4; dir++) {
00479       Float *g = h_gauge[dir] + oddBit*V*18;
00480       for (int i = 0; i < V; i++) pack12(d_gauge+i, g+i*18, dir, V+pad);
00481     }
00482   } else if (reconstruct == QUDA_RECONSTRUCT_8) {
00483     for (int dir = 0; dir < 4; dir++) {
00484       Float *g = h_gauge[dir] + oddBit*V*18;
00485       for (int i = 0; i < V; i++) pack8(d_gauge+i, g+i*18, dir, V+pad);
00486     }
00487   } else {
00488     for (int dir = 0; dir < 4; dir++) {
00489       Float *g = h_gauge[dir] + oddBit*V*18;
00490       for (int i = 0; i < V; i++) pack18(d_gauge+i, g+i*18, dir, V+pad);
00491     }
00492   }
00493 }
00494 
00495 // Assume the gauge field is "QDP" ordered: directions outside of
00496 // space-time, row-column ordering, even-odd space-time
00497 template <typename Float, typename FloatN>
00498 static void unpackQDPGaugeField(Float **h_gauge, FloatN *d_gauge, int oddBit, 
00499                                 ReconstructType reconstruct, int V, int pad) {
00500   if (reconstruct == QUDA_RECONSTRUCT_12) {
00501     for (int dir = 0; dir < 4; dir++) {
00502       Float *g = h_gauge[dir] + oddBit*V*18;
00503       for (int i = 0; i < V; i++) unpack12(g+i*18, d_gauge+i, dir, V+pad, i);
00504     }
00505   } else if (reconstruct == QUDA_RECONSTRUCT_8) {
00506     for (int dir = 0; dir < 4; dir++) {
00507       Float *g = h_gauge[dir] + oddBit*V*18;
00508       for (int i = 0; i < V; i++) unpack8(g+i*18, d_gauge+i, dir, V+pad, i);
00509     }
00510   } else {
00511     for (int dir = 0; dir < 4; dir++) {
00512       Float *g = h_gauge[dir] + oddBit*V*18;
00513       for (int i = 0; i < V; i++) unpack18(g+i*18, d_gauge+i, dir, V+pad);
00514     }
00515   }
00516 }
00517 
00518 // transpose and scale the matrix
00519 template <typename Float, typename Float2>
00520 static void transposeScale(Float *gT, Float *g, const Float2 &a) {
00521   for (int ic=0; ic<3; ic++) for (int jc=0; jc<3; jc++) for (int r=0; r<2; r++)
00522     gT[(ic*3+jc)*2+r] = a*g[(jc*3+ic)*2+r];
00523 }
00524 
00525 // Assume the gauge field is "Wilson" ordered directions inside of
00526 // space-time column-row ordering even-odd space-time
00527 template <typename Float, typename FloatN>
00528 static void packCPSGaugeField(FloatN *d_gauge, Float *h_gauge, int oddBit, 
00529                               ReconstructType reconstruct, int V, int pad) {
00530   Float gT[18];
00531   if (reconstruct == QUDA_RECONSTRUCT_12) {
00532     for (int dir = 0; dir < 4; dir++) {
00533       Float *g = h_gauge + (oddBit*V*4+dir)*18;
00534       for (int i = 0; i < V; i++) {
00535         transposeScale(gT, g, 1.0 / Anisotropy);
00536         pack12(d_gauge+i, gT, dir, V+pad);
00537       }
00538     } 
00539   } else if (reconstruct == QUDA_RECONSTRUCT_8) {
00540     for (int dir = 0; dir < 4; dir++) {
00541       Float *g = h_gauge + (oddBit*V*4+dir)*18;
00542       for (int i = 0; i < V; i++) {
00543         transposeScale(gT, g, 1.0 / Anisotropy);
00544         pack8(d_gauge+i, gT, dir, V+pad);
00545       }
00546     }
00547   } else {
00548     for (int dir = 0; dir < 4; dir++) {
00549       Float *g = h_gauge + (oddBit*V*4+dir)*18;
00550       for (int i = 0; i < V; i++) {
00551         transposeScale(gT, g, 1.0 / Anisotropy);
00552         pack18(d_gauge+i, gT, dir, V+pad);
00553       }
00554     }
00555   }
00556 
00557 }
00558 
00559 // Assume the gauge field is "Wilson" ordered directions inside of
00560 // space-time column-row ordering even-odd space-time
00561 template <typename Float, typename FloatN>
00562 static void unpackCPSGaugeField(Float *h_gauge, FloatN *d_gauge, int oddBit, 
00563                                 ReconstructType reconstruct, int V, int pad) {
00564   Float gT[18];
00565   if (reconstruct == QUDA_RECONSTRUCT_12) {
00566     for (int dir = 0; dir < 4; dir++) {
00567       Float *hg = h_gauge + (oddBit*V*4+dir)*18;
00568       for (int i = 0; i < V; i++) {
00569         unpack12(gT, d_gauge+i, dir, V+pad, i);
00570         transposeScale(hg, gT, Anisotropy);
00571       }
00572     } 
00573   } else if (reconstruct == QUDA_RECONSTRUCT_8) {
00574     for (int dir = 0; dir < 4; dir++) {
00575       Float *hg = h_gauge + (oddBit*V*4+dir)*18;
00576       for (int i = 0; i < V; i++) {
00577         unpack8(gT, d_gauge+i, dir, V+pad, i);
00578         transposeScale(hg, gT, Anisotropy);
00579       }
00580     }
00581   } else {
00582     for (int dir = 0; dir < 4; dir++) {
00583       Float *hg = h_gauge + (oddBit*V*4+dir)*18;
00584       for (int i = 0; i < V; i++) {
00585         unpack18(gT, d_gauge+i, dir, V+pad);
00586         transposeScale(hg, gT, Anisotropy);
00587       }
00588     }
00589   }
00590 
00591 }
00592 
00593 static void allocateGaugeField(FullGauge *cudaGauge, ReconstructType reconstruct, QudaPrecision precision) {
00594 
00595   cudaGauge->reconstruct = reconstruct;
00596   cudaGauge->precision = precision;
00597 
00598   cudaGauge->Nc = 3;
00599 
00600   int floatSize;
00601   if (precision == QUDA_DOUBLE_PRECISION) floatSize = sizeof(double);
00602   else if (precision == QUDA_SINGLE_PRECISION) floatSize = sizeof(float);
00603   else floatSize = sizeof(float)/2;
00604 
00605   int elements;
00606   switch(reconstruct){
00607   case QUDA_RECONSTRUCT_8:
00608       elements = 8;
00609       break;
00610   case QUDA_RECONSTRUCT_12:
00611       elements = 12;
00612       break;
00613   case QUDA_RECONSTRUCT_NO:
00614       elements = 18;
00615       break;
00616   default:
00617       errorQuda("Invalid reconstruct value");
00618   }
00619   
00620   cudaGauge->bytes = 4*cudaGauge->stride*elements*floatSize;
00621 
00622   if (!cudaGauge->even) {
00623     if (cudaMalloc((void **)&cudaGauge->even, cudaGauge->bytes) == cudaErrorMemoryAllocation) {
00624       errorQuda("Error allocating even gauge field");
00625     }
00626   }
00627   cudaMemset(cudaGauge->even, 0, cudaGauge->bytes);
00628 
00629   if (!cudaGauge->odd) {
00630     if (cudaMalloc((void **)&cudaGauge->odd, cudaGauge->bytes) == cudaErrorMemoryAllocation) {
00631       errorQuda("Error allocating even odd gauge field");
00632     }
00633   }
00634   cudaMemset(cudaGauge->odd, 0, cudaGauge->bytes);
00635 
00636   checkCudaError();
00637 }
00638 
00639 void freeGaugeField(FullGauge *cudaGauge) {
00640   if (cudaGauge->even) cudaFree(cudaGauge->even);
00641   if (cudaGauge->odd) cudaFree(cudaGauge->odd);
00642   cudaGauge->even = NULL;
00643   cudaGauge->odd = NULL;
00644 }
00645 
00646 template <typename Float, typename FloatN>
00647 static void loadGaugeField(FloatN *even, FloatN *odd, Float *cpuGauge, GaugeFieldOrder gauge_order,
00648                            ReconstructType reconstruct, int bytes, int Vh, int pad) {
00649 
00650   // Use pinned memory
00651   FloatN *packedEven, *packedOdd;
00652     
00653 #ifndef __DEVICE_EMULATION__
00654   cudaMallocHost((void**)&packedEven, bytes);
00655   cudaMallocHost((void**)&packedOdd, bytes);
00656 #else
00657   packedEven = (FloatN*)malloc(bytes);
00658   packedOdd = (FloatN*)malloc(bytes);
00659 #endif
00660     
00661   if (gauge_order == QUDA_QDP_GAUGE_ORDER) {
00662     packQDPGaugeField(packedEven, (Float**)cpuGauge, 0, reconstruct, Vh, pad);
00663     packQDPGaugeField(packedOdd,  (Float**)cpuGauge, 1, reconstruct, Vh, pad);
00664   } else if (gauge_order == QUDA_CPS_WILSON_GAUGE_ORDER) {
00665     packCPSGaugeField(packedEven, (Float*)cpuGauge, 0, reconstruct, Vh, pad);
00666     packCPSGaugeField(packedOdd,  (Float*)cpuGauge, 1, reconstruct, Vh, pad);    
00667   } else {
00668     errorQuda("Invalid gauge_order");
00669   }
00670 
00671   cudaMemcpy(even, packedEven, bytes, cudaMemcpyHostToDevice);
00672   checkCudaError();
00673     
00674   cudaMemcpy(odd,  packedOdd, bytes, cudaMemcpyHostToDevice);
00675   checkCudaError();
00676   
00677 #ifndef __DEVICE_EMULATION__
00678   cudaFreeHost(packedEven);
00679   cudaFreeHost(packedOdd);
00680 #else
00681   free(packedEven);
00682   free(packedOdd);
00683 #endif
00684 
00685 }
00686 
00687 template <typename Float, typename FloatN>
00688 static void retrieveGaugeField(Float *cpuGauge, FloatN *even, FloatN *odd, GaugeFieldOrder gauge_order,
00689                                ReconstructType reconstruct, int bytes, int Vh, int pad) {
00690 
00691   // Use pinned memory
00692   FloatN *packedEven, *packedOdd;
00693     
00694 #ifndef __DEVICE_EMULATION__
00695   cudaMallocHost((void**)&packedEven, bytes);
00696   cudaMallocHost((void**)&packedOdd, bytes);
00697 #else
00698   packedEven = (FloatN*)malloc(bytes);
00699   packedOdd = (FloatN*)malloc(bytes);
00700 #endif
00701     
00702   cudaMemcpy(packedEven, even, bytes, cudaMemcpyDeviceToHost);
00703   cudaMemcpy(packedOdd, odd, bytes, cudaMemcpyDeviceToHost);    
00704     
00705   if (gauge_order == QUDA_QDP_GAUGE_ORDER) {
00706     unpackQDPGaugeField((Float**)cpuGauge, packedEven, 0, reconstruct, Vh, pad);
00707     unpackQDPGaugeField((Float**)cpuGauge, packedOdd, 1, reconstruct, Vh, pad);
00708   } else if (gauge_order == QUDA_CPS_WILSON_GAUGE_ORDER) {
00709     unpackCPSGaugeField((Float*)cpuGauge, packedEven, 0, reconstruct, Vh, pad);
00710     unpackCPSGaugeField((Float*)cpuGauge, packedOdd, 1, reconstruct, Vh, pad);
00711   } else {
00712     errorQuda("Invalid gauge_order");
00713   }
00714     
00715 #ifndef __DEVICE_EMULATION__
00716   cudaFreeHost(packedEven);
00717   cudaFreeHost(packedOdd);
00718 #else
00719   free(packedEven);
00720   free(packedOdd);
00721 #endif
00722 
00723 }
00724 
00725 void createGaugeField(FullGauge *cudaGauge, void *cpuGauge, QudaPrecision cuda_prec, QudaPrecision cpu_prec,
00726                       GaugeFieldOrder gauge_order, ReconstructType reconstruct, GaugeFixed gauge_fixed,
00727                       Tboundary t_boundary, int *XX, double anisotropy, double tadpole_coeff, int pad)
00728 {
00729   if (cpu_prec == QUDA_HALF_PRECISION) {
00730     errorQuda("Half precision not supported on CPU");
00731   }
00732 
00733   Anisotropy = anisotropy;
00734   tBoundary = t_boundary;
00735 
00736   cudaGauge->anisotropy = anisotropy;
00737   cudaGauge->tadpole_coeff = tadpole_coeff;
00738   cudaGauge->volume = 1;
00739   for (int d=0; d<4; d++) {
00740     cudaGauge->X[d] = XX[d];
00741     cudaGauge->volume *= XX[d];
00742     X[d] = XX[d];
00743   }
00744   cudaGauge->X[0] /= 2; // actually store the even-odd sublattice dimensions
00745   cudaGauge->volume /= 2;
00746   cudaGauge->pad = pad;
00747   cudaGauge->stride = cudaGauge->volume + cudaGauge->pad;
00748   cudaGauge->gauge_fixed = gauge_fixed;
00749   cudaGauge->t_boundary = t_boundary;
00750   
00751   allocateGaugeField(cudaGauge, reconstruct, cuda_prec);
00752 
00753   if (cuda_prec == QUDA_DOUBLE_PRECISION) {
00754 
00755     if (cpu_prec == QUDA_DOUBLE_PRECISION) {
00756       loadGaugeField((double2*)(cudaGauge->even), (double2*)(cudaGauge->odd), (double*)cpuGauge, 
00757                      gauge_order, cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, pad);
00758     } else if (cpu_prec == QUDA_SINGLE_PRECISION) {
00759       loadGaugeField((double2*)(cudaGauge->even), (double2*)(cudaGauge->odd), (float*)cpuGauge, 
00760                      gauge_order, cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, pad);
00761     }
00762 
00763   } else if (cuda_prec == QUDA_SINGLE_PRECISION) {
00764 
00765     if (cpu_prec == QUDA_DOUBLE_PRECISION) {
00766       if (reconstruct == QUDA_RECONSTRUCT_NO) {
00767         loadGaugeField((float2*)(cudaGauge->even), (float2*)(cudaGauge->odd), (double*)cpuGauge, 
00768                        gauge_order, cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, pad);        
00769       } else {
00770         loadGaugeField((float4*)(cudaGauge->even), (float4*)(cudaGauge->odd), (double*)cpuGauge, 
00771                        gauge_order, cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, pad);
00772       }
00773     } else if (cpu_prec == QUDA_SINGLE_PRECISION) {
00774       if (reconstruct == QUDA_RECONSTRUCT_NO) {
00775         loadGaugeField((float2*)(cudaGauge->even), (float2*)(cudaGauge->odd), (float*)cpuGauge, 
00776                        gauge_order, cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, pad);
00777       } else {
00778         loadGaugeField((float4*)(cudaGauge->even), (float4*)(cudaGauge->odd), (float*)cpuGauge, 
00779                        gauge_order, cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, pad);
00780       }
00781     }
00782 
00783   } else if (cuda_prec == QUDA_HALF_PRECISION) {
00784 
00785     if (cpu_prec == QUDA_DOUBLE_PRECISION){
00786       if (reconstruct == QUDA_RECONSTRUCT_NO) {
00787         loadGaugeField((short2*)(cudaGauge->even), (short2*)(cudaGauge->odd), (double*)cpuGauge, 
00788                        gauge_order, cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, pad);
00789       } else {
00790         loadGaugeField((short4*)(cudaGauge->even), (short4*)(cudaGauge->odd), (double*)cpuGauge, 
00791                        gauge_order, cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, pad);        
00792       }
00793     } else if (cpu_prec == QUDA_SINGLE_PRECISION) {
00794       if (reconstruct == QUDA_RECONSTRUCT_NO) {
00795         loadGaugeField((short2*)(cudaGauge->even), (short2*)(cudaGauge->odd), (float*)cpuGauge, 
00796                        gauge_order, cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, pad);
00797       } else {
00798         loadGaugeField((short4*)(cudaGauge->even), (short4*)(cudaGauge->odd), (float*)cpuGauge, 
00799                        gauge_order, cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, pad);        
00800       }
00801     }
00802 
00803   }
00804 }
00805 
00806 void restoreGaugeField(void *cpuGauge, FullGauge *cudaGauge, QudaPrecision cpu_prec, GaugeFieldOrder gauge_order)
00807 {
00808   if (cpu_prec == QUDA_HALF_PRECISION) {
00809     errorQuda("Half precision not supported on CPU");
00810   }
00811 
00812   if (cudaGauge->precision == QUDA_DOUBLE_PRECISION) {
00813 
00814     if (cpu_prec == QUDA_DOUBLE_PRECISION) {
00815       retrieveGaugeField((double*)cpuGauge, (double2*)(cudaGauge->even), (double2*)(cudaGauge->odd), 
00816                          gauge_order, cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, cudaGauge->pad);
00817     } else if (cpu_prec == QUDA_SINGLE_PRECISION) {
00818       retrieveGaugeField((float*)cpuGauge, (double2*)(cudaGauge->even), (double2*)(cudaGauge->odd), 
00819                          gauge_order, cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, cudaGauge->pad);
00820     }
00821 
00822   } else if (cudaGauge->precision == QUDA_SINGLE_PRECISION) {
00823 
00824     if (cpu_prec == QUDA_DOUBLE_PRECISION) {
00825       if (cudaGauge->reconstruct == QUDA_RECONSTRUCT_NO) {
00826         retrieveGaugeField((double*)cpuGauge, (float2*)(cudaGauge->even), (float2*)(cudaGauge->odd), 
00827                            gauge_order, cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, cudaGauge->pad);
00828       } else {
00829         retrieveGaugeField((double*)cpuGauge, (float4*)(cudaGauge->even), (float4*)(cudaGauge->odd), 
00830                            gauge_order, cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, cudaGauge->pad);
00831       }
00832     } else if (cpu_prec == QUDA_SINGLE_PRECISION) {
00833       if (cudaGauge->reconstruct == QUDA_RECONSTRUCT_NO) {
00834         retrieveGaugeField((float*)cpuGauge, (float2*)(cudaGauge->even), (float2*)(cudaGauge->odd), 
00835                            gauge_order, cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, cudaGauge->pad);
00836       } else {
00837         retrieveGaugeField((float*)cpuGauge, (float4*)(cudaGauge->even), (float4*)(cudaGauge->odd), 
00838                            gauge_order, cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, cudaGauge->pad);
00839       }
00840     }
00841 
00842   } else if (cudaGauge->precision == QUDA_HALF_PRECISION) {
00843 
00844     if (cpu_prec == QUDA_DOUBLE_PRECISION) {
00845       if (cudaGauge->reconstruct == QUDA_RECONSTRUCT_NO) {
00846         retrieveGaugeField((double*)cpuGauge, (short2*)(cudaGauge->even), (short2*)(cudaGauge->odd), 
00847                            gauge_order, cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, cudaGauge->pad);
00848       } else {
00849         retrieveGaugeField((double*)cpuGauge, (short4*)(cudaGauge->even), (short4*)(cudaGauge->odd), 
00850                            gauge_order, cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, cudaGauge->pad);
00851       }
00852     } else if (cpu_prec == QUDA_SINGLE_PRECISION) {
00853       if (cudaGauge->reconstruct == QUDA_RECONSTRUCT_NO) {
00854         retrieveGaugeField((float*)cpuGauge, (short2*)(cudaGauge->even), (short2*)(cudaGauge->odd), 
00855                            gauge_order, cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, cudaGauge->pad);
00856       } else {
00857         retrieveGaugeField((float*)cpuGauge, (short4*)(cudaGauge->even), (short4*)(cudaGauge->odd), 
00858                            gauge_order, cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, cudaGauge->pad);
00859       }
00860     }
00861 
00862   }
00863 
00864 }
00865 
00866 
00867 /********************** the following functions are used in force computation and link fattening**************/
00868 
00869 /******************************** Functions to manipulate Staple ****************************/
00870 
00871 static void 
00872 allocateStapleQuda(FullStaple *cudaStaple, QudaPrecision precision) 
00873 {
00874     cudaStaple->precision = precision;    
00875     cudaStaple->Nc = 3;
00876     
00877     int floatSize;
00878     if (precision == QUDA_DOUBLE_PRECISION) {
00879         floatSize = sizeof(double);
00880     }
00881     else if (precision == QUDA_SINGLE_PRECISION) {
00882         floatSize = sizeof(float);
00883     }else{
00884         printf("ERROR: stape does not support half precision\n");
00885         exit(1);
00886     }
00887     
00888     int elements = 18;
00889     
00890     cudaStaple->bytes = cudaStaple->volume*elements*floatSize;
00891     
00892     cudaMalloc((void **)&cudaStaple->even, cudaStaple->bytes);
00893     cudaMalloc((void **)&cudaStaple->odd, cudaStaple->bytes);       
00894 }
00895 
00896 void
00897 createStapleQuda(FullStaple* cudaStaple, QudaGaugeParam* param)
00898 {
00899     QudaPrecision cpu_prec = param->cpu_prec;
00900     QudaPrecision cuda_prec= param->cuda_prec;
00901     
00902     if (cpu_prec == QUDA_HALF_PRECISION) {
00903         printf("ERROR: %s:  half precision not supported on cpu\n", __FUNCTION__);
00904         exit(-1);
00905     }
00906     
00907     if (cuda_prec == QUDA_DOUBLE_PRECISION && param->cpu_prec != QUDA_DOUBLE_PRECISION) {
00908         printf("Error: can only create a double GPU gauge field from a double CPU gauge field\n");
00909         exit(-1);
00910     }
00911     
00912     cudaStaple->volume = 1;
00913     for (int d=0; d<4; d++) {
00914         cudaStaple->X[d] = param->X[d];
00915         cudaStaple->volume *= param->X[d];
00916     }
00917     cudaStaple->X[0] /= 2; // actually store the even-odd sublattice dimensions
00918     cudaStaple->volume /= 2;    
00919     
00920     allocateStapleQuda(cudaStaple,  param->cuda_prec);
00921     
00922     return;
00923 }
00924 
00925 
00926 void
00927 freeStapleQuda(FullStaple *cudaStaple) 
00928 {
00929     if (cudaStaple->even) {
00930         cudaFree(cudaStaple->even);
00931     }
00932     if (cudaStaple->odd) {
00933         cudaFree(cudaStaple->odd);
00934     }
00935     cudaStaple->even = NULL;
00936     cudaStaple->odd = NULL;
00937 }
00938 
00939 
00940 
00941 
00942 /******************************** Functions to manipulate Mom ****************************/
00943 
00944 static void 
00945 allocateMomQuda(FullMom *cudaMom, QudaPrecision precision) 
00946 {
00947     cudaMom->precision = precision;    
00948     
00949     int floatSize;
00950     if (precision == QUDA_DOUBLE_PRECISION) {
00951         floatSize = sizeof(double);
00952     }
00953     else if (precision == QUDA_SINGLE_PRECISION) {
00954         floatSize = sizeof(float);
00955     }else{
00956         printf("ERROR: stape does not support half precision\n");
00957         exit(1);
00958     }
00959     
00960     int elements = 10;
00961      
00962     cudaMom->bytes = cudaMom->volume*elements*floatSize*4;
00963     
00964     cudaMalloc((void **)&cudaMom->even, cudaMom->bytes);
00965     cudaMalloc((void **)&cudaMom->odd, cudaMom->bytes);             
00966 }
00967 
00968 void
00969 createMomQuda(FullMom* cudaMom, QudaGaugeParam* param)
00970 {
00971     QudaPrecision cpu_prec = param->cpu_prec;
00972     QudaPrecision cuda_prec= param->cuda_prec;
00973     
00974     if (cpu_prec == QUDA_HALF_PRECISION) {
00975         printf("ERROR: %s:  half precision not supported on cpu\n", __FUNCTION__);
00976         exit(-1);
00977     }
00978     
00979     if (cuda_prec == QUDA_DOUBLE_PRECISION && param->cpu_prec != QUDA_DOUBLE_PRECISION) {
00980         printf("Error: can only create a double GPU gauge field from a double CPU gauge field\n");
00981         exit(-1);
00982     }
00983     
00984     cudaMom->volume = 1;
00985     for (int d=0; d<4; d++) {
00986         cudaMom->X[d] = param->X[d];
00987         cudaMom->volume *= param->X[d];
00988     }
00989     cudaMom->X[0] /= 2; // actually store the even-odd sublattice dimensions
00990     cudaMom->volume /= 2;    
00991     
00992     allocateMomQuda(cudaMom,  param->cuda_prec);
00993     
00994     return;
00995 }
00996 
00997 
00998 void
00999 freeMomQuda(FullMom *cudaMom) 
01000 {
01001     if (cudaMom->even) {
01002         cudaFree(cudaMom->even);
01003     }
01004     if (cudaMom->odd) {
01005         cudaFree(cudaMom->odd);
01006     }
01007     cudaMom->even = NULL;
01008     cudaMom->odd = NULL;
01009 }
01010 
01011 template <typename Float, typename Float2>
01012 inline void pack10(Float2 *res, Float *m, int dir, int Vh) 
01013 {
01014     Float2 *r = res + dir*5*Vh;
01015     for (int j=0; j<5; j++) {
01016         r[j*Vh].x = (float)m[j*2+0]; 
01017         r[j*Vh].y = (float)m[j*2+1]; 
01018     }
01019 }
01020 
01021 template <typename Float, typename Float2>
01022 void packMomField(Float2 *res, Float *mom, int oddBit, int Vh) 
01023 {    
01024     for (int dir = 0; dir < 4; dir++) {
01025         Float *g = mom + (oddBit*Vh*4 + dir)*momSiteSize;
01026         for (int i = 0; i < Vh; i++) {
01027             pack10(res+i, g + 4*i*momSiteSize, dir, Vh);
01028         }
01029     }      
01030 }
01031 
01032 template <typename Float, typename Float2>
01033 void loadMomField(Float2 *even, Float2 *odd, Float *mom,
01034                   int bytes, int Vh) 
01035 {
01036     
01037     Float2 *packedEven, *packedOdd;
01038     cudaMallocHost((void**)&packedEven, bytes); 
01039     cudaMallocHost((void**)&packedOdd, bytes);
01040     
01041     packMomField(packedEven, (Float*)mom, 0, Vh);
01042     packMomField(packedOdd,  (Float*)mom, 1, Vh);
01043     
01044     cudaMemcpy(even, packedEven, bytes, cudaMemcpyHostToDevice);
01045     cudaMemcpy(odd,  packedOdd, bytes, cudaMemcpyHostToDevice); 
01046     
01047     cudaFreeHost(packedEven);
01048     cudaFreeHost(packedOdd);
01049 }
01050 
01051 
01052 
01053 
01054 void
01055 loadMomToGPU(FullMom cudaMom, void* mom, QudaGaugeParam* param)
01056 {
01057     if (param->cuda_prec == QUDA_DOUBLE_PRECISION) {
01058         //loadGaugeField((double2*)(cudaGauge->even), (double2*)(cudaGauge->odd), (double*)cpuGauge, 
01059         //cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume);
01060     } else { //single precision
01061         loadMomField((float2*)(cudaMom.even), (float2*)(cudaMom.odd), (float*)mom, 
01062                      cudaMom.bytes, cudaMom.volume);
01063         
01064     }
01065 }
01066 
01067 
01068 template <typename Float, typename Float2>
01069 inline void unpack10(Float* m, Float2 *res, int dir, int Vh) 
01070 {
01071     Float2 *r = res + dir*5*Vh;
01072     for (int j=0; j<5; j++) {
01073         m[j*2+0] = r[j*Vh].x;
01074         m[j*2+1] = r[j*Vh].y;
01075     }
01076     
01077 }
01078 
01079 template <typename Float, typename Float2>
01080 void 
01081 unpackMomField(Float* mom, Float2 *res, int oddBit, int Vh) 
01082 {
01083     int dir, i;
01084     Float *m = mom + oddBit*Vh*momSiteSize*4;
01085     
01086     for (i = 0; i < Vh; i++) {
01087         for (dir = 0; dir < 4; dir++) { 
01088             Float* thismom = m + (4*i+dir)*momSiteSize;
01089             unpack10(thismom, res+i, dir, Vh);
01090         }
01091     }
01092 }
01093 
01094 template <typename Float, typename Float2>
01095 void 
01096 storeMomToCPUArray(Float* mom, Float2 *even, Float2 *odd, 
01097                    int bytes, int Vh) 
01098 {    
01099     Float2 *packedEven, *packedOdd;   
01100     cudaMallocHost((void**)&packedEven, bytes); 
01101     cudaMallocHost((void**)&packedOdd, bytes); 
01102     cudaMemcpy(packedEven, even, bytes, cudaMemcpyDeviceToHost); 
01103     cudaMemcpy(packedOdd, odd, bytes, cudaMemcpyDeviceToHost);  
01104 
01105     unpackMomField((Float*)mom, packedEven,0, Vh);
01106     unpackMomField((Float*)mom, packedOdd, 1, Vh);
01107         
01108     cudaFreeHost(packedEven); 
01109     cudaFreeHost(packedOdd); 
01110 }
01111 
01112 void 
01113 storeMomToCPU(void* mom, FullMom cudaMom, QudaGaugeParam* param)
01114 {
01115     QudaPrecision cpu_prec = param->cpu_prec;
01116     QudaPrecision cuda_prec= param->cuda_prec;
01117     
01118     if (cpu_prec != cuda_prec){
01119         printf("Error:%s: cpu and gpu precison has to be the same at this moment \n", __FUNCTION__);
01120         exit(1);        
01121     }
01122     
01123     if (cpu_prec == QUDA_HALF_PRECISION){
01124         printf("ERROR: %s:  half precision is not supported at this moment\n", __FUNCTION__);
01125         exit(1);
01126     }
01127     
01128     if (cpu_prec == QUDA_DOUBLE_PRECISION){
01129         
01130     }else { //SINGLE PRECISIONS
01131         storeMomToCPUArray( (float*)mom, (float2*) cudaMom.even, (float2*)cudaMom.odd, 
01132                             cudaMom.bytes, cudaMom.volume);     
01133     }
01134     
01135 }
01136 
01137 /**************************************************************************************************/
01138 template <typename Float, typename FloatN>
01139 void 
01140 packGaugeField(FloatN *res, Float *gauge, int oddBit, ReconstructType reconstruct, int Vh) 
01141 {
01142     int dir, i;
01143     if (reconstruct == QUDA_RECONSTRUCT_12) {
01144         for (dir = 0; dir < 4; dir++) {
01145             Float *g = gauge + oddBit*Vh*gaugeSiteSize*4;
01146             for (i = 0; i < Vh; i++) {
01147                 pack12(res+i, g+4*i*gaugeSiteSize+dir*gaugeSiteSize, dir, Vh);
01148             }
01149         }
01150     } else if (reconstruct == QUDA_RECONSTRUCT_8){
01151         for (dir = 0; dir < 4; dir++) {
01152             Float *g = gauge + oddBit*Vh*gaugeSiteSize*4;
01153             for (i = 0; i < Vh; i++) {
01154                 pack8(res+i, g+4*i*gaugeSiteSize + dir*gaugeSiteSize, dir, Vh);
01155             }
01156         }
01157     }else{
01158         for (dir = 0; dir < 4; dir++) {
01159             Float *g = gauge + oddBit*Vh*gaugeSiteSize*4;
01160             for (i = 0; i < Vh; i++) {
01161               pack18(res+i, g+4*i*gaugeSiteSize+dir*gaugeSiteSize, dir, Vh);
01162             }
01163         }      
01164     }
01165 }
01166 
01167 template <typename Float, typename FloatN>
01168 void 
01169 loadGaugeFromCPUArrayQuda(FloatN *even, FloatN *odd, Float *cpuGauge, 
01170                           ReconstructType reconstruct, int bytes, int Vh) 
01171 {
01172     
01173     // Use pinned memory
01174     
01175     FloatN *packedEven, *packedOdd;    
01176     cudaMallocHost((void**)&packedEven, bytes);
01177     cudaMallocHost((void**)&packedOdd, bytes);
01178     
01179     
01180     packGaugeField(packedEven, (Float*)cpuGauge, 0, reconstruct, Vh);
01181     packGaugeField(packedOdd,  (Float*)cpuGauge, 1, reconstruct, Vh);
01182     
01183     
01184     cudaMemcpy(even, packedEven, bytes, cudaMemcpyHostToDevice);
01185     cudaMemcpy(odd,  packedOdd, bytes, cudaMemcpyHostToDevice);    
01186     
01187     cudaFreeHost(packedEven);
01188     cudaFreeHost(packedOdd);
01189 }
01190 
01191 
01192 
01193 
01194 void
01195 createLinkQuda(FullGauge* cudaGauge, QudaGaugeParam* param)
01196 {
01197     QudaPrecision cpu_prec = param->cpu_prec;
01198     QudaPrecision cuda_prec= param->cuda_prec;
01199     
01200     if (cpu_prec == QUDA_HALF_PRECISION) {
01201         printf("ERROR: %s:  half precision not supported on cpu\n", __FUNCTION__);
01202         exit(-1);
01203     }
01204     
01205     if (cuda_prec == QUDA_DOUBLE_PRECISION && param->cpu_prec != QUDA_DOUBLE_PRECISION) {
01206         printf("Error: can only create a double GPU gauge field from a double CPU gauge field\n");
01207         exit(-1);
01208     }
01209         
01210     cudaGauge->anisotropy = param->anisotropy;
01211     cudaGauge->volume = 1;
01212     for (int d=0; d<4; d++) {
01213         cudaGauge->X[d] = param->X[d];
01214         cudaGauge->volume *= param->X[d];
01215     }
01216     cudaGauge->X[0] /= 2; // actually store the even-odd sublattice dimensions
01217     cudaGauge->volume /= 2;    
01218     cudaGauge->stride = cudaGauge->volume + cudaGauge->pad;
01219     cudaGauge->reconstruct = param->reconstruct;
01220 
01221     allocateGaugeField(cudaGauge, param->reconstruct, param->cuda_prec);
01222     
01223     return;
01224 }
01225 
01226 void 
01227 loadLinkToGPU(FullGauge cudaGauge, void *cpuGauge, QudaGaugeParam* param)
01228 {
01229     QudaPrecision cpu_prec = param->cpu_prec;
01230     QudaPrecision cuda_prec= param->cuda_prec;
01231     
01232     if (cuda_prec == QUDA_DOUBLE_PRECISION) {
01233         loadGaugeFromCPUArrayQuda((double2*)(cudaGauge.even), (double2*)(cudaGauge.odd), (double*)cpuGauge, 
01234                                   cudaGauge.reconstruct, cudaGauge.bytes, cudaGauge.volume);
01235     } else if (cuda_prec == QUDA_SINGLE_PRECISION) {
01236         if (cpu_prec == QUDA_DOUBLE_PRECISION){
01237             if (cudaGauge.reconstruct != QUDA_RECONSTRUCT_NO){
01238                 loadGaugeFromCPUArrayQuda((float4*)(cudaGauge.even), (float4*)(cudaGauge.odd), (double*)cpuGauge, 
01239                                           cudaGauge.reconstruct, cudaGauge.bytes, cudaGauge.volume);
01240             }else{
01241                 loadGaugeFromCPUArrayQuda((float2*)(cudaGauge.even), (float2*)(cudaGauge.odd), (double*)cpuGauge, 
01242                                           cudaGauge.reconstruct, cudaGauge.bytes, cudaGauge.volume);            
01243             }
01244         }
01245         else if (cpu_prec == QUDA_SINGLE_PRECISION){
01246             if (cudaGauge.reconstruct != QUDA_RECONSTRUCT_NO){
01247                 loadGaugeFromCPUArrayQuda((float4*)(cudaGauge.even), (float4*)(cudaGauge.odd), (float*)cpuGauge, 
01248                                           cudaGauge.reconstruct, cudaGauge.bytes, cudaGauge.volume);
01249             }else{
01250                 loadGaugeFromCPUArrayQuda((float2*)(cudaGauge.even), (float2*)(cudaGauge.odd), (float*)cpuGauge, 
01251                                           cudaGauge.reconstruct, cudaGauge.bytes, cudaGauge.volume);
01252             }
01253         }
01254     } else if (cuda_prec == QUDA_HALF_PRECISION) {
01255         if (cpu_prec == QUDA_DOUBLE_PRECISION){
01256             if (cudaGauge.reconstruct != QUDA_RECONSTRUCT_NO){
01257                 loadGaugeFromCPUArrayQuda((short4*)(cudaGauge.even), (short4*)(cudaGauge.odd), (double*)cpuGauge, 
01258                                cudaGauge.reconstruct, cudaGauge.bytes, cudaGauge.volume);
01259             }else{
01260                 loadGaugeFromCPUArrayQuda((short2*)(cudaGauge.even), (short2*)(cudaGauge.odd), (double*)cpuGauge, 
01261                                           cudaGauge.reconstruct, cudaGauge.bytes, cudaGauge.volume);
01262             }
01263         }
01264         else if (cpu_prec == QUDA_SINGLE_PRECISION){
01265             if (cudaGauge.reconstruct != QUDA_RECONSTRUCT_NO){
01266                 loadGaugeFromCPUArrayQuda((short4*)(cudaGauge.even), (short4*)(cudaGauge.odd), (float*)cpuGauge,
01267                                           cudaGauge.reconstruct, cudaGauge.bytes, cudaGauge.volume);
01268             }else{
01269                 loadGaugeFromCPUArrayQuda((short2*)(cudaGauge.even), (short2*)(cudaGauge.odd), (float*)cpuGauge,
01270                                           cudaGauge.reconstruct, cudaGauge.bytes, cudaGauge.volume);            
01271             }
01272         }
01273     }
01274 }
01275 /*****************************************************************/
01276 /********************** store link data to cpu memory ************/
01277 template <typename Float>
01278 inline void unpack12(Float* g, float2 *res, int dir, int V){printf("ERROR: %s is called\n", __FUNCTION__); exit(1);}
01279 template <typename Float>
01280 inline void unpack8(Float* g, float2 *res,  int dir, int V){printf("ERROR: %s is called\n", __FUNCTION__); exit(1);}
01281 
01282 template <typename Float, typename FloatN>
01283 void 
01284 unpackGaugeField(Float* gauge, FloatN *res, int oddBit, ReconstructType reconstruct, int Vh) 
01285 {
01286     int dir, i;
01287     if (reconstruct == QUDA_RECONSTRUCT_12) {
01288         for (dir = 0; dir < 4; dir++) {
01289             //Float *g = gauge + oddBit*Vh*gaugeSiteSize*4;
01290             for (i = 0; i < Vh; i++) {
01291                 //unpack12(g+4*i*gaugeSiteSize+dir*gaugeSiteSize, res+i, dir, Vh);
01292             }
01293         }
01294     } else if (reconstruct == QUDA_RECONSTRUCT_8){
01295         for (dir = 0; dir < 4; dir++) {
01296             //Float *g = gauge + oddBit*Vh*gaugeSiteSize*4;
01297             for (i = 0; i < Vh; i++) {
01298                 //unpack8(g+4*i*gaugeSiteSize + dir*gaugeSiteSize, res+i, dir, Vh);
01299             }
01300         }
01301     }else{
01302         for (dir = 0; dir < 4; dir++) {
01303           Float *g = gauge + oddBit*Vh*gaugeSiteSize*4;
01304             for (i = 0; i < Vh; i++) {
01305                 unpack18(g+4*i*gaugeSiteSize+dir*gaugeSiteSize, res+i,dir, Vh);
01306             }
01307         }      
01308     }
01309 }
01310 
01311 template <typename Float, typename FloatN>
01312 void 
01313 storeGaugeToCPUArray(Float* cpuGauge, FloatN *even, FloatN *odd, 
01314                      ReconstructType reconstruct, int bytes, int Vh) 
01315 {
01316     
01317     // Use pinned memory
01318     
01319     FloatN *packedEven, *packedOdd;    
01320     cudaMallocHost((void**)&packedEven, bytes); 
01321     cudaMallocHost((void**)&packedOdd, bytes); 
01322     cudaMemcpy(packedEven, even, bytes, cudaMemcpyDeviceToHost); 
01323     cudaMemcpy(packedOdd, odd, bytes, cudaMemcpyDeviceToHost);  
01324     
01325     
01326     unpackGaugeField((Float*)cpuGauge, packedEven,0, reconstruct, Vh);
01327     unpackGaugeField((Float*)cpuGauge, packedOdd, 1, reconstruct, Vh);
01328     
01329     cudaFreeHost(packedEven); 
01330     cudaFreeHost(packedOdd); 
01331 }
01332 
01333 
01334 void 
01335 storeLinkToCPU(void* cpuGauge, FullGauge *cudaGauge, QudaGaugeParam* param)
01336 {
01337     
01338   QudaPrecision cpu_prec = param->cpu_prec;
01339   QudaPrecision cuda_prec= param->cuda_prec;
01340     
01341   if (cuda_prec == QUDA_HALF_PRECISION || cpu_prec == QUDA_HALF_PRECISION){
01342     printf("ERROR: %s:  half precision is not supported at this moment\n", __FUNCTION__);
01343     exit(1);
01344   }
01345     
01346   if (cpu_prec == QUDA_DOUBLE_PRECISION){
01347     if (cuda_prec == QUDA_DOUBLE_PRECISION){//double and double
01348       storeGaugeToCPUArray( (double*)cpuGauge, (double2*) cudaGauge->even, (double2*)cudaGauge->odd, 
01349                             cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume);
01350     }else{ //double and single
01351       if (cudaGauge->reconstruct == QUDA_RECONSTRUCT_NO){
01352         storeGaugeToCPUArray( (double*)cpuGauge, (float2*) cudaGauge->even, (float2*)cudaGauge->odd, 
01353                               cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume);
01354       }else{
01355         storeGaugeToCPUArray( (double*)cpuGauge, (float4*) cudaGauge->even, (float4*)cudaGauge->odd, 
01356                               cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume);         
01357       }  
01358     }
01359   }else { //SINGLE PRECISIONS
01360     if (cudaGauge->reconstruct == QUDA_RECONSTRUCT_NO){
01361       storeGaugeToCPUArray( (float*)cpuGauge, (float2*) cudaGauge->even, (float2*)cudaGauge->odd, 
01362                             cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume);
01363     }else{
01364       storeGaugeToCPUArray( (float*)cpuGauge, (float4*) cudaGauge->even, (float4*)cudaGauge->odd, 
01365                             cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume);           
01366     }
01367         
01368   }
01369 }