QUDA v0.4.0
A library for QCD on GPUs
|
00001 // Routines used to pack the gauge field matrices 00002 00003 #include <math.h> 00004 00005 #define SHORT_LENGTH 65536 00006 #define SCALE_FLOAT ((SHORT_LENGTH-1) / 2.f) 00007 #define SHIFT_FLOAT (-1.f / (SHORT_LENGTH-1)) 00008 00009 template <typename Float> 00010 inline short FloatToShort(const Float &a) { 00011 return (short)((a+SHIFT_FLOAT)*SCALE_FLOAT); 00012 } 00013 00014 template <typename Float> 00015 inline void ShortToFloat(Float &a, const short &b) { 00016 a = ((Float)b/SCALE_FLOAT-SHIFT_FLOAT); 00017 } 00018 00019 /*template <int N, typename FloatN, typename Float> 00020 inline void pack8(FloatN *res, Float *g, int dir, int V) { 00021 Float *r = res + N*dir*4*V; 00022 r[0] = atan2(g[1], g[0]); 00023 r[1] = atan2(g[13], g[12]); 00024 }*/ 00025 00026 template <typename Float> 00027 inline void pack8(double2 *res, Float *g, int dir, int V) { 00028 double2 *r = res + dir*4*V; 00029 r[0].x = atan2(g[1], g[0]); 00030 r[0].y = atan2(g[13], g[12]); 00031 for (int j=1; j<4; j++) { 00032 r[j*V].x = g[2*j+0]; 00033 r[j*V].y = g[2*j+1]; 00034 } 00035 } 00036 00037 template <typename Float> 00038 inline void pack8(float4 *res, Float *g, int dir, int V) { 00039 float4 *r = res + dir*2*V; 00040 r[0].x = atan2(g[1], g[0]); 00041 r[0].y = atan2(g[13], g[12]); 00042 r[0].z = g[2]; 00043 r[0].w = g[3]; 00044 r[V].x = g[4]; 00045 r[V].y = g[5]; 00046 r[V].z = g[6]; 00047 r[V].w = g[7]; 00048 } 00049 00050 template <typename Float> 00051 inline void pack8(float2 *res, Float *g, int dir, int V) { 00052 float2 *r = res + dir*4*V; 00053 r[0].x = atan2(g[1], g[0]); 00054 r[0].y = atan2(g[13], g[12]); 00055 for (int j=1; j<4; j++) { 00056 r[j*V].x = g[2*j+0]; 00057 r[j*V].y = g[2*j+1]; 00058 } 00059 } 00060 00061 template <typename Float> 00062 inline void pack8(short4 *res, Float *g, int dir, int V) { 00063 short4 *r = res + dir*2*V; 00064 r[0].x = FloatToShort(atan2(g[1], g[0]) / M_PI); 00065 r[0].y = FloatToShort(atan2(g[13], g[12]) / M_PI); 00066 r[0].z = FloatToShort(g[2]); 00067 r[0].w = FloatToShort(g[3]); 00068 r[V].x = FloatToShort(g[4]); 00069 r[V].y = FloatToShort(g[5]); 00070 r[V].z = FloatToShort(g[6]); 00071 r[V].w = FloatToShort(g[7]); 00072 } 00073 00074 template <typename Float> 00075 inline void pack8(short2 *res, Float *g, int dir, int V) { 00076 short2 *r = res + dir*4*V; 00077 r[0].x = FloatToShort(atan2(g[1], g[0]) / M_PI); 00078 r[0].y = FloatToShort(atan2(g[13], g[12]) / M_PI); 00079 for (int j=1; j<4; j++) { 00080 r[j*V].x = FloatToShort(g[2*j+0]); 00081 r[j*V].y = FloatToShort(g[2*j+1]); 00082 } 00083 } 00084 00085 template <typename Float> 00086 inline void pack12(double2 *res, Float *g, int dir, int V) { 00087 double2 *r = res + dir*6*V; 00088 for (int j=0; j<6; j++) { 00089 r[j*V].x = g[j*2+0]; 00090 r[j*V].y = g[j*2+1]; 00091 } 00092 } 00093 00094 template <typename Float> 00095 inline void pack12(float4 *res, Float *g, int dir, int V) { 00096 float4 *r = res + dir*3*V; 00097 for (int j=0; j<3; j++) { 00098 r[j*V].x = g[j*4+0]; 00099 r[j*V].y = g[j*4+1]; 00100 r[j*V].z = g[j*4+2]; 00101 r[j*V].w = g[j*4+3]; 00102 } 00103 } 00104 00105 template <typename Float> 00106 inline void pack12(float2 *res, Float *g, int dir, int V) { 00107 float2 *r = res + dir*6*V; 00108 for (int j=0; j<6; j++) { 00109 r[j*V].x = g[j*2+0]; 00110 r[j*V].y = g[j*2+1]; 00111 } 00112 } 00113 00114 template <typename Float> 00115 inline void pack12(short4 *res, Float *g, int dir, int V) { 00116 short4 *r = res + dir*3*V; 00117 for (int j=0; j<3; j++) { 00118 r[j*V].x = FloatToShort(g[j*4+0]); 00119 r[j*V].y = FloatToShort(g[j*4+1]); 00120 r[j*V].z = FloatToShort(g[j*4+2]); 00121 r[j*V].w = FloatToShort(g[j*4+3]); 00122 } 00123 } 00124 00125 template <typename Float> 00126 inline void pack12(short2 *res, Float *g, int dir, int V) { 00127 short2 *r = res + dir*6*V; 00128 for (int j=0; j<6; j++) { 00129 r[j*V].x = FloatToShort(g[j*2+0]); 00130 r[j*V].y = FloatToShort(g[j*2+1]); 00131 } 00132 } 00133 00134 template <typename Float> 00135 inline void pack18(double2 *res, Float *g, int dir, int V) { 00136 double2 *r = res + dir*9*V; 00137 for (int j=0; j<9; j++) { 00138 r[j*V].x = g[j*2+0]; 00139 r[j*V].y = g[j*2+1]; 00140 } 00141 } 00142 00143 template <typename Float> 00144 inline void pack18(float4 *res, Float *g, int dir, int V) { 00145 float4 *r = res + dir*5*V; 00146 for (int j=0; j<4; j++) { 00147 r[j*V].x = g[j*4+0]; 00148 r[j*V].y = g[j*4+1]; 00149 r[j*V].z = g[j*4+2]; 00150 r[j*V].w = g[j*4+3]; 00151 } 00152 r[4*V].x = g[16]; 00153 r[4*V].y = g[17]; 00154 r[4*V].z = 0.0; 00155 r[4*V].w = 0.0; 00156 } 00157 00158 template <typename Float> 00159 inline void pack18(float2 *res, Float *g, int dir, int V) { 00160 float2 *r = res + dir*9*V; 00161 for (int j=0; j<9; j++) { 00162 r[j*V].x = g[j*2+0]; 00163 r[j*V].y = g[j*2+1]; 00164 } 00165 } 00166 00167 template <typename Float> 00168 inline void pack18(short4 *res, Float *g, int dir, int V) { 00169 short4 *r = res + dir*5*V; 00170 for (int j=0; j<4; j++) { 00171 r[j*V].x = FloatToShort(g[j*4+0]); 00172 r[j*V].y = FloatToShort(g[j*4+1]); 00173 r[j*V].z = FloatToShort(g[j*4+2]); 00174 r[j*V].w = FloatToShort(g[j*4+3]); 00175 } 00176 r[4*V].x = FloatToShort(g[16]); 00177 r[4*V].y = FloatToShort(g[17]); 00178 r[4*V].z = (short)0; 00179 r[4*V].w = (short)0; 00180 } 00181 00182 template <typename Float> 00183 inline void pack18(short2 *res, Float *g, int dir, int V) 00184 { 00185 short2 *r = res + dir*9*V; 00186 for (int j=0; j<9; j++) { 00187 r[j*V].x = FloatToShort(g[j*2+0]); 00188 r[j*V].y = FloatToShort(g[j*2+1]); 00189 } 00190 } 00191 00192 template<typename Float> 00193 inline void fatlink_short_pack18(short2 *d_gauge, Float *h_gauge, int dir, int V) 00194 { 00195 short2 *dg = d_gauge + dir*9*V; 00196 for (int j=0; j<9; j++) { 00197 dg[j*V].x = FloatToShort((h_gauge[j*2+0]/fat_link_max_)); 00198 dg[j*V].y = FloatToShort((h_gauge[j*2+1]/fat_link_max_)); 00199 } 00200 } 00201 00202 00203 00204 // a += b*c 00205 template <typename Float> 00206 inline void accumulateComplexProduct(Float *a, Float *b, Float *c, Float sign) { 00207 a[0] += sign*(b[0]*c[0] - b[1]*c[1]); 00208 a[1] += sign*(b[0]*c[1] + b[1]*c[0]); 00209 } 00210 00211 // a = conj(b)*c 00212 template <typename Float> 00213 inline void complexDotProduct(Float *a, Float *b, Float *c) { 00214 a[0] = b[0]*c[0] + b[1]*c[1]; 00215 a[1] = b[0]*c[1] - b[1]*c[0]; 00216 } 00217 00218 // a += conj(b) * conj(c) 00219 template <typename Float> 00220 inline void accumulateConjugateProduct(Float *a, Float *b, Float *c, int sign) { 00221 a[0] += sign * (b[0]*c[0] - b[1]*c[1]); 00222 a[1] -= sign * (b[0]*c[1] + b[1]*c[0]); 00223 } 00224 00225 // a = conj(b)*conj(c) 00226 template <typename Float> 00227 inline void complexConjugateProduct(Float *a, Float *b, Float *c) { 00228 a[0] = b[0]*c[0] - b[1]*c[1]; 00229 a[1] = -b[0]*c[1] - b[1]*c[0]; 00230 } 00231 00232 00233 // Routines used to unpack the gauge field matrices 00234 template <typename Float> 00235 inline void reconstruct8(Float *mat, int dir, int idx) { 00236 // First reconstruct first row 00237 Float row_sum = 0.0; 00238 row_sum += mat[2]*mat[2]; 00239 row_sum += mat[3]*mat[3]; 00240 row_sum += mat[4]*mat[4]; 00241 row_sum += mat[5]*mat[5]; 00242 Float u0 = (dir < 3 ? anisotropy_ : (idx >= (X_[3]-1)*X_[0]*X_[1]*X_[2]/2 ? t_boundary_ : 1)); 00243 Float diff = 1.f/(u0*u0) - row_sum; 00244 Float U00_mag = sqrt(diff >= 0 ? diff : 0.0); 00245 00246 mat[14] = mat[0]; 00247 mat[15] = mat[1]; 00248 00249 mat[0] = U00_mag * cos(mat[14]); 00250 mat[1] = U00_mag * sin(mat[14]); 00251 00252 Float column_sum = 0.0; 00253 for (int i=0; i<2; i++) column_sum += mat[i]*mat[i]; 00254 for (int i=6; i<8; i++) column_sum += mat[i]*mat[i]; 00255 diff = 1.f/(u0*u0) - column_sum; 00256 Float U20_mag = sqrt(diff >= 0 ? diff : 0.0); 00257 00258 mat[12] = U20_mag * cos(mat[15]); 00259 mat[13] = U20_mag * sin(mat[15]); 00260 00261 // First column now restored 00262 00263 // finally reconstruct last elements from SU(2) rotation 00264 Float r_inv2 = 1.0/(u0*row_sum); 00265 00266 // U11 00267 Float A[2]; 00268 complexDotProduct(A, mat+0, mat+6); 00269 complexConjugateProduct(mat+8, mat+12, mat+4); 00270 accumulateComplexProduct(mat+8, A, mat+2, u0); 00271 mat[8] *= -r_inv2; 00272 mat[9] *= -r_inv2; 00273 00274 // U12 00275 complexConjugateProduct(mat+10, mat+12, mat+2); 00276 accumulateComplexProduct(mat+10, A, mat+4, -u0); 00277 mat[10] *= r_inv2; 00278 mat[11] *= r_inv2; 00279 00280 // U21 00281 complexDotProduct(A, mat+0, mat+12); 00282 complexConjugateProduct(mat+14, mat+6, mat+4); 00283 accumulateComplexProduct(mat+14, A, mat+2, -u0); 00284 mat[14] *= r_inv2; 00285 mat[15] *= r_inv2; 00286 00287 // U12 00288 complexConjugateProduct(mat+16, mat+6, mat+2); 00289 accumulateComplexProduct(mat+16, A, mat+4, u0); 00290 mat[16] *= -r_inv2; 00291 mat[17] *= -r_inv2; 00292 } 00293 00294 template <typename Float> 00295 inline void unpack8(Float *h_gauge, double2 *d_gauge, int dir, int V, int idx) { 00296 double2 *dg = d_gauge + dir*4*V; 00297 for (int j=0; j<4; j++) { 00298 h_gauge[2*j+0] = dg[j*V].x; 00299 h_gauge[2*j+1] = dg[j*V].y; 00300 } 00301 reconstruct8(h_gauge, dir, idx); 00302 } 00303 00304 template <typename Float> 00305 inline void unpack8(Float *h_gauge, float4 *d_gauge, int dir, int V, int idx) { 00306 float4 *dg = d_gauge + dir*2*V; 00307 h_gauge[0] = dg[0].x; 00308 h_gauge[1] = dg[0].y; 00309 h_gauge[2] = dg[0].z; 00310 h_gauge[3] = dg[0].w; 00311 h_gauge[4] = dg[V].x; 00312 h_gauge[5] = dg[V].y; 00313 h_gauge[6] = dg[V].z; 00314 h_gauge[7] = dg[V].w; 00315 reconstruct8(h_gauge, dir, idx); 00316 } 00317 00318 template <typename Float> 00319 inline void unpack8(Float *h_gauge, float2 *d_gauge, int dir, int V, int idx) { 00320 float2 *dg = d_gauge + dir*4*V; 00321 for (int j=0; j<4; j++) { 00322 h_gauge[2*j+0] = dg[j*V].x; 00323 h_gauge[2*j+1] = dg[j*V].y; 00324 } 00325 reconstruct8(h_gauge, dir, idx); 00326 } 00327 00328 template <typename Float> 00329 inline void unpack8(Float *h_gauge, short4 *d_gauge, int dir, int V, int idx) { 00330 short4 *dg = d_gauge + dir*2*V; 00331 ShortToFloat(h_gauge[0], dg[0].x); 00332 ShortToFloat(h_gauge[1], dg[0].y); 00333 ShortToFloat(h_gauge[2], dg[0].z); 00334 ShortToFloat(h_gauge[3], dg[0].w); 00335 ShortToFloat(h_gauge[4], dg[V].x); 00336 ShortToFloat(h_gauge[5], dg[V].y); 00337 ShortToFloat(h_gauge[6], dg[V].z); 00338 ShortToFloat(h_gauge[7], dg[V].w); 00339 h_gauge[0] *= M_PI; 00340 h_gauge[1] *= M_PI; 00341 reconstruct8(h_gauge, dir, idx); 00342 } 00343 00344 template <typename Float> 00345 inline void unpack8(Float *h_gauge, short2 *d_gauge, int dir, int V, int idx) { 00346 short2 *dg = d_gauge + dir*4*V; 00347 for (int j=0; j<4; j++) { 00348 ShortToFloat(h_gauge[2*j+0], dg[j*V].x); 00349 ShortToFloat(h_gauge[2*j+1], dg[j*V].y); 00350 } 00351 h_gauge[0] *= M_PI; 00352 h_gauge[1] *= M_PI; 00353 reconstruct8(h_gauge, dir, idx); 00354 } 00355 00356 // do this using complex numbers (simplifies) 00357 template <typename Float> 00358 inline void reconstruct12(Float *mat, int dir, int idx) { 00359 Float *u = &mat[0*(3*2)]; 00360 Float *v = &mat[1*(3*2)]; 00361 Float *w = &mat[2*(3*2)]; 00362 w[0] = 0.0; w[1] = 0.0; w[2] = 0.0; w[3] = 0.0; w[4] = 0.0; w[5] = 0.0; 00363 accumulateConjugateProduct(w+0*(2), u+1*(2), v+2*(2), +1); 00364 accumulateConjugateProduct(w+0*(2), u+2*(2), v+1*(2), -1); 00365 accumulateConjugateProduct(w+1*(2), u+2*(2), v+0*(2), +1); 00366 accumulateConjugateProduct(w+1*(2), u+0*(2), v+2*(2), -1); 00367 accumulateConjugateProduct(w+2*(2), u+0*(2), v+1*(2), +1); 00368 accumulateConjugateProduct(w+2*(2), u+1*(2), v+0*(2), -1); 00369 Float u0 = (dir < 3 ? anisotropy_ : 00370 (idx >= (X_[3]-1)*X_[0]*X_[1]*X_[2]/2 ? t_boundary_ : 1)); 00371 w[0]*=u0; w[1]*=u0; w[2]*=u0; w[3]*=u0; w[4]*=u0; w[5]*=u0; 00372 } 00373 00374 template <typename Float> 00375 inline void unpack12(Float *h_gauge, double2 *d_gauge, int dir, int V, int idx) { 00376 double2 *dg = d_gauge + dir*6*V; 00377 for (int j=0; j<6; j++) { 00378 h_gauge[j*2+0] = dg[j*V].x; 00379 h_gauge[j*2+1] = dg[j*V].y; 00380 } 00381 reconstruct12(h_gauge, dir, idx); 00382 } 00383 00384 template <typename Float> 00385 inline void unpack12(Float *h_gauge, float4 *d_gauge, int dir, int V, int idx) { 00386 float4 *dg = d_gauge + dir*3*V; 00387 for (int j=0; j<3; j++) { 00388 h_gauge[j*4+0] = dg[j*V].x; 00389 h_gauge[j*4+1] = dg[j*V].y; 00390 h_gauge[j*4+2] = dg[j*V].z; 00391 h_gauge[j*4+3] = dg[j*V].w; 00392 } 00393 reconstruct12(h_gauge, dir, idx); 00394 } 00395 00396 template <typename Float> 00397 inline void unpack12(Float *h_gauge, float2 *d_gauge, int dir, int V, int idx) { 00398 float2 *dg = d_gauge + dir*6*V; 00399 for (int j=0; j<6; j++) { 00400 h_gauge[j*2+0] = dg[j*V].x; 00401 h_gauge[j*2+1] = dg[j*V].y; 00402 } 00403 reconstruct12(h_gauge, dir, idx); 00404 } 00405 00406 template <typename Float> 00407 inline void unpack12(Float *h_gauge, short4 *d_gauge, int dir, int V, int idx) { 00408 short4 *dg = d_gauge + dir*3*V; 00409 for (int j=0; j<3; j++) { 00410 ShortToFloat(h_gauge[j*4+0], dg[j*V].x); 00411 ShortToFloat(h_gauge[j*4+1], dg[j*V].y); 00412 ShortToFloat(h_gauge[j*4+2], dg[j*V].z); 00413 ShortToFloat(h_gauge[j*4+3], dg[j*V].w); 00414 } 00415 reconstruct12(h_gauge, dir, idx); 00416 } 00417 00418 template <typename Float> 00419 inline void unpack12(Float *h_gauge, short2 *d_gauge, int dir, int V, int idx) { 00420 short2 *dg = d_gauge + dir*6*V; 00421 for (int j=0; j<6; j++) { 00422 ShortToFloat(h_gauge[j*2+0], dg[j*V].x); 00423 ShortToFloat(h_gauge[j*2+1], dg[j*V].y); 00424 } 00425 reconstruct12(h_gauge, dir, idx); 00426 } 00427 00428 template <typename Float> 00429 inline void unpack18(Float *h_gauge, double2 *d_gauge, int dir, int V) { 00430 double2 *dg = d_gauge + dir*9*V; 00431 for (int j=0; j<9; j++) { 00432 h_gauge[j*2+0] = dg[j*V].x; 00433 h_gauge[j*2+1] = dg[j*V].y; 00434 } 00435 } 00436 00437 template <typename Float> 00438 inline void unpack18(Float *h_gauge, float4 *d_gauge, int dir, int V) { 00439 float4 *dg = d_gauge + dir*5*V; 00440 for (int j=0; j<4; j++) { 00441 h_gauge[j*4+0] = dg[j*V].x; 00442 h_gauge[j*4+1] = dg[j*V].y; 00443 h_gauge[j*4+2] = dg[j*V].z; 00444 h_gauge[j*4+3] = dg[j*V].w; 00445 } 00446 h_gauge[16] = dg[4*V].x; 00447 h_gauge[17] = dg[4*V].y; 00448 } 00449 00450 template <typename Float> 00451 inline void unpack18(Float *h_gauge, float2 *d_gauge, int dir, int V) { 00452 float2 *dg = d_gauge + dir*9*V; 00453 for (int j=0; j<9; j++) { 00454 h_gauge[j*2+0] = dg[j*V].x; 00455 h_gauge[j*2+1] = dg[j*V].y; 00456 } 00457 } 00458 00459 template <typename Float> 00460 inline void unpack18(Float *h_gauge, short4 *d_gauge, int dir, int V) { 00461 short4 *dg = d_gauge + dir*5*V; 00462 for (int j=0; j<4; j++) { 00463 ShortToFloat(h_gauge[j*4+0], dg[j*V].x); 00464 ShortToFloat(h_gauge[j*4+1], dg[j*V].y); 00465 ShortToFloat(h_gauge[j*4+2], dg[j*V].z); 00466 ShortToFloat(h_gauge[j*4+3], dg[j*V].w); 00467 } 00468 ShortToFloat(h_gauge[16],dg[4*V].x); 00469 ShortToFloat(h_gauge[17],dg[4*V].y); 00470 00471 } 00472 00473 template <typename Float> 00474 inline void unpack18(Float *h_gauge, short2 *d_gauge, int dir, int V) { 00475 short2 *dg = d_gauge + dir*9*V; 00476 for (int j=0; j<9; j++) { 00477 ShortToFloat(h_gauge[j*2+0], dg[j*V].x); 00478 ShortToFloat(h_gauge[j*2+1], dg[j*V].y); 00479 } 00480 } 00481 00482 00483 00484 // Assume the gauge field is "QDP" ordered: directions outside of 00485 // space-time, row-column ordering, even-odd space-time 00486 // offset = 0 for body 00487 // offset = Vh for face 00488 // voxels = Vh for body 00489 // voxels[i] = face volume[i] 00490 template <typename Float, typename FloatN> 00491 static void packQDPGaugeField(FloatN *d_gauge, Float **h_gauge, int oddBit, 00492 QudaReconstructType reconstruct, int Vh, int *voxels, 00493 int pad, int offset, int nFace, QudaLinkType type) { 00494 if (reconstruct == QUDA_RECONSTRUCT_12) { 00495 for (int dir = 0; dir < 4; dir++) { 00496 int nMat = nFace*voxels[dir]; 00497 Float *g = h_gauge[dir] + oddBit*nMat*18; 00498 for (int i = 0; i < nMat; i++) pack12(d_gauge+offset+i, g+i*18, dir, Vh+pad); 00499 } 00500 } else if (reconstruct == QUDA_RECONSTRUCT_8) { 00501 for (int dir = 0; dir < 4; dir++) { 00502 int nMat = nFace*voxels[dir]; 00503 Float *g = h_gauge[dir] + oddBit*nMat*18; 00504 for (int i = 0; i < nMat; i++) pack8(d_gauge+offset+i, g+i*18, dir, Vh+pad); 00505 } 00506 } else { 00507 for (int dir = 0; dir < 4; dir++) { 00508 int nMat = nFace*voxels[dir]; 00509 Float *g = h_gauge[dir] + oddBit*nMat*18; 00510 if(type == QUDA_ASQTAD_FAT_LINKS && typeid(FloatN) == typeid(short2) ){ 00511 //we know it is half precison with fatlink at this stage 00512 for (int i = 0; i < nMat; i++) 00513 fatlink_short_pack18((short2*)(d_gauge+offset+i), g+i*18, dir, Vh+pad); 00514 }else{ 00515 for (int i = 0; i < nMat; i++) pack18(d_gauge+offset+i, g+i*18, dir, Vh+pad); 00516 } 00517 } 00518 } 00519 } 00520 00521 // Assume the gauge field is "QDP" ordered: directions outside of 00522 // space-time, row-column ordering, even-odd space-time 00523 template <typename Float, typename FloatN> 00524 static void unpackQDPGaugeField(Float **h_gauge, FloatN *d_gauge, int oddBit, 00525 QudaReconstructType reconstruct, int V, int pad) { 00526 if (reconstruct == QUDA_RECONSTRUCT_12) { 00527 for (int dir = 0; dir < 4; dir++) { 00528 Float *g = h_gauge[dir] + oddBit*V*18; 00529 for (int i = 0; i < V; i++) unpack12(g+i*18, d_gauge+i, dir, V+pad, i); 00530 } 00531 } else if (reconstruct == QUDA_RECONSTRUCT_8) { 00532 for (int dir = 0; dir < 4; dir++) { 00533 Float *g = h_gauge[dir] + oddBit*V*18; 00534 for (int i = 0; i < V; i++) unpack8(g+i*18, d_gauge+i, dir, V+pad, i); 00535 } 00536 } else { 00537 for (int dir = 0; dir < 4; dir++) { 00538 Float *g = h_gauge[dir] + oddBit*V*18; 00539 for (int i = 0; i < V; i++) unpack18(g+i*18, d_gauge+i, dir, V+pad); 00540 } 00541 } 00542 } 00543 00544 // transpose and scale the matrix 00545 template <typename Float, typename Float2> 00546 static void transposeScale(Float *gT, Float *g, const Float2 &a) { 00547 for (int ic=0; ic<3; ic++) for (int jc=0; jc<3; jc++) for (int r=0; r<2; r++) 00548 gT[(ic*3+jc)*2+r] = a*g[(jc*3+ic)*2+r]; 00549 } 00550 00551 // Assume the gauge field is "Wilson" ordered directions inside of 00552 // space-time column-row ordering even-odd space-time 00553 template <typename Float, typename FloatN> 00554 static void packCPSGaugeField(FloatN *d_gauge, Float *h_gauge, int oddBit, 00555 QudaReconstructType reconstruct, int V, int pad) { 00556 Float gT[18]; 00557 if (reconstruct == QUDA_RECONSTRUCT_12) { 00558 for (int dir = 0; dir < 4; dir++) { 00559 Float *g = h_gauge + (oddBit*V*4+dir)*18; 00560 for (int i = 0; i < V; i++) { 00561 transposeScale(gT, g+4*i*18, 1.0 / anisotropy_); 00562 pack12(d_gauge+i, gT, dir, V+pad); 00563 } 00564 } 00565 } else if (reconstruct == QUDA_RECONSTRUCT_8) { 00566 for (int dir = 0; dir < 4; dir++) { 00567 Float *g = h_gauge + (oddBit*V*4+dir)*18; 00568 for (int i = 0; i < V; i++) { 00569 transposeScale(gT, g+4*i*18, 1.0 / anisotropy_); 00570 pack8(d_gauge+i, gT, dir, V+pad); 00571 } 00572 } 00573 } else { 00574 for (int dir = 0; dir < 4; dir++) { 00575 Float *g = h_gauge + (oddBit*V*4+dir)*18; 00576 for (int i = 0; i < V; i++) { 00577 transposeScale(gT, g+4*i*18, 1.0 / anisotropy_); 00578 pack18(d_gauge+i, gT, dir, V+pad); 00579 } 00580 } 00581 } 00582 00583 } 00584 00585 // Assume the gauge field is "Wilson" ordered directions inside of 00586 // space-time column-row ordering even-odd space-time 00587 template <typename Float, typename FloatN> 00588 static void unpackCPSGaugeField(Float *h_gauge, FloatN *d_gauge, int oddBit, 00589 QudaReconstructType reconstruct, int V, int pad) { 00590 Float gT[18]; 00591 if (reconstruct == QUDA_RECONSTRUCT_12) { 00592 for (int dir = 0; dir < 4; dir++) { 00593 Float *hg = h_gauge + (oddBit*V*4+dir)*18; 00594 for (int i = 0; i < V; i++) { 00595 unpack12(gT, d_gauge+i, dir, V+pad, i); 00596 transposeScale(hg+4*i*18, gT, anisotropy_); 00597 } 00598 } 00599 } else if (reconstruct == QUDA_RECONSTRUCT_8) { 00600 for (int dir = 0; dir < 4; dir++) { 00601 Float *hg = h_gauge + (oddBit*V*4+dir)*18; 00602 for (int i = 0; i < V; i++) { 00603 unpack8(gT, d_gauge+i, dir, V+pad, i); 00604 transposeScale(hg+4*i*18, gT, anisotropy_); 00605 } 00606 } 00607 } else { 00608 for (int dir = 0; dir < 4; dir++) { 00609 Float *hg = h_gauge + (oddBit*V*4+dir)*18; 00610 for (int i = 0; i < V; i++) { 00611 unpack18(gT, d_gauge+i, dir, V+pad); 00612 transposeScale(hg+4*i*18, gT, anisotropy_); 00613 } 00614 } 00615 } 00616 00617 } 00618 00619 00620 // Assume the gauge field is MILC ordered: directions inside of 00621 // space-time row-column ordering even-odd space-time 00622 template <typename Float, typename FloatN> 00623 void packMILCGaugeField(FloatN *res, Float *gauge, int oddBit, 00624 QudaReconstructType reconstruct, int Vh, int pad) 00625 { 00626 int dir, i; 00627 if (reconstruct == QUDA_RECONSTRUCT_12) { 00628 for (dir = 0; dir < 4; dir++) { 00629 Float *g = gauge + oddBit*Vh*gaugeSiteSize*4; 00630 for (i = 0; i < Vh; i++) { 00631 pack12(res+i, g+(4*i+dir)*gaugeSiteSize, dir, Vh+pad); 00632 } 00633 } 00634 } else if (reconstruct == QUDA_RECONSTRUCT_8){ 00635 for (dir = 0; dir < 4; dir++) { 00636 Float *g = gauge + oddBit*Vh*gaugeSiteSize*4; 00637 for (i = 0; i < Vh; i++) { 00638 pack8(res+i, g+(4*i+dir)*gaugeSiteSize, dir, Vh+pad); 00639 } 00640 } 00641 }else{ 00642 for (dir = 0; dir < 4; dir++) { 00643 Float *g = gauge + oddBit*Vh*gaugeSiteSize*4; 00644 for (i = 0; i < Vh; i++) { 00645 pack18(res+i, g+(4*i+dir)*gaugeSiteSize, dir, Vh+pad); 00646 } 00647 } 00648 } 00649 } 00650 00651 // Assume the gauge field is MILC ordered: directions inside of 00652 // space-time row-column ordering even-odd space-time 00653 template <typename Float, typename FloatN> 00654 static void unpackMILCGaugeField(Float *h_gauge, FloatN *d_gauge, int oddBit, 00655 QudaReconstructType reconstruct, int V, int pad) { 00656 if (reconstruct == QUDA_RECONSTRUCT_12) { 00657 for (int dir = 0; dir < 4; dir++) { 00658 Float *hg = h_gauge + (oddBit*V*4+dir)*18; 00659 for (int i = 0; i < V; i++) { 00660 unpack12(hg+4*i*18, d_gauge+i, dir, V+pad, i); 00661 } 00662 } 00663 } else if (reconstruct == QUDA_RECONSTRUCT_8) { 00664 for (int dir = 0; dir < 4; dir++) { 00665 Float *hg = h_gauge + (oddBit*V*4+dir)*18; 00666 for (int i = 0; i < V; i++) { 00667 unpack8(hg+4*i*18, d_gauge+i, dir, V+pad, i); 00668 } 00669 } 00670 } else { 00671 for (int dir = 0; dir < 4; dir++) { 00672 Float *hg = h_gauge + (oddBit*V*4+dir)*18; 00673 for (int i = 0; i < V; i++) { 00674 unpack18(hg+4*i*18, d_gauge+i, dir, V+pad); 00675 } 00676 } 00677 } 00678 00679 } 00680 00681 /* 00682 Momentum packing/unpacking routines: these are for length 10 00683 vectors, stored in Float2 format. 00684 */ 00685 00686 template <typename Float, typename Float2> 00687 inline void pack10(Float2 *res, Float *m, int dir, int Vh, int pad) 00688 { 00689 int stride = Vh + pad; 00690 Float2 *r = res + dir*5*stride; 00691 for (int j=0; j<5; j++) { 00692 r[j*stride].x = m[j*2+0]; 00693 r[j*stride].y = m[j*2+1]; 00694 } 00695 } 00696 00697 template <typename Float, typename Float2> 00698 void packMomField(Float2 *res, Float *mom, int oddBit, int Vh, int pad) 00699 { 00700 for (int dir = 0; dir < 4; dir++) { 00701 Float *g = mom + (oddBit*Vh*4 + dir)*10; 00702 for (int i = 0; i < Vh; i++) { 00703 pack10(res+i, g + 4*i*10, dir, Vh, pad); 00704 } 00705 } 00706 } 00707 00708 00709 00710 template <typename Float, typename Float2> 00711 inline void unpack10(Float* m, Float2 *res, int dir, int Vh, int pad) 00712 { 00713 int stride = Vh + pad; 00714 Float2 *r = res + dir*5*stride; 00715 for (int j=0; j<5; j++) { 00716 m[j*2+0] = r[j*stride].x; 00717 m[j*2+1] = r[j*stride].y; 00718 } 00719 } 00720 00721 template <typename Float, typename Float2> 00722 void unpackMomField(Float* mom, Float2 *res, int oddBit, int Vh, int pad) 00723 { 00724 int dir, i; 00725 Float *m = mom + oddBit*Vh*10*4; 00726 00727 for (i = 0; i < Vh; i++) { 00728 for (dir = 0; dir < 4; dir++) { 00729 Float* thismom = m + (4*i+dir)*10; 00730 unpack10(thismom, res+i, dir, Vh, pad); 00731 } 00732 } 00733 } 00734