|
QUDA v0.3.2
A library for QCD on GPUs
|
00001 #include <stdlib.h> 00002 #include <stdio.h> 00003 #include <math.h> 00004 #include <quda.h> 00005 #include <gauge_quda.h> 00006 #include <quda_internal.h> 00007 00008 #define SHORT_LENGTH 65536 00009 #define SCALE_FLOAT ((SHORT_LENGTH-1) / 2.f) 00010 #define SHIFT_FLOAT (-1.f / (SHORT_LENGTH-1)) 00011 00012 static double Anisotropy; 00013 static QudaTboundary tBoundary; 00014 static int X[4]; 00015 00016 template <typename Float> 00017 inline short FloatToShort(const Float &a) { 00018 return (short)((a+SHIFT_FLOAT)*SCALE_FLOAT); 00019 } 00020 00021 template <typename Float> 00022 inline void ShortToFloat(Float &a, const short &b) { 00023 a = ((Float)b/SCALE_FLOAT-SHIFT_FLOAT); 00024 } 00025 00026 // Routines used to pack the gauge field matrices 00027 00028 template <typename Float> 00029 inline void pack8(double2 *res, Float *g, int dir, int V) { 00030 double2 *r = res + dir*4*V; 00031 r[0].x = atan2(g[1], g[0]); 00032 r[0].y = atan2(g[13], g[12]); 00033 for (int j=1; j<4; j++) { 00034 r[j*V].x = g[2*j+0]; 00035 r[j*V].y = g[2*j+1]; 00036 } 00037 } 00038 00039 template <typename Float> 00040 inline void pack8(float4 *res, Float *g, int dir, int V) { 00041 float4 *r = res + dir*2*V; 00042 r[0].x = atan2(g[1], g[0]); 00043 r[0].y = atan2(g[13], g[12]); 00044 r[0].z = g[2]; 00045 r[0].w = g[3]; 00046 r[V].x = g[4]; 00047 r[V].y = g[5]; 00048 r[V].z = g[6]; 00049 r[V].w = g[7]; 00050 } 00051 00052 template <typename Float> 00053 inline void pack8(float2 *res, Float *g, int dir, int V) { 00054 float2 *r = res + dir*4*V; 00055 r[0].x = atan2(g[1], g[0]); 00056 r[0].y = atan2(g[13], g[12]); 00057 for (int j=1; j<4; j++) { 00058 r[j*V].x = g[2*j+0]; 00059 r[j*V].y = g[2*j+1]; 00060 } 00061 } 00062 00063 template <typename Float> 00064 inline void pack8(short4 *res, Float *g, int dir, int V) { 00065 short4 *r = res + dir*2*V; 00066 r[0].x = FloatToShort(atan2(g[1], g[0]) / M_PI); 00067 r[0].y = FloatToShort(atan2(g[13], g[12]) / M_PI); 00068 r[0].z = FloatToShort(g[2]); 00069 r[0].w = FloatToShort(g[3]); 00070 r[V].x = FloatToShort(g[4]); 00071 r[V].y = FloatToShort(g[5]); 00072 r[V].z = FloatToShort(g[6]); 00073 r[V].w = FloatToShort(g[7]); 00074 } 00075 00076 template <typename Float> 00077 inline void pack8(short2 *res, Float *g, int dir, int V) { 00078 short2 *r = res + dir*4*V; 00079 r[0].x = FloatToShort(atan2(g[1], g[0]) / M_PI); 00080 r[0].y = FloatToShort(atan2(g[13], g[12]) / M_PI); 00081 for (int j=1; j<4; j++) { 00082 r[j*V].x = FloatToShort(g[2*j+0]); 00083 r[j*V].y = FloatToShort(g[2*j+1]); 00084 } 00085 } 00086 00087 template <typename Float> 00088 inline void pack12(double2 *res, Float *g, int dir, int V) { 00089 double2 *r = res + dir*6*V; 00090 for (int j=0; j<6; j++) { 00091 r[j*V].x = g[j*2+0]; 00092 r[j*V].y = g[j*2+1]; 00093 } 00094 } 00095 00096 template <typename Float> 00097 inline void pack12(float4 *res, Float *g, int dir, int V) { 00098 float4 *r = res + dir*3*V; 00099 for (int j=0; j<3; j++) { 00100 r[j*V].x = g[j*4+0]; 00101 r[j*V].y = g[j*4+1]; 00102 r[j*V].z = g[j*4+2]; 00103 r[j*V].w = g[j*4+3]; 00104 } 00105 } 00106 00107 template <typename Float> 00108 inline void pack12(float2 *res, Float *g, int dir, int V) { 00109 float2 *r = res + dir*6*V; 00110 for (int j=0; j<6; j++) { 00111 r[j*V].x = g[j*2+0]; 00112 r[j*V].y = g[j*2+1]; 00113 } 00114 } 00115 00116 template <typename Float> 00117 inline void pack12(short4 *res, Float *g, int dir, int V) { 00118 short4 *r = res + dir*3*V; 00119 for (int j=0; j<3; j++) { 00120 r[j*V].x = FloatToShort(g[j*4+0]); 00121 r[j*V].y = FloatToShort(g[j*4+1]); 00122 r[j*V].z = FloatToShort(g[j*4+2]); 00123 r[j*V].w = FloatToShort(g[j*4+3]); 00124 } 00125 } 00126 00127 template <typename Float> 00128 inline void pack12(short2 *res, Float *g, int dir, int V) { 00129 short2 *r = res + dir*6*V; 00130 for (int j=0; j<6; j++) { 00131 r[j*V].x = FloatToShort(g[j*2+0]); 00132 r[j*V].y = FloatToShort(g[j*2+1]); 00133 } 00134 } 00135 00136 template <typename Float> 00137 inline void pack18(double2 *res, Float *g, int dir, int V) { 00138 double2 *r = res + dir*9*V; 00139 for (int j=0; j<9; j++) { 00140 r[j*V].x = g[j*2+0]; 00141 r[j*V].y = g[j*2+1]; 00142 } 00143 } 00144 00145 template <typename Float> 00146 inline void pack18(float4 *res, Float *g, int dir, int V) { 00147 float4 *r = res + dir*5*V; 00148 for (int j=0; j<4; j++) { 00149 r[j*V].x = g[j*4+0]; 00150 r[j*V].y = g[j*4+1]; 00151 r[j*V].z = g[j*4+2]; 00152 r[j*V].w = g[j*4+3]; 00153 } 00154 r[4*V].x = g[16]; 00155 r[4*V].y = g[17]; 00156 r[4*V].z = 0.0; 00157 r[4*V].w = 0.0; 00158 } 00159 00160 template <typename Float> 00161 inline void pack18(float2 *res, Float *g, int dir, int V) { 00162 float2 *r = res + dir*9*V; 00163 for (int j=0; j<9; j++) { 00164 r[j*V].x = g[j*2+0]; 00165 r[j*V].y = g[j*2+1]; 00166 } 00167 } 00168 00169 template <typename Float> 00170 inline void pack18(short4 *res, Float *g, int dir, int V) { 00171 short4 *r = res + dir*5*V; 00172 for (int j=0; j<4; j++) { 00173 r[j*V].x = FloatToShort(g[j*4+0]); 00174 r[j*V].y = FloatToShort(g[j*4+1]); 00175 r[j*V].z = FloatToShort(g[j*4+2]); 00176 r[j*V].w = FloatToShort(g[j*4+3]); 00177 } 00178 r[4*V].x = FloatToShort(g[16]); 00179 r[4*V].y = FloatToShort(g[17]); 00180 r[4*V].z = (short)0; 00181 r[4*V].w = (short)0; 00182 } 00183 00184 template <typename Float> 00185 inline void pack18(short2 *res, Float *g, int dir, int V) 00186 { 00187 short2 *r = res + dir*9*V; 00188 for (int j=0; j<9; j++) { 00189 r[j*V].x = FloatToShort(g[j*2+0]); 00190 r[j*V].y = FloatToShort(g[j*2+1]); 00191 } 00192 } 00193 00194 // a += b*c 00195 template <typename Float> 00196 inline void accumulateComplexProduct(Float *a, Float *b, Float *c, Float sign) { 00197 a[0] += sign*(b[0]*c[0] - b[1]*c[1]); 00198 a[1] += sign*(b[0]*c[1] + b[1]*c[0]); 00199 } 00200 00201 // a = conj(b)*c 00202 template <typename Float> 00203 inline void complexDotProduct(Float *a, Float *b, Float *c) { 00204 a[0] = b[0]*c[0] + b[1]*c[1]; 00205 a[1] = b[0]*c[1] - b[1]*c[0]; 00206 } 00207 00208 // a += conj(b) * conj(c) 00209 template <typename Float> 00210 inline void accumulateConjugateProduct(Float *a, Float *b, Float *c, int sign) { 00211 a[0] += sign * (b[0]*c[0] - b[1]*c[1]); 00212 a[1] -= sign * (b[0]*c[1] + b[1]*c[0]); 00213 } 00214 00215 // a = conj(b)*conj(c) 00216 template <typename Float> 00217 inline void complexConjugateProduct(Float *a, Float *b, Float *c) { 00218 a[0] = b[0]*c[0] - b[1]*c[1]; 00219 a[1] = -b[0]*c[1] - b[1]*c[0]; 00220 } 00221 00222 00223 // Routines used to unpack the gauge field matrices 00224 template <typename Float> 00225 inline void reconstruct8(Float *mat, int dir, int idx) { 00226 // First reconstruct first row 00227 Float row_sum = 0.0; 00228 row_sum += mat[2]*mat[2]; 00229 row_sum += mat[3]*mat[3]; 00230 row_sum += mat[4]*mat[4]; 00231 row_sum += mat[5]*mat[5]; 00232 Float u0 = (dir < 3 ? Anisotropy : (idx >= (X[3]-1)*X[0]*X[1]*X[2]/2 ? tBoundary : 1)); 00233 Float diff = 1.f/(u0*u0) - row_sum; 00234 Float U00_mag = sqrt(diff >= 0 ? diff : 0.0); 00235 00236 mat[14] = mat[0]; 00237 mat[15] = mat[1]; 00238 00239 mat[0] = U00_mag * cos(mat[14]); 00240 mat[1] = U00_mag * sin(mat[14]); 00241 00242 Float column_sum = 0.0; 00243 for (int i=0; i<2; i++) column_sum += mat[i]*mat[i]; 00244 for (int i=6; i<8; i++) column_sum += mat[i]*mat[i]; 00245 diff = 1.f/(u0*u0) - column_sum; 00246 Float U20_mag = sqrt(diff >= 0 ? diff : 0.0); 00247 00248 mat[12] = U20_mag * cos(mat[15]); 00249 mat[13] = U20_mag * sin(mat[15]); 00250 00251 // First column now restored 00252 00253 // finally reconstruct last elements from SU(2) rotation 00254 Float r_inv2 = 1.0/(u0*row_sum); 00255 00256 // U11 00257 Float A[2]; 00258 complexDotProduct(A, mat+0, mat+6); 00259 complexConjugateProduct(mat+8, mat+12, mat+4); 00260 accumulateComplexProduct(mat+8, A, mat+2, u0); 00261 mat[8] *= -r_inv2; 00262 mat[9] *= -r_inv2; 00263 00264 // U12 00265 complexConjugateProduct(mat+10, mat+12, mat+2); 00266 accumulateComplexProduct(mat+10, A, mat+4, -u0); 00267 mat[10] *= r_inv2; 00268 mat[11] *= r_inv2; 00269 00270 // U21 00271 complexDotProduct(A, mat+0, mat+12); 00272 complexConjugateProduct(mat+14, mat+6, mat+4); 00273 accumulateComplexProduct(mat+14, A, mat+2, -u0); 00274 mat[14] *= r_inv2; 00275 mat[15] *= r_inv2; 00276 00277 // U12 00278 complexConjugateProduct(mat+16, mat+6, mat+2); 00279 accumulateComplexProduct(mat+16, A, mat+4, u0); 00280 mat[16] *= -r_inv2; 00281 mat[17] *= -r_inv2; 00282 } 00283 00284 template <typename Float> 00285 inline void unpack8(Float *h_gauge, double2 *d_gauge, int dir, int V, int idx) { 00286 double2 *dg = d_gauge + dir*4*V; 00287 for (int j=0; j<4; j++) { 00288 h_gauge[2*j+0] = dg[j*V].x; 00289 h_gauge[2*j+1] = dg[j*V].y; 00290 } 00291 reconstruct8(h_gauge, dir, idx); 00292 } 00293 00294 template <typename Float> 00295 inline void unpack8(Float *h_gauge, float4 *d_gauge, int dir, int V, int idx) { 00296 float4 *dg = d_gauge + dir*2*V; 00297 h_gauge[0] = dg[0].x; 00298 h_gauge[1] = dg[0].y; 00299 h_gauge[2] = dg[0].z; 00300 h_gauge[3] = dg[0].w; 00301 h_gauge[4] = dg[V].x; 00302 h_gauge[5] = dg[V].y; 00303 h_gauge[6] = dg[V].z; 00304 h_gauge[7] = dg[V].w; 00305 reconstruct8(h_gauge, dir, idx); 00306 } 00307 00308 template <typename Float> 00309 inline void unpack8(Float *h_gauge, float2 *d_gauge, int dir, int V, int idx) { 00310 float2 *dg = d_gauge + dir*4*V; 00311 for (int j=0; j<4; j++) { 00312 h_gauge[2*j+0] = dg[j*V].x; 00313 h_gauge[2*j+1] = dg[j*V].y; 00314 } 00315 reconstruct8(h_gauge, dir, idx); 00316 } 00317 00318 template <typename Float> 00319 inline void unpack8(Float *h_gauge, short4 *d_gauge, int dir, int V, int idx) { 00320 short4 *dg = d_gauge + dir*2*V; 00321 ShortToFloat(h_gauge[0], dg[0].x); 00322 ShortToFloat(h_gauge[1], dg[0].y); 00323 ShortToFloat(h_gauge[2], dg[0].z); 00324 ShortToFloat(h_gauge[3], dg[0].w); 00325 ShortToFloat(h_gauge[4], dg[V].x); 00326 ShortToFloat(h_gauge[5], dg[V].y); 00327 ShortToFloat(h_gauge[6], dg[V].z); 00328 ShortToFloat(h_gauge[7], dg[V].w); 00329 h_gauge[0] *= M_PI; 00330 h_gauge[1] *= M_PI; 00331 reconstruct8(h_gauge, dir, idx); 00332 } 00333 00334 template <typename Float> 00335 inline void unpack8(Float *h_gauge, short2 *d_gauge, int dir, int V, int idx) { 00336 short2 *dg = d_gauge + dir*4*V; 00337 for (int j=0; j<4; j++) { 00338 ShortToFloat(h_gauge[2*j+0], dg[j*V].x); 00339 ShortToFloat(h_gauge[2*j+1], dg[j*V].y); 00340 } 00341 h_gauge[0] *= M_PI; 00342 h_gauge[1] *= M_PI; 00343 reconstruct8(h_gauge, dir, idx); 00344 } 00345 00346 // do this using complex numbers (simplifies) 00347 template <typename Float> 00348 inline void reconstruct12(Float *mat, int dir, int idx) { 00349 Float *u = &mat[0*(3*2)]; 00350 Float *v = &mat[1*(3*2)]; 00351 Float *w = &mat[2*(3*2)]; 00352 w[0] = 0.0; w[1] = 0.0; w[2] = 0.0; w[3] = 0.0; w[4] = 0.0; w[5] = 0.0; 00353 accumulateConjugateProduct(w+0*(2), u+1*(2), v+2*(2), +1); 00354 accumulateConjugateProduct(w+0*(2), u+2*(2), v+1*(2), -1); 00355 accumulateConjugateProduct(w+1*(2), u+2*(2), v+0*(2), +1); 00356 accumulateConjugateProduct(w+1*(2), u+0*(2), v+2*(2), -1); 00357 accumulateConjugateProduct(w+2*(2), u+0*(2), v+1*(2), +1); 00358 accumulateConjugateProduct(w+2*(2), u+1*(2), v+0*(2), -1); 00359 Float u0 = (dir < 3 ? Anisotropy : 00360 (idx >= (X[3]-1)*X[0]*X[1]*X[2]/2 ? tBoundary : 1)); 00361 w[0]*=u0; w[1]*=u0; w[2]*=u0; w[3]*=u0; w[4]*=u0; w[5]*=u0; 00362 } 00363 00364 template <typename Float> 00365 inline void unpack12(Float *h_gauge, double2 *d_gauge, int dir, int V, int idx) { 00366 double2 *dg = d_gauge + dir*6*V; 00367 for (int j=0; j<6; j++) { 00368 h_gauge[j*2+0] = dg[j*V].x; 00369 h_gauge[j*2+1] = dg[j*V].y; 00370 } 00371 reconstruct12(h_gauge, dir, idx); 00372 } 00373 00374 template <typename Float> 00375 inline void unpack12(Float *h_gauge, float4 *d_gauge, int dir, int V, int idx) { 00376 float4 *dg = d_gauge + dir*3*V; 00377 for (int j=0; j<3; j++) { 00378 h_gauge[j*4+0] = dg[j*V].x; 00379 h_gauge[j*4+1] = dg[j*V].y; 00380 h_gauge[j*4+2] = dg[j*V].z; 00381 h_gauge[j*4+3] = dg[j*V].w; 00382 } 00383 reconstruct12(h_gauge, dir, idx); 00384 } 00385 00386 template <typename Float> 00387 inline void unpack12(Float *h_gauge, float2 *d_gauge, int dir, int V, int idx) { 00388 float2 *dg = d_gauge + dir*6*V; 00389 for (int j=0; j<6; j++) { 00390 h_gauge[j*2+0] = dg[j*V].x; 00391 h_gauge[j*2+1] = dg[j*V].y; 00392 } 00393 reconstruct12(h_gauge, dir, idx); 00394 } 00395 00396 template <typename Float> 00397 inline void unpack12(Float *h_gauge, short4 *d_gauge, int dir, int V, int idx) { 00398 short4 *dg = d_gauge + dir*3*V; 00399 for (int j=0; j<3; j++) { 00400 ShortToFloat(h_gauge[j*4+0], dg[j*V].x); 00401 ShortToFloat(h_gauge[j*4+1], dg[j*V].y); 00402 ShortToFloat(h_gauge[j*4+2], dg[j*V].z); 00403 ShortToFloat(h_gauge[j*4+3], dg[j*V].w); 00404 } 00405 reconstruct12(h_gauge, dir, idx); 00406 } 00407 00408 template <typename Float> 00409 inline void unpack12(Float *h_gauge, short2 *d_gauge, int dir, int V, int idx) { 00410 short2 *dg = d_gauge + dir*6*V; 00411 for (int j=0; j<6; j++) { 00412 ShortToFloat(h_gauge[j*2+0], dg[j*V].x); 00413 ShortToFloat(h_gauge[j*2+1], dg[j*V].y); 00414 } 00415 reconstruct12(h_gauge, dir, idx); 00416 } 00417 00418 template <typename Float> 00419 inline void unpack18(Float *h_gauge, double2 *d_gauge, int dir, int V) { 00420 double2 *dg = d_gauge + dir*9*V; 00421 for (int j=0; j<9; j++) { 00422 h_gauge[j*2+0] = dg[j*V].x; 00423 h_gauge[j*2+1] = dg[j*V].y; 00424 } 00425 } 00426 00427 template <typename Float> 00428 inline void unpack18(Float *h_gauge, float4 *d_gauge, int dir, int V) { 00429 float4 *dg = d_gauge + dir*5*V; 00430 for (int j=0; j<4; j++) { 00431 h_gauge[j*4+0] = dg[j*V].x; 00432 h_gauge[j*4+1] = dg[j*V].y; 00433 h_gauge[j*4+2] = dg[j*V].z; 00434 h_gauge[j*4+3] = dg[j*V].w; 00435 } 00436 h_gauge[16] = dg[4*V].x; 00437 h_gauge[17] = dg[4*V].y; 00438 } 00439 00440 template <typename Float> 00441 inline void unpack18(Float *h_gauge, float2 *d_gauge, int dir, int V) { 00442 float2 *dg = d_gauge + dir*9*V; 00443 for (int j=0; j<9; j++) { 00444 h_gauge[j*2+0] = dg[j*V].x; 00445 h_gauge[j*2+1] = dg[j*V].y; 00446 } 00447 } 00448 00449 template <typename Float> 00450 inline void unpack18(Float *h_gauge, short4 *d_gauge, int dir, int V) { 00451 short4 *dg = d_gauge + dir*5*V; 00452 for (int j=0; j<4; j++) { 00453 ShortToFloat(h_gauge[j*4+0], dg[j*V].x); 00454 ShortToFloat(h_gauge[j*4+1], dg[j*V].y); 00455 ShortToFloat(h_gauge[j*4+2], dg[j*V].z); 00456 ShortToFloat(h_gauge[j*4+3], dg[j*V].w); 00457 } 00458 ShortToFloat(h_gauge[16],dg[4*V].x); 00459 ShortToFloat(h_gauge[17],dg[4*V].y); 00460 00461 } 00462 00463 template <typename Float> 00464 inline void unpack18(Float *h_gauge, short2 *d_gauge, int dir, int V) { 00465 short2 *dg = d_gauge + dir*9*V; 00466 for (int j=0; j<9; j++) { 00467 ShortToFloat(h_gauge[j*2+0], dg[j*V].x); 00468 ShortToFloat(h_gauge[j*2+1], dg[j*V].y); 00469 } 00470 } 00471 00472 // Assume the gauge field is "QDP" ordered: directions outside of 00473 // space-time, row-column ordering, even-odd space-time 00474 template <typename Float, typename FloatN> 00475 static void packQDPGaugeField(FloatN *d_gauge, Float **h_gauge, int oddBit, 00476 ReconstructType reconstruct, int V, int pad) { 00477 if (reconstruct == QUDA_RECONSTRUCT_12) { 00478 for (int dir = 0; dir < 4; dir++) { 00479 Float *g = h_gauge[dir] + oddBit*V*18; 00480 for (int i = 0; i < V; i++) pack12(d_gauge+i, g+i*18, dir, V+pad); 00481 } 00482 } else if (reconstruct == QUDA_RECONSTRUCT_8) { 00483 for (int dir = 0; dir < 4; dir++) { 00484 Float *g = h_gauge[dir] + oddBit*V*18; 00485 for (int i = 0; i < V; i++) pack8(d_gauge+i, g+i*18, dir, V+pad); 00486 } 00487 } else { 00488 for (int dir = 0; dir < 4; dir++) { 00489 Float *g = h_gauge[dir] + oddBit*V*18; 00490 for (int i = 0; i < V; i++) pack18(d_gauge+i, g+i*18, dir, V+pad); 00491 } 00492 } 00493 } 00494 00495 // Assume the gauge field is "QDP" ordered: directions outside of 00496 // space-time, row-column ordering, even-odd space-time 00497 template <typename Float, typename FloatN> 00498 static void unpackQDPGaugeField(Float **h_gauge, FloatN *d_gauge, int oddBit, 00499 ReconstructType reconstruct, int V, int pad) { 00500 if (reconstruct == QUDA_RECONSTRUCT_12) { 00501 for (int dir = 0; dir < 4; dir++) { 00502 Float *g = h_gauge[dir] + oddBit*V*18; 00503 for (int i = 0; i < V; i++) unpack12(g+i*18, d_gauge+i, dir, V+pad, i); 00504 } 00505 } else if (reconstruct == QUDA_RECONSTRUCT_8) { 00506 for (int dir = 0; dir < 4; dir++) { 00507 Float *g = h_gauge[dir] + oddBit*V*18; 00508 for (int i = 0; i < V; i++) unpack8(g+i*18, d_gauge+i, dir, V+pad, i); 00509 } 00510 } else { 00511 for (int dir = 0; dir < 4; dir++) { 00512 Float *g = h_gauge[dir] + oddBit*V*18; 00513 for (int i = 0; i < V; i++) unpack18(g+i*18, d_gauge+i, dir, V+pad); 00514 } 00515 } 00516 } 00517 00518 // transpose and scale the matrix 00519 template <typename Float, typename Float2> 00520 static void transposeScale(Float *gT, Float *g, const Float2 &a) { 00521 for (int ic=0; ic<3; ic++) for (int jc=0; jc<3; jc++) for (int r=0; r<2; r++) 00522 gT[(ic*3+jc)*2+r] = a*g[(jc*3+ic)*2+r]; 00523 } 00524 00525 // Assume the gauge field is "Wilson" ordered directions inside of 00526 // space-time column-row ordering even-odd space-time 00527 template <typename Float, typename FloatN> 00528 static void packCPSGaugeField(FloatN *d_gauge, Float *h_gauge, int oddBit, 00529 ReconstructType reconstruct, int V, int pad) { 00530 Float gT[18]; 00531 if (reconstruct == QUDA_RECONSTRUCT_12) { 00532 for (int dir = 0; dir < 4; dir++) { 00533 Float *g = h_gauge + (oddBit*V*4+dir)*18; 00534 for (int i = 0; i < V; i++) { 00535 transposeScale(gT, g, 1.0 / Anisotropy); 00536 pack12(d_gauge+i, gT, dir, V+pad); 00537 } 00538 } 00539 } else if (reconstruct == QUDA_RECONSTRUCT_8) { 00540 for (int dir = 0; dir < 4; dir++) { 00541 Float *g = h_gauge + (oddBit*V*4+dir)*18; 00542 for (int i = 0; i < V; i++) { 00543 transposeScale(gT, g, 1.0 / Anisotropy); 00544 pack8(d_gauge+i, gT, dir, V+pad); 00545 } 00546 } 00547 } else { 00548 for (int dir = 0; dir < 4; dir++) { 00549 Float *g = h_gauge + (oddBit*V*4+dir)*18; 00550 for (int i = 0; i < V; i++) { 00551 transposeScale(gT, g, 1.0 / Anisotropy); 00552 pack18(d_gauge+i, gT, dir, V+pad); 00553 } 00554 } 00555 } 00556 00557 } 00558 00559 // Assume the gauge field is "Wilson" ordered directions inside of 00560 // space-time column-row ordering even-odd space-time 00561 template <typename Float, typename FloatN> 00562 static void unpackCPSGaugeField(Float *h_gauge, FloatN *d_gauge, int oddBit, 00563 ReconstructType reconstruct, int V, int pad) { 00564 Float gT[18]; 00565 if (reconstruct == QUDA_RECONSTRUCT_12) { 00566 for (int dir = 0; dir < 4; dir++) { 00567 Float *hg = h_gauge + (oddBit*V*4+dir)*18; 00568 for (int i = 0; i < V; i++) { 00569 unpack12(gT, d_gauge+i, dir, V+pad, i); 00570 transposeScale(hg, gT, Anisotropy); 00571 } 00572 } 00573 } else if (reconstruct == QUDA_RECONSTRUCT_8) { 00574 for (int dir = 0; dir < 4; dir++) { 00575 Float *hg = h_gauge + (oddBit*V*4+dir)*18; 00576 for (int i = 0; i < V; i++) { 00577 unpack8(gT, d_gauge+i, dir, V+pad, i); 00578 transposeScale(hg, gT, Anisotropy); 00579 } 00580 } 00581 } else { 00582 for (int dir = 0; dir < 4; dir++) { 00583 Float *hg = h_gauge + (oddBit*V*4+dir)*18; 00584 for (int i = 0; i < V; i++) { 00585 unpack18(gT, d_gauge+i, dir, V+pad); 00586 transposeScale(hg, gT, Anisotropy); 00587 } 00588 } 00589 } 00590 00591 } 00592 00593 static void allocateGaugeField(FullGauge *cudaGauge, ReconstructType reconstruct, QudaPrecision precision) { 00594 00595 cudaGauge->reconstruct = reconstruct; 00596 cudaGauge->precision = precision; 00597 00598 cudaGauge->Nc = 3; 00599 00600 int floatSize; 00601 if (precision == QUDA_DOUBLE_PRECISION) floatSize = sizeof(double); 00602 else if (precision == QUDA_SINGLE_PRECISION) floatSize = sizeof(float); 00603 else floatSize = sizeof(float)/2; 00604 00605 int elements; 00606 switch(reconstruct){ 00607 case QUDA_RECONSTRUCT_8: 00608 elements = 8; 00609 break; 00610 case QUDA_RECONSTRUCT_12: 00611 elements = 12; 00612 break; 00613 case QUDA_RECONSTRUCT_NO: 00614 elements = 18; 00615 break; 00616 default: 00617 errorQuda("Invalid reconstruct value"); 00618 } 00619 00620 cudaGauge->bytes = 4*cudaGauge->stride*elements*floatSize; 00621 00622 if (!cudaGauge->even) { 00623 if (cudaMalloc((void **)&cudaGauge->even, cudaGauge->bytes) == cudaErrorMemoryAllocation) { 00624 errorQuda("Error allocating even gauge field"); 00625 } 00626 } 00627 cudaMemset(cudaGauge->even, 0, cudaGauge->bytes); 00628 00629 if (!cudaGauge->odd) { 00630 if (cudaMalloc((void **)&cudaGauge->odd, cudaGauge->bytes) == cudaErrorMemoryAllocation) { 00631 errorQuda("Error allocating even odd gauge field"); 00632 } 00633 } 00634 cudaMemset(cudaGauge->odd, 0, cudaGauge->bytes); 00635 00636 checkCudaError(); 00637 } 00638 00639 void freeGaugeField(FullGauge *cudaGauge) { 00640 if (cudaGauge->even) cudaFree(cudaGauge->even); 00641 if (cudaGauge->odd) cudaFree(cudaGauge->odd); 00642 cudaGauge->even = NULL; 00643 cudaGauge->odd = NULL; 00644 } 00645 00646 template <typename Float, typename FloatN> 00647 static void loadGaugeField(FloatN *even, FloatN *odd, Float *cpuGauge, GaugeFieldOrder gauge_order, 00648 ReconstructType reconstruct, int bytes, int Vh, int pad) { 00649 00650 // Use pinned memory 00651 FloatN *packedEven, *packedOdd; 00652 00653 #ifndef __DEVICE_EMULATION__ 00654 cudaMallocHost((void**)&packedEven, bytes); 00655 cudaMallocHost((void**)&packedOdd, bytes); 00656 #else 00657 packedEven = (FloatN*)malloc(bytes); 00658 packedOdd = (FloatN*)malloc(bytes); 00659 #endif 00660 00661 if (gauge_order == QUDA_QDP_GAUGE_ORDER) { 00662 packQDPGaugeField(packedEven, (Float**)cpuGauge, 0, reconstruct, Vh, pad); 00663 packQDPGaugeField(packedOdd, (Float**)cpuGauge, 1, reconstruct, Vh, pad); 00664 } else if (gauge_order == QUDA_CPS_WILSON_GAUGE_ORDER) { 00665 packCPSGaugeField(packedEven, (Float*)cpuGauge, 0, reconstruct, Vh, pad); 00666 packCPSGaugeField(packedOdd, (Float*)cpuGauge, 1, reconstruct, Vh, pad); 00667 } else { 00668 errorQuda("Invalid gauge_order"); 00669 } 00670 00671 cudaMemcpy(even, packedEven, bytes, cudaMemcpyHostToDevice); 00672 checkCudaError(); 00673 00674 cudaMemcpy(odd, packedOdd, bytes, cudaMemcpyHostToDevice); 00675 checkCudaError(); 00676 00677 #ifndef __DEVICE_EMULATION__ 00678 cudaFreeHost(packedEven); 00679 cudaFreeHost(packedOdd); 00680 #else 00681 free(packedEven); 00682 free(packedOdd); 00683 #endif 00684 00685 } 00686 00687 template <typename Float, typename FloatN> 00688 static void retrieveGaugeField(Float *cpuGauge, FloatN *even, FloatN *odd, GaugeFieldOrder gauge_order, 00689 ReconstructType reconstruct, int bytes, int Vh, int pad) { 00690 00691 // Use pinned memory 00692 FloatN *packedEven, *packedOdd; 00693 00694 #ifndef __DEVICE_EMULATION__ 00695 cudaMallocHost((void**)&packedEven, bytes); 00696 cudaMallocHost((void**)&packedOdd, bytes); 00697 #else 00698 packedEven = (FloatN*)malloc(bytes); 00699 packedOdd = (FloatN*)malloc(bytes); 00700 #endif 00701 00702 cudaMemcpy(packedEven, even, bytes, cudaMemcpyDeviceToHost); 00703 cudaMemcpy(packedOdd, odd, bytes, cudaMemcpyDeviceToHost); 00704 00705 if (gauge_order == QUDA_QDP_GAUGE_ORDER) { 00706 unpackQDPGaugeField((Float**)cpuGauge, packedEven, 0, reconstruct, Vh, pad); 00707 unpackQDPGaugeField((Float**)cpuGauge, packedOdd, 1, reconstruct, Vh, pad); 00708 } else if (gauge_order == QUDA_CPS_WILSON_GAUGE_ORDER) { 00709 unpackCPSGaugeField((Float*)cpuGauge, packedEven, 0, reconstruct, Vh, pad); 00710 unpackCPSGaugeField((Float*)cpuGauge, packedOdd, 1, reconstruct, Vh, pad); 00711 } else { 00712 errorQuda("Invalid gauge_order"); 00713 } 00714 00715 #ifndef __DEVICE_EMULATION__ 00716 cudaFreeHost(packedEven); 00717 cudaFreeHost(packedOdd); 00718 #else 00719 free(packedEven); 00720 free(packedOdd); 00721 #endif 00722 00723 } 00724 00725 void createGaugeField(FullGauge *cudaGauge, void *cpuGauge, QudaPrecision cuda_prec, QudaPrecision cpu_prec, 00726 GaugeFieldOrder gauge_order, ReconstructType reconstruct, GaugeFixed gauge_fixed, 00727 Tboundary t_boundary, int *XX, double anisotropy, double tadpole_coeff, int pad) 00728 { 00729 if (cpu_prec == QUDA_HALF_PRECISION) { 00730 errorQuda("Half precision not supported on CPU"); 00731 } 00732 00733 Anisotropy = anisotropy; 00734 tBoundary = t_boundary; 00735 00736 cudaGauge->anisotropy = anisotropy; 00737 cudaGauge->tadpole_coeff = tadpole_coeff; 00738 cudaGauge->volume = 1; 00739 for (int d=0; d<4; d++) { 00740 cudaGauge->X[d] = XX[d]; 00741 cudaGauge->volume *= XX[d]; 00742 X[d] = XX[d]; 00743 } 00744 cudaGauge->X[0] /= 2; // actually store the even-odd sublattice dimensions 00745 cudaGauge->volume /= 2; 00746 cudaGauge->pad = pad; 00747 cudaGauge->stride = cudaGauge->volume + cudaGauge->pad; 00748 cudaGauge->gauge_fixed = gauge_fixed; 00749 cudaGauge->t_boundary = t_boundary; 00750 00751 allocateGaugeField(cudaGauge, reconstruct, cuda_prec); 00752 00753 if (cuda_prec == QUDA_DOUBLE_PRECISION) { 00754 00755 if (cpu_prec == QUDA_DOUBLE_PRECISION) { 00756 loadGaugeField((double2*)(cudaGauge->even), (double2*)(cudaGauge->odd), (double*)cpuGauge, 00757 gauge_order, cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, pad); 00758 } else if (cpu_prec == QUDA_SINGLE_PRECISION) { 00759 loadGaugeField((double2*)(cudaGauge->even), (double2*)(cudaGauge->odd), (float*)cpuGauge, 00760 gauge_order, cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, pad); 00761 } 00762 00763 } else if (cuda_prec == QUDA_SINGLE_PRECISION) { 00764 00765 if (cpu_prec == QUDA_DOUBLE_PRECISION) { 00766 if (reconstruct == QUDA_RECONSTRUCT_NO) { 00767 loadGaugeField((float2*)(cudaGauge->even), (float2*)(cudaGauge->odd), (double*)cpuGauge, 00768 gauge_order, cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, pad); 00769 } else { 00770 loadGaugeField((float4*)(cudaGauge->even), (float4*)(cudaGauge->odd), (double*)cpuGauge, 00771 gauge_order, cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, pad); 00772 } 00773 } else if (cpu_prec == QUDA_SINGLE_PRECISION) { 00774 if (reconstruct == QUDA_RECONSTRUCT_NO) { 00775 loadGaugeField((float2*)(cudaGauge->even), (float2*)(cudaGauge->odd), (float*)cpuGauge, 00776 gauge_order, cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, pad); 00777 } else { 00778 loadGaugeField((float4*)(cudaGauge->even), (float4*)(cudaGauge->odd), (float*)cpuGauge, 00779 gauge_order, cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, pad); 00780 } 00781 } 00782 00783 } else if (cuda_prec == QUDA_HALF_PRECISION) { 00784 00785 if (cpu_prec == QUDA_DOUBLE_PRECISION){ 00786 if (reconstruct == QUDA_RECONSTRUCT_NO) { 00787 loadGaugeField((short2*)(cudaGauge->even), (short2*)(cudaGauge->odd), (double*)cpuGauge, 00788 gauge_order, cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, pad); 00789 } else { 00790 loadGaugeField((short4*)(cudaGauge->even), (short4*)(cudaGauge->odd), (double*)cpuGauge, 00791 gauge_order, cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, pad); 00792 } 00793 } else if (cpu_prec == QUDA_SINGLE_PRECISION) { 00794 if (reconstruct == QUDA_RECONSTRUCT_NO) { 00795 loadGaugeField((short2*)(cudaGauge->even), (short2*)(cudaGauge->odd), (float*)cpuGauge, 00796 gauge_order, cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, pad); 00797 } else { 00798 loadGaugeField((short4*)(cudaGauge->even), (short4*)(cudaGauge->odd), (float*)cpuGauge, 00799 gauge_order, cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, pad); 00800 } 00801 } 00802 00803 } 00804 } 00805 00806 void restoreGaugeField(void *cpuGauge, FullGauge *cudaGauge, QudaPrecision cpu_prec, GaugeFieldOrder gauge_order) 00807 { 00808 if (cpu_prec == QUDA_HALF_PRECISION) { 00809 errorQuda("Half precision not supported on CPU"); 00810 } 00811 00812 if (cudaGauge->precision == QUDA_DOUBLE_PRECISION) { 00813 00814 if (cpu_prec == QUDA_DOUBLE_PRECISION) { 00815 retrieveGaugeField((double*)cpuGauge, (double2*)(cudaGauge->even), (double2*)(cudaGauge->odd), 00816 gauge_order, cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, cudaGauge->pad); 00817 } else if (cpu_prec == QUDA_SINGLE_PRECISION) { 00818 retrieveGaugeField((float*)cpuGauge, (double2*)(cudaGauge->even), (double2*)(cudaGauge->odd), 00819 gauge_order, cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, cudaGauge->pad); 00820 } 00821 00822 } else if (cudaGauge->precision == QUDA_SINGLE_PRECISION) { 00823 00824 if (cpu_prec == QUDA_DOUBLE_PRECISION) { 00825 if (cudaGauge->reconstruct == QUDA_RECONSTRUCT_NO) { 00826 retrieveGaugeField((double*)cpuGauge, (float2*)(cudaGauge->even), (float2*)(cudaGauge->odd), 00827 gauge_order, cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, cudaGauge->pad); 00828 } else { 00829 retrieveGaugeField((double*)cpuGauge, (float4*)(cudaGauge->even), (float4*)(cudaGauge->odd), 00830 gauge_order, cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, cudaGauge->pad); 00831 } 00832 } else if (cpu_prec == QUDA_SINGLE_PRECISION) { 00833 if (cudaGauge->reconstruct == QUDA_RECONSTRUCT_NO) { 00834 retrieveGaugeField((float*)cpuGauge, (float2*)(cudaGauge->even), (float2*)(cudaGauge->odd), 00835 gauge_order, cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, cudaGauge->pad); 00836 } else { 00837 retrieveGaugeField((float*)cpuGauge, (float4*)(cudaGauge->even), (float4*)(cudaGauge->odd), 00838 gauge_order, cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, cudaGauge->pad); 00839 } 00840 } 00841 00842 } else if (cudaGauge->precision == QUDA_HALF_PRECISION) { 00843 00844 if (cpu_prec == QUDA_DOUBLE_PRECISION) { 00845 if (cudaGauge->reconstruct == QUDA_RECONSTRUCT_NO) { 00846 retrieveGaugeField((double*)cpuGauge, (short2*)(cudaGauge->even), (short2*)(cudaGauge->odd), 00847 gauge_order, cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, cudaGauge->pad); 00848 } else { 00849 retrieveGaugeField((double*)cpuGauge, (short4*)(cudaGauge->even), (short4*)(cudaGauge->odd), 00850 gauge_order, cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, cudaGauge->pad); 00851 } 00852 } else if (cpu_prec == QUDA_SINGLE_PRECISION) { 00853 if (cudaGauge->reconstruct == QUDA_RECONSTRUCT_NO) { 00854 retrieveGaugeField((float*)cpuGauge, (short2*)(cudaGauge->even), (short2*)(cudaGauge->odd), 00855 gauge_order, cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, cudaGauge->pad); 00856 } else { 00857 retrieveGaugeField((float*)cpuGauge, (short4*)(cudaGauge->even), (short4*)(cudaGauge->odd), 00858 gauge_order, cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume, cudaGauge->pad); 00859 } 00860 } 00861 00862 } 00863 00864 } 00865 00866 00867 /********************** the following functions are used in force computation and link fattening**************/ 00868 00869 /******************************** Functions to manipulate Staple ****************************/ 00870 00871 static void 00872 allocateStapleQuda(FullStaple *cudaStaple, QudaPrecision precision) 00873 { 00874 cudaStaple->precision = precision; 00875 cudaStaple->Nc = 3; 00876 00877 int floatSize; 00878 if (precision == QUDA_DOUBLE_PRECISION) { 00879 floatSize = sizeof(double); 00880 } 00881 else if (precision == QUDA_SINGLE_PRECISION) { 00882 floatSize = sizeof(float); 00883 }else{ 00884 printf("ERROR: stape does not support half precision\n"); 00885 exit(1); 00886 } 00887 00888 int elements = 18; 00889 00890 cudaStaple->bytes = cudaStaple->volume*elements*floatSize; 00891 00892 cudaMalloc((void **)&cudaStaple->even, cudaStaple->bytes); 00893 cudaMalloc((void **)&cudaStaple->odd, cudaStaple->bytes); 00894 } 00895 00896 void 00897 createStapleQuda(FullStaple* cudaStaple, QudaGaugeParam* param) 00898 { 00899 QudaPrecision cpu_prec = param->cpu_prec; 00900 QudaPrecision cuda_prec= param->cuda_prec; 00901 00902 if (cpu_prec == QUDA_HALF_PRECISION) { 00903 printf("ERROR: %s: half precision not supported on cpu\n", __FUNCTION__); 00904 exit(-1); 00905 } 00906 00907 if (cuda_prec == QUDA_DOUBLE_PRECISION && param->cpu_prec != QUDA_DOUBLE_PRECISION) { 00908 printf("Error: can only create a double GPU gauge field from a double CPU gauge field\n"); 00909 exit(-1); 00910 } 00911 00912 cudaStaple->volume = 1; 00913 for (int d=0; d<4; d++) { 00914 cudaStaple->X[d] = param->X[d]; 00915 cudaStaple->volume *= param->X[d]; 00916 } 00917 cudaStaple->X[0] /= 2; // actually store the even-odd sublattice dimensions 00918 cudaStaple->volume /= 2; 00919 00920 allocateStapleQuda(cudaStaple, param->cuda_prec); 00921 00922 return; 00923 } 00924 00925 00926 void 00927 freeStapleQuda(FullStaple *cudaStaple) 00928 { 00929 if (cudaStaple->even) { 00930 cudaFree(cudaStaple->even); 00931 } 00932 if (cudaStaple->odd) { 00933 cudaFree(cudaStaple->odd); 00934 } 00935 cudaStaple->even = NULL; 00936 cudaStaple->odd = NULL; 00937 } 00938 00939 00940 00941 00942 /******************************** Functions to manipulate Mom ****************************/ 00943 00944 static void 00945 allocateMomQuda(FullMom *cudaMom, QudaPrecision precision) 00946 { 00947 cudaMom->precision = precision; 00948 00949 int floatSize; 00950 if (precision == QUDA_DOUBLE_PRECISION) { 00951 floatSize = sizeof(double); 00952 } 00953 else if (precision == QUDA_SINGLE_PRECISION) { 00954 floatSize = sizeof(float); 00955 }else{ 00956 printf("ERROR: stape does not support half precision\n"); 00957 exit(1); 00958 } 00959 00960 int elements = 10; 00961 00962 cudaMom->bytes = cudaMom->volume*elements*floatSize*4; 00963 00964 cudaMalloc((void **)&cudaMom->even, cudaMom->bytes); 00965 cudaMalloc((void **)&cudaMom->odd, cudaMom->bytes); 00966 } 00967 00968 void 00969 createMomQuda(FullMom* cudaMom, QudaGaugeParam* param) 00970 { 00971 QudaPrecision cpu_prec = param->cpu_prec; 00972 QudaPrecision cuda_prec= param->cuda_prec; 00973 00974 if (cpu_prec == QUDA_HALF_PRECISION) { 00975 printf("ERROR: %s: half precision not supported on cpu\n", __FUNCTION__); 00976 exit(-1); 00977 } 00978 00979 if (cuda_prec == QUDA_DOUBLE_PRECISION && param->cpu_prec != QUDA_DOUBLE_PRECISION) { 00980 printf("Error: can only create a double GPU gauge field from a double CPU gauge field\n"); 00981 exit(-1); 00982 } 00983 00984 cudaMom->volume = 1; 00985 for (int d=0; d<4; d++) { 00986 cudaMom->X[d] = param->X[d]; 00987 cudaMom->volume *= param->X[d]; 00988 } 00989 cudaMom->X[0] /= 2; // actually store the even-odd sublattice dimensions 00990 cudaMom->volume /= 2; 00991 00992 allocateMomQuda(cudaMom, param->cuda_prec); 00993 00994 return; 00995 } 00996 00997 00998 void 00999 freeMomQuda(FullMom *cudaMom) 01000 { 01001 if (cudaMom->even) { 01002 cudaFree(cudaMom->even); 01003 } 01004 if (cudaMom->odd) { 01005 cudaFree(cudaMom->odd); 01006 } 01007 cudaMom->even = NULL; 01008 cudaMom->odd = NULL; 01009 } 01010 01011 template <typename Float, typename Float2> 01012 inline void pack10(Float2 *res, Float *m, int dir, int Vh) 01013 { 01014 Float2 *r = res + dir*5*Vh; 01015 for (int j=0; j<5; j++) { 01016 r[j*Vh].x = (float)m[j*2+0]; 01017 r[j*Vh].y = (float)m[j*2+1]; 01018 } 01019 } 01020 01021 template <typename Float, typename Float2> 01022 void packMomField(Float2 *res, Float *mom, int oddBit, int Vh) 01023 { 01024 for (int dir = 0; dir < 4; dir++) { 01025 Float *g = mom + (oddBit*Vh*4 + dir)*momSiteSize; 01026 for (int i = 0; i < Vh; i++) { 01027 pack10(res+i, g + 4*i*momSiteSize, dir, Vh); 01028 } 01029 } 01030 } 01031 01032 template <typename Float, typename Float2> 01033 void loadMomField(Float2 *even, Float2 *odd, Float *mom, 01034 int bytes, int Vh) 01035 { 01036 01037 Float2 *packedEven, *packedOdd; 01038 cudaMallocHost((void**)&packedEven, bytes); 01039 cudaMallocHost((void**)&packedOdd, bytes); 01040 01041 packMomField(packedEven, (Float*)mom, 0, Vh); 01042 packMomField(packedOdd, (Float*)mom, 1, Vh); 01043 01044 cudaMemcpy(even, packedEven, bytes, cudaMemcpyHostToDevice); 01045 cudaMemcpy(odd, packedOdd, bytes, cudaMemcpyHostToDevice); 01046 01047 cudaFreeHost(packedEven); 01048 cudaFreeHost(packedOdd); 01049 } 01050 01051 01052 01053 01054 void 01055 loadMomToGPU(FullMom cudaMom, void* mom, QudaGaugeParam* param) 01056 { 01057 if (param->cuda_prec == QUDA_DOUBLE_PRECISION) { 01058 //loadGaugeField((double2*)(cudaGauge->even), (double2*)(cudaGauge->odd), (double*)cpuGauge, 01059 //cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume); 01060 } else { //single precision 01061 loadMomField((float2*)(cudaMom.even), (float2*)(cudaMom.odd), (float*)mom, 01062 cudaMom.bytes, cudaMom.volume); 01063 01064 } 01065 } 01066 01067 01068 template <typename Float, typename Float2> 01069 inline void unpack10(Float* m, Float2 *res, int dir, int Vh) 01070 { 01071 Float2 *r = res + dir*5*Vh; 01072 for (int j=0; j<5; j++) { 01073 m[j*2+0] = r[j*Vh].x; 01074 m[j*2+1] = r[j*Vh].y; 01075 } 01076 01077 } 01078 01079 template <typename Float, typename Float2> 01080 void 01081 unpackMomField(Float* mom, Float2 *res, int oddBit, int Vh) 01082 { 01083 int dir, i; 01084 Float *m = mom + oddBit*Vh*momSiteSize*4; 01085 01086 for (i = 0; i < Vh; i++) { 01087 for (dir = 0; dir < 4; dir++) { 01088 Float* thismom = m + (4*i+dir)*momSiteSize; 01089 unpack10(thismom, res+i, dir, Vh); 01090 } 01091 } 01092 } 01093 01094 template <typename Float, typename Float2> 01095 void 01096 storeMomToCPUArray(Float* mom, Float2 *even, Float2 *odd, 01097 int bytes, int Vh) 01098 { 01099 Float2 *packedEven, *packedOdd; 01100 cudaMallocHost((void**)&packedEven, bytes); 01101 cudaMallocHost((void**)&packedOdd, bytes); 01102 cudaMemcpy(packedEven, even, bytes, cudaMemcpyDeviceToHost); 01103 cudaMemcpy(packedOdd, odd, bytes, cudaMemcpyDeviceToHost); 01104 01105 unpackMomField((Float*)mom, packedEven,0, Vh); 01106 unpackMomField((Float*)mom, packedOdd, 1, Vh); 01107 01108 cudaFreeHost(packedEven); 01109 cudaFreeHost(packedOdd); 01110 } 01111 01112 void 01113 storeMomToCPU(void* mom, FullMom cudaMom, QudaGaugeParam* param) 01114 { 01115 QudaPrecision cpu_prec = param->cpu_prec; 01116 QudaPrecision cuda_prec= param->cuda_prec; 01117 01118 if (cpu_prec != cuda_prec){ 01119 printf("Error:%s: cpu and gpu precison has to be the same at this moment \n", __FUNCTION__); 01120 exit(1); 01121 } 01122 01123 if (cpu_prec == QUDA_HALF_PRECISION){ 01124 printf("ERROR: %s: half precision is not supported at this moment\n", __FUNCTION__); 01125 exit(1); 01126 } 01127 01128 if (cpu_prec == QUDA_DOUBLE_PRECISION){ 01129 01130 }else { //SINGLE PRECISIONS 01131 storeMomToCPUArray( (float*)mom, (float2*) cudaMom.even, (float2*)cudaMom.odd, 01132 cudaMom.bytes, cudaMom.volume); 01133 } 01134 01135 } 01136 01137 /**************************************************************************************************/ 01138 template <typename Float, typename FloatN> 01139 void 01140 packGaugeField(FloatN *res, Float *gauge, int oddBit, ReconstructType reconstruct, int Vh) 01141 { 01142 int dir, i; 01143 if (reconstruct == QUDA_RECONSTRUCT_12) { 01144 for (dir = 0; dir < 4; dir++) { 01145 Float *g = gauge + oddBit*Vh*gaugeSiteSize*4; 01146 for (i = 0; i < Vh; i++) { 01147 pack12(res+i, g+4*i*gaugeSiteSize+dir*gaugeSiteSize, dir, Vh); 01148 } 01149 } 01150 } else if (reconstruct == QUDA_RECONSTRUCT_8){ 01151 for (dir = 0; dir < 4; dir++) { 01152 Float *g = gauge + oddBit*Vh*gaugeSiteSize*4; 01153 for (i = 0; i < Vh; i++) { 01154 pack8(res+i, g+4*i*gaugeSiteSize + dir*gaugeSiteSize, dir, Vh); 01155 } 01156 } 01157 }else{ 01158 for (dir = 0; dir < 4; dir++) { 01159 Float *g = gauge + oddBit*Vh*gaugeSiteSize*4; 01160 for (i = 0; i < Vh; i++) { 01161 pack18(res+i, g+4*i*gaugeSiteSize+dir*gaugeSiteSize, dir, Vh); 01162 } 01163 } 01164 } 01165 } 01166 01167 template <typename Float, typename FloatN> 01168 void 01169 loadGaugeFromCPUArrayQuda(FloatN *even, FloatN *odd, Float *cpuGauge, 01170 ReconstructType reconstruct, int bytes, int Vh) 01171 { 01172 01173 // Use pinned memory 01174 01175 FloatN *packedEven, *packedOdd; 01176 cudaMallocHost((void**)&packedEven, bytes); 01177 cudaMallocHost((void**)&packedOdd, bytes); 01178 01179 01180 packGaugeField(packedEven, (Float*)cpuGauge, 0, reconstruct, Vh); 01181 packGaugeField(packedOdd, (Float*)cpuGauge, 1, reconstruct, Vh); 01182 01183 01184 cudaMemcpy(even, packedEven, bytes, cudaMemcpyHostToDevice); 01185 cudaMemcpy(odd, packedOdd, bytes, cudaMemcpyHostToDevice); 01186 01187 cudaFreeHost(packedEven); 01188 cudaFreeHost(packedOdd); 01189 } 01190 01191 01192 01193 01194 void 01195 createLinkQuda(FullGauge* cudaGauge, QudaGaugeParam* param) 01196 { 01197 QudaPrecision cpu_prec = param->cpu_prec; 01198 QudaPrecision cuda_prec= param->cuda_prec; 01199 01200 if (cpu_prec == QUDA_HALF_PRECISION) { 01201 printf("ERROR: %s: half precision not supported on cpu\n", __FUNCTION__); 01202 exit(-1); 01203 } 01204 01205 if (cuda_prec == QUDA_DOUBLE_PRECISION && param->cpu_prec != QUDA_DOUBLE_PRECISION) { 01206 printf("Error: can only create a double GPU gauge field from a double CPU gauge field\n"); 01207 exit(-1); 01208 } 01209 01210 cudaGauge->anisotropy = param->anisotropy; 01211 cudaGauge->volume = 1; 01212 for (int d=0; d<4; d++) { 01213 cudaGauge->X[d] = param->X[d]; 01214 cudaGauge->volume *= param->X[d]; 01215 } 01216 cudaGauge->X[0] /= 2; // actually store the even-odd sublattice dimensions 01217 cudaGauge->volume /= 2; 01218 cudaGauge->stride = cudaGauge->volume + cudaGauge->pad; 01219 cudaGauge->reconstruct = param->reconstruct; 01220 01221 allocateGaugeField(cudaGauge, param->reconstruct, param->cuda_prec); 01222 01223 return; 01224 } 01225 01226 void 01227 loadLinkToGPU(FullGauge cudaGauge, void *cpuGauge, QudaGaugeParam* param) 01228 { 01229 QudaPrecision cpu_prec = param->cpu_prec; 01230 QudaPrecision cuda_prec= param->cuda_prec; 01231 01232 if (cuda_prec == QUDA_DOUBLE_PRECISION) { 01233 loadGaugeFromCPUArrayQuda((double2*)(cudaGauge.even), (double2*)(cudaGauge.odd), (double*)cpuGauge, 01234 cudaGauge.reconstruct, cudaGauge.bytes, cudaGauge.volume); 01235 } else if (cuda_prec == QUDA_SINGLE_PRECISION) { 01236 if (cpu_prec == QUDA_DOUBLE_PRECISION){ 01237 if (cudaGauge.reconstruct != QUDA_RECONSTRUCT_NO){ 01238 loadGaugeFromCPUArrayQuda((float4*)(cudaGauge.even), (float4*)(cudaGauge.odd), (double*)cpuGauge, 01239 cudaGauge.reconstruct, cudaGauge.bytes, cudaGauge.volume); 01240 }else{ 01241 loadGaugeFromCPUArrayQuda((float2*)(cudaGauge.even), (float2*)(cudaGauge.odd), (double*)cpuGauge, 01242 cudaGauge.reconstruct, cudaGauge.bytes, cudaGauge.volume); 01243 } 01244 } 01245 else if (cpu_prec == QUDA_SINGLE_PRECISION){ 01246 if (cudaGauge.reconstruct != QUDA_RECONSTRUCT_NO){ 01247 loadGaugeFromCPUArrayQuda((float4*)(cudaGauge.even), (float4*)(cudaGauge.odd), (float*)cpuGauge, 01248 cudaGauge.reconstruct, cudaGauge.bytes, cudaGauge.volume); 01249 }else{ 01250 loadGaugeFromCPUArrayQuda((float2*)(cudaGauge.even), (float2*)(cudaGauge.odd), (float*)cpuGauge, 01251 cudaGauge.reconstruct, cudaGauge.bytes, cudaGauge.volume); 01252 } 01253 } 01254 } else if (cuda_prec == QUDA_HALF_PRECISION) { 01255 if (cpu_prec == QUDA_DOUBLE_PRECISION){ 01256 if (cudaGauge.reconstruct != QUDA_RECONSTRUCT_NO){ 01257 loadGaugeFromCPUArrayQuda((short4*)(cudaGauge.even), (short4*)(cudaGauge.odd), (double*)cpuGauge, 01258 cudaGauge.reconstruct, cudaGauge.bytes, cudaGauge.volume); 01259 }else{ 01260 loadGaugeFromCPUArrayQuda((short2*)(cudaGauge.even), (short2*)(cudaGauge.odd), (double*)cpuGauge, 01261 cudaGauge.reconstruct, cudaGauge.bytes, cudaGauge.volume); 01262 } 01263 } 01264 else if (cpu_prec == QUDA_SINGLE_PRECISION){ 01265 if (cudaGauge.reconstruct != QUDA_RECONSTRUCT_NO){ 01266 loadGaugeFromCPUArrayQuda((short4*)(cudaGauge.even), (short4*)(cudaGauge.odd), (float*)cpuGauge, 01267 cudaGauge.reconstruct, cudaGauge.bytes, cudaGauge.volume); 01268 }else{ 01269 loadGaugeFromCPUArrayQuda((short2*)(cudaGauge.even), (short2*)(cudaGauge.odd), (float*)cpuGauge, 01270 cudaGauge.reconstruct, cudaGauge.bytes, cudaGauge.volume); 01271 } 01272 } 01273 } 01274 } 01275 /*****************************************************************/ 01276 /********************** store link data to cpu memory ************/ 01277 template <typename Float> 01278 inline void unpack12(Float* g, float2 *res, int dir, int V){printf("ERROR: %s is called\n", __FUNCTION__); exit(1);} 01279 template <typename Float> 01280 inline void unpack8(Float* g, float2 *res, int dir, int V){printf("ERROR: %s is called\n", __FUNCTION__); exit(1);} 01281 01282 template <typename Float, typename FloatN> 01283 void 01284 unpackGaugeField(Float* gauge, FloatN *res, int oddBit, ReconstructType reconstruct, int Vh) 01285 { 01286 int dir, i; 01287 if (reconstruct == QUDA_RECONSTRUCT_12) { 01288 for (dir = 0; dir < 4; dir++) { 01289 //Float *g = gauge + oddBit*Vh*gaugeSiteSize*4; 01290 for (i = 0; i < Vh; i++) { 01291 //unpack12(g+4*i*gaugeSiteSize+dir*gaugeSiteSize, res+i, dir, Vh); 01292 } 01293 } 01294 } else if (reconstruct == QUDA_RECONSTRUCT_8){ 01295 for (dir = 0; dir < 4; dir++) { 01296 //Float *g = gauge + oddBit*Vh*gaugeSiteSize*4; 01297 for (i = 0; i < Vh; i++) { 01298 //unpack8(g+4*i*gaugeSiteSize + dir*gaugeSiteSize, res+i, dir, Vh); 01299 } 01300 } 01301 }else{ 01302 for (dir = 0; dir < 4; dir++) { 01303 Float *g = gauge + oddBit*Vh*gaugeSiteSize*4; 01304 for (i = 0; i < Vh; i++) { 01305 unpack18(g+4*i*gaugeSiteSize+dir*gaugeSiteSize, res+i,dir, Vh); 01306 } 01307 } 01308 } 01309 } 01310 01311 template <typename Float, typename FloatN> 01312 void 01313 storeGaugeToCPUArray(Float* cpuGauge, FloatN *even, FloatN *odd, 01314 ReconstructType reconstruct, int bytes, int Vh) 01315 { 01316 01317 // Use pinned memory 01318 01319 FloatN *packedEven, *packedOdd; 01320 cudaMallocHost((void**)&packedEven, bytes); 01321 cudaMallocHost((void**)&packedOdd, bytes); 01322 cudaMemcpy(packedEven, even, bytes, cudaMemcpyDeviceToHost); 01323 cudaMemcpy(packedOdd, odd, bytes, cudaMemcpyDeviceToHost); 01324 01325 01326 unpackGaugeField((Float*)cpuGauge, packedEven,0, reconstruct, Vh); 01327 unpackGaugeField((Float*)cpuGauge, packedOdd, 1, reconstruct, Vh); 01328 01329 cudaFreeHost(packedEven); 01330 cudaFreeHost(packedOdd); 01331 } 01332 01333 01334 void 01335 storeLinkToCPU(void* cpuGauge, FullGauge *cudaGauge, QudaGaugeParam* param) 01336 { 01337 01338 QudaPrecision cpu_prec = param->cpu_prec; 01339 QudaPrecision cuda_prec= param->cuda_prec; 01340 01341 if (cuda_prec == QUDA_HALF_PRECISION || cpu_prec == QUDA_HALF_PRECISION){ 01342 printf("ERROR: %s: half precision is not supported at this moment\n", __FUNCTION__); 01343 exit(1); 01344 } 01345 01346 if (cpu_prec == QUDA_DOUBLE_PRECISION){ 01347 if (cuda_prec == QUDA_DOUBLE_PRECISION){//double and double 01348 storeGaugeToCPUArray( (double*)cpuGauge, (double2*) cudaGauge->even, (double2*)cudaGauge->odd, 01349 cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume); 01350 }else{ //double and single 01351 if (cudaGauge->reconstruct == QUDA_RECONSTRUCT_NO){ 01352 storeGaugeToCPUArray( (double*)cpuGauge, (float2*) cudaGauge->even, (float2*)cudaGauge->odd, 01353 cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume); 01354 }else{ 01355 storeGaugeToCPUArray( (double*)cpuGauge, (float4*) cudaGauge->even, (float4*)cudaGauge->odd, 01356 cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume); 01357 } 01358 } 01359 }else { //SINGLE PRECISIONS 01360 if (cudaGauge->reconstruct == QUDA_RECONSTRUCT_NO){ 01361 storeGaugeToCPUArray( (float*)cpuGauge, (float2*) cudaGauge->even, (float2*)cudaGauge->odd, 01362 cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume); 01363 }else{ 01364 storeGaugeToCPUArray( (float*)cpuGauge, (float4*) cudaGauge->even, (float4*)cudaGauge->odd, 01365 cudaGauge->reconstruct, cudaGauge->bytes, cudaGauge->volume); 01366 } 01367 01368 } 01369 }
1.7.3