|
QUDA v0.3.2
A library for QCD on GPUs
|
00001 #include <complex> 00002 #include <stdlib.h> 00003 #include <stdio.h> 00004 00005 #include <short.h> 00006 00007 #include <wilson_dslash_reference.h> 00008 #include <test_util.h> 00009 00010 #define XUP 0 00011 #define YUP 1 00012 #define ZUP 2 00013 #define TUP 3 00014 00015 using namespace std; 00016 00017 template <typename Float> 00018 static void printVector(Float *v) { 00019 printf("{(%f %f) (%f %f) (%f %f)}\n", v[0], v[1], v[2], v[3], v[4], v[5]); 00020 } 00021 00022 // X indexes the lattice site 00023 void printSpinorElement(void *spinor, int X, QudaPrecision precision) { 00024 if (precision == QUDA_DOUBLE_PRECISION) 00025 for (int s=0; s<4; s++) printVector((double*)spinor+X*24+s*6); 00026 else 00027 for (int s=0; s<4; s++) printVector((float*)spinor+X*24+s*6); 00028 } 00029 00030 // X indexes the full lattice 00031 void printGaugeElement(void *gauge, int X, QudaPrecision precision) { 00032 if (getOddBit(X) == 0) { 00033 if (precision == QUDA_DOUBLE_PRECISION) 00034 for (int m=0; m<3; m++) printVector((double*)gauge +(X/2)*gaugeSiteSize + m*3*2); 00035 else 00036 for (int m=0; m<3; m++) printVector((float*)gauge +(X/2)*gaugeSiteSize + m*3*2); 00037 00038 } else { 00039 if (precision == QUDA_DOUBLE_PRECISION) 00040 for (int m = 0; m < 3; m++) printVector((double*)gauge + (X/2+Vh)*gaugeSiteSize + m*3*2); 00041 else 00042 for (int m = 0; m < 3; m++) printVector((float*)gauge + (X/2+Vh)*gaugeSiteSize + m*3*2); 00043 } 00044 } 00045 00046 // returns 0 or 1 if the full lattice index X is even or odd 00047 int getOddBit(int Y) { 00048 int x4 = Y/(Z[2]*Z[1]*Z[0]); 00049 int x3 = (Y/(Z[1]*Z[0])) % Z[2]; 00050 int x2 = (Y/Z[0]) % Z[1]; 00051 int x1 = Y % Z[0]; 00052 return (x4+x3+x2+x1) % 2; 00053 } 00054 00055 // a+=b 00056 template <typename Float> 00057 inline void complexAddTo(Float *a, Float *b) { 00058 a[0] += b[0]; 00059 a[1] += b[1]; 00060 } 00061 00062 // a = b*c 00063 template <typename Float> 00064 inline void complexProduct(Float *a, Float *b, Float *c) { 00065 a[0] = b[0]*c[0] - b[1]*c[1]; 00066 a[1] = b[0]*c[1] + b[1]*c[0]; 00067 } 00068 00069 // a = conj(b)*conj(c) 00070 template <typename Float> 00071 inline void complexConjugateProduct(Float *a, Float *b, Float *c) { 00072 a[0] = b[0]*c[0] - b[1]*c[1]; 00073 a[1] = -b[0]*c[1] - b[1]*c[0]; 00074 } 00075 00076 // a = conj(b)*c 00077 template <typename Float> 00078 inline void complexDotProduct(Float *a, Float *b, Float *c) { 00079 a[0] = b[0]*c[0] + b[1]*c[1]; 00080 a[1] = b[0]*c[1] - b[1]*c[0]; 00081 } 00082 00083 // a += b*c 00084 template <typename Float> 00085 inline void accumulateComplexProduct(Float *a, Float *b, Float *c, Float sign) { 00086 a[0] += sign*(b[0]*c[0] - b[1]*c[1]); 00087 a[1] += sign*(b[0]*c[1] + b[1]*c[0]); 00088 } 00089 00090 // a += conj(b)*c) 00091 template <typename Float> 00092 inline void accumulateComplexDotProduct(Float *a, Float *b, Float *c) { 00093 a[0] += b[0]*c[0] + b[1]*c[1]; 00094 a[1] += b[0]*c[1] - b[1]*c[0]; 00095 } 00096 00097 template <typename Float> 00098 inline void accumulateConjugateProduct(Float *a, Float *b, Float *c, int sign) { 00099 a[0] += sign * (b[0]*c[0] - b[1]*c[1]); 00100 a[1] -= sign * (b[0]*c[1] + b[1]*c[0]); 00101 } 00102 00103 template <typename Float> 00104 inline void su3Construct12(Float *mat) { 00105 Float *w = mat+12; 00106 w[0] = 0.0; 00107 w[1] = 0.0; 00108 w[2] = 0.0; 00109 w[3] = 0.0; 00110 w[4] = 0.0; 00111 w[5] = 0.0; 00112 } 00113 00114 // Stabilized Bunk and Sommer 00115 template <typename Float> 00116 inline void su3Construct8(Float *mat) { 00117 mat[0] = atan2(mat[1], mat[0]); 00118 mat[1] = atan2(mat[13], mat[12]); 00119 for (int i=8; i<18; i++) mat[i] = 0.0; 00120 } 00121 00122 void su3_construct(void *mat, QudaReconstructType reconstruct, QudaPrecision precision) { 00123 if (reconstruct == QUDA_RECONSTRUCT_12) { 00124 if (precision == QUDA_DOUBLE_PRECISION) su3Construct12((double*)mat); 00125 else su3Construct12((float*)mat); 00126 } else { 00127 if (precision == QUDA_DOUBLE_PRECISION) su3Construct8((double*)mat); 00128 else su3Construct8((float*)mat); 00129 } 00130 } 00131 00132 // given first two rows (u,v) of SU(3) matrix mat, reconstruct the third row 00133 // as the cross product of the conjugate vectors: w = u* x v* 00134 // 00135 // 48 flops 00136 template <typename Float> 00137 static void su3Reconstruct12(Float *mat, int dir, int ga_idx, QudaGaugeParam *param) { 00138 Float *u = &mat[0*(3*2)]; 00139 Float *v = &mat[1*(3*2)]; 00140 Float *w = &mat[2*(3*2)]; 00141 w[0] = 0.0; w[1] = 0.0; w[2] = 0.0; w[3] = 0.0; w[4] = 0.0; w[5] = 0.0; 00142 accumulateConjugateProduct(w+0*(2), u+1*(2), v+2*(2), +1); 00143 accumulateConjugateProduct(w+0*(2), u+2*(2), v+1*(2), -1); 00144 accumulateConjugateProduct(w+1*(2), u+2*(2), v+0*(2), +1); 00145 accumulateConjugateProduct(w+1*(2), u+0*(2), v+2*(2), -1); 00146 accumulateConjugateProduct(w+2*(2), u+0*(2), v+1*(2), +1); 00147 accumulateConjugateProduct(w+2*(2), u+1*(2), v+0*(2), -1); 00148 Float u0 = (dir < 3 ? param->anisotropy : 00149 (ga_idx >= (Z[3]-1)*Z[0]*Z[1]*Z[2]/2 ? param->t_boundary : 1)); 00150 w[0]*=u0; w[1]*=u0; w[2]*=u0; w[3]*=u0; w[4]*=u0; w[5]*=u0; 00151 } 00152 00153 template <typename Float> 00154 static void su3Reconstruct8(Float *mat, int dir, int ga_idx, QudaGaugeParam *param) { 00155 // First reconstruct first row 00156 Float row_sum = 0.0; 00157 row_sum += mat[2]*mat[2]; 00158 row_sum += mat[3]*mat[3]; 00159 row_sum += mat[4]*mat[4]; 00160 row_sum += mat[5]*mat[5]; 00161 Float u0 = (dir < 3 ? param->anisotropy : 00162 (ga_idx >= (Z[3]-1)*Z[0]*Z[1]*Z[2]/2 ? param->t_boundary : 1)); 00163 Float U00_mag = sqrt(1.f/(u0*u0) - row_sum); 00164 00165 mat[14] = mat[0]; 00166 mat[15] = mat[1]; 00167 00168 mat[0] = U00_mag * cos(mat[14]); 00169 mat[1] = U00_mag * sin(mat[14]); 00170 00171 Float column_sum = 0.0; 00172 for (int i=0; i<2; i++) column_sum += mat[i]*mat[i]; 00173 for (int i=6; i<8; i++) column_sum += mat[i]*mat[i]; 00174 Float U20_mag = sqrt(1.f/(u0*u0) - column_sum); 00175 00176 mat[12] = U20_mag * cos(mat[15]); 00177 mat[13] = U20_mag * sin(mat[15]); 00178 00179 // First column now restored 00180 00181 // finally reconstruct last elements from SU(2) rotation 00182 Float r_inv2 = 1.0/(u0*row_sum); 00183 00184 // U11 00185 Float A[2]; 00186 complexDotProduct(A, mat+0, mat+6); 00187 complexConjugateProduct(mat+8, mat+12, mat+4); 00188 accumulateComplexProduct(mat+8, A, mat+2, u0); 00189 mat[8] *= -r_inv2; 00190 mat[9] *= -r_inv2; 00191 00192 // U12 00193 complexConjugateProduct(mat+10, mat+12, mat+2); 00194 accumulateComplexProduct(mat+10, A, mat+4, -u0); 00195 mat[10] *= r_inv2; 00196 mat[11] *= r_inv2; 00197 00198 // U21 00199 complexDotProduct(A, mat+0, mat+12); 00200 complexConjugateProduct(mat+14, mat+6, mat+4); 00201 accumulateComplexProduct(mat+14, A, mat+2, -u0); 00202 mat[14] *= r_inv2; 00203 mat[15] *= r_inv2; 00204 00205 // U12 00206 complexConjugateProduct(mat+16, mat+6, mat+2); 00207 accumulateComplexProduct(mat+16, A, mat+4, u0); 00208 mat[16] *= -r_inv2; 00209 mat[17] *= -r_inv2; 00210 } 00211 00212 void su3_reconstruct(void *mat, int dir, int ga_idx, QudaReconstructType reconstruct, QudaPrecision precision, QudaGaugeParam *param) { 00213 if (reconstruct == QUDA_RECONSTRUCT_12) { 00214 if (precision == QUDA_DOUBLE_PRECISION) su3Reconstruct12((double*)mat, dir, ga_idx, param); 00215 else su3Reconstruct12((float*)mat, dir, ga_idx, param); 00216 } else { 00217 if (precision == QUDA_DOUBLE_PRECISION) su3Reconstruct8((double*)mat, dir, ga_idx, param); 00218 else su3Reconstruct8((float*)mat, dir, ga_idx, param); 00219 } 00220 } 00221 00222 /* 00223 void su3_construct_8_half(float *mat, short *mat_half) { 00224 su3Construct8(mat); 00225 00226 mat_half[0] = floatToShort(mat[0] / M_PI); 00227 mat_half[1] = floatToShort(mat[1] / M_PI); 00228 for (int i=2; i<18; i++) { 00229 mat_half[i] = floatToShort(mat[i]); 00230 } 00231 } 00232 00233 void su3_reconstruct_8_half(float *mat, short *mat_half, int dir, int ga_idx, QudaGaugeParam *param) { 00234 00235 for (int i=0; i<18; i++) { 00236 mat[i] = shortToFloat(mat_half[i]); 00237 } 00238 mat[0] *= M_PI; 00239 mat[1] *= M_PI; 00240 00241 su3Reconstruct8(mat, dir, ga_idx, param); 00242 }*/ 00243 00244 template <typename Float> 00245 static int compareFloats(Float *a, Float *b, int len, double epsilon) { 00246 for (int i = 0; i < len; i++) { 00247 double diff = fabs(a[i] - b[i]); 00248 if (diff > epsilon) { 00249 printf("error: i=%d, a[%d]=%f, b[%d]=%f\n", i, i, a[i], i, b[i]); 00250 return 0; 00251 } 00252 } 00253 return 1; 00254 } 00255 00256 int compare_floats(void *a, void *b, int len, double epsilon, QudaPrecision precision) { 00257 if (precision == QUDA_DOUBLE_PRECISION) return compareFloats((double*)a, (double*)b, len, epsilon); 00258 else return compareFloats((float*)a, (float*)b, len, epsilon); 00259 } 00260 00261 00262 00263 // given a "half index" i into either an even or odd half lattice (corresponding 00264 // to oddBit = {0, 1}), returns the corresponding full lattice index. 00265 int fullLatticeIndex(int i, int oddBit) { 00266 int boundaryCrossings = i/(Z[0]/2) + i/(Z[1]*Z[0]/2) + i/(Z[2]*Z[1]*Z[0]/2); 00267 return 2*i + (boundaryCrossings + oddBit) % 2; 00268 } 00269 00270 00271 // i represents a "half index" into an even or odd "half lattice". 00272 // when oddBit={0,1} the half lattice is {even,odd}. 00273 // 00274 // the displacements, such as dx, refer to the full lattice coordinates. 00275 // 00276 // neighborIndex() takes a "half index", displaces it, and returns the 00277 // new "half index", which can be an index into either the even or odd lattices. 00278 // displacements of magnitude one always interchange odd and even lattices. 00279 // 00280 00281 int neighborIndex(int i, int oddBit, int dx4, int dx3, int dx2, int dx1) { 00282 int Y = fullLatticeIndex(i, oddBit); 00283 int x4 = Y/(Z[2]*Z[1]*Z[0]); 00284 int x3 = (Y/(Z[1]*Z[0])) % Z[2]; 00285 int x2 = (Y/Z[0]) % Z[1]; 00286 int x1 = Y % Z[0]; 00287 00288 // assert (oddBit == (x+y+z+t)%2); 00289 00290 x4 = (x4+dx4+Z[3]) % Z[3]; 00291 x3 = (x3+dx3+Z[2]) % Z[2]; 00292 x2 = (x2+dx2+Z[1]) % Z[1]; 00293 x1 = (x1+dx1+Z[0]) % Z[0]; 00294 00295 return (x4*(Z[2]*Z[1]*Z[0]) + x3*(Z[1]*Z[0]) + x2*(Z[0]) + x1) / 2; 00296 } 00297 00298 /* 00299 * This is a computation of neighbor using the full index and the displacement in each direction 00300 * 00301 */ 00302 00303 int 00304 neighborIndexFullLattice(int i, int dx4, int dx3, int dx2, int dx1) 00305 { 00306 int oddBit = 0; 00307 int half_idx = i; 00308 if (i >= Vh){ 00309 oddBit =1; 00310 half_idx = i - Vh; 00311 } 00312 00313 int nbr_half_idx = neighborIndex(half_idx, oddBit, dx4,dx3,dx2,dx1); 00314 int oddBitChanged = (dx4+dx3+dx2+dx1)%2; 00315 if (oddBitChanged){ 00316 oddBit = 1 - oddBit; 00317 } 00318 int ret = nbr_half_idx; 00319 if (oddBit){ 00320 ret = Vh + nbr_half_idx; 00321 } 00322 00323 return ret; 00324 } 00325 00326 // 4d checkerboard. 00327 // given a "half index" i into either an even or odd half lattice (corresponding 00328 // to oddBit = {0, 1}), returns the corresponding full lattice index. 00329 // Cf. GPGPU code in dslash_core_ante.h. 00330 // There, i is the thread index. 00331 int fullLatticeIndex_4d(int i, int oddBit) { 00332 if (i >= Vh || i < 0) {printf("i out of range in fullLatticeIndex_4d"); exit(-1);} 00333 int boundaryCrossings = i/(Z[0]/2) + i/(Z[1]*Z[0]/2) + i/(Z[2]*Z[1]*Z[0]/2); 00334 return 2*i + (boundaryCrossings + oddBit) % 2; 00335 } 00336 00337 // 5d checkerboard. 00338 // given a "half index" i into either an even or odd half lattice (corresponding 00339 // to oddBit = {0, 1}), returns the corresponding full lattice index. 00340 // Cf. GPGPU code in dslash_core_ante.h. 00341 // There, i is the thread index sid. 00342 // This function is used by neighborIndex_5d in dslash_reference.cpp. 00343 //ok 00344 int fullLatticeIndex_5d(int i, int oddBit) { 00345 int boundaryCrossings = i/(Z[0]/2) + i/(Z[1]*Z[0]/2) + i/(Z[2]*Z[1]*Z[0]/2) + i/(Z[3]*Z[2]*Z[1]*Z[0]/2); 00346 return 2*i + (boundaryCrossings + oddBit) % 2; 00347 } 00348 00349 template <typename Float> 00350 static void applyGaugeFieldScaling(Float **gauge, int Vh, QudaGaugeParam *param) { 00351 // Apply spatial scaling factor (u0) to spatial links 00352 for (int d = 0; d < 3; d++) { 00353 for (int i = 0; i < gaugeSiteSize*Vh*2; i++) { 00354 gauge[d][i] /= param->anisotropy; 00355 } 00356 } 00357 00358 // Apply boundary conditions to temporal links 00359 if (param->t_boundary == QUDA_ANTI_PERIODIC_T) { 00360 for (int j = (Z[0]/2)*Z[1]*Z[2]*(Z[3]-1); j < Vh; j++) { 00361 for (int i = 0; i < gaugeSiteSize; i++) { 00362 gauge[3][j*gaugeSiteSize+i] *= -1.0; 00363 gauge[3][(Vh+j)*gaugeSiteSize+i] *= -1.0; 00364 } 00365 } 00366 } 00367 00368 if (param->gauge_fix) { 00369 // set all gauge links (except for the first Z[0]*Z[1]*Z[2]/2) to the identity, 00370 // to simulate fixing to the temporal gauge. 00371 int dir = 3; // time direction only 00372 Float *even = gauge[dir]; 00373 Float *odd = gauge[dir]+Vh*gaugeSiteSize; 00374 for (int i = Z[0]*Z[1]*Z[2]/2; i < Vh; i++) { 00375 for (int m = 0; m < 3; m++) { 00376 for (int n = 0; n < 3; n++) { 00377 even[i*(3*3*2) + m*(3*2) + n*(2) + 0] = (m==n) ? 1 : 0; 00378 even[i*(3*3*2) + m*(3*2) + n*(2) + 1] = 0.0; 00379 odd [i*(3*3*2) + m*(3*2) + n*(2) + 0] = (m==n) ? 1 : 0; 00380 odd [i*(3*3*2) + m*(3*2) + n*(2) + 1] = 0.0; 00381 } 00382 } 00383 } 00384 } 00385 } 00386 00387 template <typename Float> 00388 void applyGaugeFieldScaling_long(Float **gauge, int Vh, QudaGaugeParam *param) 00389 { 00390 00391 int X1h=param->X[0]/2; 00392 int X1 =param->X[0]; 00393 int X2 =param->X[1]; 00394 int X3 =param->X[2]; 00395 int X4 =param->X[3]; 00396 00397 // rescale long links by the appropriate coefficient 00398 for(int d=0; d<4; d++){ 00399 for(int i=0; i < V*gaugeSiteSize; i++){ 00400 gauge[d][i] /= (-24*param->tadpole_coeff*param->tadpole_coeff); 00401 } 00402 } 00403 00404 // apply the staggered phases 00405 for (int d = 0; d < 3; d++) { 00406 00407 //even 00408 for (int i = 0; i < Vh; i++) { 00409 00410 int index = fullLatticeIndex(i, 0); 00411 int i4 = index /(X3*X2*X1); 00412 int i3 = (index - i4*(X3*X2*X1))/(X2*X1); 00413 int i2 = (index - i4*(X3*X2*X1) - i3*(X2*X1))/X1; 00414 int i1 = index - i4*(X3*X2*X1) - i3*(X2*X1) - i2*X1; 00415 int sign=1; 00416 00417 if (d == 0) { 00418 if (i4 % 2 == 1){ 00419 sign= -1; 00420 } 00421 } 00422 00423 if (d == 1){ 00424 if ((i4+i1) % 2 == 1){ 00425 sign= -1; 00426 } 00427 } 00428 if (d == 2){ 00429 if ( (i4+i1+i2) % 2 == 1){ 00430 sign= -1; 00431 } 00432 } 00433 00434 for (int j=0;j < 6; j++){ 00435 gauge[d][i*gaugeSiteSize + 12+ j] *= sign; 00436 } 00437 } 00438 //odd 00439 for (int i = 0; i < Vh; i++) { 00440 int index = fullLatticeIndex(i, 1); 00441 int i4 = index /(X3*X2*X1); 00442 int i3 = (index - i4*(X3*X2*X1))/(X2*X1); 00443 int i2 = (index - i4*(X3*X2*X1) - i3*(X2*X1))/X1; 00444 int i1 = index - i4*(X3*X2*X1) - i3*(X2*X1) - i2*X1; 00445 int sign=1; 00446 00447 if (d == 0) { 00448 if (i4 % 2 == 1){ 00449 sign= -1; 00450 } 00451 } 00452 00453 if (d == 1){ 00454 if ((i4+i1) % 2 == 1){ 00455 sign= -1; 00456 } 00457 } 00458 if (d == 2){ 00459 if ( (i4+i1+i2) % 2 == 1){ 00460 sign = -1; 00461 } 00462 } 00463 00464 for (int j=0;j < 6; j++){ 00465 gauge[d][(Vh+i)*gaugeSiteSize + 12 + j] *= sign; 00466 } 00467 } 00468 00469 } 00470 00471 // Apply boundary conditions to temporal links 00472 if (param->t_boundary == QUDA_ANTI_PERIODIC_T) { 00473 for (int j = 0; j < Vh; j++) { 00474 int sign =1; 00475 if (j >= (X4-3)*X1h*X2*X3 ){ 00476 sign= -1; 00477 } 00478 00479 for (int i = 0; i < 6; i++) { 00480 gauge[3][j*gaugeSiteSize+ 12+ i ] *= sign; 00481 gauge[3][(Vh+j)*gaugeSiteSize+12 +i] *= sign; 00482 } 00483 } 00484 } 00485 } 00486 00487 00488 00489 template <typename Float> 00490 static void constructUnitGaugeField(Float **res, QudaGaugeParam *param) { 00491 Float *resOdd[4], *resEven[4]; 00492 for (int dir = 0; dir < 4; dir++) { 00493 resEven[dir] = res[dir]; 00494 resOdd[dir] = res[dir]+Vh*gaugeSiteSize; 00495 } 00496 00497 for (int dir = 0; dir < 4; dir++) { 00498 for (int i = 0; i < Vh; i++) { 00499 for (int m = 0; m < 3; m++) { 00500 for (int n = 0; n < 3; n++) { 00501 resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = (m==n) ? 1 : 0; 00502 resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = 0.0; 00503 resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = (m==n) ? 1 : 0; 00504 resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = 0.0; 00505 } 00506 } 00507 } 00508 } 00509 00510 applyGaugeFieldScaling(res, Vh, param); 00511 } 00512 00513 // normalize the vector a 00514 template <typename Float> 00515 static void normalize(complex<Float> *a, int len) { 00516 double sum = 0.0; 00517 for (int i=0; i<len; i++) sum += norm(a[i]); 00518 for (int i=0; i<len; i++) a[i] /= sqrt(sum); 00519 } 00520 00521 // orthogonalize vector b to vector a 00522 template <typename Float> 00523 static void orthogonalize(complex<Float> *a, complex<Float> *b, int len) { 00524 complex<double> dot = 0.0; 00525 for (int i=0; i<len; i++) dot += conj(a[i])*b[i]; 00526 for (int i=0; i<len; i++) b[i] -= (complex<Float>)dot*a[i]; 00527 } 00528 00529 template <typename Float> 00530 static void constructGaugeField(Float **res, QudaGaugeParam *param) { 00531 Float *resOdd[4], *resEven[4]; 00532 for (int dir = 0; dir < 4; dir++) { 00533 resEven[dir] = res[dir]; 00534 resOdd[dir] = res[dir]+Vh*gaugeSiteSize; 00535 } 00536 00537 for (int dir = 0; dir < 4; dir++) { 00538 for (int i = 0; i < Vh; i++) { 00539 for (int m = 1; m < 3; m++) { // last 2 rows 00540 for (int n = 0; n < 3; n++) { // 3 columns 00541 resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = rand() / (Float)RAND_MAX; 00542 resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = rand() / (Float)RAND_MAX; 00543 resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = rand() / (Float)RAND_MAX; 00544 resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = rand() / (Float)RAND_MAX; 00545 } 00546 } 00547 normalize((complex<Float>*)(resEven[dir] + (i*3+1)*3*2), 3); 00548 orthogonalize((complex<Float>*)(resEven[dir] + (i*3+1)*3*2), (complex<Float>*)(resEven[dir] + (i*3+2)*3*2), 3); 00549 normalize((complex<Float>*)(resEven[dir] + (i*3 + 2)*3*2), 3); 00550 00551 normalize((complex<Float>*)(resOdd[dir] + (i*3+1)*3*2), 3); 00552 orthogonalize((complex<Float>*)(resOdd[dir] + (i*3+1)*3*2), (complex<Float>*)(resOdd[dir] + (i*3+2)*3*2), 3); 00553 normalize((complex<Float>*)(resOdd[dir] + (i*3 + 2)*3*2), 3); 00554 00555 { 00556 Float *w = resEven[dir]+(i*3+0)*3*2; 00557 Float *u = resEven[dir]+(i*3+1)*3*2; 00558 Float *v = resEven[dir]+(i*3+2)*3*2; 00559 00560 for (int n = 0; n < 6; n++) w[n] = 0.0; 00561 accumulateConjugateProduct(w+0*(2), u+1*(2), v+2*(2), +1); 00562 accumulateConjugateProduct(w+0*(2), u+2*(2), v+1*(2), -1); 00563 accumulateConjugateProduct(w+1*(2), u+2*(2), v+0*(2), +1); 00564 accumulateConjugateProduct(w+1*(2), u+0*(2), v+2*(2), -1); 00565 accumulateConjugateProduct(w+2*(2), u+0*(2), v+1*(2), +1); 00566 accumulateConjugateProduct(w+2*(2), u+1*(2), v+0*(2), -1); 00567 } 00568 00569 { 00570 Float *w = resOdd[dir]+(i*3+0)*3*2; 00571 Float *u = resOdd[dir]+(i*3+1)*3*2; 00572 Float *v = resOdd[dir]+(i*3+2)*3*2; 00573 00574 for (int n = 0; n < 6; n++) w[n] = 0.0; 00575 accumulateConjugateProduct(w+0*(2), u+1*(2), v+2*(2), +1); 00576 accumulateConjugateProduct(w+0*(2), u+2*(2), v+1*(2), -1); 00577 accumulateConjugateProduct(w+1*(2), u+2*(2), v+0*(2), +1); 00578 accumulateConjugateProduct(w+1*(2), u+0*(2), v+2*(2), -1); 00579 accumulateConjugateProduct(w+2*(2), u+0*(2), v+1*(2), +1); 00580 accumulateConjugateProduct(w+2*(2), u+1*(2), v+0*(2), -1); 00581 } 00582 00583 } 00584 } 00585 if (param->type == QUDA_WILSON_LINKS){ 00586 applyGaugeFieldScaling(res, Vh, param); 00587 }else if (param->type == QUDA_ASQTAD_LONG_LINKS){ 00588 applyGaugeFieldScaling_long(res, Vh, param); 00589 } 00590 00591 00592 } 00593 00594 template <typename Float> 00595 void constructUnitaryGaugeField(Float **res) 00596 { 00597 Float *resOdd[4], *resEven[4]; 00598 for (int dir = 0; dir < 4; dir++) { 00599 resEven[dir] = res[dir]; 00600 resOdd[dir] = res[dir]+Vh*gaugeSiteSize; 00601 } 00602 00603 for (int dir = 0; dir < 4; dir++) { 00604 for (int i = 0; i < Vh; i++) { 00605 for (int m = 1; m < 3; m++) { // last 2 rows 00606 for (int n = 0; n < 3; n++) { // 3 columns 00607 resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = rand() / (Float)RAND_MAX; 00608 resEven[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = rand() / (Float)RAND_MAX; 00609 resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 0] = rand() / (Float)RAND_MAX; 00610 resOdd[dir][i*(3*3*2) + m*(3*2) + n*(2) + 1] = rand() / (Float)RAND_MAX; 00611 } 00612 } 00613 normalize((complex<Float>*)(resEven[dir] + (i*3+1)*3*2), 3); 00614 orthogonalize((complex<Float>*)(resEven[dir] + (i*3+1)*3*2), (complex<Float>*)(resEven[dir] + (i*3+2)*3*2), 3); 00615 normalize((complex<Float>*)(resEven[dir] + (i*3 + 2)*3*2), 3); 00616 00617 normalize((complex<Float>*)(resOdd[dir] + (i*3+1)*3*2), 3); 00618 orthogonalize((complex<Float>*)(resOdd[dir] + (i*3+1)*3*2), (complex<Float>*)(resOdd[dir] + (i*3+2)*3*2), 3); 00619 normalize((complex<Float>*)(resOdd[dir] + (i*3 + 2)*3*2), 3); 00620 00621 { 00622 Float *w = resEven[dir]+(i*3+0)*3*2; 00623 Float *u = resEven[dir]+(i*3+1)*3*2; 00624 Float *v = resEven[dir]+(i*3+2)*3*2; 00625 00626 for (int n = 0; n < 6; n++) w[n] = 0.0; 00627 accumulateConjugateProduct(w+0*(2), u+1*(2), v+2*(2), +1); 00628 accumulateConjugateProduct(w+0*(2), u+2*(2), v+1*(2), -1); 00629 accumulateConjugateProduct(w+1*(2), u+2*(2), v+0*(2), +1); 00630 accumulateConjugateProduct(w+1*(2), u+0*(2), v+2*(2), -1); 00631 accumulateConjugateProduct(w+2*(2), u+0*(2), v+1*(2), +1); 00632 accumulateConjugateProduct(w+2*(2), u+1*(2), v+0*(2), -1); 00633 } 00634 00635 { 00636 Float *w = resOdd[dir]+(i*3+0)*3*2; 00637 Float *u = resOdd[dir]+(i*3+1)*3*2; 00638 Float *v = resOdd[dir]+(i*3+2)*3*2; 00639 00640 for (int n = 0; n < 6; n++) w[n] = 0.0; 00641 accumulateConjugateProduct(w+0*(2), u+1*(2), v+2*(2), +1); 00642 accumulateConjugateProduct(w+0*(2), u+2*(2), v+1*(2), -1); 00643 accumulateConjugateProduct(w+1*(2), u+2*(2), v+0*(2), +1); 00644 accumulateConjugateProduct(w+1*(2), u+0*(2), v+2*(2), -1); 00645 accumulateConjugateProduct(w+2*(2), u+0*(2), v+1*(2), +1); 00646 accumulateConjugateProduct(w+2*(2), u+1*(2), v+0*(2), -1); 00647 } 00648 00649 } 00650 } 00651 } 00652 00653 00654 void construct_gauge_field(void **gauge, int type, QudaPrecision precision, QudaGaugeParam *param) { 00655 if (type == 0) { 00656 if (precision == QUDA_DOUBLE_PRECISION) constructUnitGaugeField((double**)gauge, param); 00657 else constructUnitGaugeField((float**)gauge, param); 00658 } else { 00659 if (precision == QUDA_DOUBLE_PRECISION) constructGaugeField((double**)gauge, param); 00660 else constructGaugeField((float**)gauge, param); 00661 } 00662 } 00663 00664 void 00665 construct_fat_long_gauge_field(void **fatlink, void** longlink, 00666 int type, QudaPrecision precision, QudaGaugeParam* param) 00667 { 00668 if (type == 0) { 00669 if (precision == QUDA_DOUBLE_PRECISION) { 00670 constructUnitGaugeField((double**)fatlink, param); 00671 constructUnitGaugeField((double**)longlink, param); 00672 }else { 00673 constructUnitGaugeField((float**)fatlink, param); 00674 constructUnitGaugeField((float**)longlink, param); 00675 } 00676 } else { 00677 if (precision == QUDA_DOUBLE_PRECISION) { 00678 param->type = QUDA_ASQTAD_FAT_LINKS; 00679 constructGaugeField((double**)fatlink, param); 00680 param->type = QUDA_ASQTAD_LONG_LINKS; 00681 constructGaugeField((double**)longlink, param); 00682 }else { 00683 param->type = QUDA_ASQTAD_FAT_LINKS; 00684 constructGaugeField((float**)fatlink, param); 00685 param->type = QUDA_ASQTAD_LONG_LINKS; 00686 constructGaugeField((float**)longlink, param); 00687 } 00688 } 00689 } 00690 00691 00692 template <typename Float> 00693 static void constructCloverField(Float *res, double norm, double diag) { 00694 00695 Float c = 2.0 * norm / RAND_MAX; 00696 00697 for(int i = 0; i < V; i++) { 00698 for (int j = 0; j < 72; j++) { 00699 res[i*72 + j] = c*rand() - norm; 00700 } 00701 for (int j = 0; j< 6; j++) { 00702 res[i*72 + j] += diag; 00703 res[i*72 + j+36] += diag; 00704 } 00705 } 00706 } 00707 00708 void construct_clover_field(void *clover, double norm, double diag, QudaPrecision precision) { 00709 00710 if (precision == QUDA_DOUBLE_PRECISION) constructCloverField((double *)clover, norm, diag); 00711 else constructCloverField((float *)clover, norm, diag); 00712 } 00713 00714 /*void strong_check(void *spinorRef, void *spinorGPU, int len, QudaPrecision prec) { 00715 printf("Reference:\n"); 00716 printSpinorElement(spinorRef, 0, prec); printf("...\n"); 00717 printSpinorElement(spinorRef, len-1, prec); printf("\n"); 00718 00719 printf("\nCUDA:\n"); 00720 printSpinorElement(spinorGPU, 0, prec); printf("...\n"); 00721 printSpinorElement(spinorGPU, len-1, prec); printf("\n"); 00722 00723 compare_spinor(spinorRef, spinorGPU, len, prec); 00724 }*/ 00725 00726 template <typename Float> 00727 static void checkGauge(Float **oldG, Float **newG, double epsilon) { 00728 00729 int fail_check = 17; 00730 int *fail = new int[fail_check]; 00731 int iter[18]; 00732 for (int i=0; i<fail_check; i++) fail[i] = 0; 00733 for (int i=0; i<18; i++) iter[i] = 0; 00734 00735 for (int d=0; d<4; d++) { 00736 for (int eo=0; eo<2; eo++) { 00737 for (int i=0; i<Vh; i++) { 00738 int ga_idx = (eo*Vh+i); 00739 for (int j=0; j<18; j++) { 00740 double diff = fabs(newG[d][ga_idx*18+j] - oldG[d][ga_idx*18+j]); 00741 00742 for (int f=0; f<fail_check; f++) if (diff > pow(10.0,-(f+1))) fail[f]++; 00743 if (diff > epsilon) iter[j]++; 00744 } 00745 } 00746 } 00747 } 00748 00749 for (int i=0; i<18; i++) printf("%d fails = %d\n", i, iter[i]); 00750 00751 for (int f=0; f<fail_check; f++) { 00752 printf("%e Failures = %d / %d = %e\n", pow(10.0,-(f+1)), fail[f], V*4*18, fail[f] / (double)(4*V*18)); 00753 } 00754 00755 delete []fail; 00756 00757 } 00758 00759 void check_gauge(void **oldG, void **newG, double epsilon, QudaPrecision precision) { 00760 if (precision == QUDA_DOUBLE_PRECISION) 00761 checkGauge((double**)oldG, (double**)newG, epsilon); 00762 else 00763 checkGauge((float**)oldG, (float**)newG, epsilon); 00764 } 00765 00766 00767 void 00768 createSiteLinkCPU(void* link, QudaPrecision precision, int phase) 00769 { 00770 void* temp[4]; 00771 00772 size_t gSize = (precision == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float); 00773 for(int i=0;i < 4;i++){ 00774 temp[i] = malloc(V*gaugeSiteSize*gSize); 00775 if (temp[i] == NULL){ 00776 fprintf(stderr, "Error: malloc failed for temp in function %s\n", __FUNCTION__); 00777 exit(1); 00778 } 00779 } 00780 00781 if (precision == QUDA_DOUBLE_PRECISION) { 00782 constructUnitaryGaugeField((double**)temp); 00783 }else { 00784 constructUnitaryGaugeField((float**)temp); 00785 } 00786 00787 for(int i=0;i < V;i++){ 00788 for(int dir=0;dir < 4;dir++){ 00789 if (precision == QUDA_DOUBLE_PRECISION){ 00790 double** src= (double**)temp; 00791 double* dst = (double*)link; 00792 for(int k=0; k < gaugeSiteSize; k++){ 00793 dst[ (4*i+dir)*gaugeSiteSize + k ] = src[dir][i*gaugeSiteSize + k]; 00794 00795 } 00796 }else{ 00797 float** src= (float**)temp; 00798 float* dst = (float*)link; 00799 for(int k=0; k < gaugeSiteSize; k++){ 00800 dst[ (4*i+dir)*gaugeSiteSize + k ] = src[dir][i*gaugeSiteSize + k]; 00801 00802 } 00803 00804 } 00805 } 00806 } 00807 00808 if(phase){ 00809 00810 for(int i=0;i < V;i++){ 00811 for(int dir =XUP; dir <= TUP; dir++){ 00812 int idx = i; 00813 int oddBit =0; 00814 if (i >= Vh) { 00815 idx = i - Vh; 00816 oddBit = 1; 00817 } 00818 00819 int X1 = Z[0]; 00820 int X2 = Z[1]; 00821 int X3 = Z[2]; 00822 int X4 = Z[3]; 00823 00824 int full_idx = fullLatticeIndex(idx, oddBit); 00825 int i4 = full_idx /(X3*X2*X1); 00826 int i3 = (full_idx - i4*(X3*X2*X1))/(X2*X1); 00827 int i2 = (full_idx - i4*(X3*X2*X1) - i3*(X2*X1))/X1; 00828 int i1 = full_idx - i4*(X3*X2*X1) - i3*(X2*X1) - i2*X1; 00829 00830 double coeff= 1.0; 00831 switch(dir){ 00832 case XUP: 00833 if ( (i4 & 1) == 1){ 00834 coeff *= -1; 00835 } 00836 break; 00837 00838 case YUP: 00839 if ( ((i4+i1) & 1) == 1){ 00840 coeff *= -1; 00841 } 00842 break; 00843 00844 case ZUP: 00845 if ( ((i4+i1+i2) & 1) == 1){ 00846 coeff *= -1; 00847 } 00848 break; 00849 00850 case TUP: 00851 if (i4 == (X4-1) ){ 00852 coeff *= -1; 00853 } 00854 break; 00855 00856 default: 00857 printf("ERROR: wrong dir(%d)\n", dir); 00858 exit(1); 00859 } 00860 00861 00862 if (precision == QUDA_DOUBLE_PRECISION){ 00863 double* mylink = (double*)link; 00864 mylink = mylink + (4*i + dir)*gaugeSiteSize; 00865 00866 mylink[12] *= coeff; 00867 mylink[13] *= coeff; 00868 mylink[14] *= coeff; 00869 mylink[15] *= coeff; 00870 mylink[16] *= coeff; 00871 mylink[17] *= coeff; 00872 00873 }else{ 00874 float* mylink = (float*)link; 00875 mylink = mylink + (4*i + dir)*gaugeSiteSize; 00876 00877 mylink[12] *= coeff; 00878 mylink[13] *= coeff; 00879 mylink[14] *= coeff; 00880 mylink[15] *= coeff; 00881 mylink[16] *= coeff; 00882 mylink[17] *= coeff; 00883 00884 } 00885 } 00886 } 00887 } 00888 00889 00890 #if 1 00891 for(int i=0;i< 4*V*gaugeSiteSize;i++){ 00892 if (precision ==QUDA_SINGLE_PRECISION){ 00893 float* f = (float*)link; 00894 if (f[i] != f[i] || (fabsf(f[i]) > 1.e+3) ){ 00895 fprintf(stderr, "ERROR: %dth: bad number(%f) in function %s \n",i, f[i], __FUNCTION__); 00896 exit(1); 00897 } 00898 }else{ 00899 double* f = (double*)link; 00900 if (f[i] != f[i] || (fabs(f[i]) > 1.e+3)){ 00901 fprintf(stderr, "ERROR: %dth: bad number(%f) in function %s \n",i, f[i], __FUNCTION__); 00902 exit(1); 00903 } 00904 00905 } 00906 00907 } 00908 #endif 00909 00910 for(int i=0;i < 4;i++){ 00911 free(temp[i]); 00912 } 00913 return; 00914 } 00915 00916 00917 00918 00919 00920 template <typename Float> 00921 void compareLink(Float *linkA, Float *linkB, int len) { 00922 int fail_check = 16; 00923 int fail[fail_check]; 00924 for (int f=0; f<fail_check; f++) fail[f] = 0; 00925 00926 int iter[18]; 00927 for (int i=0; i<18; i++) iter[i] = 0; 00928 00929 for (int i=0; i<len; i++) { 00930 for (int j=0; j<18; j++) { 00931 int is = i*18+j; 00932 double diff = fabs(linkA[is]-linkB[is]); 00933 for (int f=0; f<fail_check; f++) 00934 if (diff > pow(10.0,-(f+1))) fail[f]++; 00935 //if (diff > 1e-1) printf("%d %d %e\n", i, j, diff); 00936 if (diff > 1e-3) iter[j]++; 00937 } 00938 } 00939 00940 for (int i=0; i<18; i++) printf("%d fails = %d\n", i, iter[i]); 00941 00942 for (int f=0; f<fail_check; f++) { 00943 printf("%e Failures: %d / %d = %e\n", pow(10.0,-(f+1)), fail[f], len*gaugeSiteSize, fail[f] / (double)(len*6)); 00944 } 00945 00946 } 00947 00948 static void 00949 compare_link(void *linkA, void *linkB, int len, QudaPrecision precision) 00950 { 00951 if (precision == QUDA_DOUBLE_PRECISION) compareLink((double*)linkA, (double*)linkB, len); 00952 else compareLink((float*)linkA, (float*)linkB, len); 00953 00954 return; 00955 } 00956 00957 00958 // X indexes the lattice site 00959 static void 00960 printLinkElement(void *link, int X, QudaPrecision precision) 00961 { 00962 if (precision == QUDA_DOUBLE_PRECISION){ 00963 for(int i=0; i < 3;i++){ 00964 printVector((double*)link+ X*gaugeSiteSize + i*6); 00965 } 00966 00967 } 00968 else{ 00969 for(int i=0;i < 3;i++){ 00970 printVector((float*)link+X*gaugeSiteSize + i*6); 00971 } 00972 } 00973 } 00974 00975 void strong_check_link(void * linkA, void *linkB, int len, QudaPrecision prec) 00976 { 00977 printf("LinkA:\n"); 00978 printLinkElement(linkA, 0, prec); 00979 printf("\n"); 00980 printLinkElement(linkA, 1, prec); 00981 printf("...\n"); 00982 printLinkElement(linkA, len-1, prec); 00983 printf("\n"); 00984 00985 printf("\nlinkB:\n"); 00986 printLinkElement(linkB, 0, prec); 00987 printf("\n"); 00988 printLinkElement(linkB, 1, prec); 00989 printf("...\n"); 00990 printLinkElement(linkB, len-1, prec); 00991 printf("\n"); 00992 00993 compare_link(linkA, linkB, len, prec); 00994 } 00995 00996 00997 00998 void 00999 createMomCPU(void* mom, QudaPrecision precision) 01000 { 01001 void* temp; 01002 01003 size_t gSize = (precision == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float); 01004 temp = malloc(4*V*gaugeSiteSize*gSize); 01005 if (temp == NULL){ 01006 fprintf(stderr, "Error: malloc failed for temp in function %s\n", __FUNCTION__); 01007 exit(1); 01008 } 01009 01010 01011 01012 for(int i=0;i < V;i++){ 01013 if (precision == QUDA_DOUBLE_PRECISION){ 01014 for(int dir=0;dir < 4;dir++){ 01015 double* thismom = (double*)mom; 01016 for(int k=0; k < momSiteSize; k++){ 01017 thismom[ (4*i+dir)*momSiteSize + k ]= 1.0* rand() /RAND_MAX; 01018 } 01019 } 01020 }else{ 01021 for(int dir=0;dir < 4;dir++){ 01022 float* thismom=(float*)mom; 01023 for(int k=0; k < momSiteSize; k++){ 01024 thismom[ (4*i+dir)*momSiteSize + k ]= 1.0* rand() /RAND_MAX; 01025 } 01026 } 01027 } 01028 } 01029 01030 free(temp); 01031 return; 01032 } 01033 01034 void 01035 createHwCPU(void* hw, QudaPrecision precision) 01036 { 01037 for(int i=0;i < V;i++){ 01038 if (precision == QUDA_DOUBLE_PRECISION){ 01039 for(int dir=0;dir < 4;dir++){ 01040 double* thishw = (double*)hw; 01041 for(int k=0; k < hwSiteSize; k++){ 01042 thishw[ (4*i+dir)*hwSiteSize + k ]= 1.0* rand() /RAND_MAX; 01043 } 01044 } 01045 }else{ 01046 for(int dir=0;dir < 4;dir++){ 01047 float* thishw=(float*)hw; 01048 for(int k=0; k < hwSiteSize; k++){ 01049 thishw[ (4*i+dir)*hwSiteSize + k ]= 1.0* rand() /RAND_MAX; 01050 } 01051 } 01052 } 01053 } 01054 01055 return; 01056 } 01057 01058 01059 template <typename Float> 01060 void compare_mom(Float *momA, Float *momB, int len) { 01061 int fail_check = 16; 01062 int fail[fail_check]; 01063 for (int f=0; f<fail_check; f++) fail[f] = 0; 01064 01065 int iter[momSiteSize]; 01066 for (int i=0; i<momSiteSize; i++) iter[i] = 0; 01067 01068 for (int i=0; i<len; i++) { 01069 for (int j=0; j<momSiteSize; j++) { 01070 int is = i*momSiteSize+j; 01071 double diff = fabs(momA[is]-momB[is]); 01072 for (int f=0; f<fail_check; f++) 01073 if (diff > pow(10.0,-(f+1))) fail[f]++; 01074 //if (diff > 1e-1) printf("%d %d %e\n", i, j, diff); 01075 if (diff > 1e-3) iter[j]++; 01076 } 01077 } 01078 01079 for (int i=0; i<momSiteSize; i++) printf("%d fails = %d\n", i, iter[i]); 01080 01081 for (int f=0; f<fail_check; f++) { 01082 printf("%e Failures: %d / %d = %e\n", pow(10.0,-(f+1)), fail[f], len*momSiteSize, fail[f] / (double)(len*6)); 01083 } 01084 01085 } 01086 01087 static void 01088 printMomElement(void *mom, int X, QudaPrecision precision) 01089 { 01090 if (precision == QUDA_DOUBLE_PRECISION){ 01091 double* thismom = ((double*)mom)+ X*momSiteSize; 01092 printVector(thismom); 01093 printf("(%9f,%9f) (%9f,%9f)\n", thismom[6], thismom[7], thismom[8], thismom[9]); 01094 }else{ 01095 float* thismom = ((float*)mom)+ X*momSiteSize; 01096 printVector(thismom); 01097 printf("(%9f,%9f) (%9f,%9f)\n", thismom[6], thismom[7], thismom[8], thismom[9]); 01098 } 01099 } 01100 void strong_check_mom(void * momA, void *momB, int len, QudaPrecision prec) 01101 { 01102 printf("mom:\n"); 01103 printMomElement(momA, 0, prec); 01104 printf("\n"); 01105 printMomElement(momA, 1, prec); 01106 printf("\n"); 01107 printMomElement(momA, 2, prec); 01108 printf("\n"); 01109 printMomElement(momA, 3, prec); 01110 printf("...\n"); 01111 01112 printf("\nreference mom:\n"); 01113 printMomElement(momB, 0, prec); 01114 printf("\n"); 01115 printMomElement(momB, 1, prec); 01116 printf("\n"); 01117 printMomElement(momB, 2, prec); 01118 printf("\n"); 01119 printMomElement(momB, 3, prec); 01120 printf("\n"); 01121 01122 01123 if (prec == QUDA_DOUBLE_PRECISION){ 01124 compare_mom((double*)momA, (double*)momB, len); 01125 }else{ 01126 compare_mom((float*)momA, (float*)momB, len); 01127 } 01128 }
1.7.3