|
QUDA v0.3.2
A library for QCD on GPUs
|
00001 #include <stdlib.h> 00002 #include <stdio.h> 00003 #include <math.h> 00004 00005 #include <quda_internal.h> 00006 #include <clover_quda.h> 00007 00008 void allocateParityClover(ParityClover *ret, int *X, int pad, QudaPrecision precision) 00009 { 00010 ret->precision = precision; 00011 ret->volume = 1; 00012 for (int d=0; d<4; d++) { 00013 ret->X[d] = X[d]; 00014 ret->volume *= X[d]; 00015 } 00016 ret->pad = pad; 00017 ret->stride = ret->volume + ret->pad; 00018 00019 ret->Nc = 3; 00020 ret->Ns = 4; 00021 ret->real_length = ret->volume*ret->Nc*ret->Nc*ret->Ns*ret->Ns/2; // block-diagonal Hermitian (72 reals) 00022 ret->length = ret->stride*ret->Nc*ret->Nc*ret->Ns*ret->Ns/2; // block-diagonal Hermitian (72 reals) 00023 00024 if (precision == QUDA_DOUBLE_PRECISION) ret->bytes = ret->length*sizeof(double); 00025 else if (precision == QUDA_SINGLE_PRECISION) ret->bytes = ret->length*sizeof(float); 00026 else ret->bytes = ret->length*sizeof(short); 00027 00028 if (!ret->clover) { 00029 if (cudaMalloc((void**)&(ret->clover), ret->bytes) == cudaErrorMemoryAllocation) { 00030 errorQuda("Error allocating clover term"); 00031 } 00032 } 00033 00034 if (!ret->cloverNorm) { 00035 if (precision == QUDA_HALF_PRECISION) { 00036 if (cudaMalloc((void**)&ret->cloverNorm, ret->bytes/18) == cudaErrorMemoryAllocation) { 00037 errorQuda("Error allocating cloverNorm"); 00038 } 00039 } 00040 } 00041 00042 } 00043 00044 void allocateCloverField(FullClover *ret, int *X, int pad, QudaPrecision precision) 00045 { 00046 allocateParityClover(&(ret->even), X, pad, precision); 00047 allocateParityClover(&(ret->odd), X, pad, precision); 00048 } 00049 00050 void freeParityClover(ParityClover *clover) 00051 { 00052 cudaFree(clover->clover); 00053 clover->clover = NULL; 00054 } 00055 00056 void freeCloverField(FullClover *clover) 00057 { 00058 freeParityClover(&clover->even); 00059 freeParityClover(&clover->odd); 00060 } 00061 00062 template <typename Float> 00063 static inline void packCloverMatrix(float4* a, Float *b, int Vh) 00064 { 00065 const Float half = 0.5; // pre-include factor of 1/2 introduced by basis change 00066 00067 for (int i=0; i<18; i++) { 00068 a[i*Vh].x = half * b[4*i+0]; 00069 a[i*Vh].y = half * b[4*i+1]; 00070 a[i*Vh].z = half * b[4*i+2]; 00071 a[i*Vh].w = half * b[4*i+3]; 00072 } 00073 } 00074 00075 template <typename Float> 00076 static inline void packCloverMatrix(double2* a, Float *b, int Vh) 00077 { 00078 const Float half = 0.5; // pre-include factor of 1/2 introduced by basis change 00079 00080 for (int i=0; i<36; i++) { 00081 a[i*Vh].x = half * b[2*i+0]; 00082 a[i*Vh].y = half * b[2*i+1]; 00083 } 00084 } 00085 00086 template <typename Float, typename FloatN> 00087 static void packParityClover(FloatN *res, Float *clover, int Vh, int pad) 00088 { 00089 for (int i = 0; i < Vh; i++) { 00090 packCloverMatrix(res+i, clover+72*i, Vh+pad); 00091 } 00092 } 00093 00094 template <typename Float, typename FloatN> 00095 static void packFullClover(FloatN *even, FloatN *odd, Float *clover, int *X, int pad) 00096 { 00097 int Vh = X[0]*X[1]*X[2]*X[3]; 00098 X[0] *= 2; // X now contains dimensions of the full lattice 00099 00100 for (int i=0; i<Vh; i++) { 00101 00102 int boundaryCrossings = i/X[0] + i/(X[1]*X[0]) + i/(X[2]*X[1]*X[0]); 00103 00104 { // even sites 00105 int k = 2*i + boundaryCrossings%2; 00106 packCloverMatrix(even+i, clover+72*k, Vh+pad); 00107 } 00108 00109 { // odd sites 00110 int k = 2*i + (boundaryCrossings+1)%2; 00111 packCloverMatrix(odd+i, clover+72*k, Vh+pad); 00112 } 00113 } 00114 } 00115 00116 template<typename Float> 00117 static inline void packCloverMatrixHalf(short4 *res, float *norm, Float *clover, int Vh) 00118 { 00119 const Float half = 0.5; // pre-include factor of 1/2 introduced by basis change 00120 Float max, a, c; 00121 00122 // treat the two chiral blocks separately 00123 for (int chi=0; chi<2; chi++) { 00124 max = fabs(clover[0]); 00125 for (int i=1; i<36; i++) { 00126 if ((a = fabs(clover[i])) > max) max = a; 00127 } 00128 c = MAX_SHORT/max; 00129 for (int i=0; i<9; i++) { 00130 res[i*Vh].x = (short) (c * clover[4*i+0]); 00131 res[i*Vh].y = (short) (c * clover[4*i+1]); 00132 res[i*Vh].z = (short) (c * clover[4*i+2]); 00133 res[i*Vh].w = (short) (c * clover[4*i+3]); 00134 } 00135 norm[chi*Vh] = half*max; 00136 res += 9*Vh; 00137 clover += 36; 00138 } 00139 } 00140 00141 template <typename Float> 00142 static void packParityCloverHalf(short4 *res, float *norm, Float *clover, int Vh, int pad) 00143 { 00144 for (int i = 0; i < Vh; i++) { 00145 packCloverMatrixHalf(res+i, norm+i, clover+72*i, Vh+pad); 00146 } 00147 } 00148 00149 template <typename Float> 00150 static void packFullCloverHalf(short4 *even, float *evenNorm, short4 *odd, float *oddNorm, 00151 Float *clover, int *X, int pad) 00152 { 00153 int Vh = X[0]*X[1]*X[2]*X[3]; 00154 X[0] *= 2; // X now contains dimensions of the full lattice 00155 00156 for (int i=0; i<Vh; i++) { 00157 00158 int boundaryCrossings = i/X[0] + i/(X[1]*X[0]) + i/(X[2]*X[1]*X[0]); 00159 00160 { // even sites 00161 int k = 2*i + boundaryCrossings%2; 00162 packCloverMatrixHalf(even+i, evenNorm+i, clover+72*k, Vh+pad); 00163 } 00164 00165 { // odd sites 00166 int k = 2*i + (boundaryCrossings+1)%2; 00167 packCloverMatrixHalf(odd+i, oddNorm+i, clover+72*k, Vh+pad); 00168 } 00169 } 00170 } 00171 00172 void loadParityClover(ParityClover ret, void *clover, QudaPrecision cpu_prec, 00173 CloverFieldOrder clover_order) 00174 { 00175 // use pinned memory 00176 void *packedClover, *packedCloverNorm; 00177 00178 if (ret.precision == QUDA_DOUBLE_PRECISION && cpu_prec != QUDA_DOUBLE_PRECISION) { 00179 errorQuda("Cannot have CUDA double precision without CPU double precision"); 00180 } 00181 if (clover_order != QUDA_PACKED_CLOVER_ORDER) { 00182 errorQuda("Invalid clover_order"); 00183 } 00184 00185 #ifndef __DEVICE_EMULATION__ 00186 if (cudaMallocHost(&packedClover, ret.bytes) == cudaErrorMemoryAllocation) { 00187 errorQuda("Error allocating clover pinned memory"); 00188 } 00189 if (ret.precision == QUDA_HALF_PRECISION) 00190 if (cudaMallocHost(&packedCloverNorm, ret.bytes/18) == cudaErrorMemoryAllocation) { 00191 errorQuda("Error allocating clover pinned memory"); 00192 } 00193 #else 00194 packedClover = malloc(ret.bytes); 00195 if (ret.precision == QUDA_HALF_PRECISION) packedCloverNorm = malloc(ret.bytes/18); 00196 #endif 00197 00198 if (ret.precision == QUDA_DOUBLE_PRECISION) { 00199 packParityClover((double2 *)packedClover, (double *)clover, ret.volume, ret.pad); 00200 } else if (ret.precision == QUDA_SINGLE_PRECISION) { 00201 if (cpu_prec == QUDA_DOUBLE_PRECISION) { 00202 packParityClover((float4 *)packedClover, (double *)clover, ret.volume, ret.pad); 00203 } else { 00204 packParityClover((float4 *)packedClover, (float *)clover, ret.volume, ret.pad); 00205 } 00206 } else { 00207 if (cpu_prec == QUDA_DOUBLE_PRECISION) { 00208 packParityCloverHalf((short4 *)packedClover, (float *)packedCloverNorm, 00209 (double *)clover, ret.volume, ret.pad); 00210 } else { 00211 packParityCloverHalf((short4 *)packedClover, (float *)packedCloverNorm, 00212 (float *)clover, ret.volume, ret.pad); 00213 } 00214 } 00215 00216 cudaMemcpy(ret.clover, packedClover, ret.bytes, cudaMemcpyHostToDevice); 00217 if (ret.precision == QUDA_HALF_PRECISION) { 00218 cudaMemcpy(ret.cloverNorm, packedCloverNorm, ret.bytes/18, cudaMemcpyHostToDevice); 00219 } 00220 00221 #ifndef __DEVICE_EMULATION__ 00222 cudaFreeHost(packedClover); 00223 if (ret.precision == QUDA_HALF_PRECISION) cudaFreeHost(packedCloverNorm); 00224 #else 00225 free(packedClover); 00226 if (ret.precision == QUDA_HALF_PRECISION) free(packedCloverNorm); 00227 #endif 00228 00229 } 00230 00231 void loadFullClover(FullClover ret, void *clover, QudaPrecision cpu_prec, 00232 CloverFieldOrder clover_order) 00233 { 00234 // use pinned memory 00235 void *packedEven, *packedEvenNorm, *packedOdd, *packedOddNorm; 00236 00237 if (ret.even.precision == QUDA_DOUBLE_PRECISION && cpu_prec != QUDA_DOUBLE_PRECISION) { 00238 errorQuda("Cannot have CUDA double precision without CPU double precision"); 00239 } 00240 if (clover_order != QUDA_LEX_PACKED_CLOVER_ORDER) { 00241 errorQuda("Invalid clover order"); 00242 } 00243 00244 #ifndef __DEVICE_EMULATION__ 00245 cudaMallocHost(&packedEven, ret.even.bytes); 00246 cudaMallocHost(&packedOdd, ret.even.bytes); 00247 if (ret.even.precision == QUDA_HALF_PRECISION) { 00248 cudaMallocHost(&packedEvenNorm, ret.even.bytes/18); 00249 cudaMallocHost(&packedOddNorm, ret.even.bytes/18); 00250 } 00251 #else 00252 packedEven = malloc(ret.even.bytes); 00253 packedOdd = malloc(ret.even.bytes); 00254 if (ret.even.precision == QUDA_HALF_PRECISION) { 00255 packedEvenNorm = malloc(ret.even.bytes/18); 00256 packedOddNorm = malloc(ret.even.bytes/18); 00257 } 00258 #endif 00259 00260 if (ret.even.precision == QUDA_DOUBLE_PRECISION) { 00261 packFullClover((double2 *)packedEven, (double2 *)packedOdd, (double *)clover, ret.even.X, ret.even.pad); 00262 } else if (ret.even.precision == QUDA_SINGLE_PRECISION) { 00263 if (cpu_prec == QUDA_DOUBLE_PRECISION) { 00264 packFullClover((float4 *)packedEven, (float4 *)packedOdd, (double *)clover, ret.even.X, ret.even.pad); 00265 } else { 00266 packFullClover((float4 *)packedEven, (float4 *)packedOdd, (float *)clover, ret.even.X, ret.even.pad); 00267 } 00268 } else { 00269 if (cpu_prec == QUDA_DOUBLE_PRECISION) { 00270 packFullCloverHalf((short4 *)packedEven, (float *)packedEvenNorm, (short4 *)packedOdd, 00271 (float *) packedOddNorm, (double *)clover, ret.even.X, ret.even.pad); 00272 } else { 00273 packFullCloverHalf((short4 *)packedEven, (float *)packedEvenNorm, (short4 *)packedOdd, 00274 (float * )packedOddNorm, (float *)clover, ret.even.X, ret.even.pad); 00275 } 00276 } 00277 00278 cudaMemcpy(ret.even.clover, packedEven, ret.even.bytes, cudaMemcpyHostToDevice); 00279 cudaMemcpy(ret.odd.clover, packedOdd, ret.even.bytes, cudaMemcpyHostToDevice); 00280 if (ret.even.precision == QUDA_HALF_PRECISION) { 00281 cudaMemcpy(ret.even.cloverNorm, packedEvenNorm, ret.even.bytes/18, cudaMemcpyHostToDevice); 00282 cudaMemcpy(ret.odd.cloverNorm, packedOddNorm, ret.even.bytes/18, cudaMemcpyHostToDevice); 00283 } 00284 00285 #ifndef __DEVICE_EMULATION__ 00286 cudaFreeHost(packedEven); 00287 cudaFreeHost(packedOdd); 00288 if (ret.even.precision == QUDA_HALF_PRECISION) { 00289 cudaFreeHost(packedEvenNorm); 00290 cudaFreeHost(packedOddNorm); 00291 } 00292 #else 00293 free(packedEven); 00294 free(packedOdd); 00295 if (ret.even.precision == QUDA_HALF_PRECISION) { 00296 free(packedEvenNorm); 00297 free(packedOddNorm); 00298 } 00299 #endif 00300 00301 } 00302 00303 void loadCloverField(FullClover ret, void *clover, QudaPrecision cpu_prec, CloverFieldOrder clover_order) 00304 { 00305 void *clover_odd; 00306 00307 if (cpu_prec == QUDA_SINGLE_PRECISION) clover_odd = (float *)clover + ret.even.real_length; 00308 else clover_odd = (double *)clover + ret.even.real_length; 00309 00310 if (clover_order == QUDA_LEX_PACKED_CLOVER_ORDER) { 00311 loadFullClover(ret, clover, cpu_prec, clover_order); 00312 } else if (clover_order == QUDA_PACKED_CLOVER_ORDER) { 00313 loadParityClover(ret.even, clover, cpu_prec, clover_order); 00314 loadParityClover(ret.odd, clover_odd, cpu_prec, clover_order); 00315 } else { 00316 errorQuda("Invalid clover_order"); 00317 } 00318 } 00319 00320 /* 00321 void createCloverField(FullClover *cudaClover, void *cpuClover, int *X, Precision precision, QudaInvertParam invert_param) 00322 { 00323 if (invert_param->clover_cpu_prec == QUDA_HALF_PRECISION) { 00324 errorQuda("Half precision not supported on CPU"); 00325 } 00326 00327 // X should contain the dimensions of the even/odd sublattice 00328 *cudaClover = allocateCloverField(X, precision); 00329 loadCloverField(*cudaClover, cpuClover, precision, invert_param->clover_order); 00330 } 00331 */
1.7.3