QUDA v0.4.0
A library for QCD on GPUs
|
00001 #include <stdlib.h> 00002 #include <stdio.h> 00003 #include <math.h> 00004 00005 #include <quda_internal.h> 00006 #include <clover_field.h> 00007 00008 CloverField::CloverField(const CloverFieldParam ¶m, const QudaFieldLocation &location) : 00009 LatticeField(param, location), bytes(0), norm_bytes(0), nColor(3), nSpin(4) 00010 { 00011 if (nDim != 4) errorQuda("Number of dimensions must be 4, not %d", nDim); 00012 00013 real_length = 2*volumeCB*nColor*nColor*nSpin*nSpin/2; // block-diagonal Hermitian (72 reals) 00014 length = 2*stride*nColor*nColor*nSpin*nSpin/2; 00015 00016 bytes = length*precision; 00017 bytes = ALIGNMENT_ADJUST(bytes); 00018 if (precision == QUDA_HALF_PRECISION) { 00019 norm_bytes = sizeof(float)*2*stride*2; // 2 chirality 00020 norm_bytes = ALIGNMENT_ADJUST(norm_bytes); 00021 } 00022 00023 total_bytes = bytes + norm_bytes; 00024 } 00025 00026 CloverField::~CloverField() { 00027 00028 } 00029 00030 cudaCloverField::cudaCloverField(const void *h_clov, const void *h_clov_inv, 00031 const QudaPrecision cpu_prec, 00032 const QudaCloverFieldOrder cpu_order, 00033 const CloverFieldParam ¶m) 00034 : CloverField(param, QUDA_CUDA_FIELD_LOCATION), clover(0), norm(0), cloverInv(0), invNorm(0) 00035 { 00036 00037 00038 if (h_clov) { 00039 if (cudaMalloc((void**)&clover, bytes) == cudaErrorMemoryAllocation) { 00040 errorQuda("Error allocating clover term"); 00041 } 00042 00043 if (precision == QUDA_HALF_PRECISION) { 00044 if (cudaMalloc((void**)&norm, norm_bytes) == cudaErrorMemoryAllocation) { 00045 errorQuda("Error allocating clover norm"); 00046 } 00047 } 00048 00049 even = clover; 00050 odd = (char*)clover + bytes/2; 00051 00052 evenNorm = norm; 00053 oddNorm = (char*)norm + norm_bytes/2; 00054 00055 loadCPUField(clover, norm, h_clov, cpu_prec, cpu_order); 00056 } 00057 00058 if (h_clov_inv) { 00059 if (cudaMalloc((void**)&cloverInv, bytes) == cudaErrorMemoryAllocation) { 00060 errorQuda("Error allocating clover inverse term"); 00061 } 00062 00063 if (precision == QUDA_HALF_PRECISION) { 00064 if (cudaMalloc((void**)&invNorm, norm_bytes) == cudaErrorMemoryAllocation) { 00065 errorQuda("Error allocating clover inverse norm"); 00066 } 00067 } 00068 00069 evenInv = cloverInv; 00070 oddInv = (char*)cloverInv + bytes/2; 00071 00072 evenInvNorm = invNorm; 00073 oddInvNorm = (char*)invNorm + norm_bytes/2; 00074 00075 total_bytes += bytes + norm_bytes; 00076 00077 loadCPUField(cloverInv, invNorm, h_clov_inv, cpu_prec, cpu_order); 00078 00079 // this is a hack to ensure that we can autotune the clover 00080 // operator when just using symmetric preconditioning 00081 if (!clover) { 00082 clover = cloverInv; 00083 even = evenInv; 00084 odd = oddInv; 00085 } 00086 if (!norm) { 00087 norm = invNorm; 00088 evenNorm = evenInvNorm; 00089 oddNorm = oddInvNorm; 00090 } 00091 } 00092 00093 } 00094 00095 cudaCloverField::~cudaCloverField() { 00096 if (clover != cloverInv) { 00097 if ( clover ) cudaFree(clover); 00098 if ( norm ) cudaFree(norm); 00099 } 00100 00101 if ( cloverInv ) cudaFree(cloverInv); 00102 if ( invNorm ) cudaFree(invNorm); 00103 00104 checkCudaError(); 00105 } 00106 00107 template <typename Float> 00108 static inline void packCloverMatrix(float4* a, Float *b, int Vh) 00109 { 00110 const Float half = 0.5; // pre-include factor of 1/2 introduced by basis change 00111 00112 for (int i=0; i<18; i++) { 00113 a[i*Vh].x = half * b[4*i+0]; 00114 a[i*Vh].y = half * b[4*i+1]; 00115 a[i*Vh].z = half * b[4*i+2]; 00116 a[i*Vh].w = half * b[4*i+3]; 00117 } 00118 } 00119 00120 template <typename Float> 00121 static inline void packCloverMatrix(double2* a, Float *b, int Vh) 00122 { 00123 const Float half = 0.5; // pre-include factor of 1/2 introduced by basis change 00124 00125 for (int i=0; i<36; i++) { 00126 a[i*Vh].x = half * b[2*i+0]; 00127 a[i*Vh].y = half * b[2*i+1]; 00128 } 00129 } 00130 00131 template <typename Float, typename FloatN> 00132 static void packParityClover(FloatN *res, Float *clover, int Vh, int pad) 00133 { 00134 for (int i = 0; i < Vh; i++) { 00135 packCloverMatrix(res+i, clover+72*i, Vh+pad); 00136 } 00137 } 00138 00139 template <typename Float, typename FloatN> 00140 static void packFullClover(FloatN *even, FloatN *odd, Float *clover, int *X, int pad) 00141 { 00142 int Vh = X[0]*X[1]*X[2]*X[3]/2; 00143 00144 for (int i=0; i<Vh; i++) { 00145 00146 int boundaryCrossings = i/X[0] + i/(X[1]*X[0]) + i/(X[2]*X[1]*X[0]); 00147 00148 { // even sites 00149 int k = 2*i + boundaryCrossings%2; 00150 packCloverMatrix(even+i, clover+72*k, Vh+pad); 00151 } 00152 00153 { // odd sites 00154 int k = 2*i + (boundaryCrossings+1)%2; 00155 packCloverMatrix(odd+i, clover+72*k, Vh+pad); 00156 } 00157 } 00158 } 00159 00160 template<typename Float> 00161 static inline void packCloverMatrixHalf(short4 *res, float *norm, Float *clover, int Vh) 00162 { 00163 const Float half = 0.5; // pre-include factor of 1/2 introduced by basis change 00164 Float max, a, c; 00165 00166 // treat the two chiral blocks separately 00167 for (int chi=0; chi<2; chi++) { 00168 max = fabs(clover[0]); 00169 for (int i=1; i<36; i++) { 00170 if ((a = fabs(clover[i])) > max) max = a; 00171 } 00172 c = MAX_SHORT/max; 00173 for (int i=0; i<9; i++) { 00174 res[i*Vh].x = (short) (c * clover[4*i+0]); 00175 res[i*Vh].y = (short) (c * clover[4*i+1]); 00176 res[i*Vh].z = (short) (c * clover[4*i+2]); 00177 res[i*Vh].w = (short) (c * clover[4*i+3]); 00178 } 00179 norm[chi*Vh] = half*max; 00180 res += 9*Vh; 00181 clover += 36; 00182 } 00183 } 00184 00185 template <typename Float> 00186 static void packParityCloverHalf(short4 *res, float *norm, Float *clover, int Vh, int pad) 00187 { 00188 for (int i = 0; i < Vh; i++) { 00189 packCloverMatrixHalf(res+i, norm+i, clover+72*i, Vh+pad); 00190 } 00191 } 00192 00193 template <typename Float> 00194 static void packFullCloverHalf(short4 *even, float *evenNorm, short4 *odd, float *oddNorm, 00195 Float *clover, int *X, int pad) 00196 { 00197 int Vh = X[0]*X[1]*X[2]*X[3]/2; 00198 00199 for (int i=0; i<Vh; i++) { 00200 00201 int boundaryCrossings = i/X[0] + i/(X[1]*X[0]) + i/(X[2]*X[1]*X[0]); 00202 00203 { // even sites 00204 int k = 2*i + boundaryCrossings%2; 00205 packCloverMatrixHalf(even+i, evenNorm+i, clover+72*k, Vh+pad); 00206 } 00207 00208 { // odd sites 00209 int k = 2*i + (boundaryCrossings+1)%2; 00210 packCloverMatrixHalf(odd+i, oddNorm+i, clover+72*k, Vh+pad); 00211 } 00212 } 00213 } 00214 00215 void cudaCloverField::loadCPUField(void *clover, void *norm, const void *h_clover, 00216 const QudaPrecision cpu_prec, const CloverFieldOrder cpu_order) { 00217 00218 void *h_clover_odd = (char*)h_clover + cpu_prec*real_length/2; 00219 00220 if (cpu_order == QUDA_LEX_PACKED_CLOVER_ORDER) { 00221 loadFullField(clover, norm, (char*)clover+bytes/2, (char*)norm+norm_bytes/2, h_clover, cpu_prec, cpu_order); 00222 } else if (cpu_order == QUDA_PACKED_CLOVER_ORDER) { 00223 loadParityField(clover, norm, h_clover, cpu_prec, cpu_order); 00224 loadParityField((char*)clover+bytes/2, (char*)norm+norm_bytes/2, h_clover_odd, cpu_prec, cpu_order); 00225 } else { 00226 errorQuda("Invalid clover_order"); 00227 } 00228 } 00229 00230 void cudaCloverField::loadParityField(void *clover, void *cloverNorm, const void *h_clover, 00231 const QudaPrecision cpu_prec, const CloverFieldOrder cpu_order) 00232 { 00233 // use pinned memory 00234 void *packedClover, *packedCloverNorm; 00235 00236 if (precision == QUDA_DOUBLE_PRECISION && cpu_prec != QUDA_DOUBLE_PRECISION) { 00237 errorQuda("Cannot have CUDA double precision without CPU double precision"); 00238 } 00239 if (cpu_order != QUDA_PACKED_CLOVER_ORDER) 00240 errorQuda("Invalid clover order %d", cpu_order); 00241 00242 if (cudaMallocHost(&packedClover, bytes/2) == cudaErrorMemoryAllocation) 00243 errorQuda("Error allocating clover pinned memory"); 00244 00245 if (precision == QUDA_HALF_PRECISION) { 00246 if (cudaMallocHost(&packedCloverNorm, norm_bytes/2) == cudaErrorMemoryAllocation) 00247 { 00248 errorQuda("Error allocating clover pinned memory"); 00249 } 00250 } 00251 00252 if (precision == QUDA_DOUBLE_PRECISION) { 00253 packParityClover((double2 *)packedClover, (double *)h_clover, volumeCB, pad); 00254 } else if (precision == QUDA_SINGLE_PRECISION) { 00255 if (cpu_prec == QUDA_DOUBLE_PRECISION) { 00256 packParityClover((float4 *)packedClover, (double *)h_clover, volumeCB, pad); 00257 } else { 00258 packParityClover((float4 *)packedClover, (float *)h_clover, volumeCB, pad); 00259 } 00260 } else { 00261 if (cpu_prec == QUDA_DOUBLE_PRECISION) { 00262 packParityCloverHalf((short4 *)packedClover, (float *)packedCloverNorm, 00263 (double *)h_clover, volumeCB, pad); 00264 } else { 00265 packParityCloverHalf((short4 *)packedClover, (float *)packedCloverNorm, 00266 (float *)h_clover, volumeCB, pad); 00267 } 00268 } 00269 00270 cudaMemcpy(clover, packedClover, bytes/2, cudaMemcpyHostToDevice); 00271 if (precision == QUDA_HALF_PRECISION) 00272 cudaMemcpy(cloverNorm, packedCloverNorm, norm_bytes/2, cudaMemcpyHostToDevice); 00273 00274 cudaFreeHost(packedClover); 00275 if (precision == QUDA_HALF_PRECISION) cudaFreeHost(packedCloverNorm); 00276 } 00277 00278 void cudaCloverField::loadFullField(void *even, void *evenNorm, void *odd, void *oddNorm, 00279 const void *h_clover, const QudaPrecision cpu_prec, 00280 const CloverFieldOrder cpu_order) 00281 { 00282 // use pinned memory 00283 void *packedEven, *packedEvenNorm, *packedOdd, *packedOddNorm; 00284 00285 if (precision == QUDA_DOUBLE_PRECISION && cpu_prec != QUDA_DOUBLE_PRECISION) { 00286 errorQuda("Cannot have CUDA double precision without CPU double precision"); 00287 } 00288 if (cpu_order != QUDA_LEX_PACKED_CLOVER_ORDER) { 00289 errorQuda("Invalid clover order"); 00290 } 00291 00292 cudaMallocHost(&packedEven, bytes/2); 00293 cudaMallocHost(&packedOdd, bytes/2); 00294 if (precision == QUDA_HALF_PRECISION) { 00295 cudaMallocHost(&packedEvenNorm, norm_bytes/2); 00296 cudaMallocHost(&packedOddNorm, norm_bytes/2); 00297 } 00298 00299 if (precision == QUDA_DOUBLE_PRECISION) { 00300 packFullClover((double2 *)packedEven, (double2 *)packedOdd, (double *)clover, x, pad); 00301 } else if (precision == QUDA_SINGLE_PRECISION) { 00302 if (cpu_prec == QUDA_DOUBLE_PRECISION) { 00303 packFullClover((float4 *)packedEven, (float4 *)packedOdd, (double *)clover, x, pad); 00304 } else { 00305 packFullClover((float4 *)packedEven, (float4 *)packedOdd, (float *)clover, x, pad); 00306 } 00307 } else { 00308 if (cpu_prec == QUDA_DOUBLE_PRECISION) { 00309 packFullCloverHalf((short4 *)packedEven, (float *)packedEvenNorm, (short4 *)packedOdd, 00310 (float *) packedOddNorm, (double *)clover, x, pad); 00311 } else { 00312 packFullCloverHalf((short4 *)packedEven, (float *)packedEvenNorm, (short4 *)packedOdd, 00313 (float * )packedOddNorm, (float *)clover, x, pad); 00314 } 00315 } 00316 00317 cudaMemcpy(even, packedEven, bytes/2, cudaMemcpyHostToDevice); 00318 cudaMemcpy(odd, packedOdd, bytes/2, cudaMemcpyHostToDevice); 00319 if (precision == QUDA_HALF_PRECISION) { 00320 cudaMemcpy(evenNorm, packedEvenNorm, norm_bytes/2, cudaMemcpyHostToDevice); 00321 cudaMemcpy(oddNorm, packedOddNorm, norm_bytes/2, cudaMemcpyHostToDevice); 00322 } 00323 00324 cudaFreeHost(packedEven); 00325 cudaFreeHost(packedOdd); 00326 if (precision == QUDA_HALF_PRECISION) { 00327 cudaFreeHost(packedEvenNorm); 00328 cudaFreeHost(packedOddNorm); 00329 } 00330 }