QUDA v0.4.0
A library for QCD on GPUs
|
00001 #include <stdlib.h> 00002 #include <stdio.h> 00003 #include <math.h> 00004 #include <string.h> 00005 00006 #include <typeinfo> 00007 #include <quda.h> 00008 #include <gauge_field.h> 00009 #include <quda_internal.h> 00010 #include <face_quda.h> 00011 #include <misc_helpers.h> 00012 #include <assert.h> 00013 #include <cuda.h> 00014 00015 #ifdef MPI_COMMS 00016 #include <face_quda.h> 00017 #endif 00018 00019 static double anisotropy_; 00020 extern float fat_link_max_; 00021 static int X_[4]; 00022 static QudaTboundary t_boundary_; 00023 00024 00025 00026 #include <pack_gauge.h> 00027 #include <hisq_force_utils.h> 00028 00029 // The following routines are needed to test the hisq fermion force code 00030 // Actually, some of these routines are deprecated, or will be soon, and 00031 // ought to be removed. 00032 namespace hisq{ 00033 namespace fermion_force{ 00034 00035 00036 static ParityMatrix 00037 allocateParityMatrix(const int X[4], QudaPrecision precision, bool compressed) 00038 { 00039 if(precision == QUDA_DOUBLE_PRECISION && compressed) { 00040 printf("Error: double precision not supported for compressed matrix\n"); 00041 exit(0); 00042 } 00043 00044 ParityMatrix ret; 00045 00046 ret.precision = precision; 00047 ret.X[0] = X[0]/2; 00048 ret.volume = X[0]/2; 00049 for(int d=1; d<4; d++){ 00050 ret.X[d] = X[d]; 00051 ret.volume *= X[d]; 00052 } 00053 ret.Nc = 3; 00054 if(compressed){ 00055 // store the matrices in a compressed form => 6 complex numbers per matrix 00056 // = 12 real number => store as 3 float4s 00057 ret.length = ret.volume*ret.Nc*4; // => number of real numbers 00058 }else{ 00059 // store as float2 or double2 00060 ret.length = ret.volume*ret.Nc*ret.Nc*2; // => number of real numbers 00061 } 00062 00063 if(precision == QUDA_DOUBLE_PRECISION){ 00064 if(compressed){ 00065 printf("Error allocating compressed matrix in double precision\n"); 00066 exit(0); 00067 } 00068 ret.bytes = ret.length*sizeof(double); 00069 } 00070 else { // QUDA_SINGLE_PRECISION 00071 ret.bytes = ret.length*sizeof(float); 00072 } 00073 00074 if(cudaMalloc((void**)&ret.data, ret.bytes) == cudaErrorMemoryAllocation){ 00075 printf("Error allocating matrix\n"); 00076 exit(0); 00077 } 00078 00079 cudaMemset(ret.data, 0, ret.bytes); 00080 00081 return ret; 00082 } 00083 00084 FullMatrix 00085 createMatQuda(const int X[4], QudaPrecision precision) 00086 { 00087 FullMatrix ret; 00088 ret.even = allocateParityMatrix(X, precision, false); // compressed = false 00089 ret.odd = allocateParityMatrix(X, precision, false); // compressed = false 00090 return ret; 00091 } 00092 00093 00094 FullCompMatrix 00095 createCompMatQuda(const int X[4], QudaPrecision precision) 00096 { 00097 FullCompMatrix ret; 00098 ret.even = allocateParityMatrix(X, precision, true); // compressed = true 00099 ret.odd = allocateParityMatrix(X, precision, true); // compressed = true 00100 return ret; 00101 } 00102 00103 00104 static void 00105 freeParityMatQuda(ParityMatrix parity_mat) 00106 { 00107 cudaFree(parity_mat.data); 00108 parity_mat.data = NULL; 00109 } 00110 00111 00112 void 00113 freeMatQuda(FullMatrix mat) 00114 { 00115 freeParityMatQuda(mat.even); 00116 freeParityMatQuda(mat.odd); 00117 } 00118 00119 00120 void 00121 freeCompMatQuda(FullCompMatrix mat) 00122 { 00123 freeParityMatQuda(mat.even); 00124 freeParityMatQuda(mat.odd); 00125 } 00126 00127 00128 00129 static void pack18Oprod(float2 *res, float *g, int dir, int Vh){ 00130 float2 *r = res + dir*9*Vh; 00131 for(int j=0; j<9; j++){ 00132 r[j*Vh].x = g[j*2+0]; 00133 r[j*Vh].y = g[j*2+1]; 00134 } 00135 // r[dir][j][i] // where i is the position index, j labels segments of the su3 matrix, dir is direction 00136 } 00137 00138 00139 static void packOprodField(float2 *res, float *oprod, int oddBit, int Vh){ 00140 // gaugeSiteSize = 18 00141 int gss=18; 00142 int total_volume = Vh*2; 00143 for(int dir=0; dir<4; dir++){ 00144 float *op = oprod + dir*total_volume*gss + oddBit*Vh*gss; // prod[dir][0/1][x] 00145 for (int i=0; i<Vh; i++){ 00146 pack18Oprod(res+i, op+i*gss, dir, Vh); 00147 } 00148 } 00149 } 00150 00151 00152 static void packOprodFieldDir(float2 *res, float *oprod, int dir, int oddBit, int Vh){ 00153 int gss=18; 00154 int total_volume = Vh*2; 00155 float *op = oprod + dir*total_volume*gss + oddBit*Vh*gss; 00156 for(int i=0; i<Vh; i++){ 00157 pack18Oprod(res+i, op+i*gss, 0, Vh); 00158 } 00159 } 00160 00161 00162 static ParityOprod 00163 allocateParityOprod(int *X, QudaPrecision precision) 00164 { 00165 if(precision == QUDA_DOUBLE_PRECISION) { 00166 errorQuda("Error: double precision not supported for compressed matrix\n"); 00167 } 00168 00169 ParityOprod ret; 00170 00171 ret.precision = precision; 00172 ret.X[0] = X[0]/2; 00173 ret.volume = X[0]/2; 00174 for(int d=1; d<4; d++){ 00175 ret.X[d] = X[d]; 00176 ret.volume *= X[d]; 00177 } 00178 ret.Nc = 3; 00179 ret.length = ret.volume*ret.Nc*ret.Nc*2; // => number of real numbers 00180 00181 if(precision == QUDA_DOUBLE_PRECISION){ 00182 ret.bytes = ret.length*sizeof(double); 00183 } 00184 else { // QUDA_SINGLE_PRECISION 00185 ret.bytes = ret.length*sizeof(float); 00186 } 00187 00188 // loop over the eight directions 00189 for(int dir=0; dir<4; dir++){ 00190 if(cudaMalloc((void**)&(ret.data[dir]), ret.bytes) == cudaErrorMemoryAllocation){ 00191 printf("Error allocating ParityOprod\n"); 00192 exit(0); 00193 } 00194 cudaMemset(ret.data[dir], 0, ret.bytes); 00195 } 00196 return ret; 00197 } 00198 00199 00200 00201 FullOprod 00202 createOprodQuda(int *X, QudaPrecision precision) 00203 { 00204 FullOprod ret; 00205 ret.even = allocateParityOprod(X, precision); 00206 ret.odd = allocateParityOprod(X, precision); 00207 return ret; 00208 } 00209 00210 00211 00212 00213 static void 00214 copyOprodFromCPUArrayQuda(FullOprod cudaOprod, void *cpuOprod, 00215 size_t bytes_per_dir, int Vh) 00216 { 00217 // Use pinned memory 00218 float2 *packedEven, *packedOdd; 00219 cudaMallocHost(&packedEven, bytes_per_dir); // now 00220 cudaMallocHost(&packedOdd, bytes_per_dir); 00221 for(int dir=0; dir<4; dir++){ 00222 packOprodFieldDir(packedEven, (float*)cpuOprod, dir, 0, Vh); 00223 packOprodFieldDir(packedOdd, (float*)cpuOprod, dir, 1, Vh); 00224 00225 cudaMemset(cudaOprod.even.data[dir], 0, bytes_per_dir); 00226 cudaMemset(cudaOprod.odd.data[dir], 0, bytes_per_dir); 00227 checkCudaError(); 00228 00229 cudaMemcpy(cudaOprod.even.data[dir], packedEven, bytes_per_dir, cudaMemcpyHostToDevice); 00230 cudaMemcpy(cudaOprod.odd.data[dir], packedOdd, bytes_per_dir, cudaMemcpyHostToDevice); 00231 checkCudaError(); 00232 } 00233 cudaFreeHost(packedEven); 00234 cudaFreeHost(packedOdd); 00235 } 00236 00237 00238 00239 void copyOprodToGPU(FullOprod cudaOprod, 00240 void *oprod, 00241 int half_volume){ 00242 int bytes_per_dir = half_volume*18*sizeof(float); 00243 copyOprodFromCPUArrayQuda(cudaOprod, oprod, bytes_per_dir, half_volume); 00244 } 00245 00246 00247 static void unpack18Oprod(float *h_oprod, float2 *d_oprod, int dir, int Vh) { 00248 float2 *dg = d_oprod + dir*9*Vh; 00249 for (int j=0; j<9; j++) { 00250 h_oprod[j*2+0] = dg[j*Vh].x; 00251 h_oprod[j*2+1] = dg[j*Vh].y; 00252 } 00253 } 00254 00255 static void unpackOprodField(float* res, float2* cudaOprod, int oddBit, int Vh){ 00256 int gss=18; 00257 int total_volume = Vh*2; 00258 for(int dir=0; dir<4; dir++){ 00259 float* res_ptr = res + dir*total_volume*gss + oddBit*Vh*gss; 00260 for(int i=0; i<Vh; i++){ 00261 unpack18Oprod(res_ptr + i*gss, cudaOprod+i, dir, Vh); 00262 } 00263 } 00264 } 00265 00266 00267 static void 00268 fetchOprodFromGPUArraysQuda(void *cudaOprodEven, void *cudaOprodOdd, void *cpuOprod, size_t bytes, int Vh) 00269 { 00270 float2 *packedEven, *packedOdd; 00271 cudaMallocHost(&packedEven,bytes); 00272 cudaMallocHost(&packedOdd, bytes); 00273 00274 00275 cudaMemcpy(packedEven, cudaOprodEven, bytes, cudaMemcpyDeviceToHost); 00276 checkCudaError(); 00277 cudaMemcpy(packedOdd, cudaOprodOdd, bytes, cudaMemcpyDeviceToHost); 00278 checkCudaError(); 00279 00280 unpackOprodField((float*)cpuOprod, packedEven, 0, Vh); 00281 unpackOprodField((float*)cpuOprod, packedOdd, 1, Vh); 00282 00283 cudaFreeHost(packedEven); 00284 cudaFreeHost(packedOdd); 00285 } 00286 00287 00288 void fetchOprodFromGPU(void *cudaOprodEven, 00289 void *cudaOprodOdd, 00290 void *oprod, 00291 int half_vol){ 00292 int bytes = 4*half_vol*18*sizeof(float); 00293 fetchOprodFromGPUArraysQuda(cudaOprodEven, cudaOprodOdd, oprod, bytes, half_vol); 00294 } 00295 00296 00297 static void 00298 loadOprodFromCPUArrayQuda(void *cudaOprodEven, void *cudaOprodOdd, void *cpuOprod, 00299 size_t bytes, int Vh) 00300 { 00301 // Use pinned memory 00302 float2 *packedEven, *packedOdd; 00303 checkCudaError(); 00304 00305 cudaMallocHost(&packedEven, bytes); 00306 cudaMallocHost(&packedOdd, bytes); 00307 checkCudaError(); 00308 00309 00310 packOprodField(packedEven, (float*)cpuOprod, 0, Vh); 00311 packOprodField(packedOdd, (float*)cpuOprod, 1, Vh); 00312 checkCudaError(); 00313 00314 00315 cudaMemset(cudaOprodEven, 0, bytes); 00316 cudaMemset(cudaOprodOdd, 0, bytes); 00317 checkCudaError(); 00318 00319 cudaMemcpy(cudaOprodEven, packedEven, bytes, cudaMemcpyHostToDevice); 00320 checkCudaError(); 00321 cudaMemcpy(cudaOprodOdd, packedOdd, bytes, cudaMemcpyHostToDevice); 00322 checkCudaError(); 00323 00324 cudaFreeHost(packedEven); 00325 cudaFreeHost(packedOdd); 00326 } 00327 00328 00329 00330 void loadOprodToGPU(void *cudaOprodEven, 00331 void *cudaOprodOdd, 00332 void *oprod, 00333 int vol){ 00334 checkCudaError(); 00335 int bytes = 4*vol*18*sizeof(float); 00336 00337 std::cout << "vol = " << vol << std::endl; 00338 std::cout << "bytes = " << bytes << std::endl; 00339 checkCudaError(); 00340 loadOprodFromCPUArrayQuda(cudaOprodEven, cudaOprodOdd, oprod, bytes, vol); 00341 } 00342 00343 00344 00345 void allocateOprodFields(void **cudaOprodEven, void **cudaOprodOdd, int vol){ 00346 int bytes = 4*vol*18*sizeof(float); 00347 00348 if (cudaMalloc((void **)cudaOprodEven, bytes) == cudaErrorMemoryAllocation) { 00349 errorQuda("Error allocating even outer product field"); 00350 } 00351 00352 cudaMemset((*cudaOprodEven), 0, bytes); 00353 checkCudaError(); 00354 00355 if (cudaMalloc((void **)cudaOprodOdd, bytes) == cudaErrorMemoryAllocation) { 00356 errorQuda("Error allocating odd outer product field"); 00357 } 00358 00359 cudaMemset((*cudaOprodOdd), 0, bytes); 00360 00361 checkCudaError(); 00362 } 00363 00364 } // end namespace fermion_force 00365 } // end namespace hisq