QUDA: quda/lib/hisq_force_utils.cpp Source File

QUDA v0.4.0
A library for QCD on GPUs
00001 #include <stdlib.h>
00002 #include <stdio.h>
00003 #include <math.h>
00004 #include <string.h>
00005 
00006 #include <typeinfo>
00007 #include <quda.h>
00008 #include <gauge_field.h>
00009 #include <quda_internal.h>
00010 #include <face_quda.h>
00011 #include <misc_helpers.h>
00012 #include <assert.h>
00013 #include <cuda.h>
00014 
00015 #ifdef MPI_COMMS
00016 #include <face_quda.h>
00017 #endif
00018 
00019 static double anisotropy_;
00020 extern float fat_link_max_;
00021 static int X_[4];
00022 static QudaTboundary t_boundary_;
00023 
00024 
00025 
00026 #include <pack_gauge.h>
00027 #include <hisq_force_utils.h>
00028 
00029 // The following routines are needed to test the hisq fermion force code
00030 // Actually, some of these routines are deprecated, or will be soon, and 
00031 // ought to be removed.
00032 namespace hisq{
00033   namespace fermion_force{
00034 
00035 
00036     static ParityMatrix
00037       allocateParityMatrix(const int X[4], QudaPrecision precision, bool compressed)
00038       {
00039         if(precision == QUDA_DOUBLE_PRECISION && compressed) {
00040           printf("Error: double precision not supported for compressed matrix\n");
00041           exit(0); 
00042         }
00043 
00044         ParityMatrix ret;
00045 
00046         ret.precision = precision;
00047         ret.X[0] = X[0]/2;
00048         ret.volume = X[0]/2;
00049         for(int d=1; d<4; d++){
00050           ret.X[d] = X[d];
00051           ret.volume *= X[d]; 
00052         } 
00053         ret.Nc = 3;
00054         if(compressed){
00055           // store the matrices in a compressed form => 6 complex numbers per matrix
00056           // = 12 real number => store as 3 float4s
00057           ret.length = ret.volume*ret.Nc*4; // => number of real numbers  
00058         }else{
00059           // store as float2 or double2
00060           ret.length = ret.volume*ret.Nc*ret.Nc*2; // => number of real numbers   
00061         }
00062 
00063         if(precision == QUDA_DOUBLE_PRECISION){
00064           if(compressed){
00065             printf("Error allocating compressed matrix in double precision\n");
00066             exit(0);  
00067           }
00068           ret.bytes = ret.length*sizeof(double);
00069         }
00070         else { // QUDA_SINGLE_PRECISION
00071           ret.bytes = ret.length*sizeof(float);
00072         }
00073 
00074         if(cudaMalloc((void**)&ret.data, ret.bytes) == cudaErrorMemoryAllocation){
00075           printf("Error  allocating matrix\n");
00076           exit(0);
00077         }
00078 
00079         cudaMemset(ret.data, 0, ret.bytes);
00080 
00081         return ret;
00082       }
00083 
00084     FullMatrix
00085       createMatQuda(const int X[4], QudaPrecision precision)
00086       {
00087         FullMatrix ret;
00088         ret.even = allocateParityMatrix(X, precision, false); // compressed = false
00089         ret.odd  = allocateParityMatrix(X, precision, false); // compressed = false
00090         return ret;
00091       }
00092 
00093 
00094     FullCompMatrix
00095       createCompMatQuda(const int X[4], QudaPrecision precision)
00096       {
00097         FullCompMatrix ret;
00098         ret.even = allocateParityMatrix(X, precision, true); // compressed = true
00099         ret.odd  = allocateParityMatrix(X, precision, true); // compressed = true
00100         return ret;
00101       }
00102 
00103 
00104     static void
00105       freeParityMatQuda(ParityMatrix parity_mat)
00106       {
00107         cudaFree(parity_mat.data);
00108         parity_mat.data = NULL;
00109       }
00110 
00111 
00112     void
00113       freeMatQuda(FullMatrix mat)
00114       {
00115         freeParityMatQuda(mat.even);
00116         freeParityMatQuda(mat.odd);
00117       }
00118 
00119 
00120     void
00121       freeCompMatQuda(FullCompMatrix mat)
00122       {
00123         freeParityMatQuda(mat.even);
00124         freeParityMatQuda(mat.odd);
00125       }
00126 
00127 
00128 
00129     static void pack18Oprod(float2 *res, float *g, int dir, int Vh){
00130       float2 *r = res + dir*9*Vh;
00131       for(int j=0; j<9; j++){
00132         r[j*Vh].x = g[j*2+0];
00133         r[j*Vh].y = g[j*2+1];   
00134       }
00135       // r[dir][j][i] // where i is the position index, j labels segments of the su3 matrix, dir is direction
00136     }
00137 
00138 
00139     static void packOprodField(float2 *res, float *oprod, int oddBit, int Vh){
00140       // gaugeSiteSize = 18
00141       int gss=18;
00142       int total_volume = Vh*2;
00143       for(int dir=0; dir<4; dir++){
00144         float *op = oprod + dir*total_volume*gss + oddBit*Vh*gss; // prod[dir][0/1][x]
00145         for (int i=0; i<Vh; i++){
00146           pack18Oprod(res+i, op+i*gss, dir, Vh);
00147         }       
00148       }
00149     }
00150 
00151 
00152     static void packOprodFieldDir(float2 *res, float *oprod, int dir, int oddBit, int Vh){
00153       int gss=18;
00154       int total_volume = Vh*2;
00155       float *op = oprod + dir*total_volume*gss + oddBit*Vh*gss;
00156       for(int i=0; i<Vh; i++){
00157         pack18Oprod(res+i, op+i*gss, 0, Vh);
00158       }
00159     }
00160 
00161 
00162     static ParityOprod
00163       allocateParityOprod(int *X, QudaPrecision precision)
00164       {
00165         if(precision == QUDA_DOUBLE_PRECISION) {
00166           errorQuda("Error: double precision not supported for compressed matrix\n");
00167         }
00168 
00169         ParityOprod ret;
00170 
00171         ret.precision = precision;
00172         ret.X[0] = X[0]/2;
00173         ret.volume = X[0]/2;
00174         for(int d=1; d<4; d++){
00175           ret.X[d] = X[d];
00176           ret.volume *= X[d];
00177         }
00178         ret.Nc = 3;
00179         ret.length = ret.volume*ret.Nc*ret.Nc*2; // => number of real numbers       
00180 
00181         if(precision == QUDA_DOUBLE_PRECISION){
00182           ret.bytes = ret.length*sizeof(double);
00183         }
00184         else { // QUDA_SINGLE_PRECISION
00185           ret.bytes = ret.length*sizeof(float);
00186         }
00187 
00188         // loop over the eight directions 
00189         for(int dir=0; dir<4; dir++){
00190           if(cudaMalloc((void**)&(ret.data[dir]), ret.bytes) == cudaErrorMemoryAllocation){
00191             printf("Error allocating ParityOprod\n");
00192             exit(0);
00193           }
00194           cudaMemset(ret.data[dir], 0, ret.bytes);
00195         }
00196         return ret;
00197       }
00198 
00199 
00200 
00201     FullOprod
00202       createOprodQuda(int *X, QudaPrecision precision)
00203       {
00204         FullOprod ret;
00205         ret.even = allocateParityOprod(X, precision);
00206         ret.odd  = allocateParityOprod(X, precision);
00207         return ret;
00208       }
00209 
00210 
00211 
00212 
00213     static void
00214       copyOprodFromCPUArrayQuda(FullOprod cudaOprod, void *cpuOprod,
00215           size_t bytes_per_dir, int Vh)
00216       {
00217         // Use pinned memory 
00218         float2 *packedEven, *packedOdd;
00219         cudaMallocHost(&packedEven, bytes_per_dir); // now
00220         cudaMallocHost(&packedOdd, bytes_per_dir);
00221         for(int dir=0; dir<4; dir++){
00222           packOprodFieldDir(packedEven, (float*)cpuOprod, dir, 0, Vh);
00223           packOprodFieldDir(packedOdd,  (float*)cpuOprod, dir, 1, Vh);
00224 
00225           cudaMemset(cudaOprod.even.data[dir], 0, bytes_per_dir);
00226           cudaMemset(cudaOprod.odd.data[dir],  0, bytes_per_dir);
00227           checkCudaError();
00228 
00229           cudaMemcpy(cudaOprod.even.data[dir], packedEven, bytes_per_dir, cudaMemcpyHostToDevice);
00230           cudaMemcpy(cudaOprod.odd.data[dir], packedOdd, bytes_per_dir, cudaMemcpyHostToDevice);
00231           checkCudaError();
00232         }
00233         cudaFreeHost(packedEven);
00234         cudaFreeHost(packedOdd);
00235       }
00236 
00237 
00238 
00239     void copyOprodToGPU(FullOprod cudaOprod,
00240         void  *oprod,
00241         int half_volume){
00242       int bytes_per_dir = half_volume*18*sizeof(float);
00243       copyOprodFromCPUArrayQuda(cudaOprod, oprod, bytes_per_dir, half_volume);
00244     }
00245 
00246 
00247     static void unpack18Oprod(float *h_oprod, float2 *d_oprod, int dir, int Vh) {
00248       float2 *dg = d_oprod + dir*9*Vh;
00249       for (int j=0; j<9; j++) {
00250         h_oprod[j*2+0] = dg[j*Vh].x; 
00251         h_oprod[j*2+1] = dg[j*Vh].y;
00252       }
00253     }
00254 
00255     static void unpackOprodField(float* res, float2* cudaOprod, int oddBit, int Vh){
00256       int gss=18; 
00257       int total_volume = Vh*2;
00258       for(int dir=0; dir<4; dir++){
00259         float* res_ptr = res + dir*total_volume*gss + oddBit*Vh*gss;
00260         for(int i=0; i<Vh; i++){
00261           unpack18Oprod(res_ptr + i*gss, cudaOprod+i, dir, Vh);
00262         }
00263       }
00264     }
00265 
00266 
00267     static void 
00268       fetchOprodFromGPUArraysQuda(void *cudaOprodEven, void *cudaOprodOdd, void *cpuOprod, size_t bytes, int Vh)
00269       {
00270         float2 *packedEven, *packedOdd;
00271         cudaMallocHost(&packedEven,bytes);
00272         cudaMallocHost(&packedOdd, bytes);
00273 
00274 
00275         cudaMemcpy(packedEven, cudaOprodEven, bytes, cudaMemcpyDeviceToHost);
00276         checkCudaError();
00277         cudaMemcpy(packedOdd, cudaOprodOdd, bytes, cudaMemcpyDeviceToHost);
00278         checkCudaError();
00279 
00280         unpackOprodField((float*)cpuOprod, packedEven, 0, Vh);
00281         unpackOprodField((float*)cpuOprod, packedOdd,  1, Vh);
00282 
00283         cudaFreeHost(packedEven);
00284         cudaFreeHost(packedOdd);
00285       }
00286 
00287 
00288     void fetchOprodFromGPU(void *cudaOprodEven,
00289         void *cudaOprodOdd,
00290         void *oprod,
00291         int half_vol){
00292         int bytes = 4*half_vol*18*sizeof(float);
00293         fetchOprodFromGPUArraysQuda(cudaOprodEven, cudaOprodOdd, oprod, bytes, half_vol);
00294     }
00295 
00296 
00297     static void 
00298       loadOprodFromCPUArrayQuda(void *cudaOprodEven, void *cudaOprodOdd, void *cpuOprod,
00299           size_t bytes, int Vh)
00300       {
00301         // Use pinned memory 
00302         float2 *packedEven, *packedOdd;
00303         checkCudaError();
00304 
00305         cudaMallocHost(&packedEven, bytes);
00306         cudaMallocHost(&packedOdd, bytes);
00307         checkCudaError();
00308 
00309 
00310         packOprodField(packedEven, (float*)cpuOprod, 0, Vh);
00311         packOprodField(packedOdd,  (float*)cpuOprod, 1, Vh);
00312         checkCudaError();
00313 
00314 
00315         cudaMemset(cudaOprodEven, 0, bytes);
00316         cudaMemset(cudaOprodOdd, 0, bytes);
00317         checkCudaError();
00318 
00319         cudaMemcpy(cudaOprodEven, packedEven, bytes, cudaMemcpyHostToDevice);
00320         checkCudaError();
00321         cudaMemcpy(cudaOprodOdd, packedOdd, bytes, cudaMemcpyHostToDevice);
00322         checkCudaError();
00323 
00324         cudaFreeHost(packedEven);
00325         cudaFreeHost(packedOdd);
00326       }
00327 
00328 
00329 
00330     void loadOprodToGPU(void *cudaOprodEven, 
00331         void *cudaOprodOdd,
00332         void  *oprod,
00333         int vol){
00334         checkCudaError();
00335         int bytes = 4*vol*18*sizeof(float);
00336 
00337                                 std::cout << "vol = " << vol << std::endl;
00338         std::cout << "bytes = " << bytes << std::endl;
00339         checkCudaError();
00340         loadOprodFromCPUArrayQuda(cudaOprodEven, cudaOprodOdd, oprod, bytes, vol);
00341     }
00342 
00343 
00344 
00345     void allocateOprodFields(void **cudaOprodEven, void **cudaOprodOdd, int vol){
00346       int bytes = 4*vol*18*sizeof(float);
00347 
00348       if (cudaMalloc((void **)cudaOprodEven, bytes) == cudaErrorMemoryAllocation) {
00349         errorQuda("Error allocating even outer product field");
00350       }
00351 
00352       cudaMemset((*cudaOprodEven), 0, bytes);
00353       checkCudaError();
00354 
00355       if (cudaMalloc((void **)cudaOprodOdd, bytes) == cudaErrorMemoryAllocation) {
00356         errorQuda("Error allocating odd outer product field");
00357       }
00358 
00359       cudaMemset((*cudaOprodOdd), 0, bytes);
00360 
00361       checkCudaError();
00362     }
00363 
00364   } // end namespace fermion_force
00365 } // end namespace hisq