QUDA v0.4.0
A library for QCD on GPUs
quda/lib/cuda_gauge_field.cpp
Go to the documentation of this file.
00001 #include <gauge_field.h>
00002 #include <face_quda.h>
00003 #include <typeinfo>
00004 #include <misc_helpers.h>
00005 
00006 cudaGaugeField::cudaGaugeField(const GaugeFieldParam &param) :
00007   GaugeField(param, QUDA_CUDA_FIELD_LOCATION), gauge(0), even(0), odd(0)
00008 {
00009   if(create != QUDA_NULL_FIELD_CREATE &&  create != QUDA_ZERO_FIELD_CREATE){
00010     errorQuda("ERROR: create type(%d) not supported yet\n", create);
00011   }
00012   
00013   if (cudaMalloc((void **)&gauge, bytes) == cudaErrorMemoryAllocation) {
00014     errorQuda("Error allocating gauge field");
00015   }
00016   
00017   if(create == QUDA_ZERO_FIELD_CREATE){
00018     cudaMemset(gauge, 0, bytes);
00019   }
00020   
00021   even = gauge;
00022   odd = (char*)gauge + bytes/2;
00023  
00024 }
00025 
00026 cudaGaugeField::~cudaGaugeField() {
00027   if (gauge) cudaFree(gauge);
00028   checkCudaError();
00029 }
00030 
00031 // FIXME temporary hack 
00032 static double anisotropy_;
00033 static double fat_link_max_;
00034 static const int *X_;
00035 static QudaTboundary t_boundary_; 
00036 #include <pack_gauge.h>
00037 
00038 template <typename Float, typename FloatN>
00039 static void loadGaugeField(FloatN *even, FloatN *odd, Float *cpuGauge, Float **cpuGhost,
00040                            GaugeFieldOrder cpu_order, QudaReconstructType reconstruct, 
00041                            size_t bytes, int volumeCB, int *surfaceCB, int pad, int nFace,
00042                            QudaLinkType type, double &fat_link_max) {
00043   // Use pinned memory
00044   FloatN *packed, *packedEven, *packedOdd;
00045   cudaMallocHost(&packed, bytes);
00046   packedEven = packed;
00047   packedOdd = (FloatN*)((char*)packed + bytes/2);
00048 
00049   if( ! packedEven ) errorQuda( "packedEven is borked\n");
00050   if( ! packedOdd ) errorQuda( "packedOdd is borked\n");
00051   if( ! even ) errorQuda( "even is borked\n");
00052   if( ! odd ) errorQuda( "odd is borked\n");
00053   if( ! cpuGauge ) errorQuda( "cpuGauge is borked\n");
00054 
00055 #ifdef MULTI_GPU
00056   if (cpu_order != QUDA_QDP_GAUGE_ORDER)
00057     errorQuda("Only QUDA_QDP_GAUGE_ORDER is supported for multi-gpu\n");
00058 #endif
00059 
00060   //for QUDA_ASQTAD_FAT_LINKS, need to find out the max value
00061   //fat_link_max will be used in encoding half precision fat link
00062   if(type == QUDA_ASQTAD_FAT_LINKS){
00063     fat_link_max = 0.0;
00064     for(int dir=0; dir < 4; dir++){
00065       for(int i=0;i < 2*volumeCB*gaugeSiteSize; i++){
00066         Float** tmp = (Float**)cpuGauge;
00067         if( tmp[dir][i] > fat_link_max ){
00068           fat_link_max = tmp[dir][i];
00069         }
00070       }
00071     }
00072   }
00073   
00074   double fat_link_max_double = fat_link_max;
00075 #ifdef MULTI_GPU
00076   reduceMaxDouble(fat_link_max_double);
00077 #endif
00078   fat_link_max = fat_link_max_double;
00079 
00080   int voxels[] = {volumeCB, volumeCB, volumeCB, volumeCB};
00081 
00082   // FIXME - hack for the moment
00083   fat_link_max_ = fat_link_max;
00084 
00085   int nFaceLocal = 1;
00086   if (cpu_order == QUDA_QDP_GAUGE_ORDER) {
00087     packQDPGaugeField(packedEven, (Float**)cpuGauge, 0, reconstruct, volumeCB, 
00088                       voxels, pad, 0, nFaceLocal, type);
00089     packQDPGaugeField(packedOdd,  (Float**)cpuGauge, 1, reconstruct, volumeCB, 
00090                       voxels, pad, 0, nFaceLocal, type);
00091   } else if (cpu_order == QUDA_CPS_WILSON_GAUGE_ORDER) {
00092     packCPSGaugeField(packedEven, (Float*)cpuGauge, 0, reconstruct, volumeCB, pad);
00093     packCPSGaugeField(packedOdd,  (Float*)cpuGauge, 1, reconstruct, volumeCB, pad);    
00094   } else if (cpu_order == QUDA_MILC_GAUGE_ORDER) {
00095     packMILCGaugeField(packedEven, (Float*)cpuGauge, 0, reconstruct, volumeCB, pad);
00096     packMILCGaugeField(packedOdd,  (Float*)cpuGauge, 1, reconstruct, volumeCB, pad);    
00097   } else {
00098     errorQuda("Invalid gauge_order %d", cpu_order);
00099   }
00100 
00101 #ifdef MULTI_GPU
00102   if(type != QUDA_ASQTAD_MOM_LINKS){
00103     // pack into the padded regions
00104     packQDPGaugeField(packedEven, (Float**)cpuGhost, 0, reconstruct, volumeCB,
00105                       surfaceCB, pad, volumeCB, nFace, type);
00106     packQDPGaugeField(packedOdd,  (Float**)cpuGhost, 1, reconstruct, volumeCB, 
00107                       surfaceCB, pad, volumeCB, nFace, type);
00108   }
00109 #endif
00110 
00111   cudaMemcpy(even, packed, bytes, cudaMemcpyHostToDevice);
00112   checkCudaError();
00113     
00114   cudaFreeHost(packed);
00115 }
00116 
00117 template <typename Float, typename Float2>
00118 void loadMomField(Float2 *even, Float2 *odd, Float *mom, int bytes, int Vh, int pad) 
00119 {  
00120   Float2 *packedEven, *packedOdd;
00121   cudaMallocHost(&packedEven, bytes/2); 
00122   cudaMallocHost(&packedOdd, bytes/2); 
00123     
00124   packMomField(packedEven, (Float*)mom, 0, Vh, pad);
00125   packMomField(packedOdd,  (Float*)mom, 1, Vh, pad);
00126     
00127   cudaMemcpy(even, packedEven, bytes/2, cudaMemcpyHostToDevice);
00128   cudaMemcpy(odd,  packedOdd, bytes/2, cudaMemcpyHostToDevice); 
00129   
00130   cudaFreeHost(packedEven);
00131   cudaFreeHost(packedOdd);
00132 }
00133 
00134 void cudaGaugeField::loadCPUField(const cpuGaugeField &cpu, const QudaFieldLocation &pack_location)
00135 {
00136 
00137   checkField(cpu);
00138 
00139   if (pack_location == QUDA_CUDA_FIELD_LOCATION) {
00140     errorQuda("Not implemented"); // awaiting Guochun's new gauge packing
00141   } else if (pack_location == QUDA_CPU_FIELD_LOCATION) {
00142     // FIXME
00143     anisotropy_ = anisotropy;
00144     X_ = x;
00145     t_boundary_ = t_boundary;
00146     
00147 #ifdef MULTI_GPU
00148     //FIXME: if this is MOM field, we don't need exchange data
00149     if(link_type != QUDA_ASQTAD_MOM_LINKS){ 
00150       cpu.exchangeGhost();
00151     }
00152 #endif
00153     
00154     if (reconstruct != QUDA_RECONSTRUCT_10) { // gauge field
00155       if (precision == QUDA_DOUBLE_PRECISION) {
00156         
00157         if (cpu.Precision() == QUDA_DOUBLE_PRECISION) {
00158           loadGaugeField((double2*)(even), (double2*)(odd), (double*)cpu.gauge, (double**)cpu.ghost,
00159                          cpu.Order(), reconstruct, bytes, volumeCB, surfaceCB, pad, nFace, link_type, fat_link_max);
00160         } else if (cpu.Precision() == QUDA_SINGLE_PRECISION) {
00161           loadGaugeField((double2*)(even), (double2*)(odd), (float*)cpu.gauge, (float**)cpu.ghost,
00162                          cpu.order, reconstruct, bytes, volumeCB, surfaceCB, pad, nFace, link_type, fat_link_max);
00163         }
00164         
00165       } else if (precision == QUDA_SINGLE_PRECISION) {
00166         
00167         if (cpu.Precision() == QUDA_DOUBLE_PRECISION) {
00168           if (reconstruct == QUDA_RECONSTRUCT_NO) {
00169             loadGaugeField((float2*)(even), (float2*)(odd), (double*)cpu.gauge, (double**)cpu.ghost, 
00170                            cpu.order, reconstruct, bytes, volumeCB, surfaceCB, pad, nFace, link_type, fat_link_max);          
00171           } else {
00172             loadGaugeField((float4*)(even), (float4*)(odd), (double*)cpu.gauge, (double**)cpu.ghost,
00173                            cpu.order, reconstruct, bytes, volumeCB, surfaceCB, pad, nFace, link_type, fat_link_max);
00174           }
00175         } else if (cpu.Precision() == QUDA_SINGLE_PRECISION) {
00176           if (reconstruct == QUDA_RECONSTRUCT_NO) {
00177             loadGaugeField((float2*)(even), (float2*)(odd), (float*)cpu.gauge, (float**)cpu.ghost,
00178                            cpu.order, reconstruct, bytes, volumeCB, surfaceCB, pad, nFace, link_type, fat_link_max);
00179           } else {
00180             loadGaugeField((float4*)(even), (float4*)(odd), (float*)cpu.gauge, (float**)cpu.ghost,
00181                            cpu.order, reconstruct, bytes, volumeCB, surfaceCB, pad, nFace, link_type, fat_link_max);
00182           }
00183         }
00184         
00185       } else if (precision == QUDA_HALF_PRECISION) {
00186         
00187         if (cpu.Precision() == QUDA_DOUBLE_PRECISION){
00188           if (reconstruct == QUDA_RECONSTRUCT_NO) {
00189             loadGaugeField((short2*)(even), (short2*)(odd), (double*)cpu.gauge, (double**)cpu.ghost,
00190                            cpu.order, reconstruct, bytes, volumeCB, surfaceCB, pad, nFace, link_type, fat_link_max);
00191           } else {
00192             loadGaugeField((short4*)(even), (short4*)(odd), (double*)cpu.gauge, (double**)cpu.ghost,
00193                            cpu.order, reconstruct, bytes, volumeCB, surfaceCB, pad, nFace, link_type, fat_link_max);          
00194           }
00195         } else if (cpu.Precision() == QUDA_SINGLE_PRECISION) {
00196           if (reconstruct == QUDA_RECONSTRUCT_NO) {
00197             loadGaugeField((short2*)(even), (short2*)(odd), (float*)cpu.gauge, (float**)cpu.ghost,
00198                            cpu.order, reconstruct, bytes, volumeCB, surfaceCB, pad, nFace, link_type, fat_link_max);
00199           } else {
00200             loadGaugeField((short4*)(even), (short4*)(odd), (float*)cpu.gauge, (float**)(cpu.ghost),
00201                            cpu.order, reconstruct, bytes, volumeCB, surfaceCB, pad, nFace, link_type, fat_link_max);          
00202           }
00203         }
00204       }
00205     } else { // momentum field
00206       if  (precision == QUDA_DOUBLE_PRECISION) {
00207         if (cpu.Precision() == QUDA_DOUBLE_PRECISION) {
00208           loadMomField((double2*)(even), (double2*)(odd), (double*)cpu.gauge, bytes, volumeCB, pad);
00209         } else if (cpu.Precision() == QUDA_SINGLE_PRECISION) {
00210           loadMomField((double2*)(even), (double2*)(odd), (float*)cpu.gauge, bytes, volumeCB, pad);
00211         } 
00212       } else {
00213         if (cpu.Precision() == QUDA_DOUBLE_PRECISION) {
00214           loadMomField((float2*)(even), (float2*)(odd), (double*)cpu.gauge, bytes, volumeCB, pad);
00215         } else if (cpu.Precision() == QUDA_SINGLE_PRECISION) {
00216           loadMomField((float2*)(even), (float2*)(odd), (float*)cpu.gauge, bytes, volumeCB, pad);
00217         } 
00218       }      
00219     } // gauge or momentum
00220   } else {
00221     errorQuda("Invalid pack location %d", pack_location);
00222   }
00223     
00224 }
00225   
00226 /* 
00227    Generic gauge field retrieval.  Copies the cuda gauge field into a
00228    cpu gauge field.  The reordering takes place on the host and
00229    supports most gauge field options.
00230  */
00231 template <typename Float, typename FloatN>
00232 static void storeGaugeField(Float *cpuGauge, FloatN *gauge, GaugeFieldOrder cpu_order,
00233                             QudaReconstructType reconstruct, int bytes, int volumeCB, int pad) {
00234 
00235   // Use pinned memory
00236   FloatN *packed;
00237   cudaMallocHost(&packed, bytes);
00238   cudaMemcpy(packed, gauge, bytes, cudaMemcpyDeviceToHost);
00239     
00240   FloatN *packedEven = packed;
00241   FloatN *packedOdd = (FloatN*)((char*)packed + bytes/2);
00242     
00243   if (cpu_order == QUDA_QDP_GAUGE_ORDER) {
00244     unpackQDPGaugeField((Float**)cpuGauge, packedEven, 0, reconstruct, volumeCB, pad);
00245     unpackQDPGaugeField((Float**)cpuGauge, packedOdd, 1, reconstruct, volumeCB, pad);
00246   } else if (cpu_order == QUDA_CPS_WILSON_GAUGE_ORDER) {
00247     unpackCPSGaugeField((Float*)cpuGauge, packedEven, 0, reconstruct, volumeCB, pad);
00248     unpackCPSGaugeField((Float*)cpuGauge, packedOdd, 1, reconstruct, volumeCB, pad);
00249   } else if (cpu_order == QUDA_MILC_GAUGE_ORDER) {
00250     unpackMILCGaugeField((Float*)cpuGauge, packedEven, 0, reconstruct, volumeCB, pad);
00251     unpackMILCGaugeField((Float*)cpuGauge, packedOdd, 1, reconstruct, volumeCB, pad);
00252   } else {
00253     errorQuda("Invalid gauge_order");
00254   }
00255     
00256   cudaFreeHost(packed);
00257 }
00258 
00259 
00260 
00261 /*
00262   Copies the device gauge field to the host.
00263   - no reconstruction support
00264   - device data is always Float2 ordered
00265   - host data is a 1-dimensional array (MILC ordered)
00266   - no support for half precision
00267   - input and output precisions must match
00268  */
00269 template<typename FloatN, typename Float>
00270 static void storeGaugeField(Float* cpuGauge, FloatN *gauge, int bytes, int volumeCB, 
00271                             int stride, QudaPrecision prec) 
00272 {  
00273   cudaStream_t streams[2];
00274   for (int i=0; i<2; i++) cudaStreamCreate(&streams[i]);
00275   
00276   FloatN *even = gauge;
00277   FloatN *odd = (FloatN*)((char*)gauge + bytes/2);
00278 
00279   int datalen = 4*2*volumeCB*gaugeSiteSize*sizeof(Float); // both parities
00280   void *unpacked;  
00281   if(cudaMalloc(&unpacked, datalen) != cudaSuccess){
00282     errorQuda("cudaMalloc() failed for unpacked\n");
00283   }
00284   void *unpackedEven = unpacked;
00285   void *unpackedOdd = (char*)unpacked + datalen/2;
00286   
00287   //unpack even data kernel
00288   link_format_gpu_to_cpu((void*)unpackedEven, (void*)even, volumeCB, stride, prec, streams[0]);
00289 #ifdef GPU_DIRECT
00290   cudaMemcpyAsync(cpuGauge, unpackedEven, datalen/2, cudaMemcpyDeviceToHost, streams[0]);
00291 #else
00292   cudaMemcpy(cpuGauge, unpackedEven, datalen/2, cudaMemcpyDeviceToHost);
00293 #endif
00294   
00295   //unpack odd data kernel
00296   link_format_gpu_to_cpu((void*)unpackedOdd, (void*)odd, volumeCB, stride, prec, streams[1]);
00297 #ifdef GPU_DIRECT
00298   cudaMemcpyAsync(cpuGauge + 4*volumeCB*gaugeSiteSize, unpackedOdd, datalen/2, cudaMemcpyDeviceToHost, streams[1]);  
00299   for(int i=0; i<2; i++) cudaStreamSynchronize(streams[i]);
00300 #else
00301   cudaMemcpy(cpuGauge + 4*volumeCB*gaugeSiteSize, unpackedOdd, datalen/2, cudaMemcpyDeviceToHost);  
00302 #endif
00303   
00304   cudaFree(unpacked);
00305   for(int i=0; i<2; i++) cudaStreamDestroy(streams[i]);
00306 }
00307 
00308 template <typename Float, typename Float2>
00309 void 
00310 storeMomToCPUArray(Float* mom, Float2 *even, Float2 *odd, 
00311                    int bytes, int V, int pad) 
00312 {    
00313   Float2 *packedEven, *packedOdd;   
00314   cudaMallocHost(&packedEven, bytes/2); 
00315   cudaMallocHost(&packedOdd, bytes/2); 
00316   cudaMemcpy(packedEven, even, bytes/2, cudaMemcpyDeviceToHost); 
00317   cudaMemcpy(packedOdd, odd, bytes/2, cudaMemcpyDeviceToHost);  
00318   
00319   unpackMomField((Float*)mom, packedEven,0, V/2, pad);
00320   unpackMomField((Float*)mom, packedOdd, 1, V/2, pad);
00321   
00322   cudaFreeHost(packedEven); 
00323   cudaFreeHost(packedOdd); 
00324 }
00325 
00326 void cudaGaugeField::saveCPUField(cpuGaugeField &cpu, const QudaFieldLocation &pack_location) const
00327 {
00328 
00329   // do device-side reordering then copy
00330   if (pack_location == QUDA_CUDA_FIELD_LOCATION) {
00331     // check parameters are suitable for device-side packing
00332     if (precision != cpu.Precision())
00333       errorQuda("cpu precision %d and cuda precision %d must be the same", 
00334                 cpu.Precision(), precision );
00335 
00336     if (reconstruct != QUDA_RECONSTRUCT_NO)
00337       errorQuda("Only no reconstruction supported");
00338 
00339     if (order != QUDA_FLOAT2_GAUGE_ORDER)
00340       errorQuda("Only QUDA_FLOAT2_GAUGE_ORDER supported");
00341 
00342     if (cpu.Order() != QUDA_MILC_GAUGE_ORDER)
00343       errorQuda("Only QUDA_MILC_GAUGE_ORDER supported");
00344 
00345     if (precision == QUDA_DOUBLE_PRECISION){
00346       storeGaugeField((double*)cpu.gauge, (double2*)gauge, bytes, volumeCB, stride, precision);
00347     } else if (precision == QUDA_SINGLE_PRECISION){
00348       storeGaugeField((float*)cpu.gauge, (float2*)gauge, bytes, volumeCB, stride, precision);
00349     } else {
00350       errorQuda("Half precision not supported");
00351     }
00352 
00353   } else if (pack_location == QUDA_CPU_FIELD_LOCATION) { // do copy then host-side reorder
00354     
00355     // FIXME - nasty globals
00356     anisotropy_ = anisotropy;
00357     fat_link_max_ = fat_link_max;
00358     X_ = x;
00359     t_boundary_ = t_boundary;
00360     
00361     if (reconstruct != QUDA_RECONSTRUCT_10) {
00362       if (precision == QUDA_DOUBLE_PRECISION) {
00363         
00364         if (cpu.Precision() == QUDA_DOUBLE_PRECISION) {
00365           storeGaugeField((double*)cpu.gauge, (double2*)(gauge),
00366                           cpu.order, reconstruct, bytes, volumeCB, pad);
00367         } else if (cpu.Precision() == QUDA_SINGLE_PRECISION) {
00368           storeGaugeField((float*)cpu.gauge, (double2*)(gauge),
00369                           cpu.order, reconstruct, bytes, volumeCB, pad);
00370         }
00371         
00372       } else if (precision == QUDA_SINGLE_PRECISION) {
00373         
00374         if (cpu.Precision() == QUDA_DOUBLE_PRECISION) {
00375           if (reconstruct == QUDA_RECONSTRUCT_NO) {
00376             storeGaugeField((double*)cpu.gauge, (float2*)(gauge),
00377                             cpu.order, reconstruct, bytes, volumeCB, pad);
00378           } else {
00379             storeGaugeField((double*)cpu.gauge, (float4*)(gauge),
00380                             cpu.order, reconstruct, bytes, volumeCB, pad);
00381           }
00382         } else if (cpu.Precision() == QUDA_SINGLE_PRECISION) {
00383           if (reconstruct == QUDA_RECONSTRUCT_NO) {
00384             storeGaugeField((float*)cpu.gauge, (float2*)(gauge),
00385                             cpu.order, reconstruct, bytes, volumeCB, pad);
00386           } else {
00387             storeGaugeField((float*)cpu.gauge, (float4*)(gauge),
00388                             cpu.order, reconstruct, bytes, volumeCB, pad);
00389           }
00390         }
00391         
00392       } else if (precision == QUDA_HALF_PRECISION) {
00393         
00394         if (cpu.Precision() == QUDA_DOUBLE_PRECISION) {
00395           if (reconstruct == QUDA_RECONSTRUCT_NO) {
00396             storeGaugeField((double*)cpu.gauge, (short2*)(gauge),
00397                             cpu.order, reconstruct, bytes, volumeCB, pad);
00398           } else {
00399             storeGaugeField((double*)cpu.gauge, (short4*)(gauge),
00400                             cpu.order, reconstruct, bytes, volumeCB, pad);
00401           }
00402         } else if (cpu.Precision() == QUDA_SINGLE_PRECISION) {
00403           if (reconstruct == QUDA_RECONSTRUCT_NO) {
00404             storeGaugeField((float*)cpu.gauge, (short2*)(gauge),
00405                             cpu.order, reconstruct, bytes, volumeCB, pad);
00406           } else {
00407             storeGaugeField((float*)cpu.gauge, (short4*)(gauge), 
00408                             cpu.order, reconstruct, bytes, volumeCB, pad);
00409           }
00410         }
00411       }
00412     } else {
00413 
00414       if (cpu.Precision() != precision)
00415         errorQuda("cpu and gpu precison has to be the same at this moment");
00416     
00417       if (precision == QUDA_HALF_PRECISION)
00418         errorQuda("half precision is not supported at this moment");
00419     
00420       if (cpu.order != QUDA_MILC_GAUGE_ORDER)
00421         errorQuda("Only MILC gauge order supported in momentum unpack, not %d", cpu.order);
00422 
00423       if (precision == QUDA_DOUBLE_PRECISION) {
00424         storeMomToCPUArray( (double*)cpu.gauge, (double2*)even, (double2*)odd, bytes, volume, pad);     
00425       }else { //SINGLE PRECISIONS
00426         storeMomToCPUArray( (float*)cpu.gauge, (float2*)even, (float2*)odd, bytes, volume, pad);        
00427       }
00428     } // reconstruct 10
00429   } else {
00430     errorQuda("Invalid pack location %d", pack_location);
00431   }
00432 
00433 }
00434 
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines