QUDA v0.4.0
A library for QCD on GPUs
|
00001 #include <gauge_field.h> 00002 #include <face_quda.h> 00003 #include <typeinfo> 00004 #include <misc_helpers.h> 00005 00006 cudaGaugeField::cudaGaugeField(const GaugeFieldParam ¶m) : 00007 GaugeField(param, QUDA_CUDA_FIELD_LOCATION), gauge(0), even(0), odd(0) 00008 { 00009 if(create != QUDA_NULL_FIELD_CREATE && create != QUDA_ZERO_FIELD_CREATE){ 00010 errorQuda("ERROR: create type(%d) not supported yet\n", create); 00011 } 00012 00013 if (cudaMalloc((void **)&gauge, bytes) == cudaErrorMemoryAllocation) { 00014 errorQuda("Error allocating gauge field"); 00015 } 00016 00017 if(create == QUDA_ZERO_FIELD_CREATE){ 00018 cudaMemset(gauge, 0, bytes); 00019 } 00020 00021 even = gauge; 00022 odd = (char*)gauge + bytes/2; 00023 00024 } 00025 00026 cudaGaugeField::~cudaGaugeField() { 00027 if (gauge) cudaFree(gauge); 00028 checkCudaError(); 00029 } 00030 00031 // FIXME temporary hack 00032 static double anisotropy_; 00033 static double fat_link_max_; 00034 static const int *X_; 00035 static QudaTboundary t_boundary_; 00036 #include <pack_gauge.h> 00037 00038 template <typename Float, typename FloatN> 00039 static void loadGaugeField(FloatN *even, FloatN *odd, Float *cpuGauge, Float **cpuGhost, 00040 GaugeFieldOrder cpu_order, QudaReconstructType reconstruct, 00041 size_t bytes, int volumeCB, int *surfaceCB, int pad, int nFace, 00042 QudaLinkType type, double &fat_link_max) { 00043 // Use pinned memory 00044 FloatN *packed, *packedEven, *packedOdd; 00045 cudaMallocHost(&packed, bytes); 00046 packedEven = packed; 00047 packedOdd = (FloatN*)((char*)packed + bytes/2); 00048 00049 if( ! packedEven ) errorQuda( "packedEven is borked\n"); 00050 if( ! packedOdd ) errorQuda( "packedOdd is borked\n"); 00051 if( ! even ) errorQuda( "even is borked\n"); 00052 if( ! odd ) errorQuda( "odd is borked\n"); 00053 if( ! cpuGauge ) errorQuda( "cpuGauge is borked\n"); 00054 00055 #ifdef MULTI_GPU 00056 if (cpu_order != QUDA_QDP_GAUGE_ORDER) 00057 errorQuda("Only QUDA_QDP_GAUGE_ORDER is supported for multi-gpu\n"); 00058 #endif 00059 00060 //for QUDA_ASQTAD_FAT_LINKS, need to find out the max value 00061 //fat_link_max will be used in encoding half precision fat link 00062 if(type == QUDA_ASQTAD_FAT_LINKS){ 00063 fat_link_max = 0.0; 00064 for(int dir=0; dir < 4; dir++){ 00065 for(int i=0;i < 2*volumeCB*gaugeSiteSize; i++){ 00066 Float** tmp = (Float**)cpuGauge; 00067 if( tmp[dir][i] > fat_link_max ){ 00068 fat_link_max = tmp[dir][i]; 00069 } 00070 } 00071 } 00072 } 00073 00074 double fat_link_max_double = fat_link_max; 00075 #ifdef MULTI_GPU 00076 reduceMaxDouble(fat_link_max_double); 00077 #endif 00078 fat_link_max = fat_link_max_double; 00079 00080 int voxels[] = {volumeCB, volumeCB, volumeCB, volumeCB}; 00081 00082 // FIXME - hack for the moment 00083 fat_link_max_ = fat_link_max; 00084 00085 int nFaceLocal = 1; 00086 if (cpu_order == QUDA_QDP_GAUGE_ORDER) { 00087 packQDPGaugeField(packedEven, (Float**)cpuGauge, 0, reconstruct, volumeCB, 00088 voxels, pad, 0, nFaceLocal, type); 00089 packQDPGaugeField(packedOdd, (Float**)cpuGauge, 1, reconstruct, volumeCB, 00090 voxels, pad, 0, nFaceLocal, type); 00091 } else if (cpu_order == QUDA_CPS_WILSON_GAUGE_ORDER) { 00092 packCPSGaugeField(packedEven, (Float*)cpuGauge, 0, reconstruct, volumeCB, pad); 00093 packCPSGaugeField(packedOdd, (Float*)cpuGauge, 1, reconstruct, volumeCB, pad); 00094 } else if (cpu_order == QUDA_MILC_GAUGE_ORDER) { 00095 packMILCGaugeField(packedEven, (Float*)cpuGauge, 0, reconstruct, volumeCB, pad); 00096 packMILCGaugeField(packedOdd, (Float*)cpuGauge, 1, reconstruct, volumeCB, pad); 00097 } else { 00098 errorQuda("Invalid gauge_order %d", cpu_order); 00099 } 00100 00101 #ifdef MULTI_GPU 00102 if(type != QUDA_ASQTAD_MOM_LINKS){ 00103 // pack into the padded regions 00104 packQDPGaugeField(packedEven, (Float**)cpuGhost, 0, reconstruct, volumeCB, 00105 surfaceCB, pad, volumeCB, nFace, type); 00106 packQDPGaugeField(packedOdd, (Float**)cpuGhost, 1, reconstruct, volumeCB, 00107 surfaceCB, pad, volumeCB, nFace, type); 00108 } 00109 #endif 00110 00111 cudaMemcpy(even, packed, bytes, cudaMemcpyHostToDevice); 00112 checkCudaError(); 00113 00114 cudaFreeHost(packed); 00115 } 00116 00117 template <typename Float, typename Float2> 00118 void loadMomField(Float2 *even, Float2 *odd, Float *mom, int bytes, int Vh, int pad) 00119 { 00120 Float2 *packedEven, *packedOdd; 00121 cudaMallocHost(&packedEven, bytes/2); 00122 cudaMallocHost(&packedOdd, bytes/2); 00123 00124 packMomField(packedEven, (Float*)mom, 0, Vh, pad); 00125 packMomField(packedOdd, (Float*)mom, 1, Vh, pad); 00126 00127 cudaMemcpy(even, packedEven, bytes/2, cudaMemcpyHostToDevice); 00128 cudaMemcpy(odd, packedOdd, bytes/2, cudaMemcpyHostToDevice); 00129 00130 cudaFreeHost(packedEven); 00131 cudaFreeHost(packedOdd); 00132 } 00133 00134 void cudaGaugeField::loadCPUField(const cpuGaugeField &cpu, const QudaFieldLocation &pack_location) 00135 { 00136 00137 checkField(cpu); 00138 00139 if (pack_location == QUDA_CUDA_FIELD_LOCATION) { 00140 errorQuda("Not implemented"); // awaiting Guochun's new gauge packing 00141 } else if (pack_location == QUDA_CPU_FIELD_LOCATION) { 00142 // FIXME 00143 anisotropy_ = anisotropy; 00144 X_ = x; 00145 t_boundary_ = t_boundary; 00146 00147 #ifdef MULTI_GPU 00148 //FIXME: if this is MOM field, we don't need exchange data 00149 if(link_type != QUDA_ASQTAD_MOM_LINKS){ 00150 cpu.exchangeGhost(); 00151 } 00152 #endif 00153 00154 if (reconstruct != QUDA_RECONSTRUCT_10) { // gauge field 00155 if (precision == QUDA_DOUBLE_PRECISION) { 00156 00157 if (cpu.Precision() == QUDA_DOUBLE_PRECISION) { 00158 loadGaugeField((double2*)(even), (double2*)(odd), (double*)cpu.gauge, (double**)cpu.ghost, 00159 cpu.Order(), reconstruct, bytes, volumeCB, surfaceCB, pad, nFace, link_type, fat_link_max); 00160 } else if (cpu.Precision() == QUDA_SINGLE_PRECISION) { 00161 loadGaugeField((double2*)(even), (double2*)(odd), (float*)cpu.gauge, (float**)cpu.ghost, 00162 cpu.order, reconstruct, bytes, volumeCB, surfaceCB, pad, nFace, link_type, fat_link_max); 00163 } 00164 00165 } else if (precision == QUDA_SINGLE_PRECISION) { 00166 00167 if (cpu.Precision() == QUDA_DOUBLE_PRECISION) { 00168 if (reconstruct == QUDA_RECONSTRUCT_NO) { 00169 loadGaugeField((float2*)(even), (float2*)(odd), (double*)cpu.gauge, (double**)cpu.ghost, 00170 cpu.order, reconstruct, bytes, volumeCB, surfaceCB, pad, nFace, link_type, fat_link_max); 00171 } else { 00172 loadGaugeField((float4*)(even), (float4*)(odd), (double*)cpu.gauge, (double**)cpu.ghost, 00173 cpu.order, reconstruct, bytes, volumeCB, surfaceCB, pad, nFace, link_type, fat_link_max); 00174 } 00175 } else if (cpu.Precision() == QUDA_SINGLE_PRECISION) { 00176 if (reconstruct == QUDA_RECONSTRUCT_NO) { 00177 loadGaugeField((float2*)(even), (float2*)(odd), (float*)cpu.gauge, (float**)cpu.ghost, 00178 cpu.order, reconstruct, bytes, volumeCB, surfaceCB, pad, nFace, link_type, fat_link_max); 00179 } else { 00180 loadGaugeField((float4*)(even), (float4*)(odd), (float*)cpu.gauge, (float**)cpu.ghost, 00181 cpu.order, reconstruct, bytes, volumeCB, surfaceCB, pad, nFace, link_type, fat_link_max); 00182 } 00183 } 00184 00185 } else if (precision == QUDA_HALF_PRECISION) { 00186 00187 if (cpu.Precision() == QUDA_DOUBLE_PRECISION){ 00188 if (reconstruct == QUDA_RECONSTRUCT_NO) { 00189 loadGaugeField((short2*)(even), (short2*)(odd), (double*)cpu.gauge, (double**)cpu.ghost, 00190 cpu.order, reconstruct, bytes, volumeCB, surfaceCB, pad, nFace, link_type, fat_link_max); 00191 } else { 00192 loadGaugeField((short4*)(even), (short4*)(odd), (double*)cpu.gauge, (double**)cpu.ghost, 00193 cpu.order, reconstruct, bytes, volumeCB, surfaceCB, pad, nFace, link_type, fat_link_max); 00194 } 00195 } else if (cpu.Precision() == QUDA_SINGLE_PRECISION) { 00196 if (reconstruct == QUDA_RECONSTRUCT_NO) { 00197 loadGaugeField((short2*)(even), (short2*)(odd), (float*)cpu.gauge, (float**)cpu.ghost, 00198 cpu.order, reconstruct, bytes, volumeCB, surfaceCB, pad, nFace, link_type, fat_link_max); 00199 } else { 00200 loadGaugeField((short4*)(even), (short4*)(odd), (float*)cpu.gauge, (float**)(cpu.ghost), 00201 cpu.order, reconstruct, bytes, volumeCB, surfaceCB, pad, nFace, link_type, fat_link_max); 00202 } 00203 } 00204 } 00205 } else { // momentum field 00206 if (precision == QUDA_DOUBLE_PRECISION) { 00207 if (cpu.Precision() == QUDA_DOUBLE_PRECISION) { 00208 loadMomField((double2*)(even), (double2*)(odd), (double*)cpu.gauge, bytes, volumeCB, pad); 00209 } else if (cpu.Precision() == QUDA_SINGLE_PRECISION) { 00210 loadMomField((double2*)(even), (double2*)(odd), (float*)cpu.gauge, bytes, volumeCB, pad); 00211 } 00212 } else { 00213 if (cpu.Precision() == QUDA_DOUBLE_PRECISION) { 00214 loadMomField((float2*)(even), (float2*)(odd), (double*)cpu.gauge, bytes, volumeCB, pad); 00215 } else if (cpu.Precision() == QUDA_SINGLE_PRECISION) { 00216 loadMomField((float2*)(even), (float2*)(odd), (float*)cpu.gauge, bytes, volumeCB, pad); 00217 } 00218 } 00219 } // gauge or momentum 00220 } else { 00221 errorQuda("Invalid pack location %d", pack_location); 00222 } 00223 00224 } 00225 00226 /* 00227 Generic gauge field retrieval. Copies the cuda gauge field into a 00228 cpu gauge field. The reordering takes place on the host and 00229 supports most gauge field options. 00230 */ 00231 template <typename Float, typename FloatN> 00232 static void storeGaugeField(Float *cpuGauge, FloatN *gauge, GaugeFieldOrder cpu_order, 00233 QudaReconstructType reconstruct, int bytes, int volumeCB, int pad) { 00234 00235 // Use pinned memory 00236 FloatN *packed; 00237 cudaMallocHost(&packed, bytes); 00238 cudaMemcpy(packed, gauge, bytes, cudaMemcpyDeviceToHost); 00239 00240 FloatN *packedEven = packed; 00241 FloatN *packedOdd = (FloatN*)((char*)packed + bytes/2); 00242 00243 if (cpu_order == QUDA_QDP_GAUGE_ORDER) { 00244 unpackQDPGaugeField((Float**)cpuGauge, packedEven, 0, reconstruct, volumeCB, pad); 00245 unpackQDPGaugeField((Float**)cpuGauge, packedOdd, 1, reconstruct, volumeCB, pad); 00246 } else if (cpu_order == QUDA_CPS_WILSON_GAUGE_ORDER) { 00247 unpackCPSGaugeField((Float*)cpuGauge, packedEven, 0, reconstruct, volumeCB, pad); 00248 unpackCPSGaugeField((Float*)cpuGauge, packedOdd, 1, reconstruct, volumeCB, pad); 00249 } else if (cpu_order == QUDA_MILC_GAUGE_ORDER) { 00250 unpackMILCGaugeField((Float*)cpuGauge, packedEven, 0, reconstruct, volumeCB, pad); 00251 unpackMILCGaugeField((Float*)cpuGauge, packedOdd, 1, reconstruct, volumeCB, pad); 00252 } else { 00253 errorQuda("Invalid gauge_order"); 00254 } 00255 00256 cudaFreeHost(packed); 00257 } 00258 00259 00260 00261 /* 00262 Copies the device gauge field to the host. 00263 - no reconstruction support 00264 - device data is always Float2 ordered 00265 - host data is a 1-dimensional array (MILC ordered) 00266 - no support for half precision 00267 - input and output precisions must match 00268 */ 00269 template<typename FloatN, typename Float> 00270 static void storeGaugeField(Float* cpuGauge, FloatN *gauge, int bytes, int volumeCB, 00271 int stride, QudaPrecision prec) 00272 { 00273 cudaStream_t streams[2]; 00274 for (int i=0; i<2; i++) cudaStreamCreate(&streams[i]); 00275 00276 FloatN *even = gauge; 00277 FloatN *odd = (FloatN*)((char*)gauge + bytes/2); 00278 00279 int datalen = 4*2*volumeCB*gaugeSiteSize*sizeof(Float); // both parities 00280 void *unpacked; 00281 if(cudaMalloc(&unpacked, datalen) != cudaSuccess){ 00282 errorQuda("cudaMalloc() failed for unpacked\n"); 00283 } 00284 void *unpackedEven = unpacked; 00285 void *unpackedOdd = (char*)unpacked + datalen/2; 00286 00287 //unpack even data kernel 00288 link_format_gpu_to_cpu((void*)unpackedEven, (void*)even, volumeCB, stride, prec, streams[0]); 00289 #ifdef GPU_DIRECT 00290 cudaMemcpyAsync(cpuGauge, unpackedEven, datalen/2, cudaMemcpyDeviceToHost, streams[0]); 00291 #else 00292 cudaMemcpy(cpuGauge, unpackedEven, datalen/2, cudaMemcpyDeviceToHost); 00293 #endif 00294 00295 //unpack odd data kernel 00296 link_format_gpu_to_cpu((void*)unpackedOdd, (void*)odd, volumeCB, stride, prec, streams[1]); 00297 #ifdef GPU_DIRECT 00298 cudaMemcpyAsync(cpuGauge + 4*volumeCB*gaugeSiteSize, unpackedOdd, datalen/2, cudaMemcpyDeviceToHost, streams[1]); 00299 for(int i=0; i<2; i++) cudaStreamSynchronize(streams[i]); 00300 #else 00301 cudaMemcpy(cpuGauge + 4*volumeCB*gaugeSiteSize, unpackedOdd, datalen/2, cudaMemcpyDeviceToHost); 00302 #endif 00303 00304 cudaFree(unpacked); 00305 for(int i=0; i<2; i++) cudaStreamDestroy(streams[i]); 00306 } 00307 00308 template <typename Float, typename Float2> 00309 void 00310 storeMomToCPUArray(Float* mom, Float2 *even, Float2 *odd, 00311 int bytes, int V, int pad) 00312 { 00313 Float2 *packedEven, *packedOdd; 00314 cudaMallocHost(&packedEven, bytes/2); 00315 cudaMallocHost(&packedOdd, bytes/2); 00316 cudaMemcpy(packedEven, even, bytes/2, cudaMemcpyDeviceToHost); 00317 cudaMemcpy(packedOdd, odd, bytes/2, cudaMemcpyDeviceToHost); 00318 00319 unpackMomField((Float*)mom, packedEven,0, V/2, pad); 00320 unpackMomField((Float*)mom, packedOdd, 1, V/2, pad); 00321 00322 cudaFreeHost(packedEven); 00323 cudaFreeHost(packedOdd); 00324 } 00325 00326 void cudaGaugeField::saveCPUField(cpuGaugeField &cpu, const QudaFieldLocation &pack_location) const 00327 { 00328 00329 // do device-side reordering then copy 00330 if (pack_location == QUDA_CUDA_FIELD_LOCATION) { 00331 // check parameters are suitable for device-side packing 00332 if (precision != cpu.Precision()) 00333 errorQuda("cpu precision %d and cuda precision %d must be the same", 00334 cpu.Precision(), precision ); 00335 00336 if (reconstruct != QUDA_RECONSTRUCT_NO) 00337 errorQuda("Only no reconstruction supported"); 00338 00339 if (order != QUDA_FLOAT2_GAUGE_ORDER) 00340 errorQuda("Only QUDA_FLOAT2_GAUGE_ORDER supported"); 00341 00342 if (cpu.Order() != QUDA_MILC_GAUGE_ORDER) 00343 errorQuda("Only QUDA_MILC_GAUGE_ORDER supported"); 00344 00345 if (precision == QUDA_DOUBLE_PRECISION){ 00346 storeGaugeField((double*)cpu.gauge, (double2*)gauge, bytes, volumeCB, stride, precision); 00347 } else if (precision == QUDA_SINGLE_PRECISION){ 00348 storeGaugeField((float*)cpu.gauge, (float2*)gauge, bytes, volumeCB, stride, precision); 00349 } else { 00350 errorQuda("Half precision not supported"); 00351 } 00352 00353 } else if (pack_location == QUDA_CPU_FIELD_LOCATION) { // do copy then host-side reorder 00354 00355 // FIXME - nasty globals 00356 anisotropy_ = anisotropy; 00357 fat_link_max_ = fat_link_max; 00358 X_ = x; 00359 t_boundary_ = t_boundary; 00360 00361 if (reconstruct != QUDA_RECONSTRUCT_10) { 00362 if (precision == QUDA_DOUBLE_PRECISION) { 00363 00364 if (cpu.Precision() == QUDA_DOUBLE_PRECISION) { 00365 storeGaugeField((double*)cpu.gauge, (double2*)(gauge), 00366 cpu.order, reconstruct, bytes, volumeCB, pad); 00367 } else if (cpu.Precision() == QUDA_SINGLE_PRECISION) { 00368 storeGaugeField((float*)cpu.gauge, (double2*)(gauge), 00369 cpu.order, reconstruct, bytes, volumeCB, pad); 00370 } 00371 00372 } else if (precision == QUDA_SINGLE_PRECISION) { 00373 00374 if (cpu.Precision() == QUDA_DOUBLE_PRECISION) { 00375 if (reconstruct == QUDA_RECONSTRUCT_NO) { 00376 storeGaugeField((double*)cpu.gauge, (float2*)(gauge), 00377 cpu.order, reconstruct, bytes, volumeCB, pad); 00378 } else { 00379 storeGaugeField((double*)cpu.gauge, (float4*)(gauge), 00380 cpu.order, reconstruct, bytes, volumeCB, pad); 00381 } 00382 } else if (cpu.Precision() == QUDA_SINGLE_PRECISION) { 00383 if (reconstruct == QUDA_RECONSTRUCT_NO) { 00384 storeGaugeField((float*)cpu.gauge, (float2*)(gauge), 00385 cpu.order, reconstruct, bytes, volumeCB, pad); 00386 } else { 00387 storeGaugeField((float*)cpu.gauge, (float4*)(gauge), 00388 cpu.order, reconstruct, bytes, volumeCB, pad); 00389 } 00390 } 00391 00392 } else if (precision == QUDA_HALF_PRECISION) { 00393 00394 if (cpu.Precision() == QUDA_DOUBLE_PRECISION) { 00395 if (reconstruct == QUDA_RECONSTRUCT_NO) { 00396 storeGaugeField((double*)cpu.gauge, (short2*)(gauge), 00397 cpu.order, reconstruct, bytes, volumeCB, pad); 00398 } else { 00399 storeGaugeField((double*)cpu.gauge, (short4*)(gauge), 00400 cpu.order, reconstruct, bytes, volumeCB, pad); 00401 } 00402 } else if (cpu.Precision() == QUDA_SINGLE_PRECISION) { 00403 if (reconstruct == QUDA_RECONSTRUCT_NO) { 00404 storeGaugeField((float*)cpu.gauge, (short2*)(gauge), 00405 cpu.order, reconstruct, bytes, volumeCB, pad); 00406 } else { 00407 storeGaugeField((float*)cpu.gauge, (short4*)(gauge), 00408 cpu.order, reconstruct, bytes, volumeCB, pad); 00409 } 00410 } 00411 } 00412 } else { 00413 00414 if (cpu.Precision() != precision) 00415 errorQuda("cpu and gpu precison has to be the same at this moment"); 00416 00417 if (precision == QUDA_HALF_PRECISION) 00418 errorQuda("half precision is not supported at this moment"); 00419 00420 if (cpu.order != QUDA_MILC_GAUGE_ORDER) 00421 errorQuda("Only MILC gauge order supported in momentum unpack, not %d", cpu.order); 00422 00423 if (precision == QUDA_DOUBLE_PRECISION) { 00424 storeMomToCPUArray( (double*)cpu.gauge, (double2*)even, (double2*)odd, bytes, volume, pad); 00425 }else { //SINGLE PRECISIONS 00426 storeMomToCPUArray( (float*)cpu.gauge, (float2*)even, (float2*)odd, bytes, volume, pad); 00427 } 00428 } // reconstruct 10 00429 } else { 00430 errorQuda("Invalid pack location %d", pack_location); 00431 } 00432 00433 } 00434