QUDA v0.4.0
A library for QCD on GPUs
|
00001 #include <iostream> 00002 #include <stdio.h> 00003 #include <stdlib.h> 00004 #include <math.h> 00005 #include <string.h> 00006 #include <sys/time.h> 00007 00008 #include <quda.h> 00009 #include <quda_internal.h> 00010 #include <comm_quda.h> 00011 #include <tune_quda.h> 00012 #include <blas_quda.h> 00013 #include <gauge_field.h> 00014 #include <dirac_quda.h> 00015 #include <dslash_quda.h> 00016 #include <invert_quda.h> 00017 #include <color_spinor_field.h> 00018 #include <clover_field.h> 00019 #include <llfat_quda.h> 00020 #include <fat_force_quda.h> 00021 #include <hisq_links_quda.h> 00022 00023 #ifdef QUDA_NUMA_SUPPORT 00024 #include <numa_affinity.h> 00025 #endif 00026 00027 #include <cuda.h> 00028 #ifdef MULTI_GPU 00029 #ifdef MPI_COMMS 00030 #include <mpi.h> 00031 #endif 00032 00033 #ifdef QMP_COMMS 00034 #include <qmp.h> 00035 #endif 00036 #endif 00037 00038 #ifdef GPU_GAUGE_FORCE 00039 #include <gauge_force_quda.h> 00040 #endif 00041 00042 #define MAX(a,b) ((a)>(b)? (a):(b)) 00043 #define TDIFF(a,b) (b.tv_sec - a.tv_sec + 0.000001*(b.tv_usec - a.tv_usec)) 00044 00045 #define spinorSiteSize 24 // real numbers per spinor 00046 00047 #define MAX_GPU_NUM_PER_NODE 16 00048 00049 // define newQudaGaugeParam() and newQudaInvertParam() 00050 #define INIT_PARAM 00051 #include "check_params.h" 00052 #undef INIT_PARAM 00053 00054 // define (static) checkGaugeParam() and checkInvertParam() 00055 #define CHECK_PARAM 00056 #include "check_params.h" 00057 #undef CHECK_PARAM 00058 00059 // define printQudaGaugeParam() and printQudaInvertParam() 00060 #define PRINT_PARAM 00061 #include "check_params.h" 00062 #undef PRINT_PARAM 00063 00064 #ifdef QMP_COMMS 00065 int rank_QMP; 00066 int num_QMP; 00067 extern bool qudaPt0; 00068 extern bool qudaPtNm1; 00069 #endif 00070 00071 #include "face_quda.h" 00072 00073 static QudaVerbosity verbosity; 00074 int numa_affinity_enabled = 1; 00075 00076 cudaGaugeField *gaugePrecise = NULL; 00077 cudaGaugeField *gaugeSloppy = NULL; 00078 cudaGaugeField *gaugePrecondition = NULL; 00079 00080 cudaGaugeField *gaugeFatPrecise = NULL; 00081 cudaGaugeField *gaugeFatSloppy = NULL; 00082 cudaGaugeField *gaugeFatPrecondition = NULL; 00083 00084 cudaGaugeField *gaugeLongPrecise = NULL; 00085 cudaGaugeField *gaugeLongSloppy = NULL; 00086 cudaGaugeField *gaugeLongPrecondition = NULL; 00087 00088 cudaCloverField *cloverPrecise = NULL; 00089 cudaCloverField *cloverSloppy = NULL; 00090 cudaCloverField *cloverPrecondition = NULL; 00091 00092 cudaDeviceProp deviceProp; 00093 cudaStream_t *streams; 00094 00095 int getGpuCount() 00096 { 00097 int count; 00098 cudaGetDeviceCount(&count); 00099 if (count <= 0){ 00100 errorQuda("No devices supporting CUDA"); 00101 } 00102 if(count > MAX_GPU_NUM_PER_NODE){ 00103 errorQuda("gpu count(%d) is larger than limit\n", count); 00104 } 00105 return count; 00106 } 00107 00108 void initQuda(int dev) 00109 { 00110 static int initialized = 0; 00111 if (initialized) { 00112 return; 00113 } 00114 initialized = 1; 00115 00116 #if defined(GPU_DIRECT) && defined(MULTI_GPU) && (CUDA_VERSION == 4000) 00117 //check if CUDA_NIC_INTEROP is set to 1 in the enviroment 00118 // not needed for CUDA >= 4.1 00119 char* cni_str = getenv("CUDA_NIC_INTEROP"); 00120 if(cni_str == NULL){ 00121 errorQuda("Environment variable CUDA_NIC_INTEROP is not set\n"); 00122 } 00123 int cni_int = atoi(cni_str); 00124 if (cni_int != 1){ 00125 errorQuda("Environment variable CUDA_NIC_INTEROP is not set to 1\n"); 00126 } 00127 #endif 00128 00129 int deviceCount; 00130 cudaGetDeviceCount(&deviceCount); 00131 if (deviceCount == 0) { 00132 errorQuda("No devices supporting CUDA"); 00133 } 00134 00135 for(int i=0; i<deviceCount; i++) { 00136 cudaDeviceProp deviceProp; 00137 cudaGetDeviceProperties(&deviceProp, i); 00138 printfQuda("QUDA: Found device %d: %s\n", i, deviceProp.name); 00139 } 00140 00141 #ifdef QMP_COMMS 00142 int ndim; 00143 const int *dim; 00144 00145 if ( QMP_is_initialized() != QMP_TRUE ) { 00146 errorQuda("QMP is not initialized"); 00147 } 00148 num_QMP=QMP_get_number_of_nodes(); 00149 rank_QMP=QMP_get_node_number(); 00150 00151 if (dev < 0) { 00152 dev = rank_QMP % deviceCount; 00153 } 00154 ndim = QMP_get_logical_number_of_dimensions(); 00155 dim = QMP_get_logical_dimensions(); 00156 #elif defined(MPI_COMMS) 00157 comm_init(); 00158 if (dev < 0) { 00159 dev=comm_gpuid(); 00160 } 00161 #else 00162 if (dev < 0) errorQuda("Invalid device number"); 00163 #endif 00164 00165 // Used for applying the gauge field boundary condition 00166 if( commCoords(3) == 0 ) qudaPt0=true; 00167 else qudaPt0=false; 00168 00169 if( commCoords(3) == commDim(3)-1 ) qudaPtNm1=true; 00170 else qudaPtNm1=false; 00171 00172 cudaGetDeviceProperties(&deviceProp, dev); 00173 if (deviceProp.major < 1) { 00174 errorQuda("Device %d does not support CUDA", dev); 00175 } 00176 00177 printfQuda("QUDA: Using device %d: %s\n", dev, deviceProp.name); 00178 00179 cudaSetDevice(dev); 00180 00181 #ifdef QUDA_NUMA_SUPPORT 00182 if(numa_affinity_enabled){ 00183 setNumaAffinity(dev); 00184 } 00185 #endif 00186 // if the device supports host-mapped memory, then enable this 00187 if(deviceProp.canMapHostMemory) cudaSetDeviceFlags(cudaDeviceMapHost); 00188 00189 initCache(); 00190 //cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte); 00191 cudaGetDeviceProperties(&deviceProp, dev); 00192 00193 streams = new cudaStream_t[Nstream]; 00194 for (int i=0; i<Nstream; i++) { 00195 cudaStreamCreate(&streams[i]); 00196 } 00197 00198 quda::initBlas(); 00199 00200 loadTuneCache(QUDA_VERBOSE); 00201 } 00202 00203 00204 00205 void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param) 00206 { 00207 checkGaugeParam(param); 00208 00209 // Set the specific cpu parameters and create the cpu gauge field 00210 GaugeFieldParam gauge_param(h_gauge, *param); 00211 00212 cpuGaugeField cpu(gauge_param); 00213 00214 // switch the parameters for creating the mirror precise cuda gauge field 00215 gauge_param.create = QUDA_NULL_FIELD_CREATE; 00216 gauge_param.precision = param->cuda_prec; 00217 gauge_param.reconstruct = param->reconstruct; 00218 gauge_param.pad = param->ga_pad; 00219 gauge_param.order = (gauge_param.precision == QUDA_DOUBLE_PRECISION || 00220 gauge_param.reconstruct == QUDA_RECONSTRUCT_NO ) ? 00221 QUDA_FLOAT2_GAUGE_ORDER : QUDA_FLOAT4_GAUGE_ORDER; 00222 00223 cudaGaugeField *precise = new cudaGaugeField(gauge_param); 00224 precise->loadCPUField(cpu, QUDA_CPU_FIELD_LOCATION); 00225 00226 param->gaugeGiB += precise->GBytes(); 00227 00228 // switch the parameters for creating the mirror sloppy cuda gauge field 00229 gauge_param.precision = param->cuda_prec_sloppy; 00230 gauge_param.reconstruct = param->reconstruct_sloppy; 00231 gauge_param.order = (gauge_param.precision == QUDA_DOUBLE_PRECISION || 00232 gauge_param.reconstruct == QUDA_RECONSTRUCT_NO ) ? 00233 QUDA_FLOAT2_GAUGE_ORDER : QUDA_FLOAT4_GAUGE_ORDER; 00234 cudaGaugeField *sloppy = NULL; 00235 if (param->cuda_prec != param->cuda_prec_sloppy) { 00236 sloppy = new cudaGaugeField(gauge_param); 00237 sloppy->loadCPUField(cpu, QUDA_CPU_FIELD_LOCATION); 00238 param->gaugeGiB += sloppy->GBytes(); 00239 } else { 00240 sloppy = precise; 00241 } 00242 00243 // switch the parameters for creating the mirror preconditioner cuda gauge field 00244 gauge_param.precision = param->cuda_prec_precondition; 00245 gauge_param.reconstruct = param->reconstruct_precondition; 00246 gauge_param.order = (gauge_param.precision == QUDA_DOUBLE_PRECISION || 00247 gauge_param.reconstruct == QUDA_RECONSTRUCT_NO ) ? 00248 QUDA_FLOAT2_GAUGE_ORDER : QUDA_FLOAT4_GAUGE_ORDER; 00249 cudaGaugeField *precondition = NULL; 00250 if (param->cuda_prec_sloppy != param->cuda_prec_precondition) { 00251 precondition = new cudaGaugeField(gauge_param); 00252 precondition->loadCPUField(cpu, QUDA_CPU_FIELD_LOCATION); 00253 param->gaugeGiB += precondition->GBytes(); 00254 } else { 00255 precondition = sloppy; 00256 } 00257 00258 switch (param->type) { 00259 case QUDA_WILSON_LINKS: 00260 if (gaugePrecise) errorQuda("Precise gauge field already allocated"); 00261 gaugePrecise = precise; 00262 if (gaugeSloppy) errorQuda("Sloppy gauge field already allocated"); 00263 gaugeSloppy = sloppy; 00264 if (gaugePrecondition) errorQuda("Precondition gauge field already allocated"); 00265 gaugePrecondition = precondition; 00266 break; 00267 case QUDA_ASQTAD_FAT_LINKS: 00268 if (gaugeFatPrecise) errorQuda("Precise gauge fat field already allocated"); 00269 gaugeFatPrecise = precise; 00270 if (gaugeFatSloppy) errorQuda("Sloppy gauge fat field already allocated"); 00271 gaugeFatSloppy = sloppy; 00272 if (gaugeFatPrecondition) errorQuda("Precondition gauge fat field already allocated"); 00273 gaugeFatPrecondition = precondition; 00274 break; 00275 case QUDA_ASQTAD_LONG_LINKS: 00276 if (gaugeLongPrecise) errorQuda("Precise gauge long field already allocated"); 00277 gaugeLongPrecise = precise; 00278 if (gaugeLongSloppy) errorQuda("Sloppy gauge long field already allocated"); 00279 gaugeLongSloppy = sloppy; 00280 if (gaugeLongPrecondition) errorQuda("Precondition gauge long field already allocated"); 00281 gaugeLongPrecondition = precondition; 00282 break; 00283 default: 00284 errorQuda("Invalid gauge type"); 00285 } 00286 00287 } 00288 00289 void saveGaugeQuda(void *h_gauge, QudaGaugeParam *param) 00290 { 00291 checkGaugeParam(param); 00292 00293 // Set the specific cpu parameters and create the cpu gauge field 00294 GaugeFieldParam gauge_param(h_gauge, *param); 00295 cpuGaugeField cpuGauge(gauge_param); 00296 cudaGaugeField *cudaGauge = NULL; 00297 switch (param->type) { 00298 case QUDA_WILSON_LINKS: 00299 cudaGauge = gaugePrecise; 00300 break; 00301 case QUDA_ASQTAD_FAT_LINKS: 00302 cudaGauge = gaugeFatPrecise; 00303 break; 00304 case QUDA_ASQTAD_LONG_LINKS: 00305 cudaGauge = gaugeLongPrecise; 00306 break; 00307 default: 00308 errorQuda("Invalid gauge type"); 00309 } 00310 00311 cudaGauge->saveCPUField(cpuGauge, QUDA_CPU_FIELD_LOCATION); 00312 } 00313 00314 00315 void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param) 00316 { 00317 00318 if (!h_clover && !h_clovinv) { 00319 errorQuda("loadCloverQuda() called with neither clover term nor inverse"); 00320 } 00321 if (inv_param->clover_cpu_prec == QUDA_HALF_PRECISION) { 00322 errorQuda("Half precision not supported on CPU"); 00323 } 00324 if (gaugePrecise == NULL) { 00325 errorQuda("Gauge field must be loaded before clover"); 00326 } 00327 if (inv_param->dslash_type != QUDA_CLOVER_WILSON_DSLASH) { 00328 errorQuda("Wrong dslash_type in loadCloverQuda()"); 00329 } 00330 00331 // determines whether operator is preconditioned when calling invertQuda() 00332 bool pc_solve = (inv_param->solve_type == QUDA_DIRECT_PC_SOLVE || 00333 inv_param->solve_type == QUDA_NORMEQ_PC_SOLVE); 00334 00335 // determines whether operator is preconditioned when calling MatQuda() or MatDagMatQuda() 00336 bool pc_solution = (inv_param->solution_type == QUDA_MATPC_SOLUTION || 00337 inv_param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION); 00338 00339 bool asymmetric = (inv_param->matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC || 00340 inv_param->matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC); 00341 00342 // We issue a warning only when it seems likely that the user is screwing up: 00343 00344 // inverted clover term is required when applying preconditioned operator 00345 if (!h_clovinv && pc_solve && pc_solution) { 00346 warningQuda("Inverted clover term not loaded"); 00347 } 00348 00349 // uninverted clover term is required when applying unpreconditioned operator, 00350 // but note that dslashQuda() is always preconditioned 00351 if (!h_clover && !pc_solve && !pc_solution) { 00352 //warningQuda("Uninverted clover term not loaded"); 00353 } 00354 00355 // uninverted clover term is also required for "asymmetric" preconditioning 00356 if (!h_clover && pc_solve && pc_solution && asymmetric) { 00357 warningQuda("Uninverted clover term not loaded"); 00358 } 00359 00360 CloverFieldParam clover_param; 00361 clover_param.nDim = 4; 00362 for (int i=0; i<4; i++) clover_param.x[i] = gaugePrecise->X()[i]; 00363 clover_param.precision = inv_param->clover_cuda_prec; 00364 clover_param.pad = inv_param->cl_pad; 00365 00366 cloverPrecise = new cudaCloverField(h_clover, h_clovinv, inv_param->clover_cpu_prec, 00367 inv_param->clover_order, clover_param); 00368 inv_param->cloverGiB = cloverPrecise->GBytes(); 00369 00370 // create the mirror sloppy clover field 00371 if (inv_param->clover_cuda_prec != inv_param->clover_cuda_prec_sloppy) { 00372 clover_param.precision = inv_param->clover_cuda_prec_sloppy; 00373 cloverSloppy = new cudaCloverField(h_clover, h_clovinv, inv_param->clover_cpu_prec, 00374 inv_param->clover_order, clover_param); 00375 inv_param->cloverGiB += cloverSloppy->GBytes(); 00376 } else { 00377 cloverSloppy = cloverPrecise; 00378 } 00379 00380 // create the mirror preconditioner clover field 00381 if (inv_param->clover_cuda_prec_sloppy != inv_param->clover_cuda_prec_precondition && 00382 inv_param->clover_cuda_prec_precondition != QUDA_INVALID_PRECISION) { 00383 clover_param.precision = inv_param->clover_cuda_prec_precondition; 00384 cloverPrecondition = new cudaCloverField(h_clover, h_clovinv, inv_param->clover_cpu_prec, 00385 inv_param->clover_order, clover_param); 00386 inv_param->cloverGiB += cloverPrecondition->GBytes(); 00387 } else { 00388 cloverPrecondition = cloverSloppy; 00389 } 00390 00391 } 00392 00393 void freeGaugeQuda(void) 00394 { 00395 if (gaugeSloppy != gaugePrecondition && gaugePrecondition) delete gaugePrecondition; 00396 if (gaugePrecise != gaugeSloppy && gaugeSloppy) delete gaugeSloppy; 00397 if (gaugePrecise) delete gaugePrecise; 00398 00399 gaugePrecondition = NULL; 00400 gaugeSloppy = NULL; 00401 gaugePrecise = NULL; 00402 00403 if (gaugeLongSloppy != gaugeLongPrecondition && gaugeLongPrecondition) delete gaugeLongPrecondition; 00404 if (gaugeLongPrecise != gaugeLongSloppy && gaugeLongSloppy) delete gaugeLongSloppy; 00405 if (gaugeLongPrecise) delete gaugeLongPrecise; 00406 00407 gaugeLongPrecondition = NULL; 00408 gaugeLongSloppy = NULL; 00409 gaugeLongPrecise = NULL; 00410 00411 if (gaugeFatSloppy != gaugeFatPrecondition && gaugeFatPrecondition) delete gaugeFatPrecondition; 00412 if (gaugeFatPrecise != gaugeFatSloppy && gaugeFatSloppy) delete gaugeFatSloppy; 00413 if (gaugeFatPrecise) delete gaugeFatPrecise; 00414 00415 gaugeFatPrecondition = NULL; 00416 gaugeFatSloppy = NULL; 00417 gaugeFatPrecise = NULL; 00418 } 00419 00420 void freeCloverQuda(void) 00421 { 00422 if (cloverPrecondition != cloverSloppy && cloverPrecondition) delete cloverPrecondition; 00423 if (cloverSloppy != cloverPrecise && cloverSloppy) delete cloverSloppy; 00424 if (cloverPrecise) delete cloverPrecise; 00425 00426 cloverPrecondition = NULL; 00427 cloverSloppy = NULL; 00428 cloverPrecise = NULL; 00429 } 00430 00431 void endQuda(void) 00432 { 00433 cudaColorSpinorField::freeBuffer(); 00434 cudaColorSpinorField::freeGhostBuffer(); 00435 cpuColorSpinorField::freeGhostBuffer(); 00436 freeGaugeQuda(); 00437 freeCloverQuda(); 00438 00439 quda::endBlas(); 00440 00441 if (streams) { 00442 for (int i=0; i<Nstream; i++) cudaStreamDestroy(streams[i]); 00443 delete []streams; 00444 streams = NULL; 00445 } 00446 00447 saveTuneCache(QUDA_VERBOSE); 00448 } 00449 00450 00451 void setDiracParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc) 00452 { 00453 double kappa = inv_param->kappa; 00454 if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) { 00455 kappa *= gaugePrecise->Anisotropy(); 00456 } 00457 00458 switch (inv_param->dslash_type) { 00459 case QUDA_WILSON_DSLASH: 00460 diracParam.type = pc ? QUDA_WILSONPC_DIRAC : QUDA_WILSON_DIRAC; 00461 break; 00462 case QUDA_CLOVER_WILSON_DSLASH: 00463 diracParam.type = pc ? QUDA_CLOVERPC_DIRAC : QUDA_CLOVER_DIRAC; 00464 break; 00465 case QUDA_DOMAIN_WALL_DSLASH: 00466 diracParam.type = pc ? QUDA_DOMAIN_WALLPC_DIRAC : QUDA_DOMAIN_WALL_DIRAC; 00467 break; 00468 case QUDA_ASQTAD_DSLASH: 00469 diracParam.type = pc ? QUDA_ASQTADPC_DIRAC : QUDA_ASQTAD_DIRAC; 00470 break; 00471 case QUDA_TWISTED_MASS_DSLASH: 00472 diracParam.type = pc ? QUDA_TWISTED_MASSPC_DIRAC : QUDA_TWISTED_MASS_DIRAC; 00473 break; 00474 default: 00475 errorQuda("Unsupported dslash_type"); 00476 } 00477 00478 diracParam.matpcType = inv_param->matpc_type; 00479 diracParam.dagger = inv_param->dagger; 00480 diracParam.gauge = gaugePrecise; 00481 diracParam.fatGauge = gaugeFatPrecise; 00482 diracParam.longGauge = gaugeLongPrecise; 00483 diracParam.clover = cloverPrecise; 00484 diracParam.kappa = kappa; 00485 diracParam.mass = inv_param->mass; 00486 diracParam.m5 = inv_param->m5; 00487 diracParam.mu = inv_param->mu; 00488 diracParam.verbose = inv_param->verbosity; 00489 00490 for (int i=0; i<4; i++) { 00491 diracParam.commDim[i] = 1; // comms are always on 00492 } 00493 } 00494 00495 00496 void setDiracSloppyParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc) 00497 { 00498 setDiracParam(diracParam, inv_param, pc); 00499 00500 diracParam.gauge = gaugeSloppy; 00501 diracParam.fatGauge = gaugeFatSloppy; 00502 diracParam.longGauge = gaugeLongSloppy; 00503 diracParam.clover = cloverSloppy; 00504 00505 for (int i=0; i<4; i++) { 00506 diracParam.commDim[i] = 1; // comms are always on 00507 } 00508 00509 } 00510 00511 // The preconditioner currently mimicks the sloppy operator with no comms 00512 void setDiracPreParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc) 00513 { 00514 setDiracParam(diracParam, inv_param, pc); 00515 00516 diracParam.gauge = gaugePrecondition; 00517 diracParam.fatGauge = gaugeFatPrecondition; 00518 diracParam.longGauge = gaugeLongPrecondition; 00519 diracParam.clover = cloverPrecondition; 00520 00521 for (int i=0; i<4; i++) { 00522 diracParam.commDim[i] = 0; // comms are always off 00523 } 00524 00525 } 00526 00527 00528 static void massRescale(QudaDslashType dslash_type, double &kappa, QudaSolutionType solution_type, 00529 QudaMassNormalization mass_normalization, cudaColorSpinorField &b) 00530 { 00531 if (verbosity >= QUDA_VERBOSE) { 00532 printfQuda("Mass rescale: Kappa is: %f\n", kappa); 00533 printfQuda("Mass rescale: mass normalization: %d\n", mass_normalization); 00534 double nin = norm2(b); 00535 printfQuda("Mass rescale: norm of source in = %f\n", nin); 00536 } 00537 00538 if (dslash_type == QUDA_ASQTAD_DSLASH) { 00539 if (mass_normalization != QUDA_MASS_NORMALIZATION) { 00540 errorQuda("Staggered code only supports QUDA_MASS_NORMALIZATION"); 00541 } 00542 return; 00543 } 00544 00545 // multiply the source to compensate for normalization of the Dirac operator, if necessary 00546 switch (solution_type) { 00547 case QUDA_MAT_SOLUTION: 00548 if (mass_normalization == QUDA_MASS_NORMALIZATION || 00549 mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { 00550 axCuda(2.0*kappa, b); 00551 } 00552 break; 00553 case QUDA_MATDAG_MAT_SOLUTION: 00554 if (mass_normalization == QUDA_MASS_NORMALIZATION || 00555 mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { 00556 axCuda(4.0*kappa*kappa, b); 00557 } 00558 break; 00559 case QUDA_MATPC_SOLUTION: 00560 if (mass_normalization == QUDA_MASS_NORMALIZATION) { 00561 axCuda(4.0*kappa*kappa, b); 00562 } else if (mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { 00563 axCuda(2.0*kappa, b); 00564 } 00565 break; 00566 case QUDA_MATPCDAG_MATPC_SOLUTION: 00567 if (mass_normalization == QUDA_MASS_NORMALIZATION) { 00568 axCuda(16.0*pow(kappa,4), b); 00569 } else if (mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { 00570 axCuda(4.0*kappa*kappa, b); 00571 } 00572 break; 00573 default: 00574 errorQuda("Solution type %d not supported", solution_type); 00575 } 00576 00577 if (verbosity >= QUDA_DEBUG_VERBOSE) printfQuda("Mass rescale done\n"); 00578 if (verbosity >= QUDA_VERBOSE) { 00579 printfQuda("Mass rescale: Kappa is: %f\n", kappa); 00580 printfQuda("Mass rescale: mass normalization: %d\n", mass_normalization); 00581 double nin = norm2(b); 00582 printfQuda("Mass rescale: norm of source out = %f\n", nin); 00583 } 00584 00585 } 00586 00587 static void massRescaleCoeff(QudaDslashType dslash_type, double &kappa, QudaSolutionType solution_type, 00588 QudaMassNormalization mass_normalization, double &coeff) 00589 { 00590 if (dslash_type == QUDA_ASQTAD_DSLASH) { 00591 if (mass_normalization != QUDA_MASS_NORMALIZATION) { 00592 errorQuda("Staggered code only supports QUDA_MASS_NORMALIZATION"); 00593 } 00594 return; 00595 } 00596 00597 // multiply the source to compensate for normalization of the Dirac operator, if necessary 00598 switch (solution_type) { 00599 case QUDA_MAT_SOLUTION: 00600 if (mass_normalization == QUDA_MASS_NORMALIZATION || 00601 mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { 00602 coeff *= 2.0*kappa; 00603 } 00604 break; 00605 case QUDA_MATDAG_MAT_SOLUTION: 00606 if (mass_normalization == QUDA_MASS_NORMALIZATION || 00607 mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { 00608 coeff *= 4.0*kappa*kappa; 00609 } 00610 break; 00611 case QUDA_MATPC_SOLUTION: 00612 if (mass_normalization == QUDA_MASS_NORMALIZATION) { 00613 coeff *= 4.0*kappa*kappa; 00614 } else if (mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { 00615 coeff *= 2.0*kappa; 00616 } 00617 break; 00618 case QUDA_MATPCDAG_MATPC_SOLUTION: 00619 if (mass_normalization == QUDA_MASS_NORMALIZATION) { 00620 coeff*=16.0*pow(kappa,4); 00621 } else if (mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { 00622 coeff*=4.0*kappa*kappa; 00623 } 00624 break; 00625 default: 00626 errorQuda("Solution type %d not supported", solution_type); 00627 } 00628 00629 if (verbosity >= QUDA_DEBUG_VERBOSE) printfQuda("Mass rescale done\n"); 00630 } 00631 00632 00633 void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity) 00634 { 00635 ColorSpinorParam cpuParam(h_in, *inv_param, gaugePrecise->X(), 1); 00636 ColorSpinorParam cudaParam(cpuParam, *inv_param); 00637 00638 cpuColorSpinorField hIn(cpuParam); 00639 00640 cudaColorSpinorField in(hIn, cudaParam); 00641 00642 cudaParam.create = QUDA_NULL_FIELD_CREATE; 00643 cudaColorSpinorField out(in, cudaParam); 00644 00645 if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) { 00646 if (parity == QUDA_EVEN_PARITY) { 00647 parity = QUDA_ODD_PARITY; 00648 } else { 00649 parity = QUDA_EVEN_PARITY; 00650 } 00651 axCuda(gaugePrecise->Anisotropy(), in); 00652 } 00653 bool pc = true; 00654 00655 DiracParam diracParam; 00656 setDiracParam(diracParam, inv_param, pc); 00657 00658 Dirac *dirac = Dirac::create(diracParam); // create the Dirac operator 00659 dirac->Dslash(out, in, parity); // apply the operator 00660 delete dirac; // clean up 00661 00662 cpuParam.v = h_out; 00663 cpuColorSpinorField hOut(cpuParam); 00664 out.saveCPUSpinorField(hOut); // since this is a reference, this won't work: hOut = out; 00665 } 00666 00667 00668 void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param) 00669 { 00670 bool pc = (inv_param->solution_type == QUDA_MATPC_SOLUTION || 00671 inv_param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION); 00672 00673 ColorSpinorParam cpuParam(h_in, *inv_param, gaugePrecise->X(), pc); 00674 00675 ColorSpinorParam cudaParam(cpuParam, *inv_param); 00676 00677 cpuColorSpinorField hIn(cpuParam); 00678 cudaColorSpinorField in(hIn, cudaParam); 00679 cudaParam.create = QUDA_NULL_FIELD_CREATE; 00680 cudaColorSpinorField out(in, cudaParam); 00681 00682 DiracParam diracParam; 00683 setDiracParam(diracParam, inv_param, pc); 00684 00685 Dirac *dirac = Dirac::create(diracParam); // create the Dirac operator 00686 dirac->M(out, in); // apply the operator 00687 delete dirac; // clean up 00688 00689 double kappa = inv_param->kappa; 00690 if (pc) { 00691 if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION) { 00692 axCuda(0.25/(kappa*kappa), out); 00693 } else if (inv_param->mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { 00694 axCuda(0.5/kappa, out); 00695 } 00696 } else { 00697 if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION || 00698 inv_param->mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { 00699 axCuda(0.5/kappa, out); 00700 } 00701 } 00702 00703 cpuParam.v = h_out; 00704 cpuColorSpinorField hOut(cpuParam); 00705 out.saveCPUSpinorField(hOut); // since this is a reference, this won't work: hOut = out; 00706 } 00707 00708 00709 void MatDagMatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param) 00710 { 00711 bool pc = (inv_param->solution_type == QUDA_MATPC_SOLUTION || 00712 inv_param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION); 00713 00714 ColorSpinorParam cpuParam(h_in, *inv_param, gaugePrecise->X(), pc); 00715 ColorSpinorParam cudaParam(cpuParam, *inv_param); 00716 00717 cpuColorSpinorField hIn(cpuParam); 00718 cudaColorSpinorField in(hIn, cudaParam); 00719 cudaParam.create = QUDA_NULL_FIELD_CREATE; 00720 cudaColorSpinorField out(in, cudaParam); 00721 00722 // double kappa = inv_param->kappa; 00723 // if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) kappa *= gaugePrecise->anisotropy; 00724 00725 DiracParam diracParam; 00726 setDiracParam(diracParam, inv_param, pc); 00727 00728 Dirac *dirac = Dirac::create(diracParam); // create the Dirac operator 00729 dirac->MdagM(out, in); // apply the operator 00730 delete dirac; // clean up 00731 00732 double kappa = inv_param->kappa; 00733 if (pc) { 00734 if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION) { 00735 axCuda(1.0/pow(2.0*kappa,4), out); 00736 } else if (inv_param->mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { 00737 axCuda(0.25/(kappa*kappa), out); 00738 } 00739 } else { 00740 if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION || 00741 inv_param->mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { 00742 axCuda(0.25/(kappa*kappa), out); 00743 } 00744 } 00745 00746 cpuParam.v = h_out; 00747 cpuColorSpinorField hOut(cpuParam); 00748 out.saveCPUSpinorField(hOut); // since this is a reference, this won't work: hOut = out; 00749 } 00750 00751 void createDirac(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, QudaInvertParam ¶m, const bool pc_solve) 00752 { 00753 DiracParam diracParam; 00754 DiracParam diracSloppyParam; 00755 DiracParam diracPreParam; 00756 00757 setDiracParam(diracParam, ¶m, pc_solve); 00758 setDiracSloppyParam(diracSloppyParam, ¶m, pc_solve); 00759 setDiracPreParam(diracPreParam, ¶m, pc_solve); 00760 00761 d = Dirac::create(diracParam); // create the Dirac operator 00762 dSloppy = Dirac::create(diracSloppyParam); 00763 dPre = Dirac::create(diracPreParam); 00764 } 00765 00766 cudaGaugeField* checkGauge(QudaInvertParam *param) { 00767 cudaGaugeField *cudaGauge = NULL; 00768 if (param->dslash_type != QUDA_ASQTAD_DSLASH) { 00769 if (gaugePrecise == NULL) errorQuda("Precise gauge field doesn't exist"); 00770 if (gaugeSloppy == NULL) errorQuda("Sloppy gauge field doesn't exist"); 00771 if (gaugePrecondition == NULL) errorQuda("Precondition gauge field doesn't exist"); 00772 cudaGauge = gaugePrecise; 00773 } else { 00774 if (gaugeFatPrecise == NULL) errorQuda("Precise gauge fat field doesn't exist"); 00775 if (gaugeFatSloppy == NULL) errorQuda("Sloppy gauge fat field doesn't exist"); 00776 if (gaugeFatPrecondition == NULL) errorQuda("Precondition gauge fat field doesn't exist"); 00777 00778 if (gaugeLongPrecise == NULL) errorQuda("Precise gauge long field doesn't exist"); 00779 if (gaugeLongSloppy == NULL) errorQuda("Sloppy gauge long field doesn't exist"); 00780 if (gaugeLongPrecondition == NULL) errorQuda("Precondition gauge long field doesn't exist"); 00781 cudaGauge = gaugeFatPrecise; 00782 } 00783 return cudaGauge; 00784 } 00785 00786 void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param) 00787 { 00788 // check the gauge fields have been created 00789 cudaGaugeField *cudaGauge = checkGauge(param); 00790 00791 checkInvertParam(param); 00792 verbosity = param->verbosity; 00793 00794 bool pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE || 00795 param->solve_type == QUDA_NORMEQ_PC_SOLVE); 00796 00797 bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION || 00798 param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION); 00799 00800 param->spinorGiB = cudaGauge->VolumeCB() * spinorSiteSize; 00801 if (!pc_solve) param->spinorGiB *= 2; 00802 param->spinorGiB *= (param->cuda_prec == QUDA_DOUBLE_PRECISION ? sizeof(double) : sizeof(float)); 00803 if (param->preserve_source == QUDA_PRESERVE_SOURCE_NO) { 00804 param->spinorGiB *= (param->inv_type == QUDA_CG_INVERTER ? 5 : 7)/(double)(1<<30); 00805 } else { 00806 param->spinorGiB *= (param->inv_type == QUDA_CG_INVERTER ? 8 : 9)/(double)(1<<30); 00807 } 00808 00809 param->secs = 0; 00810 param->gflops = 0; 00811 param->iter = 0; 00812 00813 Dirac *d = NULL; 00814 Dirac *dSloppy = NULL; 00815 Dirac *dPre = NULL; 00816 00817 // create the dirac operator 00818 createDirac(d, dSloppy, dPre, *param, pc_solve); 00819 00820 Dirac &dirac = *d; 00821 Dirac &diracSloppy = *dSloppy; 00822 Dirac &diracPre = *dPre; 00823 00824 cpuColorSpinorField *h_b = NULL; 00825 cpuColorSpinorField *h_x = NULL; 00826 cudaColorSpinorField *b = NULL; 00827 cudaColorSpinorField *x = NULL; 00828 cudaColorSpinorField *in = NULL; 00829 cudaColorSpinorField *out = NULL; 00830 00831 const int *X = cudaGauge->X(); 00832 00833 // wrap CPU host side pointers 00834 ColorSpinorParam cpuParam(hp_b, *param, X, pc_solution); 00835 h_b = new cpuColorSpinorField(cpuParam); 00836 cpuParam.v = hp_x; 00837 h_x = new cpuColorSpinorField(cpuParam); 00838 00839 // download source 00840 ColorSpinorParam cudaParam(cpuParam, *param); 00841 cudaParam.create = QUDA_COPY_FIELD_CREATE; 00842 b = new cudaColorSpinorField(*h_b, cudaParam); 00843 00844 if (param->use_init_guess == QUDA_USE_INIT_GUESS_YES) { // download initial guess 00845 x = new cudaColorSpinorField(*h_x, cudaParam); // solution 00846 } else { // zero initial guess 00847 cudaParam.create = QUDA_ZERO_FIELD_CREATE; 00848 x = new cudaColorSpinorField(cudaParam); // solution 00849 } 00850 00851 if (param->verbosity >= QUDA_VERBOSE) { 00852 double nh_b = norm2(*h_b); 00853 double nb = norm2(*b); 00854 printfQuda("Source: CPU = %f, CUDA copy = %f\n", nh_b, nb); 00855 } 00856 00857 setDslashTuning(param->tune, param->verbosity); 00858 quda::setBlasTuning(param->tune, param->verbosity); 00859 00860 dirac.prepare(in, out, *x, *b, param->solution_type); 00861 if (param->verbosity >= QUDA_VERBOSE) { 00862 double nin = norm2(*in); 00863 printfQuda("Prepared source = %f\n", nin); 00864 } 00865 00866 massRescale(param->dslash_type, param->kappa, param->solution_type, param->mass_normalization, *in); 00867 00868 if (param->verbosity >= QUDA_VERBOSE) { 00869 double nin = norm2(*in); 00870 printfQuda("Prepared source post mass rescale = %f\n", nin); 00871 } 00872 00873 00874 switch (param->inv_type) { 00875 case QUDA_CG_INVERTER: 00876 if (param->solution_type != QUDA_MATDAG_MAT_SOLUTION && param->solution_type != QUDA_MATPCDAG_MATPC_SOLUTION) { 00877 copyCuda(*out, *in); 00878 dirac.Mdag(*in, *out); 00879 } 00880 { 00881 DiracMdagM m(dirac), mSloppy(diracSloppy); 00882 CG cg(m, mSloppy, *param); 00883 cg(*out, *in); 00884 } 00885 break; 00886 case QUDA_BICGSTAB_INVERTER: 00887 if (param->solution_type == QUDA_MATDAG_MAT_SOLUTION || param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION) { 00888 DiracMdag m(dirac), mSloppy(diracSloppy), mPre(diracPre); 00889 BiCGstab bicg(m, mSloppy, mPre, *param); 00890 bicg(*out, *in); 00891 copyCuda(*in, *out); 00892 } 00893 { 00894 DiracM m(dirac), mSloppy(diracSloppy), mPre(diracPre); 00895 BiCGstab bicg(m, mSloppy, mPre, *param); 00896 bicg(*out, *in); 00897 } 00898 break; 00899 case QUDA_GCR_INVERTER: 00900 if (param->solution_type == QUDA_MATDAG_MAT_SOLUTION || param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION) { 00901 DiracMdag m(dirac), mSloppy(diracSloppy), mPre(diracPre); 00902 GCR gcr(m, mSloppy, mPre, *param); 00903 gcr(*out, *in); 00904 copyCuda(*in, *out); 00905 } 00906 { 00907 DiracM m(dirac), mSloppy(diracSloppy), mPre(diracPre); 00908 GCR gcr(m, mSloppy, mPre, *param); 00909 gcr(*out, *in); 00910 } 00911 break; 00912 default: 00913 errorQuda("Inverter type %d not implemented", param->inv_type); 00914 } 00915 00916 if (param->verbosity >= QUDA_VERBOSE){ 00917 double nx = norm2(*x); 00918 printfQuda("Solution = %f\n",nx); 00919 } 00920 dirac.reconstruct(*x, *b, param->solution_type); 00921 00922 x->saveCPUSpinorField(*h_x); // since this is a reference, this won't work: h_x = x; 00923 00924 if (param->verbosity >= QUDA_VERBOSE){ 00925 double nx = norm2(*x); 00926 double nh_x = norm2(*h_x); 00927 printfQuda("Reconstructed: CUDA solution = %f, CPU copy = %f\n", nx, nh_x); 00928 } 00929 00930 delete h_b; 00931 delete h_x; 00932 delete b; 00933 delete x; 00934 00935 return; 00936 } 00937 00938 00944 void invertMultiShiftQuda(void **_hp_x, void *_hp_b, QudaInvertParam *param, 00945 double* offsets, int num_offsets, double* residue_sq) 00946 { 00947 // check the gauge fields have been created 00948 cudaGaugeField *cudaGauge = checkGauge(param); 00949 checkInvertParam(param); 00950 00951 param->num_offset = num_offsets; 00952 if (param->num_offset > QUDA_MAX_MULTI_SHIFT) 00953 errorQuda("Number of shifts %d requested greater than QUDA_MAX_MULTI_SHIFT %d", 00954 param->num_offset, QUDA_MAX_MULTI_SHIFT); 00955 for (int i=0; i<param->num_offset; i++) { 00956 param->offset[i] = offsets[i]; 00957 param->tol_offset[i] = residue_sq[i]; 00958 } 00959 00960 verbosity = param->verbosity; 00961 00962 // Are we doing a preconditioned solve */ 00963 /* What does NormEq solve mean in the shifted case? 00964 */ 00965 if (param->solve_type != QUDA_NORMEQ_PC_SOLVE && 00966 param->solve_type != QUDA_NORMEQ_SOLVE) { 00967 errorQuda("Direct solve_type is not supported in invertMultiShiftQuda()\n"); 00968 } 00969 00970 bool pc_solve = (param->solve_type == QUDA_NORMEQ_PC_SOLVE); 00971 00972 // In principle one can do a MATPC Solution for a hermitian M_pc 00973 // In practice most of the time I guess one will do a M^\dagger_pc M_pc solution. 00974 bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION || 00975 param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION ); 00976 00977 // No of GiB in a checkerboard of a spinor 00978 param->spinorGiB = cudaGauge->VolumeCB() * spinorSiteSize; 00979 if( !pc_solve) param->spinorGiB *= 2; // Double volume for non PC solve 00980 00981 // **** WARNING *** this may not match implementation... 00982 if( param->inv_type == QUDA_CG_INVERTER ) { 00983 // CG-M needs 5 vectors for the smallest shift + 2 for each additional shift 00984 param->spinorGiB *= (5 + 2*(param->num_offset-1))/(double)(1<<30); 00985 } 00986 else { 00987 // BiCGStab-M needs 7 for the original shift + 2 for each additional shift + 1 auxiliary 00988 // (Jegerlehner hep-lat/9612014 eq (3.13) 00989 param->spinorGiB *= (7 + 2*(param->num_offset-1))/(double)(1<<30); 00990 } 00991 00992 // Timing and FLOP counters 00993 param->secs = 0; 00994 param->gflops = 0; 00995 param->iter = 0; 00996 00997 // Find the smallest shift and its offset. 00998 double low_offset = param->offset[0]; 00999 int low_index = 0; 01000 for (int i=1;i < param->num_offset;i++){ 01001 if (param->offset[i] < low_offset){ 01002 low_offset = param->offset[i]; 01003 low_index = i; 01004 } 01005 } 01006 01007 // Host pointers for x, take a copy of the input host pointers 01008 void** hp_x; 01009 hp_x = new void* [ param->num_offset ]; 01010 01011 void* hp_b = _hp_b; 01012 for(int i=0;i < param->num_offset;i++){ 01013 hp_x[i] = _hp_x[i]; 01014 } 01015 01016 // Now shift things so that the vector with the smallest shift 01017 // is in the first position of the array 01018 if (low_index != 0){ 01019 void* tmp = hp_x[0]; 01020 hp_x[0] = hp_x[low_index] ; 01021 hp_x[low_index] = tmp; 01022 01023 double tmp1 = param->offset[0]; 01024 param->offset[0]= param->offset[low_index]; 01025 param->offset[low_index] =tmp1; 01026 } 01027 01028 // Create the matrix. 01029 // The way this works is that createDirac will create 'd' and 'dSloppy' 01030 // which are global. We then grab these with references... 01031 // 01032 // Balint: Isn't there a nice construction pattern we could use here? This is 01033 // expedient but yucky. 01034 // DiracParam diracParam; 01035 if (param->dslash_type == QUDA_ASQTAD_DSLASH){ 01036 param->mass = sqrt(param->offset[0]/4); 01037 } 01038 01039 Dirac *d = NULL; 01040 Dirac *dSloppy = NULL; 01041 Dirac *dPre = NULL; 01042 01043 // create the dirac operator 01044 createDirac(d, dSloppy, dPre, *param, pc_solve); 01045 Dirac &dirac = *d; 01046 Dirac &diracSloppy = *dSloppy; 01047 01048 cpuColorSpinorField *h_b = NULL; // Host RHS 01049 cpuColorSpinorField **h_x = NULL; 01050 cudaColorSpinorField *b = NULL; // Cuda RHS 01051 cudaColorSpinorField **x = NULL; // Cuda Solutions 01052 01053 // Grab the dimension array of the input gauge field. 01054 const int *X = ( param->dslash_type == QUDA_ASQTAD_DSLASH ) ? 01055 gaugeFatPrecise->X() : gaugePrecise->X(); 01056 01057 // Wrap CPU host side pointers 01058 // 01059 // Balint: This creates a ColorSpinorParam struct, from the host data pointer, 01060 // the definitions in param, the dimensions X, and whether the solution is on 01061 // a checkerboard instruction or not. These can then be used as 'instructions' 01062 // to create the actual colorSpinorField 01063 ColorSpinorParam cpuParam(hp_b, *param, X, pc_solution); 01064 h_b = new cpuColorSpinorField(cpuParam); 01065 01066 h_x = new cpuColorSpinorField* [ param->num_offset ]; // DYNAMIC ALLOCATION 01067 for(int i=0; i < param->num_offset; i++) { 01068 cpuParam.v = hp_x[i]; 01069 h_x[i] = new cpuColorSpinorField(cpuParam); 01070 } 01071 01072 // Now I need a colorSpinorParam for the device 01073 ColorSpinorParam cudaParam(cpuParam, *param); 01074 // This setting will download a host vector 01075 cudaParam.create = QUDA_COPY_FIELD_CREATE; 01076 b = new cudaColorSpinorField(*h_b, cudaParam); // Creates b and downloads h_b to it 01077 01078 // Create the solution fields filled with zero 01079 x = new cudaColorSpinorField* [ param->num_offset ]; 01080 cudaParam.create = QUDA_ZERO_FIELD_CREATE; 01081 for(int i=0; i < param->num_offset; i++) { 01082 x[i] = new cudaColorSpinorField(cudaParam); 01083 } 01084 01085 // Check source norms 01086 if( param->verbosity >= QUDA_VERBOSE ) { 01087 double nh_b = norm2(*h_b); 01088 double nb = norm2(*b); 01089 printfQuda("Source: CPU= %f, CUDA copy = %f\n", nh_b,nb); 01090 } 01091 01092 setDslashTuning(param->tune, param->verbosity); 01093 quda::setBlasTuning(param->tune, param->verbosity); 01094 01095 massRescale(param->dslash_type, param->kappa, param->solution_type, param->mass_normalization, *b); 01096 double *unscaled_shifts = new double [param->num_offset]; 01097 for(int i=0; i < param->num_offset; i++){ 01098 unscaled_shifts[i] = param->offset[i]; 01099 massRescaleCoeff(param->dslash_type, param->kappa, param->solution_type, param->mass_normalization, param->offset[i]); 01100 } 01101 01102 { 01103 DiracMdagM m(dirac), mSloppy(diracSloppy); 01104 MultiShiftCG cg_m(m, mSloppy, *param); 01105 cg_m(x, *b); 01106 } 01107 01108 // restore shifts -- avoid side effects 01109 for(int i=0; i < param->num_offset; i++) { 01110 param->offset[i] = unscaled_shifts[i]; 01111 } 01112 01113 delete [] unscaled_shifts; 01114 01115 for(int i=0; i < param->num_offset; i++) { 01116 x[i]->saveCPUSpinorField(*h_x[i]); 01117 } 01118 01119 for(int i=0; i < param->num_offset; i++){ 01120 delete h_x[i]; 01121 delete x[i]; 01122 } 01123 01124 delete h_b; 01125 delete b; 01126 01127 delete [] h_x; 01128 delete [] x; 01129 01130 delete [] hp_x; 01131 01132 return; 01133 } 01134 01135 /************************************** Ugly Mixed precision multishift CG solver ****************************/ 01136 01137 static void* fatlink; 01138 static int fatlink_pad; 01139 static void* longlink; 01140 static int longlink_pad; 01141 static QudaReconstructType longlink_recon; 01142 static QudaReconstructType longlink_recon_sloppy; 01143 static QudaGaugeParam* gauge_param; 01144 01145 void 01146 record_gauge(int* X, void *_fatlink, int _fatlink_pad, void* _longlink, int _longlink_pad, 01147 QudaReconstructType _longlink_recon, QudaReconstructType _longlink_recon_sloppy, 01148 QudaGaugeParam *_param) 01149 { 01150 01151 01152 //the X and precision in fatlink must be set because we use them in dirac creation 01153 //See dirac_staggered.cpp 01154 /* 01155 for(int i =0;i < 4;i++){ 01156 cudaFatLinkPrecise.X[i] = cudaFatLinkSloppy.X[i] = X[i]; 01157 } 01158 cudaFatLinkPrecise.precision = _param->cuda_prec; 01159 cudaFatLinkSloppy.precision = _param->cuda_prec_sloppy; 01160 */ 01161 01162 fatlink = _fatlink; 01163 fatlink_pad = _fatlink_pad; 01164 01165 longlink = _longlink; 01166 longlink_pad = _longlink_pad; 01167 longlink_recon = _longlink_recon; 01168 longlink_recon_sloppy = _longlink_recon_sloppy; 01169 01170 gauge_param = _param; 01171 01172 return; 01173 01174 } 01175 01176 01177 void 01178 do_create_precise_cuda_gauge(void) 01179 { 01180 QudaPrecision prec = gauge_param->cuda_prec; 01181 QudaPrecision prec_sloppy = gauge_param->cuda_prec_sloppy; 01182 QudaPrecision prec_precondition = gauge_param->cuda_prec_precondition; 01183 01184 //the sloppy gauge field will be filled, and needs to backup 01185 cudaGaugeField *tmp_fat = gaugeFatSloppy; 01186 cudaGaugeField *tmp_long= gaugeLongSloppy; 01187 cudaGaugeField *tmp_fat_precondition = gaugeFatPrecondition; 01188 cudaGaugeField *tmp_long_precondition = gaugeLongPrecondition; 01189 01190 gaugeFatPrecise = gaugeFatSloppy = gaugeFatPrecondition = NULL; 01191 gaugeLongPrecise = gaugeLongSloppy = gaugeLongPrecondition = NULL; 01192 01193 //create precise links 01194 gauge_param->cuda_prec = gauge_param->cuda_prec_sloppy = gauge_param->cuda_prec_precondition = prec; 01195 gauge_param->type = QUDA_ASQTAD_FAT_LINKS; 01196 gauge_param->ga_pad = fatlink_pad; 01197 gauge_param->reconstruct = gauge_param->reconstruct_sloppy = QUDA_RECONSTRUCT_NO; 01198 loadGaugeQuda(fatlink, gauge_param); 01199 01200 01201 gauge_param->type = QUDA_ASQTAD_LONG_LINKS; 01202 gauge_param->ga_pad = longlink_pad; 01203 gauge_param->reconstruct = longlink_recon; 01204 gauge_param->reconstruct_sloppy = longlink_recon_sloppy; 01205 loadGaugeQuda(longlink, gauge_param); 01206 01207 //set prec/prec_sloppy it back 01208 gauge_param->cuda_prec = prec; 01209 gauge_param->cuda_prec_sloppy =prec_sloppy; 01210 gauge_param->cuda_prec_precondition = prec_precondition; 01211 01212 //set the sloopy gauge filed back 01213 gaugeFatSloppy = tmp_fat; 01214 gaugeLongSloppy = tmp_long; 01215 01216 gaugeFatPrecondition = tmp_fat_precondition; 01217 gaugeLongPrecondition = tmp_long_precondition; 01218 return; 01219 } 01220 01221 void 01222 do_create_sloppy_cuda_gauge(void) 01223 { 01224 01225 QudaPrecision prec = gauge_param->cuda_prec; 01226 QudaPrecision prec_sloppy = gauge_param->cuda_prec_sloppy; 01227 QudaPrecision prec_precondition = gauge_param->cuda_prec_precondition; 01228 01229 //the precise gauge field will be filled, and needs to backup 01230 cudaGaugeField *tmp_fat = gaugeFatPrecise; 01231 cudaGaugeField *tmp_long = gaugeLongPrecise; 01232 cudaGaugeField *tmp_fat_precondition = gaugeFatPrecondition; 01233 cudaGaugeField *tmp_long_precondition = gaugeLongPrecondition; 01234 01235 gaugeFatPrecise = gaugeFatSloppy = gaugeFatPrecondition = NULL; 01236 gaugeLongPrecise = gaugeLongSloppy = gaugeLongPrecondition= NULL; 01237 01238 //create sloppy links 01239 gauge_param->cuda_prec = gauge_param->cuda_prec_sloppy = 01240 gauge_param->cuda_prec_precondition = prec_sloppy; 01241 gauge_param->type = QUDA_ASQTAD_FAT_LINKS; 01242 gauge_param->ga_pad = fatlink_pad; 01243 gauge_param->reconstruct = gauge_param->reconstruct_sloppy = QUDA_RECONSTRUCT_NO; 01244 loadGaugeQuda(fatlink, gauge_param); 01245 01246 gauge_param->type = QUDA_ASQTAD_LONG_LINKS; 01247 gauge_param->ga_pad = longlink_pad; 01248 gauge_param->reconstruct = longlink_recon; 01249 gauge_param->reconstruct_sloppy = longlink_recon_sloppy; 01250 loadGaugeQuda(longlink, gauge_param); 01251 01252 //set prec/prec_sloppy it back 01253 gauge_param->cuda_prec = prec; 01254 gauge_param->cuda_prec_sloppy =prec_sloppy; 01255 gauge_param->cuda_prec_precondition = prec_precondition; 01256 01257 //set the sloopy gauge filed back 01258 gaugeFatPrecise = tmp_fat; 01259 gaugeLongPrecise = tmp_long; 01260 01261 gaugeFatPrecondition = tmp_fat_precondition; 01262 gaugeLongPrecondition = tmp_long_precondition; 01263 01264 return; 01265 } 01266 01267 01268 void 01269 invertMultiShiftQudaMixed(void **_hp_x, void *_hp_b, QudaInvertParam *param, 01270 double* offsets, int num_offsets, double* residue_sq) 01271 { 01272 01273 01274 QudaPrecision high_prec = param->cuda_prec; 01275 param->cuda_prec = param->cuda_prec_sloppy; 01276 01277 param->num_offset = num_offsets; 01278 if (param->num_offset > QUDA_MAX_MULTI_SHIFT) 01279 errorQuda("Number of shifts %d requested greater than QUDA_MAX_MULTI_SHIFT %d", 01280 param->num_offset, QUDA_MAX_MULTI_SHIFT); 01281 for (int i=0; i<param->num_offset; i++) { 01282 param->offset[i] = offsets[i]; 01283 param->tol_offset[i] = residue_sq[i]; 01284 } 01285 01286 do_create_sloppy_cuda_gauge(); 01287 01288 // check the gauge fields have been created 01289 //cudaGaugeField *cudaGauge = checkGauge(param); 01290 cudaGaugeField *cudaGauge = gaugeFatSloppy; 01291 01292 01293 checkInvertParam(param); 01294 verbosity = param->verbosity; 01295 01296 // Are we doing a preconditioned solve */ 01297 /* What does NormEq solve mean in the shifted case? 01298 */ 01299 if (param->solve_type != QUDA_NORMEQ_PC_SOLVE && 01300 param->solve_type != QUDA_NORMEQ_SOLVE) { 01301 errorQuda("Direct solve_type is not supported in invertMultiShiftQuda()\n"); 01302 } 01303 01304 bool pc_solve = (param->solve_type == QUDA_NORMEQ_PC_SOLVE); 01305 01306 // In principle one can do a MATPC Solution for a hermitian M_pc 01307 // In practice most of the time I guess one will do a M^\dagger_pc M_pc solution. 01308 bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION || 01309 param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION ); 01310 01311 // No of GiB in a checkerboard of a spinor 01312 param->spinorGiB = cudaGauge->VolumeCB() * spinorSiteSize; 01313 if( !pc_solve) param->spinorGiB *= 2; // Double volume for non PC solve 01314 01315 // **** WARNING *** this may not match implementation... 01316 if( param->inv_type == QUDA_CG_INVERTER ) { 01317 // CG-M needs 5 vectors for the smallest shift + 2 for each additional shift 01318 param->spinorGiB *= (5 + 2*(param->num_offset-1))/(double)(1<<30); 01319 } 01320 else { 01321 // BiCGStab-M needs 7 for the original shift + 2 for each additional shift + 1 auxiliary 01322 // (Jegerlehner hep-lat/9612014 eq (3.13) 01323 param->spinorGiB *= (7 + 2*(param->num_offset-1))/(double)(1<<30); 01324 } 01325 01326 // Timing and FLOP counters 01327 param->secs = 0; 01328 param->gflops = 0; 01329 param->iter = 0; 01330 01331 // Find the smallest shift and its offset. 01332 double low_offset = param->offset[0]; 01333 int low_index = 0; 01334 for (int i=1;i < param->num_offset;i++){ 01335 if (param->offset[i] < low_offset){ 01336 low_offset = param->offset[i]; 01337 low_index = i; 01338 } 01339 } 01340 01341 // Host pointers for x, take a copy of the input host pointers 01342 void** hp_x; 01343 hp_x = new void* [ param->num_offset ]; 01344 01345 void* hp_b = _hp_b; 01346 for(int i=0;i < param->num_offset;i++){ 01347 hp_x[i] = _hp_x[i]; 01348 } 01349 01350 // Now shift things so that the vector with the smallest shift 01351 // is in the first position of the array 01352 if (low_index != 0){ 01353 void* tmp = hp_x[0]; 01354 hp_x[0] = hp_x[low_index] ; 01355 hp_x[low_index] = tmp; 01356 01357 double tmp1 = param->offset[0]; 01358 param->offset[0]= param->offset[low_index]; 01359 param->offset[low_index] =tmp1; 01360 } 01361 01362 // Create the matrix. 01363 // The way this works is that createDirac will create 'd' and 'dSloppy' 01364 // which are global. We then grab these with references... 01365 // 01366 // Balint: Isn't there a nice construction pattern we could use here? This is 01367 // expedient but yucky. 01368 if (param->dslash_type == QUDA_ASQTAD_DSLASH){ 01369 param->mass = sqrt(param->offset[0]/4); 01370 } 01371 //FIXME: Dirty dirty hack 01372 // At this moment, the precise fat gauge is not created (NULL) 01373 // but we set it to be the same as sloppy to avoid segfault 01374 // in creating the dirac since it is needed 01375 gaugeFatPrecondition = gaugeFatPrecise = gaugeFatSloppy; 01376 01377 Dirac *d = NULL; 01378 Dirac *dSloppy = NULL; 01379 Dirac *dPre = NULL; 01380 01381 // create the dirac operator 01382 createDirac(d, dSloppy, dPre, *param, pc_solve); 01383 // resetting to NULL 01384 gaugeFatPrecise = NULL; gaugeFatPrecondition = NULL; 01385 01386 Dirac &diracSloppy = *dSloppy; 01387 01388 cpuColorSpinorField *h_b = NULL; // Host RHS 01389 cpuColorSpinorField **h_x = NULL; 01390 cudaColorSpinorField *b = NULL; // Cuda RHS 01391 cudaColorSpinorField **x = NULL; // Cuda Solutions 01392 01393 // Grab the dimension array of the input gauge field. 01394 const int *X = cudaGauge->X(); 01395 01396 // Wrap CPU host side pointers 01397 // 01398 // Balint: This creates a ColorSpinorParam struct, from the host data pointer, 01399 // the definitions in param, the dimensions X, and whether the solution is on 01400 // a checkerboard instruction or not. These can then be used as 'instructions' 01401 // to create the actual colorSpinorField 01402 ColorSpinorParam cpuParam(hp_b, *param, X, pc_solution); 01403 h_b = new cpuColorSpinorField(cpuParam); 01404 01405 h_x = new cpuColorSpinorField* [ param->num_offset ]; // DYNAMIC ALLOCATION 01406 for(int i=0; i < param->num_offset; i++) { 01407 cpuParam.v = hp_x[i]; 01408 h_x[i] = new cpuColorSpinorField(cpuParam); 01409 } 01410 01411 // Now I need a colorSpinorParam for the device 01412 ColorSpinorParam cudaParam(cpuParam, *param); 01413 // This setting will download a host vector 01414 cudaParam.create = QUDA_COPY_FIELD_CREATE; 01415 b = new cudaColorSpinorField(*h_b, cudaParam); // Creates b and downloads h_b to it 01416 01417 // Create the solution fields filled with zero 01418 x = new cudaColorSpinorField* [ param->num_offset ]; 01419 cudaParam.create = QUDA_ZERO_FIELD_CREATE; 01420 for(int i=0; i < param->num_offset; i++) { 01421 x[i] = new cudaColorSpinorField(cudaParam); 01422 } 01423 01424 // Check source norms 01425 if( param->verbosity >= QUDA_VERBOSE ) { 01426 double nh_b = norm2(*h_b); 01427 double nb = norm2(*b); 01428 printfQuda("Source: CPU= %f, CUDA copy = %f\n", nh_b,nb); 01429 } 01430 01431 // tune the Dirac Kernel 01432 setDslashTuning(param->tune, param->verbosity); 01433 quda::setBlasTuning(param->tune, param->verbosity); 01434 // if set, tuning will happen in the first multishift call 01435 01436 massRescale(param->dslash_type, param->kappa, param->solution_type, param->mass_normalization, *b); 01437 double *rescaled_shifts = new double [param->num_offset]; 01438 for(int i=0; i < param->num_offset; i++){ 01439 rescaled_shifts[i] = param->offset[i]; 01440 massRescaleCoeff(param->dslash_type, param->kappa, param->solution_type, param->mass_normalization, rescaled_shifts[i]); 01441 } 01442 01443 { 01444 DiracMdagM m(diracSloppy); 01445 MultiShiftCG cg_m(m, m, *param); 01446 cg_m(x, *b); 01447 } 01448 01449 delete [] rescaled_shifts; 01450 01451 delete b; 01452 01453 int total_iters = 0; 01454 double total_secs = 0; 01455 double total_gflops = 0; 01456 total_iters += param->iter; 01457 total_secs += param->secs; 01458 total_gflops += param->gflops; 01459 01460 cudaParam.precision = high_prec; 01461 param->cuda_prec = high_prec; 01462 do_create_precise_cuda_gauge(); 01463 01464 b = new cudaColorSpinorField(cudaParam); 01465 *b = *h_b; 01466 01467 /*FIXME: to avoid setfault*/ 01468 gaugeFatPrecondition =gaugeFatSloppy; 01469 01470 if (dPre && dPre != dSloppy) delete dPre; 01471 if (dSloppy && dSloppy != d) delete dSloppy; 01472 if (d) delete d; 01473 01474 // create the dirac operator 01475 createDirac(d, dSloppy, dPre, *param, pc_solve); 01476 01477 gaugeFatPrecondition = NULL; 01478 { 01479 Dirac& dirac2 = *d; 01480 Dirac& diracSloppy2 = *dSloppy; 01481 01482 cudaColorSpinorField* high_x; 01483 high_x = new cudaColorSpinorField(cudaParam); 01484 for(int i=0;i < param->num_offset; i++){ 01485 *high_x = *x[i]; 01486 delete x[i]; 01487 double mass = sqrt(param->offset[i]/4); 01488 dirac2.setMass(mass); 01489 diracSloppy2.setMass(mass); 01490 DiracMdagM m(dirac2), mSloppy(diracSloppy2); 01491 CG cg(m, mSloppy, *param); 01492 cg(*high_x, *b); 01493 total_iters += param->iter; 01494 total_secs += param->secs; 01495 total_gflops += param->gflops; 01496 high_x->saveCPUSpinorField(*h_x[i]); 01497 } 01498 01499 param->iter = total_iters; 01500 01501 delete high_x; 01502 } 01503 01504 01505 for(int i=0; i < param->num_offset; i++){ 01506 delete h_x[i]; 01507 } 01508 01509 delete h_b; 01510 delete b; 01511 01512 delete [] h_x; 01513 delete [] x; 01514 01515 delete [] hp_x; 01516 01517 return; 01518 } 01519 01520 01521 #ifdef GPU_FATLINK 01522 /* @method 01523 * QUDA_COMPUTE_FAT_STANDARD: standard method (default) 01524 * QUDA_COMPUTE_FAT_EXTENDED_VOLUME, extended volume method 01525 * 01526 */ 01527 #include <sys/time.h> 01528 01529 void setFatLinkPadding(QudaComputeFatMethod method, QudaGaugeParam* param) 01530 { 01531 int* X = param->X; 01532 int Vsh_x = X[1]*X[2]*X[3]/2; 01533 int Vsh_y = X[0]*X[2]*X[3]/2; 01534 int Vsh_z = X[0]*X[1]*X[3]/2; 01535 int Vsh_t = X[0]*X[1]*X[2]/2; 01536 01537 int E1 = X[0] + 4; 01538 int E2 = X[1] + 4; 01539 int E3 = X[2] + 4; 01540 int E4 = X[3] + 4; 01541 01542 // fat-link padding 01543 param->llfat_ga_pad = Vsh_t; 01544 01545 // site-link padding 01546 if(method == QUDA_COMPUTE_FAT_STANDARD){ 01547 #ifdef MULTI_GPU 01548 int Vh_2d_max = MAX(X[0]*X[1]/2, X[0]*X[2]/2); 01549 Vh_2d_max = MAX(Vh_2d_max, X[0]*X[3]/2); 01550 Vh_2d_max = MAX(Vh_2d_max, X[1]*X[2]/2); 01551 Vh_2d_max = MAX(Vh_2d_max, X[1]*X[3]/2); 01552 Vh_2d_max = MAX(Vh_2d_max, X[2]*X[3]/2); 01553 param->site_ga_pad = 3*(Vsh_x+Vsh_y+Vsh_z+Vsh_t) + 4*Vh_2d_max; 01554 #else 01555 param->site_ga_pad = Vsh_t; 01556 #endif 01557 }else{ 01558 param->site_ga_pad = E1*E2*E3/2*3; 01559 } 01560 param->ga_pad = param->site_ga_pad; 01561 01562 // staple padding 01563 if(method == QUDA_COMPUTE_FAT_STANDARD){ 01564 #ifdef MULTI_GPU 01565 param->staple_pad = 3*(Vsh_x + Vsh_y + Vsh_z+ Vsh_t); 01566 #else 01567 param->staple_pad = 3*Vsh_t; 01568 #endif 01569 }else{ 01570 param->staple_pad = E1*E2*E3/2*3; 01571 } 01572 01573 return; 01574 } 01575 01576 01577 void computeFatLinkCore(cudaGaugeField* cudaSiteLink, double* act_path_coeff, 01578 QudaGaugeParam* qudaGaugeParam, QudaComputeFatMethod method, 01579 cudaGaugeField* cudaFatLink, struct timeval time_array[]) 01580 { 01581 01582 gettimeofday(&time_array[0], NULL); 01583 01584 const int flag = qudaGaugeParam->preserve_gauge; 01585 GaugeFieldParam gParam(0,*qudaGaugeParam); 01586 01587 if(method == QUDA_COMPUTE_FAT_STANDARD){ 01588 for(int dir=0; dir<4; ++dir) gParam.x[dir] = qudaGaugeParam->X[dir]; 01589 }else{ 01590 for(int dir=0; dir<4; ++dir) gParam.x[dir] = qudaGaugeParam->X[dir] + 4; 01591 } 01592 01593 01594 static cudaGaugeField* cudaStapleField=NULL, *cudaStapleField1=NULL; 01595 if(cudaStapleField == NULL || cudaStapleField1 == NULL){ 01596 gParam.pad = qudaGaugeParam->staple_pad; 01597 gParam.create = QUDA_NULL_FIELD_CREATE; 01598 gParam.reconstruct = QUDA_RECONSTRUCT_NO; 01599 gParam.is_staple = 1; //these two condition means it is a staple instead of a normal gauge field 01600 cudaStapleField = new cudaGaugeField(gParam); 01601 cudaStapleField1 = new cudaGaugeField(gParam); 01602 } 01603 01604 gettimeofday(&time_array[1], NULL); 01605 01606 if(method == QUDA_COMPUTE_FAT_STANDARD){ 01607 llfat_cuda(*cudaFatLink, *cudaSiteLink, *cudaStapleField, *cudaStapleField1, qudaGaugeParam, act_path_coeff); 01608 }else{ //method == QUDA_COMPUTE_FAT_EXTENDED_VOLUME 01609 llfat_cuda_ex(*cudaFatLink, *cudaSiteLink, *cudaStapleField, *cudaStapleField1, qudaGaugeParam, act_path_coeff); 01610 } 01611 gettimeofday(&time_array[2], NULL); 01612 01613 if (!(flag & QUDA_FAT_PRESERVE_GPU_GAUGE) ){ 01614 delete cudaStapleField; cudaStapleField = NULL; 01615 delete cudaStapleField1; cudaStapleField1 = NULL; 01616 } 01617 gettimeofday(&time_array[3], NULL); 01618 01619 return; 01620 01621 } 01622 01623 01624 01625 int 01626 computeFatLinkQuda(void* fatlink, void** sitelink, double* act_path_coeff, 01627 QudaGaugeParam* qudaGaugeParam, 01628 QudaComputeFatMethod method) 01629 { 01630 #define TDIFF_MS(a,b) (b.tv_sec - a.tv_sec + 0.000001*(b.tv_usec - a.tv_usec))*1000 01631 01632 struct timeval t0; 01633 struct timeval t7, t8, t9, t10, t11; 01634 01635 gettimeofday(&t0, NULL); 01636 01637 static cpuGaugeField* cpuFatLink=NULL, *cpuSiteLink=NULL; 01638 static cudaGaugeField* cudaFatLink=NULL, *cudaSiteLink=NULL; 01639 int flag = qudaGaugeParam->preserve_gauge; 01640 01641 QudaGaugeParam qudaGaugeParam_ex_buf; 01642 QudaGaugeParam* qudaGaugeParam_ex = &qudaGaugeParam_ex_buf; 01643 memcpy(qudaGaugeParam_ex, qudaGaugeParam, sizeof(QudaGaugeParam)); 01644 01645 qudaGaugeParam_ex->X[0] = qudaGaugeParam->X[0]+4; 01646 qudaGaugeParam_ex->X[1] = qudaGaugeParam->X[1]+4; 01647 qudaGaugeParam_ex->X[2] = qudaGaugeParam->X[2]+4; 01648 qudaGaugeParam_ex->X[3] = qudaGaugeParam->X[3]+4; 01649 01650 GaugeFieldParam gParam_ex(0, *qudaGaugeParam_ex); 01651 01652 // fat-link padding 01653 setFatLinkPadding(method, qudaGaugeParam); 01654 qudaGaugeParam_ex->llfat_ga_pad = qudaGaugeParam->llfat_ga_pad; 01655 qudaGaugeParam_ex->staple_pad = qudaGaugeParam->staple_pad; 01656 qudaGaugeParam_ex->site_ga_pad = qudaGaugeParam->site_ga_pad; 01657 01658 GaugeFieldParam gParam(0, *qudaGaugeParam); 01659 01660 // create the host fatlink 01661 if(cpuFatLink == NULL){ 01662 gParam.create = QUDA_REFERENCE_FIELD_CREATE; 01663 gParam.link_type = QUDA_ASQTAD_FAT_LINKS; 01664 gParam.order = QUDA_MILC_GAUGE_ORDER; 01665 gParam.gauge= fatlink; 01666 cpuFatLink = new cpuGaugeField(gParam); 01667 if(cpuFatLink == NULL){ 01668 errorQuda("ERROR: Creating cpuFatLink failed\n"); 01669 } 01670 }else{ 01671 cpuFatLink->setGauge((void**)fatlink); 01672 } 01673 01674 // create the device fatlink 01675 if(cudaFatLink == NULL){ 01676 gParam.pad = qudaGaugeParam->llfat_ga_pad; 01677 gParam.create = QUDA_ZERO_FIELD_CREATE; 01678 gParam.link_type = QUDA_ASQTAD_FAT_LINKS; 01679 gParam.order = QUDA_QDP_GAUGE_ORDER; 01680 gParam.reconstruct = QUDA_RECONSTRUCT_NO; 01681 cudaFatLink = new cudaGaugeField(gParam); 01682 } 01683 01684 01685 01686 // create the host sitelink 01687 if(cpuSiteLink == NULL){ 01688 gParam.pad = 0; 01689 gParam.create = QUDA_REFERENCE_FIELD_CREATE; 01690 gParam.link_type = qudaGaugeParam->type; 01691 gParam.order = qudaGaugeParam->gauge_order; 01692 gParam.gauge = sitelink; 01693 if(method != QUDA_COMPUTE_FAT_STANDARD){ 01694 for(int dir=0; dir<4; ++dir) gParam.x[dir] = qudaGaugeParam_ex->X[dir]; 01695 } 01696 cpuSiteLink = new cpuGaugeField(gParam); 01697 if(cpuSiteLink == NULL){ 01698 errorQuda("ERROR: Creating cpuSiteLink failed\n"); 01699 } 01700 }else{ 01701 cpuSiteLink->setGauge(sitelink); 01702 } 01703 01704 if(cudaSiteLink == NULL){ 01705 gParam.pad = qudaGaugeParam->site_ga_pad; 01706 gParam.create = QUDA_NULL_FIELD_CREATE; 01707 gParam.link_type = qudaGaugeParam->type; 01708 gParam.reconstruct = qudaGaugeParam->reconstruct; 01709 cudaSiteLink = new cudaGaugeField(gParam); 01710 } 01711 01712 initCommonConstants(*cudaFatLink); 01713 01714 01715 if(method == QUDA_COMPUTE_FAT_STANDARD){ 01716 llfat_init_cuda(qudaGaugeParam); 01717 gettimeofday(&t7, NULL); 01718 01719 #ifdef MULTI_GPU 01720 if(qudaGaugeParam->gauge_order == QUDA_MILC_GAUGE_ORDER){ 01721 errorQuda("Only QDP-ordered site links are supported in the multi-gpu standard fattening code\n"); 01722 } 01723 #endif 01724 loadLinkToGPU(cudaSiteLink, cpuSiteLink, qudaGaugeParam); 01725 01726 gettimeofday(&t8, NULL); 01727 }else{ 01728 llfat_init_cuda_ex(qudaGaugeParam_ex); 01729 #ifdef MULTI_GPU 01730 exchange_cpu_sitelink_ex(qudaGaugeParam->X, (void**)cpuSiteLink->Gauge_p(), 01731 cpuSiteLink->Order(),qudaGaugeParam->cpu_prec, 0); 01732 #endif 01733 gettimeofday(&t7, NULL); 01734 loadLinkToGPU_ex(cudaSiteLink, cpuSiteLink); 01735 gettimeofday(&t8, NULL); 01736 } 01737 01738 // time the subroutines in computeFatLinkCore 01739 struct timeval time_array[4]; 01740 01741 // Actually do the fattening 01742 computeFatLinkCore(cudaSiteLink, act_path_coeff, qudaGaugeParam, method, cudaFatLink, time_array); 01743 01744 01745 gettimeofday(&t9, NULL); 01746 01747 storeLinkToCPU(cpuFatLink, cudaFatLink, qudaGaugeParam); 01748 01749 gettimeofday(&t10, NULL); 01750 01751 if (!(flag & QUDA_FAT_PRESERVE_CPU_GAUGE) ){ 01752 delete cpuFatLink; cpuFatLink = NULL; 01753 delete cpuSiteLink; cpuSiteLink = NULL; 01754 } 01755 if (!(flag & QUDA_FAT_PRESERVE_GPU_GAUGE) ){ 01756 delete cudaFatLink; cudaFatLink = NULL; 01757 delete cudaSiteLink; cudaSiteLink = NULL; 01758 } 01759 01760 gettimeofday(&t11, NULL); 01761 #ifdef DSLASH_PROFILING 01762 printfQuda("total time: %f ms, init(cuda/cpu gauge field creation,etc)=%f ms," 01763 " sitelink cpu->gpu=%f ms, computation in gpu =%f ms, fatlink gpu->cpu=%f ms\n", 01764 TDIFF_MS(t0, t11), TDIFF_MS(t0, t7) + TDIFF_MS(time_array[0],time_array[1]), TDIFF_MS(t7, t8), TDIFF_MS(time_array[1], time_array[2]), TDIFF_MS(t9,t10)); 01765 printfQuda("finally cleanup =%f ms\n", TDIFF_MS(t10, t11) + TDIFF_MS(time_array[2],time_array[3])); 01766 #endif 01767 01768 return 0; 01769 } 01770 #endif 01771 01772 #ifdef GPU_GAUGE_FORCE 01773 int 01774 computeGaugeForceQuda(void* mom, void* sitelink, int*** input_path_buf, int* path_length, 01775 void* loop_coeff, int num_paths, int max_length, double eb3, 01776 QudaGaugeParam* qudaGaugeParam, double* timeinfo) 01777 { 01778 01779 struct timeval t0, t1, t2, t3, t4; 01780 01781 gettimeofday(&t0,NULL); 01782 01783 #ifdef MULTI_GPU 01784 int E[4]; 01785 QudaGaugeParam qudaGaugeParam_ex_buf; 01786 QudaGaugeParam* qudaGaugeParam_ex=&qudaGaugeParam_ex_buf; 01787 memcpy(qudaGaugeParam_ex, qudaGaugeParam, sizeof(QudaGaugeParam)); 01788 E[0] = qudaGaugeParam_ex->X[0] = qudaGaugeParam->X[0] + 4; 01789 E[1] = qudaGaugeParam_ex->X[1] = qudaGaugeParam->X[1] + 4; 01790 E[2] = qudaGaugeParam_ex->X[2] = qudaGaugeParam->X[2] + 4; 01791 E[3] = qudaGaugeParam_ex->X[3] = qudaGaugeParam->X[3] + 4; 01792 #endif 01793 01794 int* X = qudaGaugeParam->X; 01795 GaugeFieldParam gParam(0, *qudaGaugeParam); 01796 #ifdef MULTI_GPU 01797 GaugeFieldParam gParam_ex(0, *qudaGaugeParam_ex); 01798 GaugeFieldParam& gParamSL = gParam_ex; 01799 int pad = E[2]*E[1]*E[0]/2; 01800 #else 01801 GaugeFieldParam& gParamSL = gParam; 01802 int pad = X[2]*X[1]*X[0]/2; 01803 #endif 01804 01805 GaugeFieldParam& gParamMom = gParam; 01806 01807 gParamSL.create = QUDA_REFERENCE_FIELD_CREATE; 01808 gParamSL.gauge = sitelink; 01809 gParamSL.pad = 0; 01810 cpuGaugeField* cpuSiteLink = new cpuGaugeField(gParamSL); 01811 01812 gParamSL.create =QUDA_NULL_FIELD_CREATE; 01813 gParamSL.pad = pad; 01814 gParamSL.precision = qudaGaugeParam->cuda_prec; 01815 gParamSL.reconstruct = qudaGaugeParam->reconstruct; 01816 cudaGaugeField* cudaSiteLink = new cudaGaugeField(gParamSL); 01817 qudaGaugeParam->site_ga_pad = gParamSL.pad;//need to record this value 01818 01819 gParamMom.pad = 0; 01820 gParamMom.order =QUDA_MILC_GAUGE_ORDER; 01821 gParamMom.precision = qudaGaugeParam->cpu_prec; 01822 gParamMom.create =QUDA_REFERENCE_FIELD_CREATE; 01823 gParamMom.reconstruct =QUDA_RECONSTRUCT_10; 01824 gParamMom.gauge=mom; 01825 gParamMom.link_type = QUDA_ASQTAD_MOM_LINKS; 01826 cpuGaugeField* cpuMom = new cpuGaugeField(gParamMom); 01827 01828 01829 gParamMom.pad = pad; 01830 gParamMom.create =QUDA_NULL_FIELD_CREATE; 01831 gParamMom.reconstruct = QUDA_RECONSTRUCT_10; 01832 gParamMom.precision = qudaGaugeParam->cuda_prec; 01833 gParamMom.link_type = QUDA_ASQTAD_MOM_LINKS; 01834 cudaGaugeField* cudaMom = new cudaGaugeField(gParamMom); 01835 qudaGaugeParam->mom_ga_pad = gParamMom.pad; //need to record this value 01836 01837 01838 initCommonConstants(*cudaMom); 01839 gauge_force_init_cuda(qudaGaugeParam, max_length); 01840 01841 #ifdef MULTI_GPU 01842 exchange_cpu_sitelink_ex(qudaGaugeParam->X, (void**)cpuSiteLink->Gauge_p(), 01843 cpuSiteLink->Order(), qudaGaugeParam->cpu_prec, 1); 01844 loadLinkToGPU_ex(cudaSiteLink, cpuSiteLink); 01845 #else 01846 loadLinkToGPU(cudaSiteLink, cpuSiteLink, qudaGaugeParam); 01847 #endif 01848 01849 cudaMom->loadCPUField(*cpuMom, QUDA_CPU_FIELD_LOCATION); 01850 01851 gettimeofday(&t1,NULL); 01852 01853 gauge_force_cuda(*cudaMom, eb3, *cudaSiteLink, qudaGaugeParam, input_path_buf, 01854 path_length, loop_coeff, num_paths, max_length); 01855 01856 gettimeofday(&t2,NULL); 01857 01858 cudaMom->saveCPUField(*cpuMom, QUDA_CPU_FIELD_LOCATION); 01859 01860 delete cpuSiteLink; 01861 delete cpuMom; 01862 01863 delete cudaSiteLink; 01864 delete cudaMom; 01865 01866 gettimeofday(&t3,NULL); 01867 01868 if(timeinfo){ 01869 timeinfo[0] = TDIFF(t0, t1); 01870 timeinfo[1] = TDIFF(t1, t2); 01871 timeinfo[2] = TDIFF(t2, t3); 01872 } 01873 01874 return 0; 01875 } 01876 01877 #endif 01878 01879 01880 void initCommsQuda(int argc, char **argv, const int *X, const int nDim) { 01881 01882 if (nDim != 4) errorQuda("Comms dimensions %d != 4", nDim); 01883 01884 #ifdef MULTI_GPU 01885 01886 #ifdef QMP_COMMS 01887 QMP_thread_level_t tl; 01888 QMP_init_msg_passing(&argc, &argv, QMP_THREAD_SINGLE, &tl); 01889 01890 QMP_declare_logical_topology(X, nDim); 01891 #elif defined(MPI_COMMS) 01892 MPI_Init (&argc, &argv); 01893 01894 int volume = 1; 01895 for (int d=0; d<nDim; d++) volume *= X[d]; 01896 int size = -1; 01897 MPI_Comm_size(MPI_COMM_WORLD, &size); 01898 if (volume != size) 01899 errorQuda("Number of processes %d must match requested MPI volume %d", 01900 size, volume); 01901 01902 comm_set_gridsize(X[0], X[1], X[2], X[3]); 01903 comm_init(); 01904 #endif 01905 01906 #endif 01907 } 01908 01909 void endCommsQuda() { 01910 #ifdef MULTI_GPU 01911 01912 #ifdef QMP_COMMS 01913 QMP_finalize_msg_passing(); 01914 #elif defined MPI_COMMS 01915 comm_cleanup(); 01916 #endif 01917 01918 #endif 01919 }