|
QUDA v0.3.2
A library for QCD on GPUs
|
00001 #include <iostream> 00002 #include <stdio.h> 00003 #include <stdlib.h> 00004 #include <math.h> 00005 00006 #include <quda.h> 00007 #include <quda_internal.h> 00008 #include <blas_quda.h> 00009 #include <gauge_quda.h> 00010 #include <dirac_quda.h> 00011 #include <dslash_quda.h> 00012 #include <clover_quda.h> 00013 #include <invert_quda.h> 00014 00015 #include <color_spinor_field.h> 00016 00017 #define spinorSiteSize 24 // real numbers per spinor 00018 00019 FullGauge cudaGaugePrecise; // Wilson links 00020 FullGauge cudaGaugeSloppy; 00021 00022 FullGauge cudaFatLinkPrecise; // asqtad fat links 00023 FullGauge cudaFatLinkSloppy; 00024 00025 FullGauge cudaLongLinkPrecise; // asqtad long links 00026 FullGauge cudaLongLinkSloppy; 00027 00028 FullClover cudaCloverPrecise; // clover term 00029 FullClover cudaCloverSloppy; 00030 00031 FullClover cudaCloverInvPrecise; // inverted clover term 00032 FullClover cudaCloverInvSloppy; 00033 00034 // define newQudaGaugeParam() and newQudaInvertParam() 00035 #define INIT_PARAM 00036 #include "check_params.h" 00037 #undef INIT_PARAM 00038 00039 // define (static) checkGaugeParam() and checkInvertParam() 00040 #define CHECK_PARAM 00041 #include "check_params.h" 00042 #undef CHECK_PARAM 00043 00044 // define printQudaGaugeParam() and printQudaInvertParam() 00045 #define PRINT_PARAM 00046 #include "check_params.h" 00047 #undef PRINT_PARAM 00048 00049 00050 void initQuda(int dev) 00051 { 00052 static int initialized = 0; 00053 if (initialized) { 00054 return; 00055 } 00056 initialized = 1; 00057 00058 int deviceCount; 00059 cudaGetDeviceCount(&deviceCount); 00060 if (deviceCount == 0) { 00061 errorQuda("No devices supporting CUDA"); 00062 } 00063 00064 for(int i=0; i<deviceCount; i++) { 00065 cudaDeviceProp deviceProp; 00066 cudaGetDeviceProperties(&deviceProp, i); 00067 fprintf(stderr, "QUDA: Found device %d: %s\n", i, deviceProp.name); 00068 } 00069 00070 if (dev < 0) { 00071 dev = deviceCount - 1; 00072 } 00073 00074 cudaDeviceProp deviceProp; 00075 cudaGetDeviceProperties(&deviceProp, dev); 00076 if (deviceProp.major < 1) { 00077 errorQuda("Device %d does not support CUDA", dev); 00078 } 00079 00080 fprintf(stderr, "QUDA: Using device %d: %s\n", dev, deviceProp.name); 00081 cudaSetDevice(dev); 00082 00083 cudaGaugePrecise.even = NULL; 00084 cudaGaugePrecise.odd = NULL; 00085 cudaGaugeSloppy.even = NULL; 00086 cudaGaugeSloppy.odd = NULL; 00087 00088 cudaFatLinkPrecise.even = NULL; 00089 cudaFatLinkPrecise.odd = NULL; 00090 cudaFatLinkSloppy.even = NULL; 00091 cudaFatLinkSloppy.odd = NULL; 00092 00093 cudaLongLinkPrecise.even = NULL; 00094 cudaLongLinkPrecise.odd = NULL; 00095 cudaLongLinkSloppy.even = NULL; 00096 cudaLongLinkSloppy.odd = NULL; 00097 00098 cudaCloverPrecise.even.clover = NULL; 00099 cudaCloverPrecise.odd.clover = NULL; 00100 cudaCloverSloppy.even.clover = NULL; 00101 cudaCloverSloppy.odd.clover = NULL; 00102 00103 cudaCloverInvPrecise.even.clover = NULL; 00104 cudaCloverInvPrecise.odd.clover = NULL; 00105 cudaCloverInvSloppy.even.clover = NULL; 00106 cudaCloverInvSloppy.odd.clover = NULL; 00107 00108 //initCache(); 00109 initBlas(); 00110 } 00111 00112 00113 void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param) 00114 { 00115 int packed_size; 00116 double anisotropy; 00117 FullGauge *precise, *sloppy; 00118 00119 checkGaugeParam(param); 00120 00121 switch (param->reconstruct) { 00122 case QUDA_RECONSTRUCT_8: 00123 packed_size = 8; 00124 break; 00125 case QUDA_RECONSTRUCT_12: 00126 packed_size = 12; 00127 break; 00128 case QUDA_RECONSTRUCT_NO: 00129 packed_size = 18; 00130 break; 00131 default: 00132 errorQuda("Invalid reconstruct type"); 00133 } 00134 param->packed_size = packed_size; 00135 00136 switch (param->type) { 00137 case QUDA_WILSON_LINKS: 00138 precise = &cudaGaugePrecise; 00139 sloppy = &cudaGaugeSloppy; 00140 break; 00141 case QUDA_ASQTAD_FAT_LINKS: 00142 precise = &cudaFatLinkPrecise; 00143 sloppy = &cudaFatLinkSloppy; 00144 break; 00145 case QUDA_ASQTAD_LONG_LINKS: 00146 precise = &cudaLongLinkPrecise; 00147 sloppy = &cudaLongLinkSloppy; 00148 break; 00149 default: 00150 errorQuda("Invalid gauge type"); 00151 } 00152 00153 if (param->type == QUDA_WILSON_LINKS) { 00154 anisotropy = param->anisotropy; 00155 } else { 00156 anisotropy = 1.0; 00157 } 00158 00159 if (param->type != QUDA_WILSON_LINKS && 00160 param->gauge_fix == QUDA_GAUGE_FIXED_YES) { 00161 errorQuda("Temporal gauge fixing not supported for staggered"); 00162 } 00163 00164 if ((param->cuda_prec == QUDA_HALF_PRECISION && param->reconstruct == QUDA_RECONSTRUCT_NO) || 00165 (param->cuda_prec_sloppy == QUDA_HALF_PRECISION && param->reconstruct_sloppy == QUDA_RECONSTRUCT_NO)) { 00166 warningQuda("Loading gauge field in half precision may give wrong results " 00167 "unless all elements have magnitude bounded by 1"); 00168 } 00169 00170 createGaugeField(precise, h_gauge, param->cuda_prec, param->cpu_prec, param->gauge_order, param->reconstruct, param->gauge_fix, 00171 param->t_boundary, param->X, anisotropy, param->tadpole_coeff, param->ga_pad); 00172 param->gaugeGiB += 2.0 * precise->bytes / (1 << 30); 00173 00174 if (param->cuda_prec_sloppy != param->cuda_prec || 00175 param->reconstruct_sloppy != param->reconstruct) { 00176 createGaugeField(sloppy, h_gauge, param->cuda_prec_sloppy, param->cpu_prec, param->gauge_order, 00177 param->reconstruct_sloppy, param->gauge_fix, param->t_boundary, 00178 param->X, anisotropy, param->tadpole_coeff, param->ga_pad); 00179 param->gaugeGiB += 2.0 * sloppy->bytes / (1 << 30); 00180 } else { 00181 *sloppy = *precise; 00182 } 00183 } 00184 00185 00186 void saveGaugeQuda(void *h_gauge, QudaGaugeParam *param) 00187 { 00188 FullGauge *gauge; 00189 00190 switch (param->type) { 00191 case QUDA_WILSON_LINKS: 00192 gauge = &cudaGaugePrecise; 00193 break; 00194 case QUDA_ASQTAD_FAT_LINKS: 00195 gauge = &cudaFatLinkPrecise; 00196 break; 00197 case QUDA_ASQTAD_LONG_LINKS: 00198 gauge = &cudaLongLinkPrecise; 00199 break; 00200 default: 00201 errorQuda("Invalid gauge type"); 00202 } 00203 00204 restoreGaugeField(h_gauge, gauge, param->cpu_prec, param->gauge_order); 00205 } 00206 00207 00208 void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param) 00209 { 00210 00211 if (!h_clover && !h_clovinv) { 00212 errorQuda("loadCloverQuda() called with neither clover term nor inverse"); 00213 } 00214 if (inv_param->clover_cpu_prec == QUDA_HALF_PRECISION) { 00215 errorQuda("Half precision not supported on CPU"); 00216 } 00217 if (cudaGaugePrecise.even == NULL) { 00218 errorQuda("Gauge field must be loaded before clover"); 00219 } 00220 if (inv_param->dslash_type != QUDA_CLOVER_WILSON_DSLASH) { 00221 errorQuda("Wrong dslash_type in loadCloverQuda()"); 00222 } 00223 00224 // determines whether operator is preconditioned when calling invertQuda() 00225 bool pc_solve = (inv_param->solve_type == QUDA_DIRECT_PC_SOLVE || 00226 inv_param->solve_type == QUDA_NORMEQ_PC_SOLVE); 00227 00228 // determines whether operator is preconditioned when calling MatQuda() or MatDagMatQuda() 00229 bool pc_solution = (inv_param->solution_type == QUDA_MATPC_SOLUTION || 00230 inv_param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION); 00231 00232 bool asymmetric = (inv_param->matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC || 00233 inv_param->matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC); 00234 00235 // We issue a warning only when it seems likely that the user is screwing up: 00236 00237 // inverted clover term is required when applying preconditioned operator 00238 if (!h_clovinv && pc_solve && pc_solution) { 00239 warningQuda("Inverted clover term not loaded"); 00240 } 00241 00242 // uninverted clover term is required when applying unpreconditioned operator, 00243 // but note that dslashQuda() is always preconditioned 00244 if (!h_clover && !pc_solve && !pc_solution) { 00245 //warningQuda("Uninverted clover term not loaded"); 00246 } 00247 00248 // uninverted clover term is also required for "asymmetric" preconditioning 00249 if (!h_clover && pc_solve && pc_solution && asymmetric) { 00250 warningQuda("Uninverted clover term not loaded"); 00251 } 00252 00253 int X[4]; 00254 for (int i=0; i<4; i++) { 00255 X[i] = cudaGaugePrecise.X[i]; 00256 } 00257 00258 inv_param->cloverGiB = 0; 00259 00260 if (h_clover) { 00261 allocateCloverField(&cudaCloverPrecise, X, inv_param->cl_pad, inv_param->clover_cuda_prec); 00262 loadCloverField(cudaCloverPrecise, h_clover, inv_param->clover_cpu_prec, inv_param->clover_order); 00263 inv_param->cloverGiB += 2.0*cudaCloverPrecise.even.bytes / (1<<30); 00264 00265 if (inv_param->clover_cuda_prec != inv_param->clover_cuda_prec_sloppy) { 00266 allocateCloverField(&cudaCloverSloppy, X, inv_param->cl_pad, inv_param->clover_cuda_prec_sloppy); 00267 loadCloverField(cudaCloverSloppy, h_clover, inv_param->clover_cpu_prec, inv_param->clover_order); 00268 inv_param->cloverGiB += 2.0*cudaCloverInvSloppy.even.bytes / (1<<30); 00269 } else { 00270 cudaCloverSloppy = cudaCloverPrecise; 00271 } 00272 } 00273 00274 if (h_clovinv) { 00275 allocateCloverField(&cudaCloverInvPrecise, X, inv_param->cl_pad, inv_param->clover_cuda_prec); 00276 loadCloverField(cudaCloverInvPrecise, h_clovinv, inv_param->clover_cpu_prec, inv_param->clover_order); 00277 inv_param->cloverGiB += 2.0*cudaCloverInvPrecise.even.bytes / (1<<30); 00278 00279 if (inv_param->clover_cuda_prec != inv_param->clover_cuda_prec_sloppy) { 00280 allocateCloverField(&cudaCloverInvSloppy, X, inv_param->cl_pad, inv_param->clover_cuda_prec_sloppy); 00281 loadCloverField(cudaCloverInvSloppy, h_clovinv, inv_param->clover_cpu_prec, inv_param->clover_order); 00282 inv_param->cloverGiB += 2.0*cudaCloverInvSloppy.even.bytes / (1<<30); 00283 } else { 00284 cudaCloverInvSloppy = cudaCloverInvPrecise; 00285 } 00286 } 00287 00288 } 00289 00290 00291 #if 0 00292 // discard clover term but keep the inverse 00293 void discardCloverQuda(QudaInvertParam *inv_param) 00294 { 00295 inv_param->cloverGiB -= 2.0*cudaCloverPrecise.even.bytes / (1<<30); 00296 freeCloverField(&cudaCloverPrecise); 00297 if (cudaCloverSloppy.even.clover) { 00298 inv_param->cloverGiB -= 2.0*cudaCloverSloppy.even.bytes / (1<<30); 00299 freeCloverField(&cudaCloverSloppy); 00300 } 00301 } 00302 #endif 00303 00304 00305 void endQuda(void) 00306 { 00307 cudaColorSpinorField::freeBuffer(); 00308 freeGaugeField(&cudaGaugePrecise); 00309 freeGaugeField(&cudaGaugeSloppy); 00310 freeGaugeField(&cudaFatLinkPrecise); 00311 freeGaugeField(&cudaFatLinkSloppy); 00312 freeGaugeField(&cudaLongLinkPrecise); 00313 freeGaugeField(&cudaLongLinkSloppy); 00314 if (cudaCloverPrecise.even.clover) freeCloverField(&cudaCloverPrecise); 00315 if (cudaCloverSloppy.even.clover) freeCloverField(&cudaCloverSloppy); 00316 if (cudaCloverInvPrecise.even.clover) freeCloverField(&cudaCloverInvPrecise); 00317 if (cudaCloverInvSloppy.even.clover) freeCloverField(&cudaCloverInvSloppy); 00318 endBlas(); 00319 } 00320 00321 00322 void setDiracParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc) 00323 { 00324 double kappa = inv_param->kappa; 00325 if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) { 00326 kappa *= cudaGaugePrecise.anisotropy; 00327 } 00328 00329 switch (inv_param->dslash_type) { 00330 case QUDA_WILSON_DSLASH: 00331 diracParam.type = pc ? QUDA_WILSONPC_DIRAC : QUDA_WILSON_DIRAC; 00332 break; 00333 case QUDA_CLOVER_WILSON_DSLASH: 00334 diracParam.type = pc ? QUDA_CLOVERPC_DIRAC : QUDA_CLOVER_DIRAC; 00335 break; 00336 case QUDA_DOMAIN_WALL_DSLASH: 00337 diracParam.type = pc ? QUDA_DOMAIN_WALLPC_DIRAC : QUDA_DOMAIN_WALL_DIRAC; 00338 break; 00339 case QUDA_ASQTAD_DSLASH: 00340 diracParam.type = pc ? QUDA_ASQTADPC_DIRAC : QUDA_ASQTAD_DIRAC; 00341 break; 00342 case QUDA_TWISTED_MASS_DSLASH: 00343 diracParam.type = pc ? QUDA_TWISTED_MASSPC_DIRAC : QUDA_TWISTED_MASS_DIRAC; 00344 break; 00345 default: 00346 errorQuda("Unsupported dslash_type"); 00347 } 00348 00349 diracParam.matpcType = inv_param->matpc_type; 00350 diracParam.dagger = inv_param->dagger; 00351 diracParam.gauge = &cudaGaugePrecise; 00352 diracParam.clover = &cudaCloverPrecise; 00353 diracParam.cloverInv = &cudaCloverInvPrecise; 00354 diracParam.kappa = kappa; 00355 diracParam.mass = inv_param->mass; 00356 diracParam.m5 = inv_param->m5; 00357 diracParam.mu = inv_param->mu; 00358 diracParam.verbose = inv_param->verbosity; 00359 } 00360 00361 00362 void setDiracSloppyParam(DiracParam &diracParam, QudaInvertParam *inv_param, bool pc) 00363 { 00364 setDiracParam(diracParam, inv_param, pc); 00365 00366 diracParam.gauge = &cudaGaugeSloppy; 00367 diracParam.clover = &cudaCloverSloppy; 00368 diracParam.cloverInv = &cudaCloverInvSloppy; 00369 } 00370 00371 00372 static void massRescale(QudaDslashType dslash_type, double &kappa, QudaSolutionType solution_type, 00373 QudaMassNormalization mass_normalization, cudaColorSpinorField &b) 00374 { 00375 if (dslash_type == QUDA_ASQTAD_DSLASH) { 00376 if (mass_normalization != QUDA_MASS_NORMALIZATION) { 00377 errorQuda("Staggered code only supports QUDA_MASS_NORMALIZATION"); 00378 } 00379 return; 00380 } 00381 00382 // multiply the source to compensate for normalization of the Dirac operator, if necessary 00383 switch (solution_type) { 00384 case QUDA_MAT_SOLUTION: 00385 if (mass_normalization == QUDA_MASS_NORMALIZATION || 00386 mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { 00387 axCuda(2.0*kappa, b); 00388 } 00389 break; 00390 case QUDA_MATDAG_MAT_SOLUTION: 00391 if (mass_normalization == QUDA_MASS_NORMALIZATION || 00392 mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { 00393 axCuda(4.0*kappa*kappa, b); 00394 } 00395 break; 00396 case QUDA_MATPC_SOLUTION: 00397 if (mass_normalization == QUDA_MASS_NORMALIZATION) { 00398 axCuda(4.0*kappa*kappa, b); 00399 } else if (mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { 00400 axCuda(2.0*kappa, b); 00401 } 00402 break; 00403 case QUDA_MATPCDAG_MATPC_SOLUTION: 00404 if (mass_normalization == QUDA_MASS_NORMALIZATION) { 00405 axCuda(16.0*pow(kappa,4), b); 00406 } else if (mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { 00407 axCuda(4.0*kappa*kappa, b); 00408 } 00409 break; 00410 default: 00411 errorQuda("Solution type %d not supported", solution_type); 00412 } 00413 } 00414 00415 00416 void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity) 00417 { 00418 ColorSpinorParam cpuParam(h_in, *inv_param, cudaGaugePrecise.X); 00419 cpuParam.siteSubset = QUDA_PARITY_SITE_SUBSET; 00420 ColorSpinorParam cudaParam(cpuParam, *inv_param); 00421 00422 cpuColorSpinorField hIn(cpuParam); 00423 00424 cudaColorSpinorField in(hIn, cudaParam); 00425 00426 cudaParam.create = QUDA_NULL_FIELD_CREATE; 00427 cudaColorSpinorField out(in, cudaParam); 00428 00429 if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) { 00430 if (parity == QUDA_EVEN_PARITY) { 00431 parity = QUDA_ODD_PARITY; 00432 } else { 00433 parity = QUDA_EVEN_PARITY; 00434 } 00435 axCuda(cudaGaugePrecise.anisotropy, in); 00436 } 00437 bool pc = true; 00438 00439 DiracParam diracParam; 00440 setDiracParam(diracParam, inv_param, pc); 00441 00442 Dirac *dirac = Dirac::create(diracParam); // create the Dirac operator 00443 dirac->Dslash(out, in, parity); // apply the operator 00444 delete dirac; // clean up 00445 00446 cpuParam.v = h_out; 00447 cpuColorSpinorField hOut(cpuParam); 00448 out.saveCPUSpinorField(hOut); // since this is a reference, this won't work: hOut = out; 00449 } 00450 00451 00452 void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param) 00453 { 00454 bool pc = (inv_param->solution_type == QUDA_MATPC_SOLUTION || 00455 inv_param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION); 00456 00457 if (!pc) cudaGaugePrecise.X[0] *= 2; 00458 ColorSpinorParam cpuParam(h_in, *inv_param, cudaGaugePrecise.X); 00459 if (!pc) { 00460 cudaGaugePrecise.X[0] /= 2; 00461 cpuParam.siteSubset = QUDA_FULL_SITE_SUBSET;; 00462 } else { 00463 cpuParam.siteSubset = QUDA_PARITY_SITE_SUBSET; 00464 } 00465 ColorSpinorParam cudaParam(cpuParam, *inv_param); 00466 00467 cpuColorSpinorField hIn(cpuParam); 00468 cudaColorSpinorField in(hIn, cudaParam); 00469 cudaParam.create = QUDA_NULL_FIELD_CREATE; 00470 cudaColorSpinorField out(in, cudaParam); 00471 00472 DiracParam diracParam; 00473 setDiracParam(diracParam, inv_param, pc); 00474 00475 Dirac *dirac = Dirac::create(diracParam); // create the Dirac operator 00476 dirac->M(out, in); // apply the operator 00477 delete dirac; // clean up 00478 00479 double kappa = inv_param->kappa; 00480 if (pc) { 00481 if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION) { 00482 axCuda(0.25/(kappa*kappa), out); 00483 } else if (inv_param->mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { 00484 axCuda(0.5/kappa, out); 00485 } 00486 } else { 00487 if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION || 00488 inv_param->mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { 00489 axCuda(0.5/kappa, out); 00490 } 00491 } 00492 00493 cpuParam.v = h_out; 00494 cpuColorSpinorField hOut(cpuParam); 00495 out.saveCPUSpinorField(hOut); // since this is a reference, this won't work: hOut = out; 00496 } 00497 00498 00499 void MatDagMatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param) 00500 { 00501 bool pc = (inv_param->solution_type == QUDA_MATPC_SOLUTION || 00502 inv_param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION); 00503 00504 if (!pc) cudaGaugePrecise.X[0] *= 2; 00505 ColorSpinorParam cpuParam(h_in, *inv_param, cudaGaugePrecise.X); 00506 if (!pc) { 00507 cudaGaugePrecise.X[0] /= 2; 00508 cpuParam.siteSubset = QUDA_FULL_SITE_SUBSET;; 00509 } else { 00510 cpuParam.siteSubset = QUDA_PARITY_SITE_SUBSET; 00511 } 00512 ColorSpinorParam cudaParam(cpuParam, *inv_param); 00513 00514 cpuColorSpinorField hIn(cpuParam); 00515 cudaColorSpinorField in(hIn, cudaParam); 00516 cudaParam.create = QUDA_NULL_FIELD_CREATE; 00517 cudaColorSpinorField out(in, cudaParam); 00518 00519 // double kappa = inv_param->kappa; 00520 // if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) kappa *= cudaGaugePrecise.anisotropy; 00521 00522 DiracParam diracParam; 00523 setDiracParam(diracParam, inv_param, pc); 00524 00525 Dirac *dirac = Dirac::create(diracParam); // create the Dirac operator 00526 dirac->MdagM(out, in); // apply the operator 00527 delete dirac; // clean up 00528 00529 double kappa = inv_param->kappa; 00530 if (pc) { 00531 if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION) { 00532 axCuda(1.0/pow(2.0*kappa,4), out); 00533 } else if (inv_param->mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { 00534 axCuda(0.25/(kappa*kappa), out); 00535 } 00536 } else { 00537 if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION || 00538 inv_param->mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { 00539 axCuda(0.25/(kappa*kappa), out); 00540 } 00541 } 00542 00543 cpuParam.v = h_out; 00544 cpuColorSpinorField hOut(cpuParam); 00545 out.saveCPUSpinorField(hOut); // since this is a reference, this won't work: hOut = out; 00546 } 00547 00548 00549 void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param) 00550 { 00551 checkInvertParam(param); 00552 00553 bool pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE || 00554 param->solve_type == QUDA_NORMEQ_PC_SOLVE); 00555 00556 bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION || 00557 param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION); 00558 00559 param->spinorGiB = cudaGaugePrecise.volume * spinorSiteSize; 00560 if (!pc_solve) param->spinorGiB *= 2; 00561 param->spinorGiB *= (param->cuda_prec == QUDA_DOUBLE_PRECISION ? sizeof(double) : sizeof(float)); 00562 if (param->preserve_source == QUDA_PRESERVE_SOURCE_NO) { 00563 param->spinorGiB *= (param->inv_type == QUDA_CG_INVERTER ? 5 : 7)/(double)(1<<30); 00564 } else { 00565 param->spinorGiB *= (param->inv_type == QUDA_CG_INVERTER ? 8 : 9)/(double)(1<<30); 00566 } 00567 00568 param->secs = 0; 00569 param->gflops = 0; 00570 param->iter = 0; 00571 00572 Dirac *dirac; 00573 Dirac *diracSloppy; 00574 00575 // set the Dirac operator parameters 00576 DiracParam diracParam; 00577 setDiracParam(diracParam, param, pc_solve); 00578 00579 cpuColorSpinorField* h_b; 00580 cpuColorSpinorField* h_x; 00581 cudaColorSpinorField* b; 00582 cudaColorSpinorField* x; 00583 cudaColorSpinorField *in, *out; 00584 00585 if (param->dslash_type == QUDA_ASQTAD_DSLASH) { 00586 00587 if(param->solution_type != QUDA_MATPCDAG_MATPC_SOLUTION 00588 && param->solution_type != QUDA_MATDAG_MAT_SOLUTION){ 00589 errorQuda("Your solution type not supported for staggered. " 00590 "Only QUDA_MATPCDAG_MATPC_SOLUTION and QUDA_MATDAG_MAT_SOLUTION is supported."); 00591 } 00592 00593 ColorSpinorParam csParam; 00594 csParam.precision = param->cpu_prec; 00595 csParam.fieldLocation = QUDA_CPU_FIELD_LOCATION; 00596 csParam.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; 00597 csParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; 00598 csParam.nColor=3; 00599 csParam.nSpin=1; 00600 csParam.nDim=4; 00601 csParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER; 00602 00603 if (param->solve_type == QUDA_NORMEQ_SOLVE) { 00604 csParam.siteSubset = QUDA_FULL_SITE_SUBSET; 00605 csParam.x[0] = 2*cudaFatLinkPrecise.X[0]; 00606 } else if (param->solve_type == QUDA_NORMEQ_PC_SOLVE) { 00607 csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; 00608 csParam.x[0] = cudaFatLinkPrecise.X[0]; 00609 } else { 00610 errorQuda("Direct solve_type not supported for staggered"); 00611 } 00612 csParam.x[1] = cudaFatLinkPrecise.X[1]; 00613 csParam.x[2] = cudaFatLinkPrecise.X[2]; 00614 csParam.x[3] = cudaFatLinkPrecise.X[3]; 00615 csParam.create = QUDA_REFERENCE_FIELD_CREATE; 00616 csParam.v = hp_b; 00617 h_b = new cpuColorSpinorField(csParam); 00618 00619 csParam.v = hp_x; 00620 h_x = new cpuColorSpinorField(csParam); 00621 00622 csParam.fieldLocation = QUDA_CUDA_FIELD_LOCATION; 00623 csParam.fieldOrder = QUDA_FLOAT2_FIELD_ORDER; 00624 csParam.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; 00625 00626 csParam.pad = param->sp_pad; 00627 csParam.precision = param->cuda_prec; 00628 csParam.create = QUDA_ZERO_FIELD_CREATE; 00629 00630 b = new cudaColorSpinorField(csParam); 00631 00632 diracParam.fatGauge = &cudaFatLinkPrecise; 00633 diracParam.longGauge = &cudaLongLinkPrecise; 00634 00635 dirac = Dirac::create(diracParam); // create the Dirac operator 00636 00637 diracParam.fatGauge = &cudaFatLinkSloppy; 00638 diracParam.longGauge = &cudaLongLinkSloppy; 00639 00640 setDiracSloppyParam(diracParam, param, pc_solve); 00641 diracSloppy = Dirac::create(diracParam); 00642 00643 *b = *h_b; // send data from CPU to GPU 00644 00645 csParam.create = QUDA_COPY_FIELD_CREATE; 00646 x = new cudaColorSpinorField(*h_x, csParam); // solution 00647 csParam.create = QUDA_ZERO_FIELD_CREATE; 00648 00649 } else if (param->dslash_type == QUDA_WILSON_DSLASH || 00650 param->dslash_type == QUDA_CLOVER_WILSON_DSLASH || 00651 param->dslash_type == QUDA_DOMAIN_WALL_DSLASH || 00652 param->dslash_type == QUDA_TWISTED_MASS_DSLASH) { 00653 00654 // temporary hack 00655 if (!pc_solution) cudaGaugePrecise.X[0] *= 2; 00656 ColorSpinorParam cpuParam(hp_b, *param, cudaGaugePrecise.X); 00657 if (!pc_solution) { 00658 cudaGaugePrecise.X[0] /= 2; 00659 cpuParam.siteSubset = QUDA_FULL_SITE_SUBSET;; 00660 } else { 00661 cpuParam.siteSubset = QUDA_PARITY_SITE_SUBSET; 00662 } 00663 00664 ColorSpinorParam cudaParam(cpuParam, *param); 00665 00666 h_b = new cpuColorSpinorField(cpuParam); 00667 cpuParam.v = hp_x; 00668 h_x = new cpuColorSpinorField(cpuParam); 00669 00670 b = new cudaColorSpinorField(*h_b, cudaParam); // download source 00671 00672 if (param->verbosity >= QUDA_VERBOSE) { 00673 printfQuda("Source: CPU = %f, CUDA copy = %f\n", norm2(*h_b),norm2(*b)); 00674 } 00675 cudaParam.create = QUDA_ZERO_FIELD_CREATE; 00676 x = new cudaColorSpinorField(cudaParam); // solution 00677 00678 // if using preconditioning but solving the full system 00679 if (pc_solve && !pc_solution) { 00680 cudaParam.x[0] /= 2; 00681 cudaParam.siteSubset = QUDA_PARITY_SITE_SUBSET; 00682 } 00683 00684 dirac = Dirac::create(diracParam); // create the Dirac operator 00685 00686 setDiracSloppyParam(diracParam, param, pc_solve); 00687 diracSloppy = Dirac::create(diracParam); 00688 00689 } else { 00690 errorQuda("Invalid dslash_type"); 00691 } 00692 00693 dirac->prepare(in, out, *x, *b, param->solution_type); 00694 if (param->verbosity >= QUDA_VERBOSE) printfQuda("Prepared source = %f\n", norm2(*in)); 00695 00696 massRescale(param->dslash_type, diracParam.kappa, param->solution_type, param->mass_normalization, *in); 00697 if (param->verbosity >= QUDA_VERBOSE) printfQuda("Mass rescale done\n"); 00698 00699 switch (param->inv_type) { 00700 case QUDA_CG_INVERTER: 00701 if (param->solution_type != QUDA_MATDAG_MAT_SOLUTION && param->solution_type != QUDA_MATPCDAG_MATPC_SOLUTION) { 00702 copyCuda(*out, *in); 00703 dirac->Mdag(*in, *out); 00704 } 00705 invertCgCuda(DiracMdagM(dirac), DiracMdagM(diracSloppy), *out, *in, param); 00706 break; 00707 case QUDA_BICGSTAB_INVERTER: 00708 if (param->solution_type == QUDA_MATDAG_MAT_SOLUTION || param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION) { 00709 invertBiCGstabCuda(DiracMdag(dirac), DiracMdag(diracSloppy), *out, *in, param); 00710 copyCuda(*in, *out); 00711 } 00712 invertBiCGstabCuda(DiracM(dirac), DiracM(diracSloppy), *out, *in, param); 00713 break; 00714 default: 00715 errorQuda("Inverter type %d not implemented", param->inv_type); 00716 } 00717 00718 if (param->verbosity >= QUDA_VERBOSE){ 00719 printfQuda("Solution = %f\n",norm2(*x)); 00720 } 00721 dirac->reconstruct(*x, *b, param->solution_type); 00722 00723 x->saveCPUSpinorField(*h_x); // since this is a reference, this won't work: h_x = x; 00724 00725 if (param->verbosity >= QUDA_VERBOSE){ 00726 printfQuda("Reconstructed: CUDA solution = %f, CPU copy = %f\n", norm2(*x), norm2(*h_x)); 00727 } 00728 00729 delete diracSloppy; 00730 delete dirac; 00731 00732 delete h_b; 00733 delete h_x; 00734 delete b; 00735 delete x; 00736 00737 return; 00738 } 00739 00740 00741 void invertMultiShiftQuda(void **_hp_x, void *_hp_b, QudaInvertParam *param, 00742 double* offsets, int num_offsets, double* residue_sq) 00743 { 00744 checkInvertParam(param); 00745 00746 if (param->dslash_type != QUDA_ASQTAD_DSLASH) { 00747 errorQuda("Multi-shift solver only supports staggered"); 00748 } 00749 if (param->solve_type != QUDA_NORMEQ_SOLVE && 00750 param->solve_type != QUDA_NORMEQ_PC_SOLVE) { 00751 errorQuda("Direct solve_type not supported for staggered"); 00752 } 00753 if (num_offsets <= 0){ 00754 warningQuda("invertMultiShiftQuda() called with no offsets"); 00755 return; 00756 } 00757 00758 if(param->solution_type != QUDA_MATPCDAG_MATPC_SOLUTION 00759 && param->solution_type != QUDA_MATDAG_MAT_SOLUTION){ 00760 errorQuda("Your solution type not supported for staggered. " 00761 "Only QUDA_MATPCDAG_MATPC_SOLUTION and QUDA_MATDAG_MAT_SOLUTION is supported."); 00762 } 00763 00764 00765 00766 bool pc_solve = (param->solve_type == QUDA_NORMEQ_PC_SOLVE); 00767 00768 param->secs = 0; 00769 param->gflops = 0; 00770 param->iter = 0; 00771 00772 double low_offset = offsets[0]; 00773 int low_index = 0; 00774 for (int i=1;i < num_offsets;i++){ 00775 if (offsets[i] < low_offset){ 00776 low_offset = offsets[i]; 00777 low_index = i; 00778 } 00779 } 00780 00781 void* hp_x[num_offsets]; 00782 void* hp_b = _hp_b; 00783 for(int i=0;i < num_offsets;i++){ 00784 hp_x[i] = _hp_x[i]; 00785 } 00786 00787 if (low_index != 0){ 00788 void* tmp = hp_x[0]; 00789 hp_x[0] = hp_x[low_index] ; 00790 hp_x[low_index] = tmp; 00791 00792 double tmp1 = offsets[0]; 00793 offsets[0]= offsets[low_index]; 00794 offsets[low_index] =tmp1; 00795 } 00796 00797 ColorSpinorParam csParam; 00798 csParam.precision = param->cpu_prec; 00799 csParam.fieldLocation = QUDA_CPU_FIELD_LOCATION; 00800 csParam.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; 00801 csParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; 00802 csParam.nColor=3; 00803 csParam.nSpin=1; 00804 csParam.nDim=4; 00805 csParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER; 00806 00807 if (param->solve_type == QUDA_NORMEQ_SOLVE) { 00808 csParam.siteSubset = QUDA_FULL_SITE_SUBSET; 00809 csParam.x[0] = 2*cudaFatLinkPrecise.X[0]; 00810 } else if (param->solve_type == QUDA_NORMEQ_PC_SOLVE) { 00811 csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; 00812 csParam.x[0] = cudaFatLinkPrecise.X[0]; 00813 } else { 00814 errorQuda("Direct solve_type not supported for staggered"); 00815 } 00816 csParam.x[1] = cudaFatLinkPrecise.X[1]; 00817 csParam.x[2] = cudaFatLinkPrecise.X[2]; 00818 csParam.x[3] = cudaFatLinkPrecise.X[3]; 00819 csParam.create = QUDA_REFERENCE_FIELD_CREATE; 00820 csParam.v = hp_b; 00821 cpuColorSpinorField h_b(csParam); 00822 00823 cpuColorSpinorField* h_x[num_offsets]; 00824 00825 for (int i=0; i<num_offsets; i++) { 00826 csParam.v = hp_x[i]; 00827 h_x[i] = new cpuColorSpinorField(csParam); 00828 } 00829 00830 csParam.fieldLocation = QUDA_CUDA_FIELD_LOCATION; 00831 csParam.fieldOrder = QUDA_FLOAT2_FIELD_ORDER; 00832 csParam.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; 00833 00834 csParam.pad = param->sp_pad; 00835 csParam.precision = param->cuda_prec; 00836 csParam.create = QUDA_ZERO_FIELD_CREATE; 00837 00838 cudaColorSpinorField b(csParam); 00839 00840 // set the mass in the invert_param 00841 param->mass = sqrt(offsets[0]/4); 00842 00843 // set the Dirac operator parameters 00844 DiracParam diracParam; 00845 setDiracParam(diracParam, param, pc_solve); 00846 diracParam.fatGauge = &cudaFatLinkPrecise; 00847 diracParam.longGauge = &cudaLongLinkPrecise; 00848 00849 Dirac *dirac = Dirac::create(diracParam); // create the Dirac operator 00850 00851 diracParam.fatGauge = &cudaFatLinkSloppy; 00852 diracParam.longGauge = &cudaLongLinkSloppy; 00853 00854 setDiracSloppyParam(diracParam, param, pc_solve); 00855 Dirac *diracSloppy = Dirac::create(diracParam); 00856 00857 b = h_b; //send data from CPU to GPU 00858 00859 csParam.create = QUDA_ZERO_FIELD_CREATE; 00860 cudaColorSpinorField* x[num_offsets]; // solution 00861 for (int i=0; i < num_offsets; i++) { 00862 x[i] = new cudaColorSpinorField(csParam); 00863 } 00864 cudaColorSpinorField tmp(csParam); // temporary 00865 invertMultiShiftCgCuda(DiracMdagM(dirac), DiracMdagM(diracSloppy), x, b, param, offsets, num_offsets, residue_sq); 00866 00867 for (int i=0; i < num_offsets; i++) { 00868 x[i]->saveCPUSpinorField(*h_x[i]); 00869 } 00870 00871 for(int i=0; i<num_offsets; i++) { 00872 delete h_x[i]; 00873 delete x[i]; 00874 } 00875 delete diracSloppy; 00876 delete dirac; 00877 00878 return; 00879 }
1.7.3