QUDA: quda/lib/interface_quda.cpp Source File

QUDA v0.4.0
A library for QCD on GPUs
00001 #include <iostream>
00002 #include <stdio.h>
00003 #include <stdlib.h>
00004 #include <math.h>
00005 #include <string.h>
00006 #include <sys/time.h>
00007 
00008 #include <quda.h>
00009 #include <quda_internal.h>
00010 #include <comm_quda.h>
00011 #include <tune_quda.h>
00012 #include <blas_quda.h>
00013 #include <gauge_field.h>
00014 #include <dirac_quda.h>
00015 #include <dslash_quda.h>
00016 #include <invert_quda.h>
00017 #include <color_spinor_field.h>
00018 #include <clover_field.h>
00019 #include <llfat_quda.h>
00020 #include <fat_force_quda.h>
00021 #include <hisq_links_quda.h>
00022 
00023 #ifdef QUDA_NUMA_SUPPORT
00024 #include <numa_affinity.h>
00025 #endif
00026 
00027 #include <cuda.h>
00028 #ifdef MULTI_GPU
00029 #ifdef MPI_COMMS
00030 #include <mpi.h>
00031 #endif
00032 
00033 #ifdef QMP_COMMS
00034 #include <qmp.h>
00035 #endif
00036 #endif
00037 
00038 #ifdef GPU_GAUGE_FORCE
00039 #include <gauge_force_quda.h>
00040 #endif
00041 
00042 #define MAX(a,b) ((a)>(b)? (a):(b))
00043 #define TDIFF(a,b) (b.tv_sec - a.tv_sec + 0.000001*(b.tv_usec - a.tv_usec))
00044 
00045 #define spinorSiteSize 24 // real numbers per spinor
00046 
00047 #define MAX_GPU_NUM_PER_NODE 16
00048 
00049 // define newQudaGaugeParam() and newQudaInvertParam()
00050 #define INIT_PARAM
00051 #include "check_params.h"
00052 #undef INIT_PARAM
00053 
00054 // define (static) checkGaugeParam() and checkInvertParam()
00055 #define CHECK_PARAM
00056 #include "check_params.h"
00057 #undef CHECK_PARAM
00058 
00059 // define printQudaGaugeParam() and printQudaInvertParam()
00060 #define PRINT_PARAM
00061 #include "check_params.h"
00062 #undef PRINT_PARAM
00063 
00064 #ifdef QMP_COMMS
00065 int rank_QMP;
00066 int num_QMP;
00067 extern bool qudaPt0;
00068 extern bool qudaPtNm1;
00069 #endif
00070 
00071 #include "face_quda.h"
00072 
00073 static QudaVerbosity verbosity;
00074 int numa_affinity_enabled = 1;
00075 
00076 cudaGaugeField *gaugePrecise = NULL;
00077 cudaGaugeField *gaugeSloppy = NULL;
00078 cudaGaugeField *gaugePrecondition = NULL;
00079 
00080 cudaGaugeField *gaugeFatPrecise = NULL;
00081 cudaGaugeField *gaugeFatSloppy = NULL;
00082 cudaGaugeField *gaugeFatPrecondition = NULL;
00083 
00084 cudaGaugeField *gaugeLongPrecise = NULL;
00085 cudaGaugeField *gaugeLongSloppy = NULL;
00086 cudaGaugeField *gaugeLongPrecondition = NULL;
00087 
00088 cudaCloverField *cloverPrecise = NULL;
00089 cudaCloverField *cloverSloppy = NULL;
00090 cudaCloverField *cloverPrecondition = NULL;
00091 
00092 cudaDeviceProp deviceProp;
00093 cudaStream_t *streams;
00094 
00095 int getGpuCount()
00096 {
00097   int count;
00098   cudaGetDeviceCount(&count);
00099   if (count <= 0){
00100     errorQuda("No devices supporting CUDA");
00101   }
00102   if(count > MAX_GPU_NUM_PER_NODE){
00103     errorQuda("gpu count(%d) is larger than limit\n", count);
00104   }
00105   return count;
00106 }
00107 
00108 void initQuda(int dev)
00109 {
00110   static int initialized = 0;
00111   if (initialized) {
00112     return;
00113   }
00114   initialized = 1;
00115 
00116 #if defined(GPU_DIRECT) && defined(MULTI_GPU) && (CUDA_VERSION == 4000)
00117   //check if CUDA_NIC_INTEROP is set to 1 in the enviroment
00118   // not needed for CUDA >= 4.1
00119   char* cni_str = getenv("CUDA_NIC_INTEROP");
00120   if(cni_str == NULL){
00121     errorQuda("Environment variable CUDA_NIC_INTEROP is not set\n");
00122   }
00123   int cni_int = atoi(cni_str);
00124   if (cni_int != 1){
00125     errorQuda("Environment variable CUDA_NIC_INTEROP is not set to 1\n");    
00126   }
00127 #endif
00128 
00129   int deviceCount;
00130   cudaGetDeviceCount(&deviceCount);
00131   if (deviceCount == 0) {
00132     errorQuda("No devices supporting CUDA");
00133   }
00134 
00135   for(int i=0; i<deviceCount; i++) {
00136     cudaDeviceProp deviceProp;
00137     cudaGetDeviceProperties(&deviceProp, i);
00138     printfQuda("QUDA: Found device %d: %s\n", i, deviceProp.name);
00139   }
00140 
00141 #ifdef QMP_COMMS
00142   int ndim;
00143   const int *dim;
00144 
00145   if ( QMP_is_initialized() != QMP_TRUE ) {
00146     errorQuda("QMP is not initialized");
00147   }
00148   num_QMP=QMP_get_number_of_nodes();
00149   rank_QMP=QMP_get_node_number();
00150   
00151   if (dev < 0) {
00152     dev = rank_QMP % deviceCount;
00153   }
00154   ndim = QMP_get_logical_number_of_dimensions();
00155   dim = QMP_get_logical_dimensions();
00156 #elif defined(MPI_COMMS)
00157   comm_init();
00158   if (dev < 0) {
00159     dev=comm_gpuid();
00160   }
00161 #else
00162   if (dev < 0) errorQuda("Invalid device number");
00163 #endif
00164   
00165   // Used for applying the gauge field boundary condition
00166   if( commCoords(3) == 0 ) qudaPt0=true;
00167   else qudaPt0=false;
00168 
00169   if( commCoords(3) == commDim(3)-1 ) qudaPtNm1=true;
00170   else qudaPtNm1=false;
00171 
00172   cudaGetDeviceProperties(&deviceProp, dev);
00173   if (deviceProp.major < 1) {
00174     errorQuda("Device %d does not support CUDA", dev);
00175   }
00176   
00177   printfQuda("QUDA: Using device %d: %s\n", dev, deviceProp.name);
00178 
00179   cudaSetDevice(dev);
00180 
00181 #ifdef QUDA_NUMA_SUPPORT
00182   if(numa_affinity_enabled){
00183     setNumaAffinity(dev);
00184   }
00185 #endif
00186   // if the device supports host-mapped memory, then enable this
00187   if(deviceProp.canMapHostMemory) cudaSetDeviceFlags(cudaDeviceMapHost);
00188 
00189   initCache();
00190   //cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
00191   cudaGetDeviceProperties(&deviceProp, dev);
00192 
00193   streams = new cudaStream_t[Nstream];
00194   for (int i=0; i<Nstream; i++) {
00195     cudaStreamCreate(&streams[i]);
00196   }
00197 
00198   quda::initBlas();
00199 
00200   loadTuneCache(QUDA_VERBOSE);
00201 }
00202 
00203 
00204 
00205 void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
00206 {
00207   checkGaugeParam(param);
00208 
00209   // Set the specific cpu parameters and create the cpu gauge field
00210   GaugeFieldParam gauge_param(h_gauge, *param);
00211 
00212   cpuGaugeField cpu(gauge_param);
00213 
00214   // switch the parameters for creating the mirror precise cuda gauge field
00215   gauge_param.create = QUDA_NULL_FIELD_CREATE;
00216   gauge_param.precision = param->cuda_prec;
00217   gauge_param.reconstruct = param->reconstruct;
00218   gauge_param.pad = param->ga_pad;
00219   gauge_param.order = (gauge_param.precision == QUDA_DOUBLE_PRECISION || 
00220                        gauge_param.reconstruct == QUDA_RECONSTRUCT_NO ) ?
00221     QUDA_FLOAT2_GAUGE_ORDER : QUDA_FLOAT4_GAUGE_ORDER;
00222 
00223   cudaGaugeField *precise = new cudaGaugeField(gauge_param);
00224   precise->loadCPUField(cpu, QUDA_CPU_FIELD_LOCATION);
00225 
00226   param->gaugeGiB += precise->GBytes();
00227 
00228   // switch the parameters for creating the mirror sloppy cuda gauge field
00229   gauge_param.precision = param->cuda_prec_sloppy;
00230   gauge_param.reconstruct = param->reconstruct_sloppy;
00231   gauge_param.order = (gauge_param.precision == QUDA_DOUBLE_PRECISION || 
00232                        gauge_param.reconstruct == QUDA_RECONSTRUCT_NO ) ?
00233     QUDA_FLOAT2_GAUGE_ORDER : QUDA_FLOAT4_GAUGE_ORDER;
00234   cudaGaugeField *sloppy = NULL;
00235   if (param->cuda_prec != param->cuda_prec_sloppy) {
00236     sloppy = new cudaGaugeField(gauge_param);
00237     sloppy->loadCPUField(cpu, QUDA_CPU_FIELD_LOCATION);
00238     param->gaugeGiB += sloppy->GBytes();
00239   } else {
00240     sloppy = precise;
00241   }
00242 
00243   // switch the parameters for creating the mirror preconditioner cuda gauge field
00244   gauge_param.precision = param->cuda_prec_precondition;
00245   gauge_param.reconstruct = param->reconstruct_precondition;
00246   gauge_param.order = (gauge_param.precision == QUDA_DOUBLE_PRECISION || 
00247                        gauge_param.reconstruct == QUDA_RECONSTRUCT_NO ) ?
00248     QUDA_FLOAT2_GAUGE_ORDER : QUDA_FLOAT4_GAUGE_ORDER;
00249   cudaGaugeField *precondition = NULL;
00250   if (param->cuda_prec_sloppy != param->cuda_prec_precondition) {
00251     precondition = new cudaGaugeField(gauge_param);
00252     precondition->loadCPUField(cpu, QUDA_CPU_FIELD_LOCATION);
00253     param->gaugeGiB += precondition->GBytes();
00254   } else {
00255     precondition = sloppy;
00256   }
00257   
00258   switch (param->type) {
00259   case QUDA_WILSON_LINKS:
00260     if (gaugePrecise) errorQuda("Precise gauge field already allocated");
00261     gaugePrecise = precise;
00262     if (gaugeSloppy) errorQuda("Sloppy gauge field already allocated");
00263     gaugeSloppy = sloppy;
00264     if (gaugePrecondition) errorQuda("Precondition gauge field already allocated");
00265     gaugePrecondition = precondition;
00266     break;
00267   case QUDA_ASQTAD_FAT_LINKS:
00268     if (gaugeFatPrecise) errorQuda("Precise gauge fat field already allocated");
00269     gaugeFatPrecise = precise;
00270     if (gaugeFatSloppy) errorQuda("Sloppy gauge fat field already allocated");
00271     gaugeFatSloppy = sloppy;
00272     if (gaugeFatPrecondition) errorQuda("Precondition gauge fat field already allocated");
00273     gaugeFatPrecondition = precondition;
00274     break;
00275   case QUDA_ASQTAD_LONG_LINKS:
00276     if (gaugeLongPrecise) errorQuda("Precise gauge long field already allocated");
00277     gaugeLongPrecise = precise;
00278     if (gaugeLongSloppy) errorQuda("Sloppy gauge long field already allocated");
00279     gaugeLongSloppy = sloppy;
00280     if (gaugeLongPrecondition) errorQuda("Precondition gauge long field already allocated");
00281     gaugeLongPrecondition = precondition;
00282     break;
00283   default:
00284     errorQuda("Invalid gauge type");   
00285   }
00286 
00287 }
00288 
00289 void saveGaugeQuda(void *h_gauge, QudaGaugeParam *param)
00290 {
00291   checkGaugeParam(param);
00292 
00293   // Set the specific cpu parameters and create the cpu gauge field
00294   GaugeFieldParam gauge_param(h_gauge, *param);
00295   cpuGaugeField cpuGauge(gauge_param);
00296   cudaGaugeField *cudaGauge = NULL;
00297   switch (param->type) {
00298   case QUDA_WILSON_LINKS:
00299     cudaGauge = gaugePrecise;
00300     break;
00301   case QUDA_ASQTAD_FAT_LINKS:
00302     cudaGauge = gaugeFatPrecise;
00303     break;
00304   case QUDA_ASQTAD_LONG_LINKS:
00305     cudaGauge = gaugeLongPrecise;
00306     break;
00307   default:
00308     errorQuda("Invalid gauge type");   
00309   }
00310 
00311   cudaGauge->saveCPUField(cpuGauge, QUDA_CPU_FIELD_LOCATION);
00312 }
00313 
00314 
00315 void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
00316 {
00317 
00318   if (!h_clover && !h_clovinv) {
00319     errorQuda("loadCloverQuda() called with neither clover term nor inverse");
00320   }
00321   if (inv_param->clover_cpu_prec == QUDA_HALF_PRECISION) {
00322     errorQuda("Half precision not supported on CPU");
00323   }
00324   if (gaugePrecise == NULL) {
00325     errorQuda("Gauge field must be loaded before clover");
00326   }
00327   if (inv_param->dslash_type != QUDA_CLOVER_WILSON_DSLASH) {
00328     errorQuda("Wrong dslash_type in loadCloverQuda()");
00329   }
00330 
00331   // determines whether operator is preconditioned when calling invertQuda()
00332   bool pc_solve = (inv_param->solve_type == QUDA_DIRECT_PC_SOLVE ||
00333                    inv_param->solve_type == QUDA_NORMEQ_PC_SOLVE);
00334 
00335   // determines whether operator is preconditioned when calling MatQuda() or MatDagMatQuda()
00336   bool pc_solution = (inv_param->solution_type == QUDA_MATPC_SOLUTION ||
00337                       inv_param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION);
00338 
00339   bool asymmetric = (inv_param->matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC ||
00340                      inv_param->matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC);
00341 
00342   // We issue a warning only when it seems likely that the user is screwing up:
00343 
00344   // inverted clover term is required when applying preconditioned operator
00345   if (!h_clovinv && pc_solve && pc_solution) {
00346     warningQuda("Inverted clover term not loaded");
00347   }
00348 
00349   // uninverted clover term is required when applying unpreconditioned operator,
00350   // but note that dslashQuda() is always preconditioned
00351   if (!h_clover && !pc_solve && !pc_solution) {
00352     //warningQuda("Uninverted clover term not loaded");
00353   }
00354 
00355   // uninverted clover term is also required for "asymmetric" preconditioning
00356   if (!h_clover && pc_solve && pc_solution && asymmetric) {
00357     warningQuda("Uninverted clover term not loaded");
00358   }
00359 
00360   CloverFieldParam clover_param;
00361   clover_param.nDim = 4;
00362   for (int i=0; i<4; i++) clover_param.x[i] = gaugePrecise->X()[i];
00363   clover_param.precision = inv_param->clover_cuda_prec;
00364   clover_param.pad = inv_param->cl_pad;
00365 
00366   cloverPrecise = new cudaCloverField(h_clover, h_clovinv, inv_param->clover_cpu_prec, 
00367                                       inv_param->clover_order, clover_param);
00368   inv_param->cloverGiB = cloverPrecise->GBytes();
00369 
00370   // create the mirror sloppy clover field
00371   if (inv_param->clover_cuda_prec != inv_param->clover_cuda_prec_sloppy) {
00372     clover_param.precision = inv_param->clover_cuda_prec_sloppy;
00373     cloverSloppy = new cudaCloverField(h_clover, h_clovinv, inv_param->clover_cpu_prec, 
00374                                        inv_param->clover_order, clover_param); 
00375     inv_param->cloverGiB += cloverSloppy->GBytes();
00376   } else {
00377     cloverSloppy = cloverPrecise;
00378   }
00379 
00380   // create the mirror preconditioner clover field
00381   if (inv_param->clover_cuda_prec_sloppy != inv_param->clover_cuda_prec_precondition &&
00382       inv_param->clover_cuda_prec_precondition != QUDA_INVALID_PRECISION) {
00383     clover_param.precision = inv_param->clover_cuda_prec_precondition;
00384     cloverPrecondition = new cudaCloverField(h_clover, h_clovinv, inv_param->clover_cpu_prec, 
00385                                              inv_param->clover_order, clover_param); 
00386     inv_param->cloverGiB += cloverPrecondition->GBytes();
00387   } else {
00388     cloverPrecondition = cloverSloppy;
00389   }
00390 
00391 }
00392 
00393 void freeGaugeQuda(void) 
00394 {  
00395   if (gaugeSloppy != gaugePrecondition && gaugePrecondition) delete gaugePrecondition;
00396   if (gaugePrecise != gaugeSloppy && gaugeSloppy) delete gaugeSloppy;
00397   if (gaugePrecise) delete gaugePrecise;
00398 
00399   gaugePrecondition = NULL;
00400   gaugeSloppy = NULL;
00401   gaugePrecise = NULL;
00402 
00403   if (gaugeLongSloppy != gaugeLongPrecondition && gaugeLongPrecondition) delete gaugeLongPrecondition;
00404   if (gaugeLongPrecise != gaugeLongSloppy && gaugeLongSloppy) delete gaugeLongSloppy;
00405   if (gaugeLongPrecise) delete gaugeLongPrecise;
00406 
00407   gaugeLongPrecondition = NULL;
00408   gaugeLongSloppy = NULL;
00409   gaugeLongPrecise = NULL;
00410 
00411   if (gaugeFatSloppy != gaugeFatPrecondition && gaugeFatPrecondition) delete gaugeFatPrecondition;
00412   if (gaugeFatPrecise != gaugeFatSloppy && gaugeFatSloppy) delete gaugeFatSloppy;
00413   if (gaugeFatPrecise) delete gaugeFatPrecise;
00414   
00415   gaugeFatPrecondition = NULL;
00416   gaugeFatSloppy = NULL;
00417   gaugeFatPrecise = NULL;
00418 }
00419 
00420 void freeCloverQuda(void)
00421 {
00422   if (cloverPrecondition != cloverSloppy && cloverPrecondition) delete cloverPrecondition;
00423   if (cloverSloppy != cloverPrecise && cloverSloppy) delete cloverSloppy;
00424   if (cloverPrecise) delete cloverPrecise;
00425 
00426   cloverPrecondition = NULL;
00427   cloverSloppy = NULL;
00428   cloverPrecise = NULL;
00429 }
00430 
00431 void endQuda(void)
00432 {
00433   cudaColorSpinorField::freeBuffer();
00434   cudaColorSpinorField::freeGhostBuffer();
00435   cpuColorSpinorField::freeGhostBuffer();
00436   freeGaugeQuda();
00437   freeCloverQuda();
00438 
00439   quda::endBlas();
00440 
00441   if (streams) {
00442     for (int i=0; i<Nstream; i++) cudaStreamDestroy(streams[i]);
00443     delete []streams;
00444     streams = NULL;
00445   }
00446 
00447   saveTuneCache(QUDA_VERBOSE);
00448 }
00449 
00450 
00451 void setDiracParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc)
00452 {
00453   double kappa = inv_param->kappa;
00454   if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) {
00455     kappa *= gaugePrecise->Anisotropy();
00456   }
00457 
00458   switch (inv_param->dslash_type) {
00459   case QUDA_WILSON_DSLASH:
00460     diracParam.type = pc ? QUDA_WILSONPC_DIRAC : QUDA_WILSON_DIRAC;
00461     break;
00462   case QUDA_CLOVER_WILSON_DSLASH:
00463     diracParam.type = pc ? QUDA_CLOVERPC_DIRAC : QUDA_CLOVER_DIRAC;
00464     break;
00465   case QUDA_DOMAIN_WALL_DSLASH:
00466     diracParam.type = pc ? QUDA_DOMAIN_WALLPC_DIRAC : QUDA_DOMAIN_WALL_DIRAC;
00467     break;
00468   case QUDA_ASQTAD_DSLASH:
00469     diracParam.type = pc ? QUDA_ASQTADPC_DIRAC : QUDA_ASQTAD_DIRAC;
00470     break;
00471   case QUDA_TWISTED_MASS_DSLASH:
00472     diracParam.type = pc ? QUDA_TWISTED_MASSPC_DIRAC : QUDA_TWISTED_MASS_DIRAC;
00473     break;
00474   default:
00475     errorQuda("Unsupported dslash_type");
00476   }
00477 
00478   diracParam.matpcType = inv_param->matpc_type;
00479   diracParam.dagger = inv_param->dagger;
00480   diracParam.gauge = gaugePrecise;
00481   diracParam.fatGauge = gaugeFatPrecise;
00482   diracParam.longGauge = gaugeLongPrecise;    
00483   diracParam.clover = cloverPrecise;
00484   diracParam.kappa = kappa;
00485   diracParam.mass = inv_param->mass;
00486   diracParam.m5 = inv_param->m5;
00487   diracParam.mu = inv_param->mu;
00488   diracParam.verbose = inv_param->verbosity;
00489 
00490   for (int i=0; i<4; i++) {
00491     diracParam.commDim[i] = 1;   // comms are always on
00492   }
00493 }
00494 
00495 
00496 void setDiracSloppyParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc)
00497 {
00498   setDiracParam(diracParam, inv_param, pc);
00499 
00500   diracParam.gauge = gaugeSloppy;
00501   diracParam.fatGauge = gaugeFatSloppy;
00502   diracParam.longGauge = gaugeLongSloppy;    
00503   diracParam.clover = cloverSloppy;
00504 
00505   for (int i=0; i<4; i++) {
00506     diracParam.commDim[i] = 1;   // comms are always on
00507   }
00508 
00509 }
00510 
00511 // The preconditioner currently mimicks the sloppy operator with no comms
00512 void setDiracPreParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc)
00513 {
00514   setDiracParam(diracParam, inv_param, pc);
00515 
00516   diracParam.gauge = gaugePrecondition;
00517   diracParam.fatGauge = gaugeFatPrecondition;
00518   diracParam.longGauge = gaugeLongPrecondition;    
00519   diracParam.clover = cloverPrecondition;
00520 
00521   for (int i=0; i<4; i++) {
00522     diracParam.commDim[i] = 0; // comms are always off
00523   }
00524 
00525 }
00526 
00527 
00528 static void massRescale(QudaDslashType dslash_type, double &kappa, QudaSolutionType solution_type, 
00529                         QudaMassNormalization mass_normalization, cudaColorSpinorField &b)
00530 {   
00531  if (verbosity >= QUDA_VERBOSE) {
00532     printfQuda("Mass rescale: Kappa is: %f\n", kappa);
00533     printfQuda("Mass rescale: mass normalization: %d\n", mass_normalization);
00534     double nin = norm2(b);
00535     printfQuda("Mass rescale: norm of source in = %f\n", nin);
00536   }
00537  
00538   if (dslash_type == QUDA_ASQTAD_DSLASH) {
00539     if (mass_normalization != QUDA_MASS_NORMALIZATION) {
00540       errorQuda("Staggered code only supports QUDA_MASS_NORMALIZATION");
00541     }
00542     return;
00543   }
00544 
00545   // multiply the source to compensate for normalization of the Dirac operator, if necessary
00546   switch (solution_type) {
00547   case QUDA_MAT_SOLUTION:
00548     if (mass_normalization == QUDA_MASS_NORMALIZATION ||
00549         mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
00550       axCuda(2.0*kappa, b);
00551     }
00552     break;
00553   case QUDA_MATDAG_MAT_SOLUTION:
00554     if (mass_normalization == QUDA_MASS_NORMALIZATION ||
00555         mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
00556       axCuda(4.0*kappa*kappa, b);
00557     }
00558     break;
00559   case QUDA_MATPC_SOLUTION:
00560     if (mass_normalization == QUDA_MASS_NORMALIZATION) {
00561         axCuda(4.0*kappa*kappa, b);
00562     } else if (mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
00563         axCuda(2.0*kappa, b);
00564     }
00565     break;
00566   case QUDA_MATPCDAG_MATPC_SOLUTION:
00567     if (mass_normalization == QUDA_MASS_NORMALIZATION) {
00568         axCuda(16.0*pow(kappa,4), b);
00569     } else if (mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
00570         axCuda(4.0*kappa*kappa, b);
00571     }
00572     break;
00573   default:
00574     errorQuda("Solution type %d not supported", solution_type);
00575   }
00576 
00577   if (verbosity >= QUDA_DEBUG_VERBOSE) printfQuda("Mass rescale done\n");   
00578  if (verbosity >= QUDA_VERBOSE) {
00579     printfQuda("Mass rescale: Kappa is: %f\n", kappa);
00580     printfQuda("Mass rescale: mass normalization: %d\n", mass_normalization);
00581     double nin = norm2(b);
00582     printfQuda("Mass rescale: norm of source out = %f\n", nin);
00583   }
00584 
00585 }
00586 
00587 static void massRescaleCoeff(QudaDslashType dslash_type, double &kappa, QudaSolutionType solution_type, 
00588                              QudaMassNormalization mass_normalization, double &coeff)
00589 {    
00590   if (dslash_type == QUDA_ASQTAD_DSLASH) {
00591     if (mass_normalization != QUDA_MASS_NORMALIZATION) {
00592       errorQuda("Staggered code only supports QUDA_MASS_NORMALIZATION");
00593     }
00594     return;
00595   }
00596 
00597   // multiply the source to compensate for normalization of the Dirac operator, if necessary
00598   switch (solution_type) {
00599   case QUDA_MAT_SOLUTION:
00600     if (mass_normalization == QUDA_MASS_NORMALIZATION ||
00601         mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
00602       coeff *= 2.0*kappa;
00603     }
00604     break;
00605   case QUDA_MATDAG_MAT_SOLUTION:
00606     if (mass_normalization == QUDA_MASS_NORMALIZATION ||
00607         mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
00608       coeff *= 4.0*kappa*kappa;
00609     }
00610     break;
00611   case QUDA_MATPC_SOLUTION:
00612     if (mass_normalization == QUDA_MASS_NORMALIZATION) {
00613       coeff *= 4.0*kappa*kappa;
00614     } else if (mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
00615       coeff *= 2.0*kappa;
00616     }
00617     break;
00618   case QUDA_MATPCDAG_MATPC_SOLUTION:
00619     if (mass_normalization == QUDA_MASS_NORMALIZATION) {
00620         coeff*=16.0*pow(kappa,4);
00621     } else if (mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
00622         coeff*=4.0*kappa*kappa;
00623     }
00624     break;
00625   default:
00626     errorQuda("Solution type %d not supported", solution_type);
00627   }
00628 
00629   if (verbosity >= QUDA_DEBUG_VERBOSE) printfQuda("Mass rescale done\n");   
00630 }
00631 
00632 
00633 void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity)
00634 {
00635   ColorSpinorParam cpuParam(h_in, *inv_param, gaugePrecise->X(), 1);
00636   ColorSpinorParam cudaParam(cpuParam, *inv_param);
00637 
00638   cpuColorSpinorField hIn(cpuParam);
00639 
00640   cudaColorSpinorField in(hIn, cudaParam);
00641 
00642   cudaParam.create = QUDA_NULL_FIELD_CREATE;
00643   cudaColorSpinorField out(in, cudaParam);
00644 
00645   if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) {
00646     if (parity == QUDA_EVEN_PARITY) {
00647       parity = QUDA_ODD_PARITY;
00648     } else {
00649       parity = QUDA_EVEN_PARITY;
00650     }
00651     axCuda(gaugePrecise->Anisotropy(), in);
00652   }
00653   bool pc = true;
00654 
00655   DiracParam diracParam;
00656   setDiracParam(diracParam, inv_param, pc);
00657 
00658   Dirac *dirac = Dirac::create(diracParam); // create the Dirac operator
00659   dirac->Dslash(out, in, parity); // apply the operator
00660   delete dirac; // clean up
00661 
00662   cpuParam.v = h_out;
00663   cpuColorSpinorField hOut(cpuParam);
00664   out.saveCPUSpinorField(hOut); // since this is a reference, this won't work: hOut = out;
00665 }
00666 
00667 
00668 void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
00669 {
00670   bool pc = (inv_param->solution_type == QUDA_MATPC_SOLUTION ||
00671              inv_param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION);
00672 
00673   ColorSpinorParam cpuParam(h_in, *inv_param, gaugePrecise->X(), pc);
00674 
00675   ColorSpinorParam cudaParam(cpuParam, *inv_param);
00676 
00677   cpuColorSpinorField hIn(cpuParam);
00678   cudaColorSpinorField in(hIn, cudaParam);
00679   cudaParam.create = QUDA_NULL_FIELD_CREATE;
00680   cudaColorSpinorField out(in, cudaParam);
00681 
00682   DiracParam diracParam;
00683   setDiracParam(diracParam, inv_param, pc);
00684 
00685   Dirac *dirac = Dirac::create(diracParam); // create the Dirac operator
00686   dirac->M(out, in); // apply the operator
00687   delete dirac; // clean up
00688 
00689   double kappa = inv_param->kappa;
00690   if (pc) {
00691     if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION) {
00692       axCuda(0.25/(kappa*kappa), out);
00693     } else if (inv_param->mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
00694       axCuda(0.5/kappa, out);
00695     }
00696   } else {
00697     if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION ||
00698         inv_param->mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
00699       axCuda(0.5/kappa, out);
00700     }
00701   }
00702 
00703   cpuParam.v = h_out;
00704   cpuColorSpinorField hOut(cpuParam);
00705   out.saveCPUSpinorField(hOut); // since this is a reference, this won't work: hOut = out;
00706 }
00707 
00708 
00709 void MatDagMatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
00710 {
00711   bool pc = (inv_param->solution_type == QUDA_MATPC_SOLUTION ||
00712              inv_param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION);
00713 
00714   ColorSpinorParam cpuParam(h_in, *inv_param, gaugePrecise->X(), pc);
00715   ColorSpinorParam cudaParam(cpuParam, *inv_param);
00716 
00717   cpuColorSpinorField hIn(cpuParam);
00718   cudaColorSpinorField in(hIn, cudaParam);
00719   cudaParam.create = QUDA_NULL_FIELD_CREATE;
00720   cudaColorSpinorField out(in, cudaParam);
00721 
00722   //  double kappa = inv_param->kappa;
00723   //  if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) kappa *= gaugePrecise->anisotropy;
00724 
00725   DiracParam diracParam;
00726   setDiracParam(diracParam, inv_param, pc);
00727 
00728   Dirac *dirac = Dirac::create(diracParam); // create the Dirac operator
00729   dirac->MdagM(out, in); // apply the operator
00730   delete dirac; // clean up
00731 
00732   double kappa = inv_param->kappa;
00733   if (pc) {
00734     if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION) {
00735       axCuda(1.0/pow(2.0*kappa,4), out);
00736     } else if (inv_param->mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
00737       axCuda(0.25/(kappa*kappa), out);
00738     }
00739   } else {
00740     if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION ||
00741         inv_param->mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
00742       axCuda(0.25/(kappa*kappa), out);
00743     }
00744   }
00745 
00746   cpuParam.v = h_out;
00747   cpuColorSpinorField hOut(cpuParam);
00748   out.saveCPUSpinorField(hOut); // since this is a reference, this won't work: hOut = out;
00749 }
00750 
00751 void createDirac(Dirac *&d, Dirac *&dSloppy, Dirac *&dPre, QudaInvertParam &param, const bool pc_solve)
00752 {
00753   DiracParam diracParam;
00754   DiracParam diracSloppyParam;
00755   DiracParam diracPreParam;
00756 
00757   setDiracParam(diracParam, &param, pc_solve);
00758   setDiracSloppyParam(diracSloppyParam, &param, pc_solve);
00759   setDiracPreParam(diracPreParam, &param, pc_solve);
00760     
00761   d = Dirac::create(diracParam); // create the Dirac operator   
00762   dSloppy = Dirac::create(diracSloppyParam);
00763   dPre = Dirac::create(diracPreParam);
00764 }
00765 
00766 cudaGaugeField* checkGauge(QudaInvertParam *param) {
00767   cudaGaugeField *cudaGauge = NULL;
00768   if (param->dslash_type != QUDA_ASQTAD_DSLASH) {
00769     if (gaugePrecise == NULL) errorQuda("Precise gauge field doesn't exist");
00770     if (gaugeSloppy == NULL) errorQuda("Sloppy gauge field doesn't exist");
00771     if (gaugePrecondition == NULL) errorQuda("Precondition gauge field doesn't exist");
00772     cudaGauge = gaugePrecise;
00773   } else {
00774     if (gaugeFatPrecise == NULL) errorQuda("Precise gauge fat field doesn't exist");
00775     if (gaugeFatSloppy == NULL) errorQuda("Sloppy gauge fat field doesn't exist");
00776     if (gaugeFatPrecondition == NULL) errorQuda("Precondition gauge fat field doesn't exist");
00777 
00778     if (gaugeLongPrecise == NULL) errorQuda("Precise gauge long field doesn't exist");
00779     if (gaugeLongSloppy == NULL) errorQuda("Sloppy gauge long field doesn't exist");
00780     if (gaugeLongPrecondition == NULL) errorQuda("Precondition gauge long field doesn't exist");
00781     cudaGauge = gaugeFatPrecise;
00782   }
00783   return cudaGauge;
00784 }
00785 
00786 void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param)
00787 {
00788   // check the gauge fields have been created
00789   cudaGaugeField *cudaGauge = checkGauge(param);
00790 
00791   checkInvertParam(param);
00792   verbosity = param->verbosity;
00793 
00794   bool pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE ||
00795                    param->solve_type == QUDA_NORMEQ_PC_SOLVE);
00796 
00797   bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION ||
00798                       param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION);
00799 
00800   param->spinorGiB = cudaGauge->VolumeCB() * spinorSiteSize;
00801   if (!pc_solve) param->spinorGiB *= 2;
00802   param->spinorGiB *= (param->cuda_prec == QUDA_DOUBLE_PRECISION ? sizeof(double) : sizeof(float));
00803   if (param->preserve_source == QUDA_PRESERVE_SOURCE_NO) {
00804     param->spinorGiB *= (param->inv_type == QUDA_CG_INVERTER ? 5 : 7)/(double)(1<<30);
00805   } else {
00806     param->spinorGiB *= (param->inv_type == QUDA_CG_INVERTER ? 8 : 9)/(double)(1<<30);
00807   }
00808 
00809   param->secs = 0;
00810   param->gflops = 0;
00811   param->iter = 0;
00812 
00813   Dirac *d = NULL;
00814   Dirac *dSloppy = NULL;
00815   Dirac *dPre = NULL;
00816 
00817   // create the dirac operator
00818   createDirac(d, dSloppy, dPre, *param, pc_solve);
00819 
00820   Dirac &dirac = *d;
00821   Dirac &diracSloppy = *dSloppy;
00822   Dirac &diracPre = *dPre;
00823 
00824   cpuColorSpinorField *h_b = NULL;
00825   cpuColorSpinorField *h_x = NULL;
00826   cudaColorSpinorField *b = NULL;
00827   cudaColorSpinorField *x = NULL;
00828   cudaColorSpinorField *in = NULL;
00829   cudaColorSpinorField *out = NULL;
00830 
00831   const int *X = cudaGauge->X();
00832 
00833   // wrap CPU host side pointers
00834   ColorSpinorParam cpuParam(hp_b, *param, X, pc_solution);
00835   h_b = new cpuColorSpinorField(cpuParam);
00836   cpuParam.v = hp_x;
00837   h_x = new cpuColorSpinorField(cpuParam);
00838     
00839   // download source
00840   ColorSpinorParam cudaParam(cpuParam, *param);     
00841   cudaParam.create = QUDA_COPY_FIELD_CREATE;
00842   b = new cudaColorSpinorField(*h_b, cudaParam); 
00843 
00844   if (param->use_init_guess == QUDA_USE_INIT_GUESS_YES) { // download initial guess
00845     x = new cudaColorSpinorField(*h_x, cudaParam); // solution  
00846   } else { // zero initial guess
00847     cudaParam.create = QUDA_ZERO_FIELD_CREATE;
00848     x = new cudaColorSpinorField(cudaParam); // solution
00849   }
00850     
00851   if (param->verbosity >= QUDA_VERBOSE) {
00852     double nh_b = norm2(*h_b);
00853     double nb = norm2(*b);
00854     printfQuda("Source: CPU = %f, CUDA copy = %f\n", nh_b, nb);
00855   }
00856 
00857   setDslashTuning(param->tune, param->verbosity);
00858   quda::setBlasTuning(param->tune, param->verbosity);
00859 
00860   dirac.prepare(in, out, *x, *b, param->solution_type);
00861   if (param->verbosity >= QUDA_VERBOSE) {
00862     double nin = norm2(*in);
00863     printfQuda("Prepared source = %f\n", nin);   
00864   }
00865 
00866   massRescale(param->dslash_type, param->kappa, param->solution_type, param->mass_normalization, *in);
00867 
00868   if (param->verbosity >= QUDA_VERBOSE) {
00869     double nin = norm2(*in);
00870     printfQuda("Prepared source post mass rescale = %f\n", nin);   
00871   }
00872   
00873 
00874   switch (param->inv_type) {
00875   case QUDA_CG_INVERTER:
00876     if (param->solution_type != QUDA_MATDAG_MAT_SOLUTION && param->solution_type != QUDA_MATPCDAG_MATPC_SOLUTION) {
00877       copyCuda(*out, *in);
00878       dirac.Mdag(*in, *out);
00879     }
00880     {
00881       DiracMdagM m(dirac), mSloppy(diracSloppy);
00882       CG cg(m, mSloppy, *param);
00883       cg(*out, *in);
00884     }
00885     break;
00886   case QUDA_BICGSTAB_INVERTER:
00887     if (param->solution_type == QUDA_MATDAG_MAT_SOLUTION || param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION) {
00888       DiracMdag m(dirac), mSloppy(diracSloppy), mPre(diracPre);
00889       BiCGstab bicg(m, mSloppy, mPre, *param);
00890       bicg(*out, *in);
00891       copyCuda(*in, *out);
00892     }
00893     {
00894       DiracM m(dirac), mSloppy(diracSloppy), mPre(diracPre);
00895       BiCGstab bicg(m, mSloppy, mPre, *param);
00896       bicg(*out, *in);
00897     }
00898     break;
00899   case QUDA_GCR_INVERTER:
00900     if (param->solution_type == QUDA_MATDAG_MAT_SOLUTION || param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION) {
00901       DiracMdag m(dirac), mSloppy(diracSloppy), mPre(diracPre);
00902       GCR gcr(m, mSloppy, mPre, *param);
00903       gcr(*out, *in);
00904       copyCuda(*in, *out);
00905     }
00906     {
00907       DiracM m(dirac), mSloppy(diracSloppy), mPre(diracPre);
00908       GCR gcr(m, mSloppy, mPre, *param);
00909       gcr(*out, *in);
00910     }
00911     break;
00912   default:
00913     errorQuda("Inverter type %d not implemented", param->inv_type);
00914   }
00915   
00916   if (param->verbosity >= QUDA_VERBOSE){
00917    double nx = norm2(*x);
00918    printfQuda("Solution = %f\n",nx);
00919   }
00920   dirac.reconstruct(*x, *b, param->solution_type);
00921   
00922   x->saveCPUSpinorField(*h_x); // since this is a reference, this won't work: h_x = x;
00923   
00924   if (param->verbosity >= QUDA_VERBOSE){
00925     double nx = norm2(*x);
00926     double nh_x = norm2(*h_x);
00927     printfQuda("Reconstructed: CUDA solution = %f, CPU copy = %f\n", nx, nh_x);
00928   }
00929   
00930   delete h_b;
00931   delete h_x;
00932   delete b;
00933   delete x;
00934   
00935   return;
00936 }
00937 
00938 
00944 void invertMultiShiftQuda(void **_hp_x, void *_hp_b, QudaInvertParam *param,
00945                           double* offsets, int num_offsets, double* residue_sq)
00946 {
00947   // check the gauge fields have been created
00948   cudaGaugeField *cudaGauge = checkGauge(param);
00949   checkInvertParam(param);
00950 
00951   param->num_offset = num_offsets;
00952   if (param->num_offset > QUDA_MAX_MULTI_SHIFT) 
00953     errorQuda("Number of shifts %d requested greater than QUDA_MAX_MULTI_SHIFT %d", 
00954               param->num_offset, QUDA_MAX_MULTI_SHIFT);
00955   for (int i=0; i<param->num_offset; i++) {
00956     param->offset[i] = offsets[i];
00957     param->tol_offset[i] = residue_sq[i];
00958   }
00959 
00960   verbosity = param->verbosity;
00961 
00962   // Are we doing a preconditioned solve */
00963   /* What does NormEq solve mean in the shifted case? 
00964    */
00965   if (param->solve_type != QUDA_NORMEQ_PC_SOLVE &&
00966       param->solve_type != QUDA_NORMEQ_SOLVE) { 
00967     errorQuda("Direct solve_type is not supported in invertMultiShiftQuda()\n");
00968   }
00969 
00970   bool pc_solve = (param->solve_type == QUDA_NORMEQ_PC_SOLVE);
00971 
00972   // In principle one can do a MATPC Solution for a hermitian M_pc
00973   // In practice most of the time I guess one will do a M^\dagger_pc M_pc solution.
00974   bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION ||
00975                       param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION );
00976 
00977   // No of GiB in a checkerboard of a spinor
00978   param->spinorGiB = cudaGauge->VolumeCB() * spinorSiteSize;
00979   if( !pc_solve) param->spinorGiB *= 2; // Double volume for non PC solve
00980   
00981   // **** WARNING *** this may not match implementation... 
00982   if( param->inv_type == QUDA_CG_INVERTER ) { 
00983     // CG-M needs 5 vectors for the smallest shift + 2 for each additional shift
00984     param->spinorGiB *= (5 + 2*(param->num_offset-1))/(double)(1<<30);
00985   }
00986   else {
00987     // BiCGStab-M needs 7 for the original shift + 2 for each additional shift + 1 auxiliary
00988     // (Jegerlehner hep-lat/9612014 eq (3.13)
00989     param->spinorGiB *= (7 + 2*(param->num_offset-1))/(double)(1<<30);
00990   }
00991 
00992   // Timing and FLOP counters
00993   param->secs = 0;
00994   param->gflops = 0;
00995   param->iter = 0;
00996   
00997   // Find the smallest shift and its offset.
00998   double low_offset = param->offset[0];
00999   int low_index = 0;
01000   for (int i=1;i < param->num_offset;i++){
01001     if (param->offset[i] < low_offset){
01002       low_offset = param->offset[i];
01003       low_index = i;
01004     }
01005   }
01006   
01007   // Host pointers for x, take a copy of the input host pointers
01008   void** hp_x;
01009   hp_x = new void* [ param->num_offset ];
01010 
01011   void* hp_b = _hp_b;
01012   for(int i=0;i < param->num_offset;i++){
01013     hp_x[i] = _hp_x[i];
01014   }
01015   
01016   // Now shift things so that the vector with the smallest shift 
01017   // is in the first position of the array
01018   if (low_index != 0){
01019     void* tmp = hp_x[0];
01020     hp_x[0] = hp_x[low_index] ;
01021     hp_x[low_index] = tmp;
01022     
01023     double tmp1 = param->offset[0];
01024     param->offset[0]= param->offset[low_index];
01025     param->offset[low_index] =tmp1;
01026   }
01027     
01028   // Create the matrix.
01029   // The way this works is that createDirac will create 'd' and 'dSloppy'
01030   // which are global. We then grab these with references...
01031   //
01032   // Balint: Isn't there a  nice construction pattern we could use here? This is 
01033   // expedient but yucky.
01034   //  DiracParam diracParam; 
01035   if (param->dslash_type == QUDA_ASQTAD_DSLASH){
01036     param->mass = sqrt(param->offset[0]/4);  
01037   }
01038 
01039   Dirac *d = NULL;
01040   Dirac *dSloppy = NULL;
01041   Dirac *dPre = NULL;
01042 
01043   // create the dirac operator
01044   createDirac(d, dSloppy, dPre, *param, pc_solve);
01045   Dirac &dirac = *d;
01046   Dirac &diracSloppy = *dSloppy;
01047 
01048   cpuColorSpinorField *h_b = NULL; // Host RHS
01049   cpuColorSpinorField **h_x = NULL;
01050   cudaColorSpinorField *b = NULL;   // Cuda RHS
01051   cudaColorSpinorField **x = NULL;  // Cuda Solutions
01052 
01053   // Grab the dimension array of the input gauge field.
01054   const int *X = ( param->dslash_type == QUDA_ASQTAD_DSLASH ) ? 
01055     gaugeFatPrecise->X() : gaugePrecise->X();
01056 
01057   // Wrap CPU host side pointers
01058   // 
01059   // Balint: This creates a ColorSpinorParam struct, from the host data pointer, 
01060   // the definitions in param, the dimensions X, and whether the solution is on 
01061   // a checkerboard instruction or not. These can then be used as 'instructions' 
01062   // to create the actual colorSpinorField
01063   ColorSpinorParam cpuParam(hp_b, *param, X, pc_solution);
01064   h_b = new cpuColorSpinorField(cpuParam);
01065 
01066   h_x = new cpuColorSpinorField* [ param->num_offset ]; // DYNAMIC ALLOCATION
01067   for(int i=0; i < param->num_offset; i++) { 
01068     cpuParam.v = hp_x[i];
01069     h_x[i] = new cpuColorSpinorField(cpuParam);
01070   }
01071 
01072   // Now I need a colorSpinorParam for the device
01073   ColorSpinorParam cudaParam(cpuParam, *param);
01074   // This setting will download a host vector
01075   cudaParam.create = QUDA_COPY_FIELD_CREATE;
01076   b = new cudaColorSpinorField(*h_b, cudaParam); // Creates b and downloads h_b to it
01077 
01078   // Create the solution fields filled with zero
01079   x = new cudaColorSpinorField* [ param->num_offset ];
01080   cudaParam.create = QUDA_ZERO_FIELD_CREATE;
01081   for(int i=0; i < param->num_offset; i++) { 
01082     x[i] = new cudaColorSpinorField(cudaParam);
01083   }
01084 
01085   // Check source norms
01086   if( param->verbosity >= QUDA_VERBOSE ) {
01087     double nh_b = norm2(*h_b);
01088     double nb = norm2(*b);
01089     printfQuda("Source: CPU= %f, CUDA copy = %f\n", nh_b,nb);
01090   }
01091 
01092   setDslashTuning(param->tune, param->verbosity);
01093   quda::setBlasTuning(param->tune, param->verbosity);
01094   
01095   massRescale(param->dslash_type, param->kappa, param->solution_type, param->mass_normalization, *b);
01096   double *unscaled_shifts = new double [param->num_offset];
01097   for(int i=0; i < param->num_offset; i++){ 
01098     unscaled_shifts[i] = param->offset[i];
01099     massRescaleCoeff(param->dslash_type, param->kappa, param->solution_type, param->mass_normalization, param->offset[i]);
01100   }
01101 
01102   {
01103     DiracMdagM m(dirac), mSloppy(diracSloppy);
01104     MultiShiftCG cg_m(m, mSloppy, *param);
01105     cg_m(x, *b);  
01106   }
01107 
01108   // restore shifts -- avoid side effects
01109   for(int i=0; i < param->num_offset; i++) { 
01110     param->offset[i] = unscaled_shifts[i];
01111   }
01112 
01113   delete [] unscaled_shifts;
01114 
01115   for(int i=0; i < param->num_offset; i++) { 
01116     x[i]->saveCPUSpinorField(*h_x[i]);
01117   }
01118 
01119   for(int i=0; i < param->num_offset; i++){ 
01120     delete h_x[i];
01121     delete x[i];
01122   }
01123 
01124   delete h_b;
01125   delete b;
01126 
01127   delete [] h_x;
01128   delete [] x;
01129 
01130   delete [] hp_x;
01131 
01132   return;
01133 }
01134 
01135 /************************************** Ugly Mixed precision multishift CG solver ****************************/
01136 
01137 static void* fatlink;
01138 static int fatlink_pad;
01139 static void* longlink;
01140 static int longlink_pad;
01141 static QudaReconstructType longlink_recon;
01142 static QudaReconstructType longlink_recon_sloppy;
01143 static QudaGaugeParam* gauge_param;
01144 
01145 void 
01146 record_gauge(int* X, void *_fatlink, int _fatlink_pad, void* _longlink, int _longlink_pad, 
01147              QudaReconstructType _longlink_recon, QudaReconstructType _longlink_recon_sloppy,
01148              QudaGaugeParam *_param)
01149 {
01150   
01151 
01152   //the X and precision in fatlink must be set because we use them in dirac creation
01153   //See dirac_staggered.cpp
01154   /*
01155   for(int i =0;i < 4;i++){
01156     cudaFatLinkPrecise.X[i] = cudaFatLinkSloppy.X[i] = X[i];
01157   }
01158   cudaFatLinkPrecise.precision = _param->cuda_prec;
01159   cudaFatLinkSloppy.precision = _param->cuda_prec_sloppy;
01160   */
01161 
01162   fatlink = _fatlink;
01163   fatlink_pad = _fatlink_pad;
01164   
01165   longlink = _longlink;
01166   longlink_pad = _longlink_pad;
01167   longlink_recon = _longlink_recon;
01168   longlink_recon_sloppy = _longlink_recon_sloppy;
01169   
01170   gauge_param = _param;
01171 
01172   return;
01173 
01174 }
01175 
01176 
01177 void 
01178 do_create_precise_cuda_gauge(void)
01179 {
01180   QudaPrecision prec = gauge_param->cuda_prec;
01181   QudaPrecision prec_sloppy = gauge_param->cuda_prec_sloppy;
01182   QudaPrecision prec_precondition = gauge_param->cuda_prec_precondition;
01183   
01184   //the sloppy gauge field will be filled, and needs to backup
01185   cudaGaugeField *tmp_fat = gaugeFatSloppy;
01186   cudaGaugeField *tmp_long= gaugeLongSloppy;
01187   cudaGaugeField *tmp_fat_precondition = gaugeFatPrecondition;
01188   cudaGaugeField *tmp_long_precondition = gaugeLongPrecondition;
01189 
01190   gaugeFatPrecise = gaugeFatSloppy = gaugeFatPrecondition = NULL;
01191   gaugeLongPrecise = gaugeLongSloppy = gaugeLongPrecondition = NULL;
01192   
01193   //create precise links
01194   gauge_param->cuda_prec = gauge_param->cuda_prec_sloppy = gauge_param->cuda_prec_precondition = prec;
01195   gauge_param->type = QUDA_ASQTAD_FAT_LINKS;
01196   gauge_param->ga_pad = fatlink_pad;
01197   gauge_param->reconstruct = gauge_param->reconstruct_sloppy = QUDA_RECONSTRUCT_NO;
01198   loadGaugeQuda(fatlink, gauge_param);
01199 
01200   
01201   gauge_param->type = QUDA_ASQTAD_LONG_LINKS;
01202   gauge_param->ga_pad = longlink_pad;
01203   gauge_param->reconstruct = longlink_recon;
01204   gauge_param->reconstruct_sloppy = longlink_recon_sloppy;
01205   loadGaugeQuda(longlink, gauge_param);
01206 
01207   //set prec/prec_sloppy it back
01208   gauge_param->cuda_prec = prec;
01209   gauge_param->cuda_prec_sloppy =prec_sloppy;
01210   gauge_param->cuda_prec_precondition = prec_precondition;
01211 
01212   //set the sloopy gauge filed back
01213   gaugeFatSloppy = tmp_fat;
01214   gaugeLongSloppy = tmp_long;
01215   
01216   gaugeFatPrecondition = tmp_fat_precondition;
01217   gaugeLongPrecondition = tmp_long_precondition;
01218   return;
01219 }
01220 
01221 void 
01222 do_create_sloppy_cuda_gauge(void)
01223 {
01224 
01225   QudaPrecision prec = gauge_param->cuda_prec;
01226   QudaPrecision prec_sloppy = gauge_param->cuda_prec_sloppy;  
01227   QudaPrecision prec_precondition = gauge_param->cuda_prec_precondition;
01228 
01229   //the precise gauge field will be filled, and needs to backup
01230   cudaGaugeField *tmp_fat = gaugeFatPrecise;
01231   cudaGaugeField *tmp_long = gaugeLongPrecise;
01232   cudaGaugeField *tmp_fat_precondition = gaugeFatPrecondition;
01233   cudaGaugeField *tmp_long_precondition = gaugeLongPrecondition;
01234   
01235   gaugeFatPrecise = gaugeFatSloppy = gaugeFatPrecondition = NULL;
01236   gaugeLongPrecise = gaugeLongSloppy = gaugeLongPrecondition= NULL;
01237 
01238   //create sloppy links
01239   gauge_param->cuda_prec = gauge_param->cuda_prec_sloppy = 
01240     gauge_param->cuda_prec_precondition = prec_sloppy; 
01241   gauge_param->type = QUDA_ASQTAD_FAT_LINKS;
01242   gauge_param->ga_pad = fatlink_pad;
01243   gauge_param->reconstruct = gauge_param->reconstruct_sloppy = QUDA_RECONSTRUCT_NO;
01244   loadGaugeQuda(fatlink, gauge_param);
01245   
01246   gauge_param->type = QUDA_ASQTAD_LONG_LINKS;
01247   gauge_param->ga_pad = longlink_pad;
01248   gauge_param->reconstruct = longlink_recon;
01249   gauge_param->reconstruct_sloppy = longlink_recon_sloppy;
01250   loadGaugeQuda(longlink, gauge_param);
01251   
01252   //set prec/prec_sloppy it back
01253   gauge_param->cuda_prec = prec;
01254   gauge_param->cuda_prec_sloppy =prec_sloppy;
01255   gauge_param->cuda_prec_precondition = prec_precondition;
01256   
01257   //set the sloopy gauge filed back
01258   gaugeFatPrecise = tmp_fat;
01259   gaugeLongPrecise = tmp_long;  
01260   
01261   gaugeFatPrecondition = tmp_fat_precondition;
01262   gaugeLongPrecondition = tmp_long_precondition;
01263 
01264   return;
01265 }
01266 
01267 
01268 void 
01269 invertMultiShiftQudaMixed(void **_hp_x, void *_hp_b, QudaInvertParam *param,
01270                           double* offsets, int num_offsets, double* residue_sq)
01271 {
01272 
01273 
01274   QudaPrecision high_prec = param->cuda_prec;
01275   param->cuda_prec = param->cuda_prec_sloppy;
01276   
01277   param->num_offset = num_offsets;
01278   if (param->num_offset > QUDA_MAX_MULTI_SHIFT) 
01279     errorQuda("Number of shifts %d requested greater than QUDA_MAX_MULTI_SHIFT %d", 
01280               param->num_offset, QUDA_MAX_MULTI_SHIFT);
01281   for (int i=0; i<param->num_offset; i++) {
01282     param->offset[i] = offsets[i];
01283     param->tol_offset[i] = residue_sq[i];
01284   }
01285 
01286   do_create_sloppy_cuda_gauge();
01287 
01288   // check the gauge fields have been created
01289   //cudaGaugeField *cudaGauge = checkGauge(param);
01290   cudaGaugeField *cudaGauge = gaugeFatSloppy;
01291 
01292 
01293   checkInvertParam(param);
01294   verbosity = param->verbosity;
01295 
01296   // Are we doing a preconditioned solve */
01297   /* What does NormEq solve mean in the shifted case? 
01298    */
01299   if (param->solve_type != QUDA_NORMEQ_PC_SOLVE &&
01300       param->solve_type != QUDA_NORMEQ_SOLVE) { 
01301     errorQuda("Direct solve_type is not supported in invertMultiShiftQuda()\n");
01302   }
01303 
01304   bool pc_solve = (param->solve_type == QUDA_NORMEQ_PC_SOLVE);
01305 
01306   // In principle one can do a MATPC Solution for a hermitian M_pc
01307   // In practice most of the time I guess one will do a M^\dagger_pc M_pc solution.
01308   bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION ||
01309                       param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION );
01310 
01311   // No of GiB in a checkerboard of a spinor
01312   param->spinorGiB = cudaGauge->VolumeCB() * spinorSiteSize;
01313   if( !pc_solve) param->spinorGiB *= 2; // Double volume for non PC solve
01314   
01315   // **** WARNING *** this may not match implementation... 
01316   if( param->inv_type == QUDA_CG_INVERTER ) { 
01317     // CG-M needs 5 vectors for the smallest shift + 2 for each additional shift
01318     param->spinorGiB *= (5 + 2*(param->num_offset-1))/(double)(1<<30);
01319   }
01320   else {
01321     // BiCGStab-M needs 7 for the original shift + 2 for each additional shift + 1 auxiliary
01322     // (Jegerlehner hep-lat/9612014 eq (3.13)
01323     param->spinorGiB *= (7 + 2*(param->num_offset-1))/(double)(1<<30);
01324   }
01325 
01326   // Timing and FLOP counters
01327   param->secs = 0;
01328   param->gflops = 0;
01329   param->iter = 0;
01330   
01331   // Find the smallest shift and its offset.
01332   double low_offset = param->offset[0];
01333   int low_index = 0;
01334   for (int i=1;i < param->num_offset;i++){
01335     if (param->offset[i] < low_offset){
01336       low_offset = param->offset[i];
01337       low_index = i;
01338     }
01339   }
01340   
01341   // Host pointers for x, take a copy of the input host pointers
01342   void** hp_x;
01343   hp_x = new void* [ param->num_offset ];
01344 
01345   void* hp_b = _hp_b;
01346   for(int i=0;i < param->num_offset;i++){
01347     hp_x[i] = _hp_x[i];
01348   }
01349   
01350   // Now shift things so that the vector with the smallest shift 
01351   // is in the first position of the array
01352   if (low_index != 0){
01353     void* tmp = hp_x[0];
01354     hp_x[0] = hp_x[low_index] ;
01355     hp_x[low_index] = tmp;
01356     
01357     double tmp1 = param->offset[0];
01358     param->offset[0]= param->offset[low_index];
01359     param->offset[low_index] =tmp1;
01360   }
01361     
01362   // Create the matrix.
01363   // The way this works is that createDirac will create 'd' and 'dSloppy'
01364   // which are global. We then grab these with references...
01365   //
01366   // Balint: Isn't there a  nice construction pattern we could use here? This is 
01367   // expedient but yucky.
01368   if (param->dslash_type == QUDA_ASQTAD_DSLASH){
01369     param->mass = sqrt(param->offset[0]/4);  
01370   }
01371   //FIXME: Dirty dirty hack
01372   // At this moment, the precise fat gauge is not created (NULL)
01373   // but we set it to be the same as sloppy to avoid segfault 
01374   // in creating the dirac since it is needed 
01375   gaugeFatPrecondition = gaugeFatPrecise = gaugeFatSloppy;
01376 
01377   Dirac *d = NULL;
01378   Dirac *dSloppy = NULL;
01379   Dirac *dPre = NULL;
01380 
01381   // create the dirac operator
01382   createDirac(d, dSloppy, dPre, *param, pc_solve);
01383   // resetting to NULL
01384   gaugeFatPrecise = NULL; gaugeFatPrecondition = NULL;
01385 
01386   Dirac &diracSloppy = *dSloppy;
01387 
01388   cpuColorSpinorField *h_b = NULL; // Host RHS
01389   cpuColorSpinorField **h_x = NULL;
01390   cudaColorSpinorField *b = NULL;   // Cuda RHS
01391   cudaColorSpinorField **x = NULL;  // Cuda Solutions
01392 
01393   // Grab the dimension array of the input gauge field.
01394   const int *X = cudaGauge->X();
01395 
01396   // Wrap CPU host side pointers
01397   // 
01398   // Balint: This creates a ColorSpinorParam struct, from the host data pointer, 
01399   // the definitions in param, the dimensions X, and whether the solution is on 
01400   // a checkerboard instruction or not. These can then be used as 'instructions' 
01401   // to create the actual colorSpinorField
01402   ColorSpinorParam cpuParam(hp_b, *param, X, pc_solution);
01403   h_b = new cpuColorSpinorField(cpuParam);
01404 
01405   h_x = new cpuColorSpinorField* [ param->num_offset ]; // DYNAMIC ALLOCATION
01406   for(int i=0; i < param->num_offset; i++) { 
01407     cpuParam.v = hp_x[i];
01408     h_x[i] = new cpuColorSpinorField(cpuParam);
01409   }
01410 
01411   // Now I need a colorSpinorParam for the device
01412   ColorSpinorParam cudaParam(cpuParam, *param);
01413   // This setting will download a host vector
01414   cudaParam.create = QUDA_COPY_FIELD_CREATE;
01415   b = new cudaColorSpinorField(*h_b, cudaParam); // Creates b and downloads h_b to it
01416 
01417   // Create the solution fields filled with zero
01418   x = new cudaColorSpinorField* [ param->num_offset ];
01419   cudaParam.create = QUDA_ZERO_FIELD_CREATE;
01420   for(int i=0; i < param->num_offset; i++) { 
01421     x[i] = new cudaColorSpinorField(cudaParam);
01422   }
01423 
01424   // Check source norms
01425   if( param->verbosity >= QUDA_VERBOSE ) {
01426     double nh_b = norm2(*h_b);
01427     double nb = norm2(*b);
01428     printfQuda("Source: CPU= %f, CUDA copy = %f\n", nh_b,nb);
01429   }
01430 
01431   // tune the Dirac Kernel
01432   setDslashTuning(param->tune, param->verbosity);
01433   quda::setBlasTuning(param->tune, param->verbosity);
01434   // if set, tuning will happen in the first multishift call
01435   
01436   massRescale(param->dslash_type, param->kappa, param->solution_type, param->mass_normalization, *b);
01437   double *rescaled_shifts = new double [param->num_offset];
01438   for(int i=0; i < param->num_offset; i++){ 
01439     rescaled_shifts[i] = param->offset[i];
01440     massRescaleCoeff(param->dslash_type, param->kappa, param->solution_type, param->mass_normalization, rescaled_shifts[i]);
01441   }
01442 
01443   {
01444     DiracMdagM m(diracSloppy);
01445     MultiShiftCG cg_m(m, m, *param);
01446     cg_m(x, *b);  
01447   }
01448     
01449   delete [] rescaled_shifts;
01450   
01451   delete b;
01452   
01453   int total_iters = 0;
01454   double total_secs = 0;
01455   double total_gflops = 0;
01456   total_iters += param->iter;
01457   total_secs  += param->secs;
01458   total_gflops += param->gflops;
01459   
01460   cudaParam.precision = high_prec;
01461   param->cuda_prec = high_prec;
01462   do_create_precise_cuda_gauge();
01463   
01464   b = new cudaColorSpinorField(cudaParam);
01465   *b = *h_b;
01466   
01467   /*FIXME: to avoid setfault*/
01468   gaugeFatPrecondition =gaugeFatSloppy;
01469 
01470   if (dPre && dPre != dSloppy) delete dPre;
01471   if (dSloppy && dSloppy != d) delete dSloppy;
01472   if (d) delete d;
01473 
01474   // create the dirac operator
01475   createDirac(d, dSloppy, dPre, *param, pc_solve);
01476 
01477   gaugeFatPrecondition = NULL;
01478   {
01479     Dirac& dirac2 = *d;
01480     Dirac& diracSloppy2 = *dSloppy;
01481     
01482     cudaColorSpinorField* high_x;
01483     high_x = new cudaColorSpinorField(cudaParam);
01484     for(int i=0;i < param->num_offset; i++){
01485       *high_x  = *x[i];
01486       delete x[i];
01487       double mass = sqrt(param->offset[i]/4);
01488       dirac2.setMass(mass);
01489       diracSloppy2.setMass(mass);
01490       DiracMdagM m(dirac2), mSloppy(diracSloppy2);
01491       CG cg(m, mSloppy, *param);
01492       cg(*high_x, *b);      
01493       total_iters += param->iter;
01494       total_secs  += param->secs;
01495       total_gflops += param->gflops;      
01496       high_x->saveCPUSpinorField(*h_x[i]);      
01497     }
01498     
01499     param->iter = total_iters;
01500     
01501     delete high_x;
01502   }
01503   
01504   
01505   for(int i=0; i < param->num_offset; i++){ 
01506     delete h_x[i];
01507   }
01508 
01509   delete h_b;
01510   delete b;
01511 
01512   delete [] h_x;
01513   delete [] x;
01514 
01515   delete [] hp_x;
01516 
01517   return;
01518 }
01519 
01520 
01521 #ifdef GPU_FATLINK 
01522 /*   @method  
01523  *   QUDA_COMPUTE_FAT_STANDARD: standard method (default)
01524  *   QUDA_COMPUTE_FAT_EXTENDED_VOLUME, extended volume method
01525  *
01526  */
01527 #include <sys/time.h>
01528 
01529 void setFatLinkPadding(QudaComputeFatMethod method, QudaGaugeParam* param)
01530 {
01531   int* X    = param->X;
01532   int Vsh_x = X[1]*X[2]*X[3]/2;
01533   int Vsh_y = X[0]*X[2]*X[3]/2;
01534   int Vsh_z = X[0]*X[1]*X[3]/2;
01535   int Vsh_t = X[0]*X[1]*X[2]/2;
01536 
01537   int E1 = X[0] + 4;
01538   int E2 = X[1] + 4;
01539   int E3 = X[2] + 4;
01540   int E4 = X[3] + 4;
01541 
01542   // fat-link padding 
01543   param->llfat_ga_pad = Vsh_t;
01544 
01545   // site-link padding
01546   if(method ==  QUDA_COMPUTE_FAT_STANDARD){
01547 #ifdef MULTI_GPU
01548     int Vh_2d_max = MAX(X[0]*X[1]/2, X[0]*X[2]/2);
01549     Vh_2d_max = MAX(Vh_2d_max, X[0]*X[3]/2);
01550     Vh_2d_max = MAX(Vh_2d_max, X[1]*X[2]/2);
01551     Vh_2d_max = MAX(Vh_2d_max, X[1]*X[3]/2);
01552     Vh_2d_max = MAX(Vh_2d_max, X[2]*X[3]/2);
01553     param->site_ga_pad = 3*(Vsh_x+Vsh_y+Vsh_z+Vsh_t) + 4*Vh_2d_max;
01554 #else
01555     param->site_ga_pad = Vsh_t;
01556 #endif
01557   }else{
01558     param->site_ga_pad = E1*E2*E3/2*3;
01559   }
01560   param->ga_pad = param->site_ga_pad;
01561 
01562  // staple padding
01563   if(method == QUDA_COMPUTE_FAT_STANDARD){
01564 #ifdef MULTI_GPU
01565     param->staple_pad = 3*(Vsh_x + Vsh_y + Vsh_z+ Vsh_t);
01566 #else
01567     param->staple_pad = 3*Vsh_t;
01568 #endif
01569   }else{
01570     param->staple_pad  = E1*E2*E3/2*3;
01571   }
01572 
01573   return;
01574 }
01575 
01576 
01577 void computeFatLinkCore(cudaGaugeField* cudaSiteLink, double* act_path_coeff,
01578                         QudaGaugeParam* qudaGaugeParam, QudaComputeFatMethod method,
01579                         cudaGaugeField* cudaFatLink, struct timeval time_array[])
01580 {
01581  
01582   gettimeofday(&time_array[0], NULL);
01583   
01584   const int flag = qudaGaugeParam->preserve_gauge;
01585   GaugeFieldParam gParam(0,*qudaGaugeParam);
01586 
01587   if(method == QUDA_COMPUTE_FAT_STANDARD){
01588     for(int dir=0; dir<4; ++dir) gParam.x[dir] = qudaGaugeParam->X[dir];
01589   }else{
01590     for(int dir=0; dir<4; ++dir) gParam.x[dir] = qudaGaugeParam->X[dir] + 4;
01591   }
01592 
01593 
01594   static cudaGaugeField* cudaStapleField=NULL, *cudaStapleField1=NULL;
01595   if(cudaStapleField == NULL || cudaStapleField1 == NULL){
01596     gParam.pad    = qudaGaugeParam->staple_pad;
01597     gParam.create = QUDA_NULL_FIELD_CREATE;
01598     gParam.reconstruct = QUDA_RECONSTRUCT_NO;
01599     gParam.is_staple = 1; //these two condition means it is a staple instead of a normal gauge field      
01600     cudaStapleField  = new cudaGaugeField(gParam);
01601     cudaStapleField1 = new cudaGaugeField(gParam);
01602   }
01603 
01604   gettimeofday(&time_array[1], NULL);
01605 
01606   if(method == QUDA_COMPUTE_FAT_STANDARD){
01607     llfat_cuda(*cudaFatLink, *cudaSiteLink, *cudaStapleField, *cudaStapleField1, qudaGaugeParam, act_path_coeff);
01608   }else{ //method == QUDA_COMPUTE_FAT_EXTENDED_VOLUME
01609     llfat_cuda_ex(*cudaFatLink, *cudaSiteLink, *cudaStapleField, *cudaStapleField1, qudaGaugeParam, act_path_coeff);
01610   }
01611   gettimeofday(&time_array[2], NULL);
01612 
01613   if (!(flag & QUDA_FAT_PRESERVE_GPU_GAUGE) ){
01614     delete cudaStapleField; cudaStapleField = NULL;
01615     delete cudaStapleField1; cudaStapleField1 = NULL;
01616   }
01617   gettimeofday(&time_array[3], NULL);
01618 
01619   return;
01620 
01621 }
01622 
01623 
01624 
01625 int
01626 computeFatLinkQuda(void* fatlink, void** sitelink, double* act_path_coeff, 
01627                    QudaGaugeParam* qudaGaugeParam, 
01628                    QudaComputeFatMethod method)
01629 {
01630 #define TDIFF_MS(a,b) (b.tv_sec - a.tv_sec + 0.000001*(b.tv_usec - a.tv_usec))*1000
01631 
01632   struct timeval t0;
01633   struct timeval t7, t8, t9, t10, t11;
01634 
01635   gettimeofday(&t0, NULL);
01636 
01637   static cpuGaugeField* cpuFatLink=NULL, *cpuSiteLink=NULL;
01638   static cudaGaugeField* cudaFatLink=NULL, *cudaSiteLink=NULL;
01639   int flag = qudaGaugeParam->preserve_gauge;
01640 
01641   QudaGaugeParam qudaGaugeParam_ex_buf;
01642   QudaGaugeParam* qudaGaugeParam_ex = &qudaGaugeParam_ex_buf;
01643   memcpy(qudaGaugeParam_ex, qudaGaugeParam, sizeof(QudaGaugeParam));
01644 
01645   qudaGaugeParam_ex->X[0] = qudaGaugeParam->X[0]+4;
01646   qudaGaugeParam_ex->X[1] = qudaGaugeParam->X[1]+4;
01647   qudaGaugeParam_ex->X[2] = qudaGaugeParam->X[2]+4;
01648   qudaGaugeParam_ex->X[3] = qudaGaugeParam->X[3]+4;
01649 
01650   GaugeFieldParam gParam_ex(0, *qudaGaugeParam_ex);
01651   
01652   // fat-link padding
01653   setFatLinkPadding(method, qudaGaugeParam);
01654   qudaGaugeParam_ex->llfat_ga_pad = qudaGaugeParam->llfat_ga_pad;
01655   qudaGaugeParam_ex->staple_pad   = qudaGaugeParam->staple_pad;
01656   qudaGaugeParam_ex->site_ga_pad  = qudaGaugeParam->site_ga_pad;
01657   
01658   GaugeFieldParam gParam(0, *qudaGaugeParam);
01659 
01660   // create the host fatlink
01661   if(cpuFatLink == NULL){
01662     gParam.create = QUDA_REFERENCE_FIELD_CREATE;
01663     gParam.link_type = QUDA_ASQTAD_FAT_LINKS;
01664     gParam.order = QUDA_MILC_GAUGE_ORDER;
01665     gParam.gauge= fatlink;
01666     cpuFatLink = new cpuGaugeField(gParam);
01667     if(cpuFatLink == NULL){
01668       errorQuda("ERROR: Creating cpuFatLink failed\n");
01669     }
01670   }else{
01671     cpuFatLink->setGauge((void**)fatlink);
01672   }
01673   
01674  // create the device fatlink
01675   if(cudaFatLink == NULL){
01676     gParam.pad    = qudaGaugeParam->llfat_ga_pad;
01677     gParam.create = QUDA_ZERO_FIELD_CREATE;
01678     gParam.link_type = QUDA_ASQTAD_FAT_LINKS;
01679     gParam.order = QUDA_QDP_GAUGE_ORDER;
01680     gParam.reconstruct = QUDA_RECONSTRUCT_NO;
01681     cudaFatLink = new cudaGaugeField(gParam);
01682   }
01683 
01684 
01685 
01686   // create the host sitelink   
01687   if(cpuSiteLink == NULL){
01688     gParam.pad = 0; 
01689     gParam.create    = QUDA_REFERENCE_FIELD_CREATE;
01690     gParam.link_type = qudaGaugeParam->type;
01691     gParam.order     = qudaGaugeParam->gauge_order;
01692     gParam.gauge     = sitelink;
01693     if(method != QUDA_COMPUTE_FAT_STANDARD){
01694       for(int dir=0; dir<4; ++dir) gParam.x[dir] = qudaGaugeParam_ex->X[dir];   
01695     }
01696     cpuSiteLink      = new cpuGaugeField(gParam);
01697     if(cpuSiteLink == NULL){
01698       errorQuda("ERROR: Creating cpuSiteLink failed\n");
01699     }
01700   }else{
01701     cpuSiteLink->setGauge(sitelink);
01702   }
01703   
01704   if(cudaSiteLink == NULL){
01705     gParam.pad         = qudaGaugeParam->site_ga_pad;
01706     gParam.create      = QUDA_NULL_FIELD_CREATE;
01707     gParam.link_type   = qudaGaugeParam->type;
01708     gParam.reconstruct = qudaGaugeParam->reconstruct;      
01709     cudaSiteLink = new cudaGaugeField(gParam);
01710   }
01711   
01712   initCommonConstants(*cudaFatLink);
01713   
01714   
01715   if(method == QUDA_COMPUTE_FAT_STANDARD){
01716     llfat_init_cuda(qudaGaugeParam);
01717     gettimeofday(&t7, NULL);
01718     
01719 #ifdef MULTI_GPU
01720     if(qudaGaugeParam->gauge_order == QUDA_MILC_GAUGE_ORDER){
01721       errorQuda("Only QDP-ordered site links are supported in the multi-gpu standard fattening code\n");
01722     }
01723 #endif
01724     loadLinkToGPU(cudaSiteLink, cpuSiteLink, qudaGaugeParam);
01725     
01726     gettimeofday(&t8, NULL);
01727   }else{
01728     llfat_init_cuda_ex(qudaGaugeParam_ex);
01729 #ifdef MULTI_GPU
01730     exchange_cpu_sitelink_ex(qudaGaugeParam->X, (void**)cpuSiteLink->Gauge_p(), 
01731                              cpuSiteLink->Order(),qudaGaugeParam->cpu_prec, 0);
01732 #endif
01733     gettimeofday(&t7, NULL);
01734     loadLinkToGPU_ex(cudaSiteLink, cpuSiteLink);
01735     gettimeofday(&t8, NULL);
01736   }
01737 
01738   // time the subroutines in computeFatLinkCore
01739   struct timeval time_array[4];
01740   
01741   // Actually do the fattening
01742   computeFatLinkCore(cudaSiteLink, act_path_coeff, qudaGaugeParam, method, cudaFatLink, time_array);
01743  
01744 
01745   gettimeofday(&t9, NULL);
01746 
01747   storeLinkToCPU(cpuFatLink, cudaFatLink, qudaGaugeParam);
01748   
01749   gettimeofday(&t10, NULL);
01750   
01751   if (!(flag & QUDA_FAT_PRESERVE_CPU_GAUGE) ){
01752     delete cpuFatLink; cpuFatLink = NULL;
01753     delete cpuSiteLink; cpuSiteLink = NULL;
01754   }  
01755   if (!(flag & QUDA_FAT_PRESERVE_GPU_GAUGE) ){
01756     delete cudaFatLink; cudaFatLink = NULL;
01757     delete cudaSiteLink; cudaSiteLink = NULL;
01758   }
01759   
01760   gettimeofday(&t11, NULL);
01761 #ifdef DSLASH_PROFILING 
01762   printfQuda("total time: %f ms, init(cuda/cpu gauge field creation,etc)=%f ms,"
01763              " sitelink cpu->gpu=%f ms, computation in gpu =%f ms, fatlink gpu->cpu=%f ms\n",
01764              TDIFF_MS(t0, t11), TDIFF_MS(t0, t7) + TDIFF_MS(time_array[0],time_array[1]), TDIFF_MS(t7, t8), TDIFF_MS(time_array[1], time_array[2]), TDIFF_MS(t9,t10));
01765   printfQuda("finally cleanup =%f ms\n", TDIFF_MS(t10, t11) + TDIFF_MS(time_array[2],time_array[3]));
01766 #endif
01767 
01768   return 0;
01769 }
01770 #endif
01771 
01772 #ifdef GPU_GAUGE_FORCE
01773 int
01774 computeGaugeForceQuda(void* mom, void* sitelink,  int*** input_path_buf, int* path_length,
01775                       void* loop_coeff, int num_paths, int max_length, double eb3,
01776                       QudaGaugeParam* qudaGaugeParam, double* timeinfo)
01777 {
01778   
01779   struct timeval t0, t1, t2, t3, t4;
01780   
01781   gettimeofday(&t0,NULL);
01782 
01783 #ifdef MULTI_GPU
01784   int E[4];
01785   QudaGaugeParam qudaGaugeParam_ex_buf;
01786   QudaGaugeParam* qudaGaugeParam_ex=&qudaGaugeParam_ex_buf;
01787   memcpy(qudaGaugeParam_ex, qudaGaugeParam, sizeof(QudaGaugeParam));
01788   E[0] = qudaGaugeParam_ex->X[0] = qudaGaugeParam->X[0] + 4;
01789   E[1] = qudaGaugeParam_ex->X[1] = qudaGaugeParam->X[1] + 4;
01790   E[2] = qudaGaugeParam_ex->X[2] = qudaGaugeParam->X[2] + 4;
01791   E[3] = qudaGaugeParam_ex->X[3] = qudaGaugeParam->X[3] + 4;
01792 #endif
01793 
01794   int* X = qudaGaugeParam->X;
01795   GaugeFieldParam gParam(0, *qudaGaugeParam);
01796 #ifdef MULTI_GPU
01797   GaugeFieldParam gParam_ex(0, *qudaGaugeParam_ex);
01798   GaugeFieldParam& gParamSL = gParam_ex;  
01799   int pad = E[2]*E[1]*E[0]/2;
01800 #else
01801   GaugeFieldParam& gParamSL = gParam;
01802   int pad = X[2]*X[1]*X[0]/2;
01803 #endif
01804   
01805   GaugeFieldParam& gParamMom = gParam;
01806   
01807   gParamSL.create = QUDA_REFERENCE_FIELD_CREATE;
01808   gParamSL.gauge = sitelink;
01809   gParamSL.pad = 0;
01810   cpuGaugeField* cpuSiteLink = new cpuGaugeField(gParamSL);
01811   
01812   gParamSL.create =QUDA_NULL_FIELD_CREATE;
01813   gParamSL.pad = pad;
01814   gParamSL.precision = qudaGaugeParam->cuda_prec;
01815   gParamSL.reconstruct = qudaGaugeParam->reconstruct;
01816   cudaGaugeField* cudaSiteLink = new cudaGaugeField(gParamSL);  
01817   qudaGaugeParam->site_ga_pad = gParamSL.pad;//need to record this value
01818 
01819   gParamMom.pad = 0;
01820   gParamMom.order =QUDA_MILC_GAUGE_ORDER;
01821   gParamMom.precision = qudaGaugeParam->cpu_prec;
01822   gParamMom.create =QUDA_REFERENCE_FIELD_CREATE;
01823   gParamMom.reconstruct =QUDA_RECONSTRUCT_10;  
01824   gParamMom.gauge=mom;
01825   gParamMom.link_type = QUDA_ASQTAD_MOM_LINKS;
01826   cpuGaugeField* cpuMom = new cpuGaugeField(gParamMom);              
01827 
01828 
01829   gParamMom.pad = pad;
01830   gParamMom.create =QUDA_NULL_FIELD_CREATE;  
01831   gParamMom.reconstruct = QUDA_RECONSTRUCT_10;
01832   gParamMom.precision = qudaGaugeParam->cuda_prec;
01833   gParamMom.link_type = QUDA_ASQTAD_MOM_LINKS;
01834   cudaGaugeField* cudaMom = new cudaGaugeField(gParamMom);
01835   qudaGaugeParam->mom_ga_pad = gParamMom.pad; //need to record this value
01836   
01837   
01838   initCommonConstants(*cudaMom);
01839   gauge_force_init_cuda(qudaGaugeParam, max_length); 
01840   
01841 #ifdef MULTI_GPU
01842   exchange_cpu_sitelink_ex(qudaGaugeParam->X, (void**)cpuSiteLink->Gauge_p(), 
01843                            cpuSiteLink->Order(), qudaGaugeParam->cpu_prec, 1);
01844   loadLinkToGPU_ex(cudaSiteLink, cpuSiteLink);
01845 #else  
01846   loadLinkToGPU(cudaSiteLink, cpuSiteLink, qudaGaugeParam);    
01847 #endif
01848   
01849   cudaMom->loadCPUField(*cpuMom, QUDA_CPU_FIELD_LOCATION);
01850 
01851   gettimeofday(&t1,NULL);  
01852   
01853   gauge_force_cuda(*cudaMom, eb3, *cudaSiteLink, qudaGaugeParam, input_path_buf, 
01854                    path_length, loop_coeff, num_paths, max_length);
01855 
01856   gettimeofday(&t2,NULL);
01857 
01858   cudaMom->saveCPUField(*cpuMom, QUDA_CPU_FIELD_LOCATION);
01859   
01860   delete cpuSiteLink;
01861   delete cpuMom;
01862   
01863   delete cudaSiteLink;
01864   delete cudaMom;
01865   
01866   gettimeofday(&t3,NULL); 
01867 
01868   if(timeinfo){
01869     timeinfo[0] = TDIFF(t0, t1);
01870     timeinfo[1] = TDIFF(t1, t2);
01871     timeinfo[2] = TDIFF(t2, t3);
01872   }
01873 
01874   return 0;  
01875 }
01876 
01877 #endif
01878 
01879 
01880 void initCommsQuda(int argc, char **argv, const int *X, const int nDim) {
01881 
01882   if (nDim != 4) errorQuda("Comms dimensions %d != 4", nDim);
01883 
01884 #ifdef MULTI_GPU
01885 
01886 #ifdef QMP_COMMS
01887   QMP_thread_level_t tl;
01888   QMP_init_msg_passing(&argc, &argv, QMP_THREAD_SINGLE, &tl);
01889 
01890   QMP_declare_logical_topology(X, nDim);
01891 #elif defined(MPI_COMMS)
01892   MPI_Init (&argc, &argv);  
01893 
01894   int volume = 1;
01895   for (int d=0; d<nDim; d++) volume *= X[d];
01896   int size = -1;
01897   MPI_Comm_size(MPI_COMM_WORLD, &size);
01898   if (volume != size)
01899     errorQuda("Number of processes %d must match requested MPI volume %d",
01900               size, volume);
01901 
01902   comm_set_gridsize(X[0], X[1], X[2], X[3]);  
01903   comm_init();
01904 #endif
01905 
01906 #endif
01907 }
01908 
01909 void endCommsQuda() {
01910 #ifdef MULTI_GPU
01911 
01912 #ifdef QMP_COMMS
01913   QMP_finalize_msg_passing();
01914 #elif defined MPI_COMMS
01915   comm_cleanup();
01916 #endif 
01917 
01918 #endif
01919 }