QUDA: quda/lib/cuda_color_spinor_field.cpp Source File

QUDA v0.4.0
A library for QCD on GPUs
00001 #include <stdlib.h>
00002 #include <stdio.h>
00003 #include <color_spinor_field.h>
00004 #include <blas_quda.h>
00005 
00006 #include <string.h>
00007 #include <iostream>
00008 #include "misc_helpers.h"
00009 #include <face_quda.h>
00010 #include <dslash_quda.h>
00011 
00012 // Easy to switch between overlapping communication or not
00013 #ifdef OVERLAP_COMMS
00014 #define CUDAMEMCPY(dst, src, size, type, stream) cudaMemcpyAsync(dst, src, size, type, stream)
00015 #else
00016 #define CUDAMEMCPY(dst, src, size, type, stream) cudaMemcpy(dst, src, size, type)
00017 #endif
00018 
00019 void* cudaColorSpinorField::buffer = 0;
00020 bool cudaColorSpinorField::bufferInit = false;
00021 size_t cudaColorSpinorField::bufferBytes = 0;
00022 
00023 int cudaColorSpinorField::initGhostFaceBuffer = 0;
00024 void* cudaColorSpinorField::fwdGhostFaceBuffer[QUDA_MAX_DIM]; //gpu memory
00025 void* cudaColorSpinorField::backGhostFaceBuffer[QUDA_MAX_DIM]; //gpu memory
00026 QudaPrecision cudaColorSpinorField::facePrecision; 
00027 
00028 extern bool kernelPackT;
00029 
00030 /*cudaColorSpinorField::cudaColorSpinorField() : 
00031   ColorSpinorField(), v(0), norm(0), alloc(false), init(false) {
00032 
00033   }*/
00034 
00035 cudaColorSpinorField::cudaColorSpinorField(const ColorSpinorParam &param) : 
00036   ColorSpinorField(param), v(0), norm(0), alloc(false), init(true) {
00037   create(param.create);
00038   if  (param.create == QUDA_NULL_FIELD_CREATE) {
00039     // do nothing
00040   } else if (param.create == QUDA_ZERO_FIELD_CREATE) {
00041     zero();
00042   } else if (param.create == QUDA_REFERENCE_FIELD_CREATE) {
00043     v = param.v;
00044     norm = param.norm;
00045   } else if (param.create == QUDA_COPY_FIELD_CREATE){
00046     errorQuda("not implemented");
00047   }
00048 }
00049 
00050 cudaColorSpinorField::cudaColorSpinorField(const cudaColorSpinorField &src) : 
00051   ColorSpinorField(src), v(0), norm(0), alloc(false), init(true) {
00052   create(QUDA_COPY_FIELD_CREATE);
00053   copy(src);
00054 }
00055 
00056 // creates a copy of src, any differences defined in param
00057 cudaColorSpinorField::cudaColorSpinorField(const ColorSpinorField &src, const ColorSpinorParam &param) :
00058   ColorSpinorField(src), v(0), norm(0), alloc(false), init(true) {  
00059 // can only overide if we are not using a reference or parity special case
00060 
00061   if (param.create != QUDA_REFERENCE_FIELD_CREATE || 
00062       (param.create == QUDA_REFERENCE_FIELD_CREATE && 
00063        src.SiteSubset() == QUDA_FULL_SITE_SUBSET && 
00064        param.siteSubset == QUDA_PARITY_SITE_SUBSET && 
00065        src.FieldLocation() == QUDA_CUDA_FIELD_LOCATION) ) {
00066     reset(param);
00067   } else {
00068     errorQuda("Undefined behaviour"); // else silent bug possible?
00069   }
00070 
00071   fieldLocation = QUDA_CUDA_FIELD_LOCATION;
00072   create(param.create);
00073 
00074   if (param.create == QUDA_NULL_FIELD_CREATE) {
00075     // do nothing
00076   } else if (param.create == QUDA_ZERO_FIELD_CREATE) {
00077     zero();
00078   } else if (param.create == QUDA_COPY_FIELD_CREATE) {
00079     if (src.FieldLocation() == QUDA_CUDA_FIELD_LOCATION) {
00080       copy(dynamic_cast<const cudaColorSpinorField&>(src));    
00081     } else if (src.FieldLocation() == QUDA_CPU_FIELD_LOCATION) {
00082       loadCPUSpinorField(dynamic_cast<const cpuColorSpinorField&>(src));
00083     } else {
00084       errorQuda("FieldLocation %d not supported", src.FieldLocation());
00085     }
00086   } else if (param.create == QUDA_REFERENCE_FIELD_CREATE) {
00087     if (src.FieldLocation() == QUDA_CUDA_FIELD_LOCATION) {
00088       v = (dynamic_cast<const cudaColorSpinorField&>(src)).v;
00089       norm = (dynamic_cast<const cudaColorSpinorField&>(src)).norm;
00090     } else {
00091       errorQuda("Cannot reference a non cuda field");
00092     }
00093   } else {
00094     errorQuda("CreateType %d not implemented", param.create);
00095   }
00096 
00097 }
00098 
00099 cudaColorSpinorField::cudaColorSpinorField(const ColorSpinorField &src) 
00100   : ColorSpinorField(src), alloc(false), init(true) {
00101   fieldLocation = QUDA_CUDA_FIELD_LOCATION;
00102   create(QUDA_COPY_FIELD_CREATE);
00103   if (src.FieldLocation() == QUDA_CUDA_FIELD_LOCATION) {
00104     copy(dynamic_cast<const cudaColorSpinorField&>(src));
00105   } else if (src.FieldLocation() == QUDA_CPU_FIELD_LOCATION) {
00106     loadCPUSpinorField(src);
00107   } else {
00108     errorQuda("FieldLocation not supported");
00109   }
00110 }
00111 
00112 cudaColorSpinorField& cudaColorSpinorField::operator=(const cudaColorSpinorField &src) {
00113   if (&src != this) {
00114     destroy();
00115     // keep current attributes unless unset
00116     if (!ColorSpinorField::init) ColorSpinorField::operator=(src);
00117     fieldLocation = QUDA_CUDA_FIELD_LOCATION;
00118     create(QUDA_COPY_FIELD_CREATE);
00119     copy(src);
00120   }
00121   return *this;
00122 }
00123 
00124 cudaColorSpinorField& cudaColorSpinorField::operator=(const cpuColorSpinorField &src) {
00125   destroy();
00126   // keep current attributes unless unset
00127   if (!ColorSpinorField::init) ColorSpinorField::operator=(src);
00128   fieldLocation = QUDA_CUDA_FIELD_LOCATION;
00129   create(QUDA_COPY_FIELD_CREATE);
00130   loadCPUSpinorField(src);
00131   return *this;
00132 }
00133 
00134 cudaColorSpinorField::~cudaColorSpinorField() {
00135   destroy();
00136 }
00137 
00138 
00139 void cudaColorSpinorField::create(const QudaFieldCreate create) {
00140 
00141   if (siteSubset == QUDA_FULL_SITE_SUBSET && siteOrder != QUDA_EVEN_ODD_SITE_ORDER) {
00142     errorQuda("Subset not implemented");
00143   }
00144 
00145   //FIXME: This addition is temporary to ensure we have the correct
00146   //field order for a given precision
00147   if (precision == QUDA_DOUBLE_PRECISION) fieldOrder = QUDA_FLOAT2_FIELD_ORDER;
00148   else fieldOrder = (nSpin == 4) ? QUDA_FLOAT4_FIELD_ORDER : QUDA_FLOAT2_FIELD_ORDER;
00149 
00150   if (create != QUDA_REFERENCE_FIELD_CREATE) {
00151     // Overallocate to hold tface bytes extra
00152     if (cudaMalloc((void**)&v, bytes) == cudaErrorMemoryAllocation) {
00153       errorQuda("Error allocating spinor: bytes=%lu", (unsigned long)bytes);
00154     }
00155 
00156     if (precision == QUDA_HALF_PRECISION) {
00157       if (cudaMalloc((void**)&norm, norm_bytes) == cudaErrorMemoryAllocation) {
00158         errorQuda("Error allocating norm");
00159       }
00160     }
00161     alloc = true;
00162   }
00163 
00164   // Check if buffer isn't big enough
00165   if (bytes > bufferBytes && bufferInit) {
00166     cudaFreeHost(buffer);
00167     bufferInit = false;
00168   }
00169 
00170   if (!bufferInit) {
00171     bufferBytes = bytes;
00172     cudaMallocHost(&buffer, bufferBytes);    
00173     bufferInit = true;
00174   }
00175 
00176   if (siteSubset == QUDA_FULL_SITE_SUBSET) {
00177     // create the associated even and odd subsets
00178     ColorSpinorParam param;
00179     param.siteSubset = QUDA_PARITY_SITE_SUBSET;
00180     param.nDim = nDim;
00181     memcpy(param.x, x, nDim*sizeof(int));
00182     param.x[0] /= 2; // set single parity dimensions
00183     param.create = QUDA_REFERENCE_FIELD_CREATE;
00184     param.v = v;
00185     param.norm = norm;
00186     even = new cudaColorSpinorField(*this, param);
00187     odd = new cudaColorSpinorField(*this, param);
00188     // need this hackery for the moment (need to locate the odd pointer half way into the full field)
00189     (dynamic_cast<cudaColorSpinorField*>(odd))->v = (void*)((unsigned long)v + bytes/2);
00190     if (precision == QUDA_HALF_PRECISION) 
00191       (dynamic_cast<cudaColorSpinorField*>(odd))->norm = (void*)((unsigned long)norm + norm_bytes/2);
00192   }
00193 
00194   if (siteSubset != QUDA_FULL_SITE_SUBSET) {
00195     zeroPad();
00196   } else {
00197     (dynamic_cast<cudaColorSpinorField*>(even))->zeroPad();
00198     (dynamic_cast<cudaColorSpinorField*>(odd))->zeroPad();
00199   }
00200   
00201 }
00202 void cudaColorSpinorField::freeBuffer() {
00203   if (bufferInit) cudaFreeHost(buffer);
00204 }
00205 
00206 void cudaColorSpinorField::destroy() {
00207   if (alloc) {
00208     cudaFree(v);
00209     if (precision == QUDA_HALF_PRECISION) cudaFree(norm);
00210     if (siteSubset == QUDA_FULL_SITE_SUBSET) {
00211       delete even;
00212       delete odd;
00213     }
00214     alloc = false;
00215   }
00216 }
00217 
00218 
00219 cudaColorSpinorField& cudaColorSpinorField::Even() const { 
00220   if (siteSubset == QUDA_FULL_SITE_SUBSET) {
00221     return *(dynamic_cast<cudaColorSpinorField*>(even)); 
00222   }
00223 
00224   errorQuda("Cannot return even subset of %d subset", siteSubset);
00225   exit(-1);
00226 }
00227 
00228 cudaColorSpinorField& cudaColorSpinorField::Odd() const {
00229   if (siteSubset == QUDA_FULL_SITE_SUBSET) {
00230     return *(dynamic_cast<cudaColorSpinorField*>(odd)); 
00231   }
00232 
00233   errorQuda("Cannot return odd subset of %d subset", siteSubset);
00234   exit(-1);
00235 }
00236 
00237 // cuda's floating point format, IEEE-754, represents the floating point
00238 // zero as 4 zero bytes
00239 void cudaColorSpinorField::zero() {
00240   cudaMemset(v, 0, bytes);
00241   if (precision == QUDA_HALF_PRECISION) cudaMemset(norm, 0, norm_bytes);
00242 }
00243 
00244 
00245 void cudaColorSpinorField::zeroPad() {
00246   size_t pad_bytes = (stride - volume) * precision * fieldOrder;
00247   int Npad = nColor * nSpin * 2 / fieldOrder;
00248   for (int i=0; i<Npad; i++) {
00249     if (pad_bytes) cudaMemset((char*)v + (volume + i*stride)*fieldOrder*precision, 0, pad_bytes);      
00250   }
00251 }
00252 
00253 void cudaColorSpinorField::copy(const cudaColorSpinorField &src) {
00254   checkField(*this, src);
00255   copyCuda(*this, src);
00256 }
00257 
00258 #include <pack_spinor.h>
00259 
00260 void cudaColorSpinorField::loadCPUSpinorField(const cpuColorSpinorField &src) {
00261 
00262   
00263   if (nDim != src.Ndim()) {
00264     errorQuda("Number of dimensions %d %d don't match", nDim, src.Ndim());
00265   }
00266 
00267   if (volume != src.volume) {
00268     errorQuda("Volumes %d %d don't match", volume, src.volume);
00269   }
00270 
00271   if (SiteOrder() != src.SiteOrder()) {
00272     errorQuda("Subset orders don't match");
00273   }
00274 
00275   if (nColor != 3) {
00276     errorQuda("Nc != 3 not yet supported");
00277   }
00278 
00279   if (siteSubset != src.siteSubset) {
00280     errorQuda("Subset types do not match %d %d", siteSubset, src.siteSubset);
00281   }
00282   if (precision == QUDA_HALF_PRECISION) {
00283     ColorSpinorParam param(*this); // acquire all attributes of this
00284     param.precision = QUDA_SINGLE_PRECISION; // change precision
00285     param.create = QUDA_COPY_FIELD_CREATE;
00286     cudaColorSpinorField tmp(src, param);
00287     copy(tmp);
00288     return;
00289   }
00290 
00291   // no native support for this yet - copy to a native supported order
00292   if (src.FieldOrder() == QUDA_QOP_DOMAIN_WALL_FIELD_ORDER) {
00293     ColorSpinorParam param(src); // acquire all attributes of this
00294     param.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
00295     param.create = QUDA_NULL_FIELD_CREATE;
00296     cpuColorSpinorField tmp(param);
00297     tmp.copy(src);
00298     loadCPUSpinorField(tmp);
00299     return;
00300   }
00301 
00302   // (temporary?) bug fix for padding
00303   memset(buffer, 0, bufferBytes);
00304   
00305 #define LOAD_SPINOR_CPU_TO_GPU(myNs)                                    \
00306   if (precision == QUDA_DOUBLE_PRECISION) {                             \
00307       if (src.precision == QUDA_DOUBLE_PRECISION) {                     \
00308           if (fieldOrder == QUDA_FLOAT_FIELD_ORDER) {                           \
00309               packSpinor<3,myNs,1>((double*)buffer, (double*)src.v, volume, pad, x, total_length, src.total_length, \
00310                                 src.SiteSubset(), src.SiteOrder(), gammaBasis, src.GammaBasis(), src.FieldOrder()); \
00311           } else if (fieldOrder == QUDA_FLOAT2_FIELD_ORDER) {                   \
00312               packSpinor<3,myNs,2>((double*)buffer, (double*)src.v, volume, pad, x, total_length, src.total_length, \
00313                                 src.SiteSubset(), src.SiteOrder(), gammaBasis, src.GammaBasis(), src.FieldOrder()); \
00314           } else if (fieldOrder == QUDA_FLOAT4_FIELD_ORDER) {                   \
00315               errorQuda("double4 not supported");                       \
00316           }                                                             \
00317       } else {                                                          \
00318           if (fieldOrder == QUDA_FLOAT_FIELD_ORDER) {                           \
00319               packSpinor<3,myNs,1>((double*)buffer, (float*)src.v, volume, pad, x, total_length, src.total_length, \
00320                                 src.SiteSubset(), src.SiteOrder(), gammaBasis, src.GammaBasis(), src.FieldOrder()); \
00321           } else if (fieldOrder == QUDA_FLOAT2_FIELD_ORDER) {                   \
00322               packSpinor<3,myNs,2>((double*)buffer, (float*)src.v, volume, pad, x, total_length, src.total_length, \
00323                                 src.SiteSubset(), src.SiteOrder(), gammaBasis, src.GammaBasis(), src.FieldOrder()); \
00324           } else if (fieldOrder == QUDA_FLOAT4_FIELD_ORDER) {                   \
00325               errorQuda("double4 not supported");                       \
00326           }                                                             \
00327       }                                                                 \
00328   } else {                                                              \
00329       if (src.precision == QUDA_DOUBLE_PRECISION) {                     \
00330           if (fieldOrder == QUDA_FLOAT_FIELD_ORDER) {                           \
00331               packSpinor<3,myNs,1>((float*)buffer, (double*)src.v, volume, pad, x, total_length, src.total_length, \
00332                                 src.SiteSubset(), src.SiteOrder(), gammaBasis, src.GammaBasis(), src.FieldOrder()); \
00333           } else if (fieldOrder == QUDA_FLOAT2_FIELD_ORDER) {                   \
00334               packSpinor<3,myNs,2>((float*)buffer, (double*)src.v, volume, pad, x, total_length, src.total_length, \
00335                                 src.SiteSubset(), src.SiteOrder(), gammaBasis, src.GammaBasis(), src.FieldOrder()); \
00336           } else if (fieldOrder == QUDA_FLOAT4_FIELD_ORDER) {                   \
00337               packSpinor<3,myNs,4>((float*)buffer, (double*)src.v, volume, pad, x, total_length, src.total_length, \
00338                                 src.SiteSubset(), src.SiteOrder(), gammaBasis, src.GammaBasis(), src.FieldOrder()); \
00339           }                                                             \
00340       } else {                                                          \
00341           if (fieldOrder == QUDA_FLOAT_FIELD_ORDER) {                           \
00342               packSpinor<3,myNs,1>((float*)buffer, (float*)src.v, volume, pad, x, total_length, src.total_length, \
00343                                 src.SiteSubset(), src.SiteOrder(), gammaBasis, src.GammaBasis(), src.FieldOrder()); \
00344           } else if (fieldOrder == QUDA_FLOAT2_FIELD_ORDER) {                   \
00345               packSpinor<3,myNs,2>((float*)buffer, (float*)src.v, volume, pad, x, total_length, src.total_length, \
00346                                 src.SiteSubset(), src.SiteOrder(), gammaBasis, src.GammaBasis(), src.FieldOrder()); \
00347           } else if (fieldOrder == QUDA_FLOAT4_FIELD_ORDER) {                   \
00348               packSpinor<3,myNs,4>((float*)buffer, (float*)src.v, volume, pad, x, total_length, src.total_length, \
00349                                 src.SiteSubset(), src.SiteOrder(), gammaBasis, src.GammaBasis(), src.FieldOrder()); \
00350           }                                                             \
00351       }                                                                 \
00352 }
00353 
00354   switch(nSpin){
00355   case 1:
00356       LOAD_SPINOR_CPU_TO_GPU(1);
00357       break;
00358   case 4:
00359       LOAD_SPINOR_CPU_TO_GPU(4);
00360       break;
00361   default:
00362       errorQuda("invalid number of spinors in function %s\n", __FUNCTION__);
00363 
00364   }
00365   
00366 #undef LOAD_SPINOR_CPU_TO_GPU
00367 
00368   /*  for (int i=0; i<length; i++) {
00369     std::cout << i << " " << ((float*)src.v)[i] << " " << ((float*)buffer)[i] << std::endl;
00370     }*/
00371 
00372   cudaMemcpy(v, buffer, bytes, cudaMemcpyHostToDevice);
00373   return;
00374 }
00375 
00376 
00377 void cudaColorSpinorField::saveCPUSpinorField(cpuColorSpinorField &dest) const {
00378 
00379   if (nDim != dest.Ndim()) {
00380     errorQuda("Number of dimensions %d %d don't match", nDim, dest.Ndim());
00381   }
00382 
00383   if (volume != dest.volume) {
00384     errorQuda("Volumes %d %d don't match", volume, dest.volume);
00385   }
00386 
00387   if (SiteOrder() != dest.SiteOrder()) {
00388     errorQuda("Subset orders don't match");
00389   }
00390 
00391   if (nColor != 3) {
00392     errorQuda("Nc != 3 not yet supported");
00393   }
00394 
00395 
00396   if (siteSubset != dest.siteSubset) {
00397     errorQuda("Subset types do not match %d %d", siteSubset, dest.siteSubset);
00398   }
00399 
00400   if (precision == QUDA_HALF_PRECISION) {
00401     ColorSpinorParam param(*this); // acquire all attributes of this
00402     param.precision = QUDA_SINGLE_PRECISION; // change precision
00403     param.create = QUDA_COPY_FIELD_CREATE; 
00404     cudaColorSpinorField tmp(*this, param);
00405     tmp.saveCPUSpinorField(dest);
00406     return;
00407   }
00408 
00409   // no native support for this yet - copy to a native supported order
00410   if (dest.FieldOrder() == QUDA_QOP_DOMAIN_WALL_FIELD_ORDER) {
00411     ColorSpinorParam param(dest); // acquire all attributes of this
00412     param.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
00413     param.create = QUDA_NULL_FIELD_CREATE;
00414     cpuColorSpinorField tmp(param);
00415     saveCPUSpinorField(tmp);
00416     dest.copy(tmp);
00417     return;
00418   }
00419 
00420   // (temporary?) bug fix for padding
00421   memset(buffer, 0, bufferBytes);
00422 
00423   cudaMemcpy(buffer, v, bytes, cudaMemcpyDeviceToHost);
00424 
00425 
00426 #define SAVE_SPINOR_GPU_TO_CPU(myNs)                            \
00427   if (precision == QUDA_DOUBLE_PRECISION) {                             \
00428       if (dest.precision == QUDA_DOUBLE_PRECISION) {                    \
00429           if (fieldOrder == QUDA_FLOAT_FIELD_ORDER) {                           \
00430               unpackSpinor<3,myNs,1>((double*)dest.v, (double*)buffer, volume, pad, x, dest.total_length, total_length, \
00431                                   dest.SiteSubset(), dest.SiteOrder(), dest.GammaBasis(), gammaBasis, dest.FieldOrder());       \
00432           } else if (fieldOrder == QUDA_FLOAT2_FIELD_ORDER) {                   \
00433               unpackSpinor<3,myNs,2>((double*)dest.v, (double*)buffer, volume, pad, x, dest.total_length, total_length, \
00434                                   dest.SiteSubset(), dest.SiteOrder(), dest.GammaBasis(), gammaBasis, dest.FieldOrder());       \
00435           } else if (fieldOrder == QUDA_FLOAT4_FIELD_ORDER) {                   \
00436               errorQuda("double4 not supported");                       \
00437           }                                                             \
00438       } else {                                                          \
00439           if (fieldOrder == QUDA_FLOAT_FIELD_ORDER) {                           \
00440               unpackSpinor<3,myNs,1>((float*)dest.v, (double*)buffer, volume, pad, x, dest.total_length, total_length, \
00441                                   dest.SiteSubset(), dest.SiteOrder(), dest.GammaBasis(), gammaBasis, dest.FieldOrder()); \
00442           } else if (fieldOrder == QUDA_FLOAT2_FIELD_ORDER) {                   \
00443               unpackSpinor<3,myNs,2>((float*)dest.v, (double*)buffer, volume, pad, x, dest.total_length, total_length, \
00444                                   dest.SiteSubset(), dest.SiteOrder(), dest.GammaBasis(), gammaBasis, dest.FieldOrder()); \
00445           } else if (fieldOrder == QUDA_FLOAT4_FIELD_ORDER) {                   \
00446               errorQuda("double4 not supported");                       \
00447           }                                                             \
00448       }                                                                 \
00449   } else {                                                              \
00450       if (dest.precision == QUDA_DOUBLE_PRECISION) {                    \
00451           if (fieldOrder == QUDA_FLOAT_FIELD_ORDER) {                           \
00452               unpackSpinor<3,myNs,1>((double*)dest.v, (float*)buffer, volume, pad, x, dest.total_length, total_length, \
00453                                   dest.SiteSubset(), dest.SiteOrder(), dest.GammaBasis(), gammaBasis, dest.FieldOrder());       \
00454           } else if (fieldOrder == QUDA_FLOAT2_FIELD_ORDER) {                   \
00455               unpackSpinor<3,myNs,2>((double*)dest.v, (float*)buffer, volume, pad, x, dest.total_length, total_length,  \
00456                                   dest.SiteSubset(), dest.SiteOrder(), dest.GammaBasis(), gammaBasis, dest.FieldOrder()); \
00457           } else if (fieldOrder == QUDA_FLOAT4_FIELD_ORDER) {                   \
00458               unpackSpinor<3,myNs,4>((double*)dest.v, (float*)buffer, volume, pad, x, dest.total_length, total_length, \
00459                                   dest.SiteSubset(), dest.SiteOrder(), dest.GammaBasis(), gammaBasis, dest.FieldOrder()); \
00460           }                                                             \
00461       } else {                                                          \
00462           if (fieldOrder == QUDA_FLOAT_FIELD_ORDER) {                           \
00463               unpackSpinor<3,myNs,1>((float*)dest.v, (float*)buffer, volume, pad, x, dest.total_length, total_length, \
00464                                   dest.SiteSubset(), dest.SiteOrder(), dest.GammaBasis(), gammaBasis, dest.FieldOrder()); \
00465           } else if (fieldOrder == QUDA_FLOAT2_FIELD_ORDER) {                   \
00466               unpackSpinor<3,myNs,2>((float*)dest.v, (float*)buffer, volume, pad, x, dest.total_length, total_length, \
00467                                   dest.SiteSubset(), dest.SiteOrder(), dest.GammaBasis(), gammaBasis, dest.FieldOrder());       \
00468           } else if (fieldOrder == QUDA_FLOAT4_FIELD_ORDER) {                   \
00469               unpackSpinor<3,myNs,4>((float*)dest.v, (float*)buffer, volume, pad, x, dest.total_length, total_length, \
00470                                   dest.SiteSubset(), dest.SiteOrder(), dest.GammaBasis(), gammaBasis, dest.FieldOrder());       \
00471           }                                                             \
00472       }                                                                 \
00473   }
00474 
00475   switch(nSpin){
00476   case 1:
00477       SAVE_SPINOR_GPU_TO_CPU(1);
00478       break;
00479   case 4:
00480       SAVE_SPINOR_GPU_TO_CPU(4);
00481       break;
00482   default:
00483       errorQuda("invalid number of spinors in function %s\n", __FUNCTION__);      
00484   }
00485 #undef SAVE_SPINOR_GPU_TO_CPU
00486   return;
00487 }
00488 
00489 void cudaColorSpinorField::allocateGhostBuffer(void) {
00490   int nFace = (nSpin == 1) ? 3 : 1; //3 faces for asqtad
00491   int Nint = nColor * nSpin * 2; // number of internal degrees of freedom
00492   if (nSpin == 4) Nint /= 2; // spin projection for Wilson
00493 
00494   if(this->initGhostFaceBuffer == 0 || precision > facePrecision){    
00495     for (int i=0; i<4; i++) {
00496       if(!commDimPartitioned(i)){
00497         continue;
00498       }
00499       size_t faceBytes = nFace*ghostFace[i]*Nint*precision;
00500       // add extra space for the norms for half precision
00501       if (precision == QUDA_HALF_PRECISION) faceBytes += nFace*ghostFace[i]*sizeof(float);
00502       
00503       if (this->initGhostFaceBuffer) { // only free-ed if precision is higher than previous allocation
00504         //cudaFree(this->fwdGhostFaceBuffer[i]); 
00505         cudaFree(this->backGhostFaceBuffer[i]); this->backGhostFaceBuffer[i] = NULL;
00506         this->fwdGhostFaceBuffer[i] = NULL;
00507       }
00508       //cudaMalloc((void**)&this->fwdGhostFaceBuffer[i], faceBytes);
00509       if (cudaMalloc((void**)&this->backGhostFaceBuffer[i], 2*faceBytes) == cudaErrorMemoryAllocation){
00510         errorQuda("cudaMalloc() failed for backGhostFaceBuffer\n");
00511       }
00512       fwdGhostFaceBuffer[i] = (void*)(((char*)backGhostFaceBuffer[i]) + faceBytes);
00513     }   
00514     
00515     this->facePrecision = precision;
00516     this->initGhostFaceBuffer = 1;
00517   }
00518 
00519   for (int i=0; i<4; i++) {
00520     if(!commDimPartitioned(i)){
00521       continue;
00522     }
00523     size_t faceBytes = nFace*ghostFace[i]*Nint*precision;
00524     // add extra space for the norms for half precision
00525     if (precision == QUDA_HALF_PRECISION) faceBytes += nFace*ghostFace[i]*sizeof(float);
00526     fwdGhostFaceBuffer[i] = (void*)(((char*)backGhostFaceBuffer[i]) + faceBytes);
00527   }
00528 
00529 
00530 }
00531 
00532 void cudaColorSpinorField::freeGhostBuffer(void) {
00533   if (!initGhostFaceBuffer) return;
00534   
00535   for(int i=0;i < 4; i++){
00536     if(!commDimPartitioned(i)){
00537       continue;
00538     }
00539     //cudaFree(fwdGhostFaceBuffer[i]); 
00540     cudaFree(backGhostFaceBuffer[i]); backGhostFaceBuffer[i] = NULL;
00541     fwdGhostFaceBuffer[i] = NULL;
00542   } 
00543 
00544   initGhostFaceBuffer = 0;  
00545 }
00546 
00547 // pack the ghost zone into a contiguous buffer for communications
00548 void cudaColorSpinorField::packGhost(const int dim, const QudaParity parity, const int dagger, cudaStream_t *stream) 
00549 {
00550 #ifdef MULTI_GPU
00551   if (dim !=3 || kernelPackT) { // use kernels to pack into contiguous buffers then a single cudaMemcpy
00552     void* gpu_buf = this->backGhostFaceBuffer[dim];
00553     packFace(gpu_buf, *this, dim, dagger, parity, *stream); 
00554   }
00555 #else
00556   errorQuda("packGhost not built on single-GPU build");
00557 #endif
00558 
00559 }
00560  
00561 // send the ghost zone to the host
00562 void cudaColorSpinorField::sendGhost(void *ghost_spinor, const int dim, const QudaDirection dir,
00563                                      const int dagger, cudaStream_t *stream) {
00564 
00565 #ifdef MULTI_GPU
00566   int Nvec = (nSpin == 1 || precision == QUDA_DOUBLE_PRECISION) ? 2 : 4;
00567   int nFace = (nSpin == 1) ? 3 : 1; //3 faces for asqtad
00568   int Nint = (nColor * nSpin * 2) / (nSpin == 4 ? 2 : 1);  // (spin proj.) degrees of freedom
00569 
00570   if (dim !=3 || kernelPackT) { // use kernels to pack into contiguous buffers then a single cudaMemcpy
00571 
00572     size_t bytes = nFace*Nint*ghostFace[dim]*precision;
00573     if (precision == QUDA_HALF_PRECISION) bytes += nFace*ghostFace[dim]*sizeof(float);
00574     void* gpu_buf = 
00575       (dir == QUDA_BACKWARDS) ? this->backGhostFaceBuffer[dim] : this->fwdGhostFaceBuffer[dim];
00576 
00577     CUDAMEMCPY(ghost_spinor, gpu_buf, bytes, cudaMemcpyDeviceToHost, *stream); 
00578   } else { // do multiple cudaMemcpys 
00579 
00580     int Npad = Nint / Nvec; // number Nvec buffers we have
00581     int Nt_minus1_offset = (volume - nFace*ghostFace[3]); // N_t -1 = Vh-Vsh
00582     
00583     int offset = 0;
00584     if (nSpin == 1) {
00585       offset = (dir == QUDA_BACKWARDS) ? 0 : Nt_minus1_offset;
00586     } else if (nSpin == 4) {    
00587       // !dagger: send lower components backwards, send upper components forwards
00588       // dagger: send upper components backwards, send lower components forwards
00589       bool upper = dagger ? true : false; // Fwd is !Back  
00590       if (dir == QUDA_FORWARDS) upper = !upper;
00591       int lower_spin_offset = Npad*stride;          
00592       if (upper) offset = (dir == QUDA_BACKWARDS ? 0 : Nt_minus1_offset);
00593       else offset = lower_spin_offset + (dir == QUDA_BACKWARDS ? 0 : Nt_minus1_offset);
00594     }
00595     
00596     // QUDA Memcpy NPad's worth. 
00597     //  -- Dest will point to the right beginning PAD. 
00598     //  -- Each Pad has size Nvec*Vsh Floats. 
00599     //  --  There is Nvec*Stride Floats from the start of one PAD to the start of the next
00600 
00601     void *dst = (char*)ghost_spinor;
00602     void *src = (char*)v + offset*Nvec*precision;
00603     size_t len = nFace*ghostFace[3]*Nvec*precision;     
00604     size_t spitch = stride*Nvec*precision;
00605     cudaMemcpy2DAsync(dst, len, src, spitch, len, Npad, cudaMemcpyDeviceToHost, *stream);
00606 
00607     /*for(int i=0; i < Npad; i++) {
00608       int len = nFace*ghostFace[3]*Nvec*precision;     
00609       void *dst = (char*)ghost_spinor + i*len;
00610       void *src = (char*)v + (offset + i*stride)* Nvec*precision;
00611       CUDAMEMCPY(dst, src, len, cudaMemcpyDeviceToHost, *stream); 
00612     }*/
00613     
00614     if (precision == QUDA_HALF_PRECISION) {
00615       int norm_offset = (dir == QUDA_BACKWARDS) ? 0 : Nt_minus1_offset*sizeof(float);
00616       void *dst = (char*)ghost_spinor + nFace*Nint*ghostFace[3]*precision;
00617       void *src = (char*)norm + norm_offset;
00618       CUDAMEMCPY(dst, src, nFace*ghostFace[3]*sizeof(float), cudaMemcpyDeviceToHost, *stream); 
00619     }
00620   }
00621 #else
00622   errorQuda("sendGhost not built on single-GPU build");
00623 #endif
00624 
00625 }
00626 
00627 void cudaColorSpinorField::unpackGhost(void* ghost_spinor, const int dim, 
00628                                        const QudaDirection dir, 
00629                                        const int dagger, cudaStream_t* stream) 
00630 {
00631   int nFace = (nSpin == 1) ? 3 : 1; //3 faces for asqtad
00632   int Nint = (nColor * nSpin * 2) / (nSpin == 4 ? 2 : 1);  // (spin proj.) degrees of freedom
00633 
00634   int len = nFace*ghostFace[dim]*Nint;
00635   int offset = length + ghostOffset[dim]*nColor*nSpin*2;
00636   offset += (dir == QUDA_BACKWARDS) ? 0 : len;
00637 
00638   void *dst = (char*)v + precision*offset;
00639   void *src = ghost_spinor;
00640 
00641   CUDAMEMCPY(dst, src, len*precision, cudaMemcpyHostToDevice, *stream);
00642     
00643   if (precision == QUDA_HALF_PRECISION) {
00644     int normlen = nFace*ghostFace[dim];
00645     int norm_offset = stride + ghostNormOffset[dim];
00646     norm_offset += (dir == QUDA_BACKWARDS) ? 0 : normlen;
00647 
00648     void *dst = (char*)norm + norm_offset*sizeof(float);
00649     void *src = (char*)ghost_spinor+nFace*Nint*ghostFace[dim]*precision; // norm region of host ghost zone
00650     CUDAMEMCPY(dst, src, normlen*sizeof(float), cudaMemcpyHostToDevice, *stream);
00651   }
00652 
00653 }
00654 
00655 std::ostream& operator<<(std::ostream &out, const cudaColorSpinorField &a) {
00656   out << (const ColorSpinorField)a;
00657   out << "v = " << a.v << std::endl;
00658   out << "norm = " << a.norm << std::endl;
00659   out << "alloc = " << a.alloc << std::endl;
00660   out << "init = " << a.init << std::endl;
00661   return out;
00662 }