QUDA v0.4.0
A library for QCD on GPUs
|
00001 #include <stdlib.h> 00002 #include <stdio.h> 00003 #include <color_spinor_field.h> 00004 #include <blas_quda.h> 00005 00006 #include <string.h> 00007 #include <iostream> 00008 #include "misc_helpers.h" 00009 #include <face_quda.h> 00010 #include <dslash_quda.h> 00011 00012 // Easy to switch between overlapping communication or not 00013 #ifdef OVERLAP_COMMS 00014 #define CUDAMEMCPY(dst, src, size, type, stream) cudaMemcpyAsync(dst, src, size, type, stream) 00015 #else 00016 #define CUDAMEMCPY(dst, src, size, type, stream) cudaMemcpy(dst, src, size, type) 00017 #endif 00018 00019 void* cudaColorSpinorField::buffer = 0; 00020 bool cudaColorSpinorField::bufferInit = false; 00021 size_t cudaColorSpinorField::bufferBytes = 0; 00022 00023 int cudaColorSpinorField::initGhostFaceBuffer = 0; 00024 void* cudaColorSpinorField::fwdGhostFaceBuffer[QUDA_MAX_DIM]; //gpu memory 00025 void* cudaColorSpinorField::backGhostFaceBuffer[QUDA_MAX_DIM]; //gpu memory 00026 QudaPrecision cudaColorSpinorField::facePrecision; 00027 00028 extern bool kernelPackT; 00029 00030 /*cudaColorSpinorField::cudaColorSpinorField() : 00031 ColorSpinorField(), v(0), norm(0), alloc(false), init(false) { 00032 00033 }*/ 00034 00035 cudaColorSpinorField::cudaColorSpinorField(const ColorSpinorParam ¶m) : 00036 ColorSpinorField(param), v(0), norm(0), alloc(false), init(true) { 00037 create(param.create); 00038 if (param.create == QUDA_NULL_FIELD_CREATE) { 00039 // do nothing 00040 } else if (param.create == QUDA_ZERO_FIELD_CREATE) { 00041 zero(); 00042 } else if (param.create == QUDA_REFERENCE_FIELD_CREATE) { 00043 v = param.v; 00044 norm = param.norm; 00045 } else if (param.create == QUDA_COPY_FIELD_CREATE){ 00046 errorQuda("not implemented"); 00047 } 00048 } 00049 00050 cudaColorSpinorField::cudaColorSpinorField(const cudaColorSpinorField &src) : 00051 ColorSpinorField(src), v(0), norm(0), alloc(false), init(true) { 00052 create(QUDA_COPY_FIELD_CREATE); 00053 copy(src); 00054 } 00055 00056 // creates a copy of src, any differences defined in param 00057 cudaColorSpinorField::cudaColorSpinorField(const ColorSpinorField &src, const ColorSpinorParam ¶m) : 00058 ColorSpinorField(src), v(0), norm(0), alloc(false), init(true) { 00059 // can only overide if we are not using a reference or parity special case 00060 00061 if (param.create != QUDA_REFERENCE_FIELD_CREATE || 00062 (param.create == QUDA_REFERENCE_FIELD_CREATE && 00063 src.SiteSubset() == QUDA_FULL_SITE_SUBSET && 00064 param.siteSubset == QUDA_PARITY_SITE_SUBSET && 00065 src.FieldLocation() == QUDA_CUDA_FIELD_LOCATION) ) { 00066 reset(param); 00067 } else { 00068 errorQuda("Undefined behaviour"); // else silent bug possible? 00069 } 00070 00071 fieldLocation = QUDA_CUDA_FIELD_LOCATION; 00072 create(param.create); 00073 00074 if (param.create == QUDA_NULL_FIELD_CREATE) { 00075 // do nothing 00076 } else if (param.create == QUDA_ZERO_FIELD_CREATE) { 00077 zero(); 00078 } else if (param.create == QUDA_COPY_FIELD_CREATE) { 00079 if (src.FieldLocation() == QUDA_CUDA_FIELD_LOCATION) { 00080 copy(dynamic_cast<const cudaColorSpinorField&>(src)); 00081 } else if (src.FieldLocation() == QUDA_CPU_FIELD_LOCATION) { 00082 loadCPUSpinorField(dynamic_cast<const cpuColorSpinorField&>(src)); 00083 } else { 00084 errorQuda("FieldLocation %d not supported", src.FieldLocation()); 00085 } 00086 } else if (param.create == QUDA_REFERENCE_FIELD_CREATE) { 00087 if (src.FieldLocation() == QUDA_CUDA_FIELD_LOCATION) { 00088 v = (dynamic_cast<const cudaColorSpinorField&>(src)).v; 00089 norm = (dynamic_cast<const cudaColorSpinorField&>(src)).norm; 00090 } else { 00091 errorQuda("Cannot reference a non cuda field"); 00092 } 00093 } else { 00094 errorQuda("CreateType %d not implemented", param.create); 00095 } 00096 00097 } 00098 00099 cudaColorSpinorField::cudaColorSpinorField(const ColorSpinorField &src) 00100 : ColorSpinorField(src), alloc(false), init(true) { 00101 fieldLocation = QUDA_CUDA_FIELD_LOCATION; 00102 create(QUDA_COPY_FIELD_CREATE); 00103 if (src.FieldLocation() == QUDA_CUDA_FIELD_LOCATION) { 00104 copy(dynamic_cast<const cudaColorSpinorField&>(src)); 00105 } else if (src.FieldLocation() == QUDA_CPU_FIELD_LOCATION) { 00106 loadCPUSpinorField(src); 00107 } else { 00108 errorQuda("FieldLocation not supported"); 00109 } 00110 } 00111 00112 cudaColorSpinorField& cudaColorSpinorField::operator=(const cudaColorSpinorField &src) { 00113 if (&src != this) { 00114 destroy(); 00115 // keep current attributes unless unset 00116 if (!ColorSpinorField::init) ColorSpinorField::operator=(src); 00117 fieldLocation = QUDA_CUDA_FIELD_LOCATION; 00118 create(QUDA_COPY_FIELD_CREATE); 00119 copy(src); 00120 } 00121 return *this; 00122 } 00123 00124 cudaColorSpinorField& cudaColorSpinorField::operator=(const cpuColorSpinorField &src) { 00125 destroy(); 00126 // keep current attributes unless unset 00127 if (!ColorSpinorField::init) ColorSpinorField::operator=(src); 00128 fieldLocation = QUDA_CUDA_FIELD_LOCATION; 00129 create(QUDA_COPY_FIELD_CREATE); 00130 loadCPUSpinorField(src); 00131 return *this; 00132 } 00133 00134 cudaColorSpinorField::~cudaColorSpinorField() { 00135 destroy(); 00136 } 00137 00138 00139 void cudaColorSpinorField::create(const QudaFieldCreate create) { 00140 00141 if (siteSubset == QUDA_FULL_SITE_SUBSET && siteOrder != QUDA_EVEN_ODD_SITE_ORDER) { 00142 errorQuda("Subset not implemented"); 00143 } 00144 00145 //FIXME: This addition is temporary to ensure we have the correct 00146 //field order for a given precision 00147 if (precision == QUDA_DOUBLE_PRECISION) fieldOrder = QUDA_FLOAT2_FIELD_ORDER; 00148 else fieldOrder = (nSpin == 4) ? QUDA_FLOAT4_FIELD_ORDER : QUDA_FLOAT2_FIELD_ORDER; 00149 00150 if (create != QUDA_REFERENCE_FIELD_CREATE) { 00151 // Overallocate to hold tface bytes extra 00152 if (cudaMalloc((void**)&v, bytes) == cudaErrorMemoryAllocation) { 00153 errorQuda("Error allocating spinor: bytes=%lu", (unsigned long)bytes); 00154 } 00155 00156 if (precision == QUDA_HALF_PRECISION) { 00157 if (cudaMalloc((void**)&norm, norm_bytes) == cudaErrorMemoryAllocation) { 00158 errorQuda("Error allocating norm"); 00159 } 00160 } 00161 alloc = true; 00162 } 00163 00164 // Check if buffer isn't big enough 00165 if (bytes > bufferBytes && bufferInit) { 00166 cudaFreeHost(buffer); 00167 bufferInit = false; 00168 } 00169 00170 if (!bufferInit) { 00171 bufferBytes = bytes; 00172 cudaMallocHost(&buffer, bufferBytes); 00173 bufferInit = true; 00174 } 00175 00176 if (siteSubset == QUDA_FULL_SITE_SUBSET) { 00177 // create the associated even and odd subsets 00178 ColorSpinorParam param; 00179 param.siteSubset = QUDA_PARITY_SITE_SUBSET; 00180 param.nDim = nDim; 00181 memcpy(param.x, x, nDim*sizeof(int)); 00182 param.x[0] /= 2; // set single parity dimensions 00183 param.create = QUDA_REFERENCE_FIELD_CREATE; 00184 param.v = v; 00185 param.norm = norm; 00186 even = new cudaColorSpinorField(*this, param); 00187 odd = new cudaColorSpinorField(*this, param); 00188 // need this hackery for the moment (need to locate the odd pointer half way into the full field) 00189 (dynamic_cast<cudaColorSpinorField*>(odd))->v = (void*)((unsigned long)v + bytes/2); 00190 if (precision == QUDA_HALF_PRECISION) 00191 (dynamic_cast<cudaColorSpinorField*>(odd))->norm = (void*)((unsigned long)norm + norm_bytes/2); 00192 } 00193 00194 if (siteSubset != QUDA_FULL_SITE_SUBSET) { 00195 zeroPad(); 00196 } else { 00197 (dynamic_cast<cudaColorSpinorField*>(even))->zeroPad(); 00198 (dynamic_cast<cudaColorSpinorField*>(odd))->zeroPad(); 00199 } 00200 00201 } 00202 void cudaColorSpinorField::freeBuffer() { 00203 if (bufferInit) cudaFreeHost(buffer); 00204 } 00205 00206 void cudaColorSpinorField::destroy() { 00207 if (alloc) { 00208 cudaFree(v); 00209 if (precision == QUDA_HALF_PRECISION) cudaFree(norm); 00210 if (siteSubset == QUDA_FULL_SITE_SUBSET) { 00211 delete even; 00212 delete odd; 00213 } 00214 alloc = false; 00215 } 00216 } 00217 00218 00219 cudaColorSpinorField& cudaColorSpinorField::Even() const { 00220 if (siteSubset == QUDA_FULL_SITE_SUBSET) { 00221 return *(dynamic_cast<cudaColorSpinorField*>(even)); 00222 } 00223 00224 errorQuda("Cannot return even subset of %d subset", siteSubset); 00225 exit(-1); 00226 } 00227 00228 cudaColorSpinorField& cudaColorSpinorField::Odd() const { 00229 if (siteSubset == QUDA_FULL_SITE_SUBSET) { 00230 return *(dynamic_cast<cudaColorSpinorField*>(odd)); 00231 } 00232 00233 errorQuda("Cannot return odd subset of %d subset", siteSubset); 00234 exit(-1); 00235 } 00236 00237 // cuda's floating point format, IEEE-754, represents the floating point 00238 // zero as 4 zero bytes 00239 void cudaColorSpinorField::zero() { 00240 cudaMemset(v, 0, bytes); 00241 if (precision == QUDA_HALF_PRECISION) cudaMemset(norm, 0, norm_bytes); 00242 } 00243 00244 00245 void cudaColorSpinorField::zeroPad() { 00246 size_t pad_bytes = (stride - volume) * precision * fieldOrder; 00247 int Npad = nColor * nSpin * 2 / fieldOrder; 00248 for (int i=0; i<Npad; i++) { 00249 if (pad_bytes) cudaMemset((char*)v + (volume + i*stride)*fieldOrder*precision, 0, pad_bytes); 00250 } 00251 } 00252 00253 void cudaColorSpinorField::copy(const cudaColorSpinorField &src) { 00254 checkField(*this, src); 00255 copyCuda(*this, src); 00256 } 00257 00258 #include <pack_spinor.h> 00259 00260 void cudaColorSpinorField::loadCPUSpinorField(const cpuColorSpinorField &src) { 00261 00262 00263 if (nDim != src.Ndim()) { 00264 errorQuda("Number of dimensions %d %d don't match", nDim, src.Ndim()); 00265 } 00266 00267 if (volume != src.volume) { 00268 errorQuda("Volumes %d %d don't match", volume, src.volume); 00269 } 00270 00271 if (SiteOrder() != src.SiteOrder()) { 00272 errorQuda("Subset orders don't match"); 00273 } 00274 00275 if (nColor != 3) { 00276 errorQuda("Nc != 3 not yet supported"); 00277 } 00278 00279 if (siteSubset != src.siteSubset) { 00280 errorQuda("Subset types do not match %d %d", siteSubset, src.siteSubset); 00281 } 00282 if (precision == QUDA_HALF_PRECISION) { 00283 ColorSpinorParam param(*this); // acquire all attributes of this 00284 param.precision = QUDA_SINGLE_PRECISION; // change precision 00285 param.create = QUDA_COPY_FIELD_CREATE; 00286 cudaColorSpinorField tmp(src, param); 00287 copy(tmp); 00288 return; 00289 } 00290 00291 // no native support for this yet - copy to a native supported order 00292 if (src.FieldOrder() == QUDA_QOP_DOMAIN_WALL_FIELD_ORDER) { 00293 ColorSpinorParam param(src); // acquire all attributes of this 00294 param.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; 00295 param.create = QUDA_NULL_FIELD_CREATE; 00296 cpuColorSpinorField tmp(param); 00297 tmp.copy(src); 00298 loadCPUSpinorField(tmp); 00299 return; 00300 } 00301 00302 // (temporary?) bug fix for padding 00303 memset(buffer, 0, bufferBytes); 00304 00305 #define LOAD_SPINOR_CPU_TO_GPU(myNs) \ 00306 if (precision == QUDA_DOUBLE_PRECISION) { \ 00307 if (src.precision == QUDA_DOUBLE_PRECISION) { \ 00308 if (fieldOrder == QUDA_FLOAT_FIELD_ORDER) { \ 00309 packSpinor<3,myNs,1>((double*)buffer, (double*)src.v, volume, pad, x, total_length, src.total_length, \ 00310 src.SiteSubset(), src.SiteOrder(), gammaBasis, src.GammaBasis(), src.FieldOrder()); \ 00311 } else if (fieldOrder == QUDA_FLOAT2_FIELD_ORDER) { \ 00312 packSpinor<3,myNs,2>((double*)buffer, (double*)src.v, volume, pad, x, total_length, src.total_length, \ 00313 src.SiteSubset(), src.SiteOrder(), gammaBasis, src.GammaBasis(), src.FieldOrder()); \ 00314 } else if (fieldOrder == QUDA_FLOAT4_FIELD_ORDER) { \ 00315 errorQuda("double4 not supported"); \ 00316 } \ 00317 } else { \ 00318 if (fieldOrder == QUDA_FLOAT_FIELD_ORDER) { \ 00319 packSpinor<3,myNs,1>((double*)buffer, (float*)src.v, volume, pad, x, total_length, src.total_length, \ 00320 src.SiteSubset(), src.SiteOrder(), gammaBasis, src.GammaBasis(), src.FieldOrder()); \ 00321 } else if (fieldOrder == QUDA_FLOAT2_FIELD_ORDER) { \ 00322 packSpinor<3,myNs,2>((double*)buffer, (float*)src.v, volume, pad, x, total_length, src.total_length, \ 00323 src.SiteSubset(), src.SiteOrder(), gammaBasis, src.GammaBasis(), src.FieldOrder()); \ 00324 } else if (fieldOrder == QUDA_FLOAT4_FIELD_ORDER) { \ 00325 errorQuda("double4 not supported"); \ 00326 } \ 00327 } \ 00328 } else { \ 00329 if (src.precision == QUDA_DOUBLE_PRECISION) { \ 00330 if (fieldOrder == QUDA_FLOAT_FIELD_ORDER) { \ 00331 packSpinor<3,myNs,1>((float*)buffer, (double*)src.v, volume, pad, x, total_length, src.total_length, \ 00332 src.SiteSubset(), src.SiteOrder(), gammaBasis, src.GammaBasis(), src.FieldOrder()); \ 00333 } else if (fieldOrder == QUDA_FLOAT2_FIELD_ORDER) { \ 00334 packSpinor<3,myNs,2>((float*)buffer, (double*)src.v, volume, pad, x, total_length, src.total_length, \ 00335 src.SiteSubset(), src.SiteOrder(), gammaBasis, src.GammaBasis(), src.FieldOrder()); \ 00336 } else if (fieldOrder == QUDA_FLOAT4_FIELD_ORDER) { \ 00337 packSpinor<3,myNs,4>((float*)buffer, (double*)src.v, volume, pad, x, total_length, src.total_length, \ 00338 src.SiteSubset(), src.SiteOrder(), gammaBasis, src.GammaBasis(), src.FieldOrder()); \ 00339 } \ 00340 } else { \ 00341 if (fieldOrder == QUDA_FLOAT_FIELD_ORDER) { \ 00342 packSpinor<3,myNs,1>((float*)buffer, (float*)src.v, volume, pad, x, total_length, src.total_length, \ 00343 src.SiteSubset(), src.SiteOrder(), gammaBasis, src.GammaBasis(), src.FieldOrder()); \ 00344 } else if (fieldOrder == QUDA_FLOAT2_FIELD_ORDER) { \ 00345 packSpinor<3,myNs,2>((float*)buffer, (float*)src.v, volume, pad, x, total_length, src.total_length, \ 00346 src.SiteSubset(), src.SiteOrder(), gammaBasis, src.GammaBasis(), src.FieldOrder()); \ 00347 } else if (fieldOrder == QUDA_FLOAT4_FIELD_ORDER) { \ 00348 packSpinor<3,myNs,4>((float*)buffer, (float*)src.v, volume, pad, x, total_length, src.total_length, \ 00349 src.SiteSubset(), src.SiteOrder(), gammaBasis, src.GammaBasis(), src.FieldOrder()); \ 00350 } \ 00351 } \ 00352 } 00353 00354 switch(nSpin){ 00355 case 1: 00356 LOAD_SPINOR_CPU_TO_GPU(1); 00357 break; 00358 case 4: 00359 LOAD_SPINOR_CPU_TO_GPU(4); 00360 break; 00361 default: 00362 errorQuda("invalid number of spinors in function %s\n", __FUNCTION__); 00363 00364 } 00365 00366 #undef LOAD_SPINOR_CPU_TO_GPU 00367 00368 /* for (int i=0; i<length; i++) { 00369 std::cout << i << " " << ((float*)src.v)[i] << " " << ((float*)buffer)[i] << std::endl; 00370 }*/ 00371 00372 cudaMemcpy(v, buffer, bytes, cudaMemcpyHostToDevice); 00373 return; 00374 } 00375 00376 00377 void cudaColorSpinorField::saveCPUSpinorField(cpuColorSpinorField &dest) const { 00378 00379 if (nDim != dest.Ndim()) { 00380 errorQuda("Number of dimensions %d %d don't match", nDim, dest.Ndim()); 00381 } 00382 00383 if (volume != dest.volume) { 00384 errorQuda("Volumes %d %d don't match", volume, dest.volume); 00385 } 00386 00387 if (SiteOrder() != dest.SiteOrder()) { 00388 errorQuda("Subset orders don't match"); 00389 } 00390 00391 if (nColor != 3) { 00392 errorQuda("Nc != 3 not yet supported"); 00393 } 00394 00395 00396 if (siteSubset != dest.siteSubset) { 00397 errorQuda("Subset types do not match %d %d", siteSubset, dest.siteSubset); 00398 } 00399 00400 if (precision == QUDA_HALF_PRECISION) { 00401 ColorSpinorParam param(*this); // acquire all attributes of this 00402 param.precision = QUDA_SINGLE_PRECISION; // change precision 00403 param.create = QUDA_COPY_FIELD_CREATE; 00404 cudaColorSpinorField tmp(*this, param); 00405 tmp.saveCPUSpinorField(dest); 00406 return; 00407 } 00408 00409 // no native support for this yet - copy to a native supported order 00410 if (dest.FieldOrder() == QUDA_QOP_DOMAIN_WALL_FIELD_ORDER) { 00411 ColorSpinorParam param(dest); // acquire all attributes of this 00412 param.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; 00413 param.create = QUDA_NULL_FIELD_CREATE; 00414 cpuColorSpinorField tmp(param); 00415 saveCPUSpinorField(tmp); 00416 dest.copy(tmp); 00417 return; 00418 } 00419 00420 // (temporary?) bug fix for padding 00421 memset(buffer, 0, bufferBytes); 00422 00423 cudaMemcpy(buffer, v, bytes, cudaMemcpyDeviceToHost); 00424 00425 00426 #define SAVE_SPINOR_GPU_TO_CPU(myNs) \ 00427 if (precision == QUDA_DOUBLE_PRECISION) { \ 00428 if (dest.precision == QUDA_DOUBLE_PRECISION) { \ 00429 if (fieldOrder == QUDA_FLOAT_FIELD_ORDER) { \ 00430 unpackSpinor<3,myNs,1>((double*)dest.v, (double*)buffer, volume, pad, x, dest.total_length, total_length, \ 00431 dest.SiteSubset(), dest.SiteOrder(), dest.GammaBasis(), gammaBasis, dest.FieldOrder()); \ 00432 } else if (fieldOrder == QUDA_FLOAT2_FIELD_ORDER) { \ 00433 unpackSpinor<3,myNs,2>((double*)dest.v, (double*)buffer, volume, pad, x, dest.total_length, total_length, \ 00434 dest.SiteSubset(), dest.SiteOrder(), dest.GammaBasis(), gammaBasis, dest.FieldOrder()); \ 00435 } else if (fieldOrder == QUDA_FLOAT4_FIELD_ORDER) { \ 00436 errorQuda("double4 not supported"); \ 00437 } \ 00438 } else { \ 00439 if (fieldOrder == QUDA_FLOAT_FIELD_ORDER) { \ 00440 unpackSpinor<3,myNs,1>((float*)dest.v, (double*)buffer, volume, pad, x, dest.total_length, total_length, \ 00441 dest.SiteSubset(), dest.SiteOrder(), dest.GammaBasis(), gammaBasis, dest.FieldOrder()); \ 00442 } else if (fieldOrder == QUDA_FLOAT2_FIELD_ORDER) { \ 00443 unpackSpinor<3,myNs,2>((float*)dest.v, (double*)buffer, volume, pad, x, dest.total_length, total_length, \ 00444 dest.SiteSubset(), dest.SiteOrder(), dest.GammaBasis(), gammaBasis, dest.FieldOrder()); \ 00445 } else if (fieldOrder == QUDA_FLOAT4_FIELD_ORDER) { \ 00446 errorQuda("double4 not supported"); \ 00447 } \ 00448 } \ 00449 } else { \ 00450 if (dest.precision == QUDA_DOUBLE_PRECISION) { \ 00451 if (fieldOrder == QUDA_FLOAT_FIELD_ORDER) { \ 00452 unpackSpinor<3,myNs,1>((double*)dest.v, (float*)buffer, volume, pad, x, dest.total_length, total_length, \ 00453 dest.SiteSubset(), dest.SiteOrder(), dest.GammaBasis(), gammaBasis, dest.FieldOrder()); \ 00454 } else if (fieldOrder == QUDA_FLOAT2_FIELD_ORDER) { \ 00455 unpackSpinor<3,myNs,2>((double*)dest.v, (float*)buffer, volume, pad, x, dest.total_length, total_length, \ 00456 dest.SiteSubset(), dest.SiteOrder(), dest.GammaBasis(), gammaBasis, dest.FieldOrder()); \ 00457 } else if (fieldOrder == QUDA_FLOAT4_FIELD_ORDER) { \ 00458 unpackSpinor<3,myNs,4>((double*)dest.v, (float*)buffer, volume, pad, x, dest.total_length, total_length, \ 00459 dest.SiteSubset(), dest.SiteOrder(), dest.GammaBasis(), gammaBasis, dest.FieldOrder()); \ 00460 } \ 00461 } else { \ 00462 if (fieldOrder == QUDA_FLOAT_FIELD_ORDER) { \ 00463 unpackSpinor<3,myNs,1>((float*)dest.v, (float*)buffer, volume, pad, x, dest.total_length, total_length, \ 00464 dest.SiteSubset(), dest.SiteOrder(), dest.GammaBasis(), gammaBasis, dest.FieldOrder()); \ 00465 } else if (fieldOrder == QUDA_FLOAT2_FIELD_ORDER) { \ 00466 unpackSpinor<3,myNs,2>((float*)dest.v, (float*)buffer, volume, pad, x, dest.total_length, total_length, \ 00467 dest.SiteSubset(), dest.SiteOrder(), dest.GammaBasis(), gammaBasis, dest.FieldOrder()); \ 00468 } else if (fieldOrder == QUDA_FLOAT4_FIELD_ORDER) { \ 00469 unpackSpinor<3,myNs,4>((float*)dest.v, (float*)buffer, volume, pad, x, dest.total_length, total_length, \ 00470 dest.SiteSubset(), dest.SiteOrder(), dest.GammaBasis(), gammaBasis, dest.FieldOrder()); \ 00471 } \ 00472 } \ 00473 } 00474 00475 switch(nSpin){ 00476 case 1: 00477 SAVE_SPINOR_GPU_TO_CPU(1); 00478 break; 00479 case 4: 00480 SAVE_SPINOR_GPU_TO_CPU(4); 00481 break; 00482 default: 00483 errorQuda("invalid number of spinors in function %s\n", __FUNCTION__); 00484 } 00485 #undef SAVE_SPINOR_GPU_TO_CPU 00486 return; 00487 } 00488 00489 void cudaColorSpinorField::allocateGhostBuffer(void) { 00490 int nFace = (nSpin == 1) ? 3 : 1; //3 faces for asqtad 00491 int Nint = nColor * nSpin * 2; // number of internal degrees of freedom 00492 if (nSpin == 4) Nint /= 2; // spin projection for Wilson 00493 00494 if(this->initGhostFaceBuffer == 0 || precision > facePrecision){ 00495 for (int i=0; i<4; i++) { 00496 if(!commDimPartitioned(i)){ 00497 continue; 00498 } 00499 size_t faceBytes = nFace*ghostFace[i]*Nint*precision; 00500 // add extra space for the norms for half precision 00501 if (precision == QUDA_HALF_PRECISION) faceBytes += nFace*ghostFace[i]*sizeof(float); 00502 00503 if (this->initGhostFaceBuffer) { // only free-ed if precision is higher than previous allocation 00504 //cudaFree(this->fwdGhostFaceBuffer[i]); 00505 cudaFree(this->backGhostFaceBuffer[i]); this->backGhostFaceBuffer[i] = NULL; 00506 this->fwdGhostFaceBuffer[i] = NULL; 00507 } 00508 //cudaMalloc((void**)&this->fwdGhostFaceBuffer[i], faceBytes); 00509 if (cudaMalloc((void**)&this->backGhostFaceBuffer[i], 2*faceBytes) == cudaErrorMemoryAllocation){ 00510 errorQuda("cudaMalloc() failed for backGhostFaceBuffer\n"); 00511 } 00512 fwdGhostFaceBuffer[i] = (void*)(((char*)backGhostFaceBuffer[i]) + faceBytes); 00513 } 00514 00515 this->facePrecision = precision; 00516 this->initGhostFaceBuffer = 1; 00517 } 00518 00519 for (int i=0; i<4; i++) { 00520 if(!commDimPartitioned(i)){ 00521 continue; 00522 } 00523 size_t faceBytes = nFace*ghostFace[i]*Nint*precision; 00524 // add extra space for the norms for half precision 00525 if (precision == QUDA_HALF_PRECISION) faceBytes += nFace*ghostFace[i]*sizeof(float); 00526 fwdGhostFaceBuffer[i] = (void*)(((char*)backGhostFaceBuffer[i]) + faceBytes); 00527 } 00528 00529 00530 } 00531 00532 void cudaColorSpinorField::freeGhostBuffer(void) { 00533 if (!initGhostFaceBuffer) return; 00534 00535 for(int i=0;i < 4; i++){ 00536 if(!commDimPartitioned(i)){ 00537 continue; 00538 } 00539 //cudaFree(fwdGhostFaceBuffer[i]); 00540 cudaFree(backGhostFaceBuffer[i]); backGhostFaceBuffer[i] = NULL; 00541 fwdGhostFaceBuffer[i] = NULL; 00542 } 00543 00544 initGhostFaceBuffer = 0; 00545 } 00546 00547 // pack the ghost zone into a contiguous buffer for communications 00548 void cudaColorSpinorField::packGhost(const int dim, const QudaParity parity, const int dagger, cudaStream_t *stream) 00549 { 00550 #ifdef MULTI_GPU 00551 if (dim !=3 || kernelPackT) { // use kernels to pack into contiguous buffers then a single cudaMemcpy 00552 void* gpu_buf = this->backGhostFaceBuffer[dim]; 00553 packFace(gpu_buf, *this, dim, dagger, parity, *stream); 00554 } 00555 #else 00556 errorQuda("packGhost not built on single-GPU build"); 00557 #endif 00558 00559 } 00560 00561 // send the ghost zone to the host 00562 void cudaColorSpinorField::sendGhost(void *ghost_spinor, const int dim, const QudaDirection dir, 00563 const int dagger, cudaStream_t *stream) { 00564 00565 #ifdef MULTI_GPU 00566 int Nvec = (nSpin == 1 || precision == QUDA_DOUBLE_PRECISION) ? 2 : 4; 00567 int nFace = (nSpin == 1) ? 3 : 1; //3 faces for asqtad 00568 int Nint = (nColor * nSpin * 2) / (nSpin == 4 ? 2 : 1); // (spin proj.) degrees of freedom 00569 00570 if (dim !=3 || kernelPackT) { // use kernels to pack into contiguous buffers then a single cudaMemcpy 00571 00572 size_t bytes = nFace*Nint*ghostFace[dim]*precision; 00573 if (precision == QUDA_HALF_PRECISION) bytes += nFace*ghostFace[dim]*sizeof(float); 00574 void* gpu_buf = 00575 (dir == QUDA_BACKWARDS) ? this->backGhostFaceBuffer[dim] : this->fwdGhostFaceBuffer[dim]; 00576 00577 CUDAMEMCPY(ghost_spinor, gpu_buf, bytes, cudaMemcpyDeviceToHost, *stream); 00578 } else { // do multiple cudaMemcpys 00579 00580 int Npad = Nint / Nvec; // number Nvec buffers we have 00581 int Nt_minus1_offset = (volume - nFace*ghostFace[3]); // N_t -1 = Vh-Vsh 00582 00583 int offset = 0; 00584 if (nSpin == 1) { 00585 offset = (dir == QUDA_BACKWARDS) ? 0 : Nt_minus1_offset; 00586 } else if (nSpin == 4) { 00587 // !dagger: send lower components backwards, send upper components forwards 00588 // dagger: send upper components backwards, send lower components forwards 00589 bool upper = dagger ? true : false; // Fwd is !Back 00590 if (dir == QUDA_FORWARDS) upper = !upper; 00591 int lower_spin_offset = Npad*stride; 00592 if (upper) offset = (dir == QUDA_BACKWARDS ? 0 : Nt_minus1_offset); 00593 else offset = lower_spin_offset + (dir == QUDA_BACKWARDS ? 0 : Nt_minus1_offset); 00594 } 00595 00596 // QUDA Memcpy NPad's worth. 00597 // -- Dest will point to the right beginning PAD. 00598 // -- Each Pad has size Nvec*Vsh Floats. 00599 // -- There is Nvec*Stride Floats from the start of one PAD to the start of the next 00600 00601 void *dst = (char*)ghost_spinor; 00602 void *src = (char*)v + offset*Nvec*precision; 00603 size_t len = nFace*ghostFace[3]*Nvec*precision; 00604 size_t spitch = stride*Nvec*precision; 00605 cudaMemcpy2DAsync(dst, len, src, spitch, len, Npad, cudaMemcpyDeviceToHost, *stream); 00606 00607 /*for(int i=0; i < Npad; i++) { 00608 int len = nFace*ghostFace[3]*Nvec*precision; 00609 void *dst = (char*)ghost_spinor + i*len; 00610 void *src = (char*)v + (offset + i*stride)* Nvec*precision; 00611 CUDAMEMCPY(dst, src, len, cudaMemcpyDeviceToHost, *stream); 00612 }*/ 00613 00614 if (precision == QUDA_HALF_PRECISION) { 00615 int norm_offset = (dir == QUDA_BACKWARDS) ? 0 : Nt_minus1_offset*sizeof(float); 00616 void *dst = (char*)ghost_spinor + nFace*Nint*ghostFace[3]*precision; 00617 void *src = (char*)norm + norm_offset; 00618 CUDAMEMCPY(dst, src, nFace*ghostFace[3]*sizeof(float), cudaMemcpyDeviceToHost, *stream); 00619 } 00620 } 00621 #else 00622 errorQuda("sendGhost not built on single-GPU build"); 00623 #endif 00624 00625 } 00626 00627 void cudaColorSpinorField::unpackGhost(void* ghost_spinor, const int dim, 00628 const QudaDirection dir, 00629 const int dagger, cudaStream_t* stream) 00630 { 00631 int nFace = (nSpin == 1) ? 3 : 1; //3 faces for asqtad 00632 int Nint = (nColor * nSpin * 2) / (nSpin == 4 ? 2 : 1); // (spin proj.) degrees of freedom 00633 00634 int len = nFace*ghostFace[dim]*Nint; 00635 int offset = length + ghostOffset[dim]*nColor*nSpin*2; 00636 offset += (dir == QUDA_BACKWARDS) ? 0 : len; 00637 00638 void *dst = (char*)v + precision*offset; 00639 void *src = ghost_spinor; 00640 00641 CUDAMEMCPY(dst, src, len*precision, cudaMemcpyHostToDevice, *stream); 00642 00643 if (precision == QUDA_HALF_PRECISION) { 00644 int normlen = nFace*ghostFace[dim]; 00645 int norm_offset = stride + ghostNormOffset[dim]; 00646 norm_offset += (dir == QUDA_BACKWARDS) ? 0 : normlen; 00647 00648 void *dst = (char*)norm + norm_offset*sizeof(float); 00649 void *src = (char*)ghost_spinor+nFace*Nint*ghostFace[dim]*precision; // norm region of host ghost zone 00650 CUDAMEMCPY(dst, src, normlen*sizeof(float), cudaMemcpyHostToDevice, *stream); 00651 } 00652 00653 } 00654 00655 std::ostream& operator<<(std::ostream &out, const cudaColorSpinorField &a) { 00656 out << (const ColorSpinorField)a; 00657 out << "v = " << a.v << std::endl; 00658 out << "norm = " << a.norm << std::endl; 00659 out << "alloc = " << a.alloc << std::endl; 00660 out << "init = " << a.init << std::endl; 00661 return out; 00662 }