15 #define REORDER_LOCATION QUDA_CUDA_FIELD_LOCATION
17 #define REORDER_LOCATION QUDA_CPU_FIELD_LOCATION
23 void* cudaColorSpinorField::buffer_h = 0;
24 void* cudaColorSpinorField::buffer_d = 0;
25 bool cudaColorSpinorField::bufferInit =
false;
26 size_t cudaColorSpinorField::bufferBytes = 0;
28 int cudaColorSpinorField::initGhostFaceBuffer = 0;
29 void* cudaColorSpinorField::ghostFaceBuffer;
30 void* cudaColorSpinorField::fwdGhostFaceBuffer[
QUDA_MAX_DIM];
31 void* cudaColorSpinorField::backGhostFaceBuffer[
QUDA_MAX_DIM];
65 if (isNative() && src.isNative()) copy(src);
66 else errorQuda(
"Cannot copy using non-native fields");
90 errorQuda(
"Cannot reference a non-cuda field");
102 isNative() && dynamic_cast<const cudaColorSpinorField &>(src).isNative()) {
103 copy(dynamic_cast<const cudaColorSpinorField&>(src));
105 loadSpinorField(src);
120 isNative() && dynamic_cast<const cudaColorSpinorField &>(src).isNative()) {
121 copy(dynamic_cast<const cudaColorSpinorField&>(src));
123 loadSpinorField(src);
125 errorQuda(
"Unknown input ColorSpinorField %s",
typeid(src).name());
137 errorQuda(
"Unknown input ColorSpinorField %s",
typeid(src).name());
150 if (isNative() && src.isNative()) copy(src);
151 else errorQuda(
"Cannot copy using non-native fields");
163 loadSpinorField(src);
172 bool cudaColorSpinorField::isNative()
const {
179 }
else if (
nSpin == 1) {
185 }
else if (
nSpin == 1) {
213 if ((
bytes > bufferBytes) && bufferInit) {
237 memcpy(param.x,
x,
nDim*
sizeof(
int));
250 #ifdef USE_TEXTURE_OBJECTS
267 #ifdef USE_TEXTURE_OBJECTS
274 #ifdef USE_TEXTURE_OBJECTS
275 void cudaColorSpinorField::createTexObject() {
277 if (texInit)
errorQuda(
"Already bound textures");
281 cudaChannelFormatDesc desc;
282 memset(&desc, 0,
sizeof(cudaChannelFormatDesc));
284 else desc.f = cudaChannelFormatKindSigned;
299 cudaResourceDesc resDesc;
300 memset(&resDesc, 0,
sizeof(resDesc));
301 resDesc.resType = cudaResourceTypeLinear;
302 resDesc.res.linear.devPtr =
v;
303 resDesc.res.linear.desc = desc;
304 resDesc.res.linear.sizeInBytes =
bytes;
306 cudaTextureDesc texDesc;
307 memset(&texDesc, 0,
sizeof(texDesc));
309 else texDesc.readMode = cudaReadModeElementType;
311 cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
316 cudaChannelFormatDesc desc;
317 memset(&desc, 0,
sizeof(cudaChannelFormatDesc));
318 desc.f = cudaChannelFormatKindFloat;
321 cudaResourceDesc resDesc;
322 memset(&resDesc, 0,
sizeof(resDesc));
323 resDesc.resType = cudaResourceTypeLinear;
324 resDesc.res.linear.devPtr =
norm;
325 resDesc.res.linear.desc = desc;
328 cudaTextureDesc texDesc;
329 memset(&texDesc, 0,
sizeof(texDesc));
330 texDesc.readMode = cudaReadModeElementType;
332 cudaCreateTextureObject(&texNorm, &resDesc, &texDesc, NULL);
339 void cudaColorSpinorField::destroyTexObject() {
341 cudaDestroyTextureObject(tex);
361 void cudaColorSpinorField::destroy() {
372 #ifdef USE_TEXTURE_OBJECTS
405 void cudaColorSpinorField::zeroPad() {
408 for (
int i=0; i<Npad; i++) {
413 void cudaColorSpinorField::copy(
const cudaColorSpinorField &src) {
420 #define REORDER_SPINOR_FIELD(DST, SRC, dst, src, myNs, loc) \
421 if ((dst).Precision() == QUDA_DOUBLE_PRECISION) { \
422 if ((src).Precision() == QUDA_DOUBLE_PRECISION) { \
423 if (fieldOrder == QUDA_FLOAT_FIELD_ORDER) { \
424 packSpinor<3,myNs,1>((double*)DST, (double*)SRC, dst, src, loc); \
425 } else if (fieldOrder == QUDA_FLOAT2_FIELD_ORDER) { \
426 packSpinor<3,myNs,2>((double*)DST, (double*)SRC, dst, src, loc); \
427 } else if (fieldOrder == QUDA_FLOAT4_FIELD_ORDER) { \
428 packSpinor<3,myNs,4>((double*)DST, (double*)SRC, dst, src, loc); \
431 if (fieldOrder == QUDA_FLOAT_FIELD_ORDER) { \
432 packSpinor<3,myNs,1>((double*)DST, (float*)SRC, dst, src, loc); \
433 } else if (fieldOrder == QUDA_FLOAT2_FIELD_ORDER) { \
434 packSpinor<3,myNs,2>((double*)DST, (float*)SRC, dst, src, loc); \
435 } else if (fieldOrder == QUDA_FLOAT4_FIELD_ORDER) { \
436 packSpinor<3,myNs,4>((double*)DST, (float*)SRC, dst, src, loc); \
440 if ((src).Precision() == QUDA_DOUBLE_PRECISION) { \
441 if (fieldOrder == QUDA_FLOAT_FIELD_ORDER) { \
442 packSpinor<3,myNs,1>((float*)DST, (double*)SRC, dst, src, loc); \
443 } else if (fieldOrder == QUDA_FLOAT2_FIELD_ORDER) { \
444 packSpinor<3,myNs,2>((float*)DST, (double*)SRC, dst, src, loc); \
445 } else if (fieldOrder == QUDA_FLOAT4_FIELD_ORDER) { \
446 packSpinor<3,myNs,4>((float*)DST, (double*)SRC, dst, src, loc); \
449 if (fieldOrder == QUDA_FLOAT_FIELD_ORDER) { \
450 packSpinor<3,myNs,1>((float*)DST, (float*)SRC, dst, src, loc); \
451 } else if (fieldOrder == QUDA_FLOAT2_FIELD_ORDER) { \
452 packSpinor<3,myNs,2>((float*)DST, (float*)SRC, dst, src, loc); \
453 } else if (fieldOrder == QUDA_FLOAT4_FIELD_ORDER) { \
454 packSpinor<3,myNs,4>((float*)DST, (float*)SRC, dst, src, loc); \
460 void cudaColorSpinorField::resizeBuffer(
size_t bytes)
const {
461 if (bytes > bufferBytes) {
469 void cudaColorSpinorField::loadSpinorField(
const ColorSpinorField &src) {
489 loadSpinorField(
tmp);
509 cudaMemcpy(
v, buffer_h,
bytes, cudaMemcpyHostToDevice);
516 resizeBuffer(src.Bytes());
517 cudaMemcpy(buffer_d, dynamic_cast<const cpuColorSpinorField&>(src).
V(), src.Bytes(), cudaMemcpyHostToDevice);
541 void cudaColorSpinorField::saveSpinorField(ColorSpinorField &dest)
const {
550 tmp.saveSpinorField(dest);
561 saveSpinorField(
tmp);
567 cudaMemcpy(buffer_h,
v,
bytes, cudaMemcpyDeviceToHost);
579 errorQuda(
"invalid number of spinors in function");
585 void *dst = (
typeid(dest)==
typeid(
cudaColorSpinorField)) ? dynamic_cast<cudaColorSpinorField&>(dest).V() : buffer_d;
595 errorQuda(
"invalid number of spinors in function");
599 cudaMemcpy(dynamic_cast<cpuColorSpinorField&>(dest).
V(), buffer_d, dest.Bytes(), cudaMemcpyDeviceToHost);
608 int nFace = (
nSpin == 1) ? 3 : 1;
610 if (nSpin == 4) Nint /= 2;
613 if(initGhostFaceBuffer == 0 ||
precision > facePrecision){
615 if (initGhostFaceBuffer)
device_free(ghostFaceBuffer);
618 size_t faceBytes = 0;
619 for (
int i=0; i<4; i++) {
628 initGhostFaceBuffer = 1;
635 for (
int i=0; i<4; i++) {
638 backGhostFaceBuffer[i] = (
void*)(((
char*)ghostFaceBuffer) + offset);
642 fwdGhostFaceBuffer[i] = (
void*)(((
char*)ghostFaceBuffer) + offset);
652 if (!initGhostFaceBuffer)
return;
656 for(
int i=0;i < 4; i++){
658 backGhostFaceBuffer[i] = NULL;
659 fwdGhostFaceBuffer[i] = NULL;
661 initGhostFaceBuffer = 0;
668 packFace(ghostFaceBuffer, *
this, dagger, parity, *stream);
670 errorQuda(
"packGhost not built on single-GPU build");
681 int nFace = (
nSpin == 1) ? 3 : 1;
689 (dir ==
QUDA_BACKWARDS) ? this->backGhostFaceBuffer[dim] : this->fwdGhostFaceBuffer[dim];
691 cudaMemcpyAsync(ghost_spinor, gpu_buf, bytes, cudaMemcpyDeviceToHost, *stream);
694 int Npad = Nint / Nvec;
700 }
else if (
nSpin == 4) {
703 bool upper = dagger ?
true :
false;
705 int lower_spin_offset = Npad*
stride;
706 if (upper) offset = (dir ==
QUDA_BACKWARDS ? 0 : Nt_minus1_offset);
707 else offset = lower_spin_offset + (dir ==
QUDA_BACKWARDS ? 0 : Nt_minus1_offset);
715 void *dst = (
char*)ghost_spinor;
719 cudaMemcpy2DAsync(dst, len, src, spitch, len, Npad, cudaMemcpyDeviceToHost, *stream);
722 int norm_offset = (dir ==
QUDA_BACKWARDS) ? 0 : Nt_minus1_offset*
sizeof(
float);
723 void *dst = (
char*)ghost_spinor + nFace*Nint*
ghostFace[3]*precision;
724 void *src = (
char*)
norm + norm_offset;
725 cudaMemcpyAsync(dst, src, nFace*
ghostFace[3]*
sizeof(
float), cudaMemcpyDeviceToHost, *stream);
729 errorQuda(
"sendGhost not built on single-GPU build");
738 int nFace = (
nSpin == 1) ? 3 : 1;
746 void *src = ghost_spinor;
748 cudaMemcpyAsync(dst, src, len*
precision, cudaMemcpyHostToDevice, *stream);
755 void *dst = (
char*)
norm + norm_offset*
sizeof(
float);
756 void *src = (
char*)ghost_spinor+nFace*Nint*ghostFace[dim]*precision;
757 cudaMemcpyAsync(dst, src, normlen*
sizeof(
float), cudaMemcpyHostToDevice, *stream);
769 out <<
"v = " << a.
v << std::endl;
770 out <<
"norm = " << a.
norm << std::endl;
771 out <<
"alloc = " << a.alloc << std::endl;
772 out <<
"init = " << a.init << std::endl;