22 ghost_field_tex {
nullptr,
nullptr,
nullptr,
nullptr}
82 norm = (
void*)src.Norm();
84 errorQuda(
"Cannot reference a non-cuda field");
129 errorQuda(
"Unknown input ColorSpinorField %s",
typeid(src).name());
178 cudaHostGetDevicePointer(&
v,
v_h, 0);
197 memcpy(param.
x,
x,
nDim*
sizeof(
int));
216 memcpy(param.
x,
x,
nDim*
sizeof(
int));
235 #ifdef USE_TEXTURE_OBJECTS 254 memcpy(param.
x,
x,
nDim*
sizeof(
int));
271 #ifdef USE_TEXTURE_OBJECTS //(a lot of texture objects...) 290 #ifdef USE_TEXTURE_OBJECTS 296 #ifdef USE_TEXTURE_OBJECTS 297 void cudaColorSpinorField::createTexObject() {
304 cudaChannelFormatDesc desc;
305 memset(&desc, 0,
sizeof(cudaChannelFormatDesc));
307 else desc.f = cudaChannelFormatKindSigned;
326 cudaResourceDesc resDesc;
327 memset(&resDesc, 0,
sizeof(resDesc));
328 resDesc.resType = cudaResourceTypeLinear;
329 resDesc.res.linear.devPtr =
v;
330 resDesc.res.linear.desc = desc;
331 resDesc.res.linear.sizeInBytes =
bytes;
333 cudaTextureDesc texDesc;
334 memset(&texDesc, 0,
sizeof(texDesc));
336 else texDesc.readMode = cudaReadModeElementType;
338 if (resDesc.res.linear.sizeInBytes %
deviceProp.textureAlignment != 0
340 errorQuda(
"Allocation size %lu does not have correct alignment for textures (%lu)",
341 resDesc.res.linear.sizeInBytes,
deviceProp.textureAlignment);
344 unsigned long texels = resDesc.res.linear.sizeInBytes / texel_size;
345 if (texels > (
unsigned)
deviceProp.maxTexture1DLinear) {
346 errorQuda(
"Attempting to bind too large a texture %lu > %d", texels,
deviceProp.maxTexture1DLinear);
349 cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
355 cudaChannelFormatDesc desc;
356 memset(&desc, 0,
sizeof(cudaChannelFormatDesc));
357 desc.f = cudaChannelFormatKindFloat;
360 cudaResourceDesc resDesc;
361 memset(&resDesc, 0,
sizeof(resDesc));
362 resDesc.resType = cudaResourceTypeLinear;
363 resDesc.res.linear.devPtr =
norm;
364 resDesc.res.linear.desc = desc;
367 if (resDesc.res.linear.sizeInBytes %
deviceProp.textureAlignment != 0
369 errorQuda(
"Allocation size %lu does not have correct alignment for textures (%lu)",
370 resDesc.res.linear.sizeInBytes,
deviceProp.textureAlignment);
373 cudaTextureDesc texDesc;
374 memset(&texDesc, 0,
sizeof(texDesc));
375 texDesc.readMode = cudaReadModeElementType;
377 cudaCreateTextureObject(&texNorm, &resDesc, &texDesc, NULL);
388 void cudaColorSpinorField::createGhostTexObject()
const {
393 for (
int b=0; b<2; b++) {
394 cudaChannelFormatDesc desc;
395 memset(&desc, 0,
sizeof(cudaChannelFormatDesc));
397 else desc.f = cudaChannelFormatKindSigned;
412 cudaResourceDesc resDesc;
413 memset(&resDesc, 0,
sizeof(resDesc));
414 resDesc.resType = cudaResourceTypeLinear;
416 resDesc.res.linear.desc = desc;
420 errorQuda(
"Allocation size %lu does not have correct alignment for textures (%lu)",
421 resDesc.res.linear.sizeInBytes,
deviceProp.textureAlignment);
424 cudaTextureDesc texDesc;
425 memset(&texDesc, 0,
sizeof(texDesc));
427 else texDesc.readMode = cudaReadModeElementType;
429 cudaCreateTextureObject(&ghostTex[b], &resDesc, &texDesc, NULL);
434 errorQuda(
"Allocation size %lu does not have correct alignment for textures (%lu)",
435 resDesc.res.linear.sizeInBytes,
deviceProp.textureAlignment);
437 cudaCreateTextureObject(&ghostTex[2 + b], &resDesc, &texDesc, NULL);
440 cudaChannelFormatDesc desc;
441 memset(&desc, 0,
sizeof(cudaChannelFormatDesc));
442 desc.f = cudaChannelFormatKindFloat;
445 cudaResourceDesc resDesc;
446 memset(&resDesc, 0,
sizeof(resDesc));
447 resDesc.resType = cudaResourceTypeLinear;
449 resDesc.res.linear.desc = desc;
453 errorQuda(
"Allocation size %lu does not have correct alignment for textures (%lu)",
454 resDesc.res.linear.sizeInBytes,
deviceProp.textureAlignment);
457 cudaTextureDesc texDesc;
458 memset(&texDesc, 0,
sizeof(texDesc));
459 texDesc.readMode = cudaReadModeElementType;
461 cudaCreateTextureObject(&ghostTexNorm[b], &resDesc, &texDesc, NULL);
465 errorQuda(
"Allocation size %lu does not have correct alignment for textures (%lu)",
466 resDesc.res.linear.sizeInBytes,
deviceProp.textureAlignment);
468 cudaCreateTextureObject(&ghostTexNorm[2 + b], &resDesc, &texDesc, NULL);
483 void cudaColorSpinorField::destroyTexObject() {
485 cudaDestroyTextureObject(tex);
491 void cudaColorSpinorField::destroyGhostTexObject()
const {
493 for (
int i=0; i<4; i++) cudaDestroyTextureObject(ghostTex[i]);
495 for (
int i=0; i<4; i++) cudaDestroyTextureObject(ghostTexNorm[i]);
522 CompositeColorSpinorField::iterator vec;
531 #ifdef USE_TEXTURE_OBJECTS 534 destroyGhostTexObject();
568 cudaMemsetAsync(
v, 0,
bytes);
586 for (
int subset=0; subset<
siteSubset; subset++) {
587 cudaMemset2DAsync(dst + subset*
bytes/siteSubset, pitch, 0, pad_bytes, Npad);
594 for (
int subset=0; subset<
siteSubset; subset++) {
603 for (
int subset=0; subset <
siteSubset; subset++) {
604 cudaMemsetAsync((
char*)
v + subset_length*precision + subset_bytes*subset, 0,
605 subset_bytes-subset_length*precision);
612 for (
int subset=0; subset <
siteSubset; subset++) {
613 cudaMemsetAsync((
char*)
norm + (
size_t)
stride*
sizeof(
float) + subset_bytes*subset, 0,
614 subset_bytes-(
size_t)
stride*
sizeof(
float));
633 copy(dynamic_cast<const cudaColorSpinorField&>(src));
639 errorQuda(
"Unknown input ColorSpinorField %s",
typeid(src).name());
662 cudaError_t error = cudaHostGetDevicePointer(&src_d, const_cast<void*>(src.
V()), 0);
663 if (error != cudaSuccess)
errorQuda(
"Failed to get device pointer for ColorSpinorField field");
666 void *Src=
nullptr, *srcNorm=
nullptr, *buffer=
nullptr;
670 srcNorm =
static_cast<char*
>(Src) + src.
Bytes();
675 memcpy(buffer, src.
V(), src.
Bytes());
677 cudaError_t error = cudaHostGetDevicePointer(&Src, buffer, 0);
678 if (error != cudaSuccess)
errorQuda(
"Failed to get device pointer for ColorSpinorField field");
679 srcNorm =
static_cast<char*
>(Src) + src.
Bytes();
711 cudaError_t error = cudaHostGetDevicePointer(&dest_d, const_cast<void*>(dest.
V()), 0);
712 if (error != cudaSuccess)
errorQuda(
"Failed to get device pointer for ColorSpinorField field");
715 void *dst =
nullptr, *dstNorm =
nullptr, *buffer =
nullptr;
719 dstNorm =
static_cast<char*
>(dst) + dest.
Bytes();
722 cudaError_t error = cudaHostGetDevicePointer(&dst, buffer, 0);
723 if (error != cudaSuccess)
errorQuda(
"Failed to get device pointer for ColorSpinorField");
724 dstNorm =
static_cast<char*
>(dst)+dest.
Bytes();
734 memcpy(dest.
V(), buffer, dest.
Bytes());
752 #ifdef USE_TEXTURE_OBJECTS 757 destroyGhostTexObject();
765 MemoryLocation location_label,
bool spin_project,
double a,
double b,
double c)
770 for (
int dim=0; dim<4; dim++) {
771 for (
int dir=0; dir<2; dir++) {
772 switch(location[2*dim+dir]) {
782 default:
errorQuda(
"Undefined location %d", location[2*dim+dir]);
787 PackGhost(packBuffer, *
this, location_label, nFace, dagger, parity, spin_project, a, b, c, *stream);
790 errorQuda(
"packGhost not built on single-GPU build");
811 const int Npad = Nint /
Nvec;
813 const int x4 =
nDim==5 ?
x[4] : 1;
819 }
else if (
nSpin == 4) {
822 bool upper = dagger ? true :
false;
824 int lower_spin_offset = Npad*
stride;
825 if (upper) offset = (dir ==
QUDA_BACKWARDS ? 0 : Nt_minus1_offset);
826 else offset = lower_spin_offset + (dir ==
QUDA_BACKWARDS ? 0 : Nt_minus1_offset);
830 size_t dpitch = x4*len;
839 for (
int s = 0;
s < x4;
s++) {
840 void *dst = (
char *)ghost_spinor +
s * len +
parity * nFace * Nint *
ghostFaceCB[3] * ghost_precision;
841 void *src = (
char *)
v + (offset +
s * (
volumeCB / x4)) * Nvec * ghost_precision +
parity *
bytes / 2;
842 qudaMemcpy2DAsync(dst, dpitch, src, spitch, len, Npad, cudaMemcpyDeviceToHost, *stream);
846 size_t len = nFace * (
ghostFaceCB[3] / x4) *
sizeof(
float);
847 int norm_offset = (dir ==
QUDA_BACKWARDS) ? 0 : Nt_minus1_offset *
sizeof(
float);
848 void *dst = (
char *)ghost_spinor + nParity * nFace * Nint *
ghostFaceCB[3] * ghost_precision +
s * len
860 errorQuda(
"sendGhost not built on single-GPU build");
870 const void *src = ghost_spinor;
918 for (
int b=0; b<2; b++) {
939 bool spin_project,
double a,
double b,
double c)
946 spin_project, a, b, c);
950 const int dagger,
const int dim,
951 cudaStream_t *stream_p,
const bool zero_copy)
965 cudaStream_t *pack_stream = (stream_p) ? stream_p : stream+dir;
1025 int Npad = Nint/
Nvec;
1035 cudaStream_t *copy_stream = (stream_p) ? stream_p : stream + d;
1038 if (!remote_write) {
1049 cudaMemcpyAsync(ghost_dst,
1052 cudaMemcpyDeviceToDevice,
1058 const int x4 =
nDim==5 ?
x[4] : 1;
1063 offset = (dir == 0) ? 0 : Nt_minus_offset;
1064 }
else if (
nSpin == 4) {
1067 bool upper = dagger ? true :
false;
1068 if (dir == 1) upper = !upper;
1069 int lower_spin_offset = Npad*
stride;
1071 offset = (dir == 0 ? 0 : Nt_minus_offset);
1073 offset = lower_spin_offset + (dir == 0 ? 0 : Nt_minus_offset);
1077 size_t dpitch = x4*len;
1081 for (
int s = 0;
s < x4;
s++) {
1082 void *dst = (
char *)ghost_dst +
s * len +
parity * nFace * Nint *
ghostFaceCB[3] * ghost_precision;
1083 void *src = (
char *)
v + (offset +
s * (
volumeCB / x4)) * Nvec * ghost_precision +
parity *
bytes / 2;
1085 cudaMemcpy2DAsync(dst, dpitch, src, spitch, len, Npad, cudaMemcpyDeviceToDevice, *copy_stream);
1089 size_t len = nFace * (
ghostFaceCB[3] / x4) *
sizeof(
float);
1090 int norm_offset = (dir == 0) ? 0 : Nt_minus_offset *
sizeof(
float);
1091 void *dst = (
char *)ghost_dst + nParity * nFace * Nint *
ghostFaceCB[3] * ghost_precision +
s * len
1094 cudaMemcpyAsync(dst, src, len, cudaMemcpyDeviceToDevice, *copy_stream);
1117 recvStart(nFace, dir, dagger, stream_p, gdr_recv);
1118 sendStart(nFace, dir, dagger, stream_p, gdr_send);
1143 }
else if (gdr_send) {
1152 }
else if (gdr_recv) {
1158 if (complete_recv_fwd[dim] && complete_send_back[dim]) {
1159 complete_send_back[dim] =
false;
1160 complete_recv_fwd[dim] =
false;
1169 }
else if (gdr_send) {
1178 }
else if (gdr_recv) {
1184 if (complete_recv_back[dim] && complete_send_fwd[dim]) {
1185 complete_send_fwd[dim] =
false;
1186 complete_recv_back[dim] =
false;
1212 }
else if (gdr_send) {
1222 }
else if (gdr_recv) {
1234 }
else if (gdr_send) {
1244 }
else if (gdr_recv) {
1260 int dim = dim_dir/2;
1261 int dir = (dim_dir+1)%2;
1273 int dim = dim_dir/2;
1274 int dir = (dim_dir+1)%2;
1283 bool zero_copy =
false;
1284 int dim = dim_dir/2;
1285 int dir = (dim_dir+1)%2;
1291 const MemoryLocation *halo_location_,
bool gdr_send,
bool gdr_recv,
1314 pack_destination[i] = pack_destination_ ? pack_destination_[i] :
Device;
1315 halo_location[i] = halo_location_ ? halo_location_[i] :
Device;
1320 bool fused_pack_memcpy =
true;
1324 bool fused_halo_memcpy =
true;
1326 bool pack_host =
false;
1327 bool halo_host =
false;
1331 for (
int dir=0; dir<2; dir++) {
1337 for (
int dir=0; dir<2; dir++) {
1346 if (pack_destination[2*d+0] ==
Host || pack_destination[2*d+1] ==
Host) pack_host =
true;
1347 if (halo_location[2*d+0] ==
Host || halo_location[2*d+1] ==
Host) halo_host =
true;
1359 if (!fused_pack_memcpy) {
1373 }
else if (total_bytes && !pack_host) {
1379 for (
int i=0; i<2*nDimComms; i++) const_cast<cudaColorSpinorField*>(
this)->
recvStart(nFace, i, dagger, 0, gdr_recv);
1381 bool sync = pack_host ? true :
false;
1386 for (
int p2p=0; p2p<2; p2p++) {
1388 for (
int dir=0; dir<2; dir++) {
1398 while (comms_done < 2*nDimComms) {
1400 for (
int dir=0; dir<2; dir++) {
1401 if (!comms_complete[dim*2+dir]) {
1403 if (comms_complete[2*dim+dir]) {
1413 if (!fused_halo_memcpy) {
1427 }
else if (total_bytes && !halo_host) {
1437 out <<
"v = " << a.
v << std::endl;
1438 out <<
"norm = " << a.
norm << std::endl;
1439 out <<
"alloc = " << a.
alloc << std::endl;
1440 out <<
"init = " << a.
init << std::endl;
1452 errorQuda(
"Incorrect component index...");
1455 errorQuda(
"Cannot get requested component");
1462 if (first_element < 0)
errorQuda(
"\nError: trying to set negative first element.\n");
1464 if (first_element == 0 && range == this->EigvDim())
1466 if (range != dst.EigvDim())
errorQuda(
"\nError: eigenvector range to big.\n");
1468 copyCuda(dst, *
this);
1470 else if ((first_element+range) < this->EigvDim())
1487 memcpy(param.
x,
x,
nDim*
sizeof(
int));
1490 param.eigv_dim = range;
1492 param.
v = (
void*)((
char*)
v + first_element*eigv_bytes);
1493 param.
norm = (
void*)((
char*)
norm + first_element*eigv_norm_bytes);
1498 eigv_subset->eigenvectors.reserve(param.eigv_dim);
1499 for (
int id = first_element;
id < (first_element+range);
id++)
1505 copyCuda(dst, *eigv_subset);
1509 errorQuda(
"Incorrect eigenvector dimension...");
1512 errorQuda(
"Eigenvector must be a parity spinor");
1520 #ifdef USE_TEXTURE_OBJECTS 1521 printfQuda(
"\nPrint texture info for the field:\n");
1523 cudaResourceDesc resDesc;
1525 cudaGetTextureObjectResourceDesc(&resDesc, this->Tex());
1526 printfQuda(
"\nDevice pointer: %p\n", resDesc.res.linear.devPtr);
1527 printfQuda(
"\nVolume (in bytes): %lu\n", resDesc.res.linear.sizeInBytes);
1528 if (resDesc.resType == cudaResourceTypeLinear)
printfQuda(
"\nResource type: linear \n");
1541 tmp.
Source(sourceType, st, s, c);
#define qudaMemcpy(dst, src, count, kind)
QudaFieldLocation reorder_location()
Return whether data is reordered on the CPU or GPU. This can set at QUDA initialization using the env...
CompositeColorSpinorField components
void setPrecision(QudaPrecision precision, QudaPrecision ghost_precision=QUDA_INVALID_PRECISION, bool force_native=false)
QudaFieldOrder fieldOrder
void allocateGhostBuffer(size_t ghost_bytes) const
Allocate the static ghost buffers.
int commDimPartitioned(int dir)
void commsStart(int nFace, int d, int dagger=0, cudaStream_t *stream_p=nullptr, bool gdr_send=false, bool gdr_recv=false)
Initiate halo communication.
#define pool_pinned_free(ptr)
int ghostNormOffset[QUDA_MAX_DIM][2]
enum QudaPrecision_s QudaPrecision
void streamInit(cudaStream_t *stream_p)
void * my_face_dim_dir_d[2][QUDA_MAX_DIM][2]
static bool complete_recv_back[QUDA_MAX_DIM]
cudaDeviceProp deviceProp
void allocateGhostBuffer(int nFace, bool spin_project=true) const
Allocate the ghost buffers.
cudaError_t qudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags)
Wrapper around cudaEventRecord or cuEventRecord.
void * ghostNorm[2][QUDA_MAX_DIM]
QudaVerbosity getVerbosity()
void unpackGhost(const void *ghost_spinor, const int nFace, const int dim, const QudaDirection dir, const int dagger, cudaStream_t *stream)
void gather(int nFace, int dagger, int dir, cudaStream_t *stream_p=NULL)
void copySpinorField(const ColorSpinorField &src)
void sendGhost(void *ghost_spinor, const int nFace, const int dim, const QudaDirection dir, const int dagger, cudaStream_t *stream)
CompositeColorSpinorFieldDescriptor composite_descr
used for deflation eigenvector sets etc.:
static void * ghost_pinned_recv_buffer_hd[2]
static std::map< void *, MemAlloc > alloc[N_ALLOC_TYPE]
void commsWait(int nFace, int d, int dagger=0, cudaStream_t *stream_p=nullptr, bool gdr_send=false, bool gdr_recv=false)
Wait on halo communication to complete.
void Source(const QudaSourceType sourceType, const int st=0, const int s=0, const int c=0)
cudaColorSpinorField * tmp
void loadSpinorField(const ColorSpinorField &src)
QudaGammaBasis gammaBasis
QudaGammaBasis GammaBasis() const
static bool complete_send_fwd[QUDA_MAX_DIM]
void copy(ColorSpinorField &dst, const ColorSpinorField &src)
static void * ghost_pinned_send_buffer_h[2]
void copyGenericColorSpinor(ColorSpinorField &dst, const ColorSpinorField &src, QudaFieldLocation location, void *Dst=0, void *Src=0, void *dstNorm=0, void *srcNorm=0)
static MsgHandle * mh_send_p2p_back[2][QUDA_MAX_DIM]
void * from_face_dim_dir_hd[2][QUDA_MAX_DIM][2]
void scatterExtended(int nFace, int parity, int dagger, int dir)
MsgHandle * mh_send_rdma_fwd[2][QUDA_MAX_DIM]
void CopySubset(cudaColorSpinorField &dst, const int range, const int first_element=0) const
bool ghost_precision_reset
void * ghost_field_tex[4]
void copy(const cudaColorSpinorField &)
QudaSiteSubset siteSubset
int commsQuery(int nFace, int d, int dagger=0, cudaStream_t *stream_p=nullptr, bool gdr_send=false, bool gdr_recv=false)
Non-blocking query if the halo communication has completed.
void exchangeGhost(QudaParity parity, int nFace, int dagger, const MemoryLocation *pack_destination=nullptr, const MemoryLocation *halo_location=nullptr, bool gdr_send=false, bool gdr_recv=false, QudaPrecision ghost_precision=QUDA_INVALID_PRECISION) const
This is a unified ghost exchange function for doing a complete halo exchange regardless of the type o...
bool is_aligned(const void *ptr, size_t alignment)
enum QudaSourceType_s QudaSourceType
virtual ~cudaColorSpinorField()
void * my_face_dim_dir_hd[2][QUDA_MAX_DIM][2]
friend class cpuColorSpinorField
bool is_composite
for deflation solvers:
int ghostFaceCB[QUDA_MAX_DIM]
enum QudaDirection_s QudaDirection
void genericPackGhost(void **ghost, const ColorSpinorField &a, QudaParity parity, int nFace, int dagger, MemoryLocation *destination=nullptr)
Generic ghost packing routine.
MsgHandle * mh_send_rdma_back[2][QUDA_MAX_DIM]
static MsgHandle * mh_recv_p2p_fwd[2][QUDA_MAX_DIM]
static bool complete_send_back[QUDA_MAX_DIM]
#define qudaDeviceSynchronize()
static bool ghost_field_reset
void Source(const QudaSourceType sourceType, const int st=0, const int s=0, const int c=0)
QudaFieldLocation location
static void checkField(const ColorSpinorField &, const ColorSpinorField &)
void PrintVector(unsigned int x) const
void createComms(int nFace, bool spin_project=true)
Create the communication handlers and buffers.
void reset(const ColorSpinorParam &)
QudaFieldOrder fieldOrder
MsgHandle * mh_recv_back[2][QUDA_MAX_DIM]
friend std::ostream & operator<<(std::ostream &out, const cudaColorSpinorField &)
QudaSiteSubset SiteSubset() const
void saveSpinorField(ColorSpinorField &src) const
QudaGammaBasis gammaBasis
void comm_start(MsgHandle *mh)
#define pool_device_malloc(size)
MsgHandle * mh_recv_rdma_fwd[2][QUDA_MAX_DIM]
QudaPrecision ghost_precision_tex
void packExtended(const int nFace, const int R[], const int parity, const int dagger, const int dim, cudaStream_t *stream_p, const bool zeroCopyPack=false)
void getTexObjectInfo() const
void * ghost[2][QUDA_MAX_DIM]
ColorSpinorField & operator=(const ColorSpinorField &)
static void * ghost_remote_send_buffer_d[2][QUDA_MAX_DIM][2]
enum QudaParity_s QudaParity
void * from_face_dim_dir_d[2][QUDA_MAX_DIM][2]
void init()
Create the CUBLAS context.
size_t ghost_face_bytes[QUDA_MAX_DIM]
void sendStart(int nFace, int d, int dagger=0, cudaStream_t *stream_p=nullptr, bool gdr=false, bool remote_write=false)
Initiate halo communication sending.
cudaColorSpinorField & Component(const int idx) const
for composite fields:
void PackGhost(void *ghost[2 *QUDA_MAX_DIM], const ColorSpinorField &field, MemoryLocation location, int nFace, bool dagger, int parity, bool spin_project, double a, double b, double c, const cudaStream_t &stream)
Dslash face packing routine.
QudaTwistFlavorType twistFlavor
static void destroyIPCComms()
void zeroPad()
Zero the padded regions added on to the field. Ensures correct reductions and silences false positive...
MsgHandle * mh_send_fwd[2][QUDA_MAX_DIM]
void create(const QudaFieldCreate)
void * memset(void *s, int c, size_t n)
virtual ColorSpinorField & operator=(const ColorSpinorField &)
void packGhost(const int nFace, const QudaParity parity, const int dim, const QudaDirection dir, const int dagger, cudaStream_t *stream, MemoryLocation location[2 *QUDA_MAX_DIM], MemoryLocation location_label, bool spin_project, double a=0, double b=0, double c=0)
Packs the cudaColorSpinorField's ghost zone.
bool comm_peer2peer_enabled(int dir, int dim)
void unpackGhostExtended(const void *ghost_spinor, const int nFace, const QudaParity parity, const int dim, const QudaDirection dir, const int dagger, cudaStream_t *stream, bool zero_copy)
static void * ghost_send_buffer_d[2]
#define pool_pinned_malloc(size)
void restore() const
Restores the cudaColorSpinorField.
#define qudaMemcpyAsync(dst, src, count, kind, stream)
cpuColorSpinorField * out
int ghostOffset[QUDA_MAX_DIM][2]
void createGhostZone(int nFace, bool spin_project=true) const
QudaPrecision ghost_precision
void pack(int nFace, int parity, int dagger, int stream_idx, MemoryLocation location[], MemoryLocation location_label, bool spin_project=true, double a=0, double b=0, double c=0)
void * from_face_dim_dir_h[2][QUDA_MAX_DIM][2]
static MsgHandle * mh_send_p2p_fwd[2][QUDA_MAX_DIM]
int comm_query(MsgHandle *mh)
static void * ghost_recv_buffer_d[2]
MsgHandle * mh_recv_rdma_back[2][QUDA_MAX_DIM]
static cudaEvent_t ipcCopyEvent[2][2][QUDA_MAX_DIM]
void backup() const
Backs up the cudaColorSpinorField.
QudaTwistFlavorType twistFlavor
static bool complete_recv_fwd[QUDA_MAX_DIM]
bool comm_gdr_enabled()
Query if GPU Direct RDMA communication is enabled (global setting)
cudaError_t qudaEventRecord(cudaEvent_t &event, cudaStream_t stream=0)
Wrapper around cudaEventRecord or cuEventRecord.
enum QudaFieldCreate_s QudaFieldCreate
static cudaEvent_t ipcRemoteCopyEvent[2][2][QUDA_MAX_DIM]
void packGhostExtended(const int nFace, const int R[], const QudaParity parity, const int dim, const QudaDirection dir, const int dagger, cudaStream_t *stream, bool zero_copy=false)
#define pool_device_free(ptr)
cudaColorSpinorField(const cudaColorSpinorField &)
void pushKernelPackT(bool pack)
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
MsgHandle * mh_recv_fwd[2][QUDA_MAX_DIM]
void createComms(bool no_comms_fill=false, bool bidir=true)
#define mapped_malloc(size)
void comm_wait(MsgHandle *mh)
static MsgHandle * mh_recv_p2p_back[2][QUDA_MAX_DIM]
void * ghost_buf[2 *QUDA_MAX_DIM]
void scatter(int nFace, int dagger, int dir, cudaStream_t *stream_p)
void * my_face_dim_dir_h[2][QUDA_MAX_DIM][2]
QudaSiteSubset siteSubset
void recvStart(int nFace, int dir, int dagger=0, cudaStream_t *stream_p=nullptr, bool gdr=false)
Initiate halo communication receive.
int comm_peer2peer_enabled_global()
MsgHandle * mh_send_back[2][QUDA_MAX_DIM]
QudaFieldOrder FieldOrder() const
static void * ghost_pinned_recv_buffer_h[2]
int comm_dim_partitioned(int dim)
void genericCudaPrintVector(const cudaColorSpinorField &a, unsigned x)
#define qudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, kind, stream)