12 static bool zeroCopy =
false;
29 switch (
param.create) {
71 errorQuda(
"Cannot reference a non-cuda field");
103 copySpinorField(src);
112 errorQuda(
"Unknown input ColorSpinorField %s",
typeid(src).name());
126 copySpinorField(src);
138 loadSpinorField(src);
184 param.is_composite =
false;
185 param.composite_dim = 0;
186 param.is_component =
true;
191 param.component_id = cid;
204 param.is_composite =
false;
205 param.composite_dim = 0;
216 if (!((uint64_t)
v == (uint64_t)(
void *)std::numeric_limits<uint64_t>::max()
218 && (uint64_t)
norm == (uint64_t)(
void *)std::numeric_limits<uint64_t>::max()))) {
239 param.is_composite =
false;
240 param.composite_dim = 0;
241 param.is_component =
true;
249 param.component_id = cid;
267 void cudaColorSpinorField::destroy()
287 CompositeColorSpinorField::iterator vec;
338 void cudaColorSpinorField::zeroPad() {
352 for (
int subset=0; subset<
siteSubset; subset++) {
360 for (
int subset=0; subset<
siteSubset; subset++) {
369 for (
int subset=0; subset <
siteSubset; subset++) {
371 subset_bytes - subset_length *
precision, 0);
378 for (
int subset=0; subset <
siteSubset; subset++) {
380 subset_bytes - (
size_t)
stride *
sizeof(
float), 0);
396 loadSpinorField(src);
398 errorQuda(
"Unknown input ColorSpinorField %s",
typeid(src).name());
402 void cudaColorSpinorField::loadSpinorField(
const ColorSpinorField &src) {
423 void *Src=
nullptr, *srcNorm=
nullptr, *buffer=
nullptr;
427 srcNorm =
static_cast<char*
>(Src) + src.Bytes();
428 qudaMemcpy(Src, src.V(), src.Bytes(), cudaMemcpyDefault);
429 qudaMemcpy(srcNorm, src.Norm(), src.NormBytes(), cudaMemcpyDefault);
432 memcpy(buffer, src.V(), src.Bytes());
433 memcpy(
static_cast<char*
>(buffer)+src.Bytes(), src.Norm(), src.NormBytes());
435 srcNorm =
static_cast<char*
>(Src) + src.Bytes();
450 void cudaColorSpinorField::saveSpinorField(ColorSpinorField &dest)
const {
468 void *dst =
nullptr, *dstNorm =
nullptr, *buffer =
nullptr;
472 dstNorm =
static_cast<char*
>(dst) + dest.Bytes();
476 dstNorm =
static_cast<char*
>(dst)+dest.Bytes();
482 qudaMemcpy(dest.V(), dst, dest.Bytes(), cudaMemcpyDefault);
483 qudaMemcpy(dest.Norm(), dstNorm, dest.NormBytes(), cudaMemcpyDefault);
486 memcpy(dest.V(), buffer, dest.Bytes());
487 memcpy(dest.Norm(),
static_cast<char*
>(buffer) + dest.Bytes(), dest.NormBytes());
508 bool spin_project,
double a,
double b,
double c,
int shmem)
514 for (
int dir=0; dir<2; dir++) {
515 switch (location[2 *
dim + dir]) {
536 packBuffer[2 *
dim + dir]
539 default:
errorQuda(
"Undefined location %d", location[2*
dim+dir]);
543 PackGhost(packBuffer, *
this, location_label, nFace,
dagger,
parity, spin_project, a, b, c, shmem, *
stream);
545 errorQuda(
"packGhost not built on single-GPU build");
565 const int Npad = Nint /
Nvec;
567 const int x4 =
nDim==5 ?
x[4] : 1;
573 }
else if (
nSpin == 4) {
576 bool upper =
dagger ? true :
false;
578 int lower_spin_offset = Npad*
stride;
579 if (upper) offset = (dir ==
QUDA_BACKWARDS ? 0 : Nt_minus1_offset);
580 else offset = lower_spin_offset + (dir ==
QUDA_BACKWARDS ? 0 : Nt_minus1_offset);
584 size_t dpitch = x4*len;
593 for (
int s = 0; s < x4; s++) {
600 size_t len = nFace * (
ghostFaceCB[3] / x4) *
sizeof(
float);
601 int norm_offset = (dir ==
QUDA_BACKWARDS) ? 0 : Nt_minus1_offset *
sizeof(
float);
614 errorQuda(
"sendGhost not built on single-GPU build");
621 const void *src = ghost_spinor;
665 for (
int b=0; b<2; b++) {
685 bool spin_project,
double a,
double b,
double c,
int shmem)
692 spin_project, a, b, c, shmem);
769 int Npad = Nint/
Nvec;
793 cudaMemcpyAsync(ghost_dst,
796 cudaMemcpyDeviceToDevice,
802 const int x4 =
nDim==5 ?
x[4] : 1;
807 offset = (dir == 0) ? 0 : Nt_minus_offset;
808 }
else if (
nSpin == 4) {
811 bool upper =
dagger ? true :
false;
812 if (dir == 1) upper = !upper;
813 int lower_spin_offset = Npad*
stride;
815 offset = (dir == 0 ? 0 : Nt_minus_offset);
817 offset = lower_spin_offset + (dir == 0 ? 0 : Nt_minus_offset);
821 size_t dpitch = x4*len;
825 for (
int s = 0; s < x4; s++) {
829 cudaMemcpy2DAsync(dst, dpitch, src, spitch, len, Npad, cudaMemcpyDeviceToDevice, *copy_stream);
833 size_t len = nFace * (
ghostFaceCB[3] / x4) *
sizeof(
float);
834 int norm_offset = (dir == 0) ? 0 : Nt_minus_offset *
sizeof(
float);
838 cudaMemcpyAsync(dst, src, len, cudaMemcpyDeviceToDevice, *copy_stream);
889 }
else if (gdr_send) {
898 }
else if (gdr_recv) {
904 if (complete_recv_fwd[
dim] && complete_send_back[
dim]) {
905 complete_send_back[
dim] =
false;
906 complete_recv_fwd[
dim] =
false;
915 }
else if (gdr_send) {
924 }
else if (gdr_recv) {
930 if (complete_recv_back[
dim] && complete_send_fwd[
dim]) {
931 complete_send_fwd[
dim] =
false;
932 complete_recv_back[
dim] =
false;
959 }
else if (gdr_send) {
969 }
else if (gdr_recv) {
981 }
else if (gdr_send) {
991 }
else if (gdr_recv) {
1007 int dim = dim_dir/2;
1008 int dir = (dim_dir+1)%2;
1020 int dim = dim_dir/2;
1021 int dir = (dim_dir+1)%2;
1030 bool zero_copy =
false;
1031 int dim = dim_dir/2;
1032 int dir = (dim_dir+1)%2;
1039 bool gdr_send,
bool gdr_recv,
QudaPrecision ghost_precision_)
const
1062 pack_destination[i] = pack_destination_ ? pack_destination_[i] :
Device;
1063 halo_location[i] = halo_location_ ? halo_location_[i] :
Device;
1068 bool fused_pack_memcpy =
true;
1072 bool fused_halo_memcpy =
true;
1074 bool pack_host =
false;
1075 bool halo_host =
false;
1079 for (
int dir=0; dir<2; dir++) {
1085 for (
int dir=0; dir<2; dir++) {
1094 if (pack_destination[2*d+0] ==
Host || pack_destination[2*d+1] ==
Host) pack_host =
true;
1095 if (halo_location[2*d+0] ==
Host || halo_location[2*d+1] ==
Host) halo_host =
true;
1108 if (!fused_pack_memcpy) {
1131 for (
int i=0; i<2*nDimComms; i++) const_cast<cudaColorSpinorField*>(
this)->recvStart(nFace, i,
dagger, 0, gdr_recv);
1133 bool sync = pack_host ? true :
false;
1138 for (
int p2p=0; p2p<2; p2p++) {
1140 for (
int dir=0; dir<2; dir++) {
1152 for (
int dir=0; dir<2; dir++) {
1153 if (!comms_complete[
dim*2+dir]) {
1155 if (comms_complete[2*
dim+dir]) {
1165 if (!fused_halo_memcpy) {
1191 for (
int dir = 0; dir < 2; dir++) {
1201 out <<
"v = " << a.
v << std::endl;
1202 out <<
"norm = " << a.
norm << std::endl;
1203 out <<
"alloc = " << a.alloc << std::endl;
1204 out <<
"init = " << a.init << std::endl;
1216 errorQuda(
"Incorrect component index...");
1219 errorQuda(
"Cannot get requested component");
1226 if (first_element < 0)
errorQuda(
"\nError: trying to set negative first element.\n");
1228 if (first_element == 0 && range == this->EigvDim())
1230 if (range != dst.EigvDim())
errorQuda(
"\nError: eigenvector range to big.\n");
1232 copyCuda(dst, *
this);
1234 else if ((first_element+range) < this->EigvDim())
1254 param.eigv_dim = range;
1256 param.v = (
void*)((
char*)
v + first_element*eigv_bytes);
1257 param.norm = (
void*)((
char*)
norm + first_element*eigv_norm_bytes);
1262 eigv_subset->eigenvectors.reserve(
param.eigv_dim);
1263 for (
int id = first_element;
id < (first_element+range);
id++)
1269 copyCuda(dst, *eigv_subset);
1273 errorQuda(
"Incorrect eigenvector dimension...");
1276 errorQuda(
"Eigenvector must be a parity spinor");
1295 tmp.Source(sourceType, st, s, c);
virtual ColorSpinorField & operator=(const ColorSpinorField &)
void * ghost_buf[2 *QUDA_MAX_DIM]
CompositeColorSpinorFieldDescriptor composite_descr
used for deflation eigenvector sets etc.:
QudaTwistFlavorType twistFlavor
CompositeColorSpinorField components
QudaGammaBasis gammaBasis
QudaSiteSubset SiteSubset() const
void * ghost[2][QUDA_MAX_DIM]
void reset(const ColorSpinorParam &)
void createGhostZone(int nFace, bool spin_project=true) const
static void checkField(const ColorSpinorField &, const ColorSpinorField &)
QudaFieldOrder fieldOrder
friend class ColorSpinorParam
QudaSiteSubset siteSubset
int ghostFaceCB[QUDA_MAX_DIM]
void * ghostNorm[2][QUDA_MAX_DIM]
MsgHandle * mh_send_fwd[2][QUDA_MAX_DIM]
MsgHandle * mh_recv_rdma_back[2][QUDA_MAX_DIM]
bool ghost_precision_reset
MsgHandle * mh_send_rdma_fwd[2][QUDA_MAX_DIM]
void * from_face_dim_dir_d[2][QUDA_MAX_DIM][2]
static MsgHandle * mh_recv_p2p_back[2][QUDA_MAX_DIM]
MsgHandle * mh_send_rdma_back[2][QUDA_MAX_DIM]
void * my_face_dim_dir_h[2][QUDA_MAX_DIM][2]
void * from_face_dim_dir_h[2][QUDA_MAX_DIM][2]
static MsgHandle * mh_send_p2p_fwd[2][QUDA_MAX_DIM]
static MsgHandle * mh_recv_p2p_fwd[2][QUDA_MAX_DIM]
static void * ghost_pinned_recv_buffer_h[2]
size_t ghost_offset[QUDA_MAX_DIM][2]
QudaPrecision ghost_precision
void * my_face_dim_dir_d[2][QUDA_MAX_DIM][2]
static void destroyIPCComms()
size_t ghost_face_bytes[QUDA_MAX_DIM]
static void * ghost_pinned_send_buffer_h[2]
static void * ghost_remote_send_buffer_d[2][QUDA_MAX_DIM][2]
void * from_face_dim_dir_hd[2][QUDA_MAX_DIM][2]
static MsgHandle * mh_send_p2p_back[2][QUDA_MAX_DIM]
static bool ghost_field_reset
int surface[QUDA_MAX_DIM]
MsgHandle * mh_send_back[2][QUDA_MAX_DIM]
void * my_face_dim_dir_hd[2][QUDA_MAX_DIM][2]
static cudaEvent_t ipcCopyEvent[2][2][QUDA_MAX_DIM]
static cudaEvent_t ipcRemoteCopyEvent[2][2][QUDA_MAX_DIM]
MsgHandle * mh_recv_fwd[2][QUDA_MAX_DIM]
void allocateGhostBuffer(size_t ghost_bytes) const
Allocate the static ghost buffers.
MsgHandle * mh_recv_rdma_fwd[2][QUDA_MAX_DIM]
size_t ghost_face_bytes_aligned[QUDA_MAX_DIM]
static void * ghost_recv_buffer_d[2]
MsgHandle * mh_recv_back[2][QUDA_MAX_DIM]
void createComms(bool no_comms_fill=false, bool bidir=true)
static void * ghost_send_buffer_d[2]
void createComms(int nFace, bool spin_project=true)
Create the communication handlers and buffers.
void prefetch(QudaFieldLocation mem_space, qudaStream_t stream=0) const
If managed memory and prefetch is enabled, prefetch the spinor, the norm field (as appropriate),...
virtual void copy_from_buffer(void *buffer)
Copy all contents of the field from a host buffer to this field.
cudaColorSpinorField(const cudaColorSpinorField &)
void recvStart(int nFace, int dir, int dagger=0, qudaStream_t *stream_p=nullptr, bool gdr=false)
Initiate halo communication receive.
virtual ~cudaColorSpinorField()
void commsWait(int nFace, int d, int dagger=0, qudaStream_t *stream_p=nullptr, bool gdr_send=false, bool gdr_recv=false)
Wait on halo communication to complete.
void pack(int nFace, int parity, int dagger, int stream_idx, MemoryLocation location[], MemoryLocation location_label, bool spin_project=true, double a=0, double b=0, double c=0, int shmem=0)
void CopySubset(cudaColorSpinorField &dst, const int range, const int first_element=0) const
void streamInit(qudaStream_t *stream_p)
void sendGhost(void *ghost_spinor, const int nFace, const int dim, const QudaDirection dir, const int dagger, qudaStream_t *stream)
void packGhostExtended(const int nFace, const int R[], const QudaParity parity, const int dim, const QudaDirection dir, const int dagger, qudaStream_t *stream, bool zero_copy=false)
ColorSpinorField & operator=(const ColorSpinorField &)
void sendStart(int nFace, int d, int dagger=0, qudaStream_t *stream_p=nullptr, bool gdr=false, bool remote_write=false)
Initiate halo communication sending.
cudaColorSpinorField & Component(const int idx) const
for composite fields:
void scatterExtended(int nFace, int parity, int dagger, int dir)
void unpackGhostExtended(const void *ghost_spinor, const int nFace, const QudaParity parity, const int dim, const QudaDirection dir, const int dagger, qudaStream_t *stream, bool zero_copy)
friend class cpuColorSpinorField
void backup() const
Backs up the cudaColorSpinorField.
void commsStart(int nFace, int d, int dagger=0, qudaStream_t *stream_p=nullptr, bool gdr_send=false, bool gdr_recv=false)
Initiate halo communication.
virtual void copy_to_buffer(void *buffer) const
Copy all contents of the field to a host buffer.
void gather(int nFace, int dagger, int dir, qudaStream_t *stream_p=NULL)
int commsQuery(int nFace, int d, int dagger=0, qudaStream_t *stream_p=nullptr, bool gdr_send=false, bool gdr_recv=false)
Non-blocking query if the halo communication has completed.
void Source(const QudaSourceType sourceType, const int st=0, const int s=0, const int c=0)
void exchangeGhost(QudaParity parity, int nFace, int dagger, const MemoryLocation *pack_destination=nullptr, const MemoryLocation *halo_location=nullptr, bool gdr_send=false, bool gdr_recv=false, QudaPrecision ghost_precision=QUDA_INVALID_PRECISION) const
This is a unified ghost exchange function for doing a complete halo exchange regardless of the type o...
void copy(const cudaColorSpinorField &)
void scatter(int nFace, int dagger, int dir, qudaStream_t *stream_p)
void restore() const
Restores the cudaColorSpinorField.
void packGhost(const int nFace, const QudaParity parity, const int dim, const QudaDirection dir, const int dagger, qudaStream_t *stream, MemoryLocation location[2 *QUDA_MAX_DIM], MemoryLocation location_label, bool spin_project, double a=0, double b=0, double c=0, int shmem=0)
Packs the cudaColorSpinorField's ghost zone.
void allocateGhostBuffer(int nFace, bool spin_project=true) const
Allocate the ghost buffers.
void packExtended(const int nFace, const int R[], const int parity, const int dagger, const int dim, qudaStream_t *stream_p, const bool zeroCopyPack=false)
void PrintVector(unsigned int x) const
void unpackGhost(const void *ghost_spinor, const int nFace, const int dim, const QudaDirection dir, const int dagger, qudaStream_t *stream)
void comm_start(MsgHandle *mh)
bool comm_gdr_enabled()
Query if GPU Direct RDMA communication is enabled (global setting)
int comm_query(MsgHandle *mh)
bool comm_peer2peer_enabled(int dir, int dim)
int comm_dim_partitioned(int dim)
void comm_wait(MsgHandle *mh)
int commDimPartitioned(int dir)
int comm_peer2peer_enabled_global()
void * memset(void *s, int c, size_t n)
cudaColorSpinorField * tmp
enum QudaPrecision_s QudaPrecision
@ QUDA_CUDA_FIELD_LOCATION
@ QUDA_CPU_FIELD_LOCATION
enum QudaDirection_s QudaDirection
@ QUDA_PARITY_SITE_SUBSET
enum QudaFieldLocation_s QudaFieldLocation
enum QudaFieldCreate_s QudaFieldCreate
@ QUDA_EVEN_ODD_SITE_ORDER
enum QudaSourceType_s QudaSourceType
@ QUDA_PADDED_SPACE_SPIN_COLOR_FIELD_ORDER
@ QUDA_SPACE_SPIN_COLOR_FIELD_ORDER
@ QUDA_REFERENCE_FIELD_CREATE
enum QudaParity_s QudaParity
#define pool_pinned_malloc(size)
#define pool_device_malloc(size)
#define pool_pinned_free(ptr)
#define pool_device_free(ptr)
#define get_mapped_device_pointer(ptr)
#define mapped_malloc(size)
void init()
Create the BLAS context.
void genericPackGhost(void **ghost, const ColorSpinorField &a, QudaParity parity, int nFace, int dagger, MemoryLocation *destination=nullptr)
Generic ghost packing routine.
void PackGhost(void *ghost[2 *QUDA_MAX_DIM], const ColorSpinorField &field, MemoryLocation location, int nFace, bool dagger, int parity, bool spin_project, double a, double b, double c, int shmem, const qudaStream_t &stream)
Dslash face packing routine.
void copyGenericColorSpinor(ColorSpinorField &dst, const ColorSpinorField &src, QudaFieldLocation location, void *Dst=0, void *Src=0, void *dstNorm=0, void *srcNorm=0)
void genericCudaPrintVector(const cudaColorSpinorField &a, unsigned x)
QudaFieldLocation reorder_location()
Return whether data is reordered on the CPU or GPU. This can set at QUDA initialization using the env...
void pushKernelPackT(bool pack)
bool is_prefetch_enabled()
std::ostream & operator<<(std::ostream &output, const CloverFieldParam ¶m)
#define qudaMemset2DAsync(ptr, pitch, value, width, height, stream)
#define qudaMemsetAsync(ptr, value, count, stream)
#define qudaStreamWaitEvent(stream, event, flags)
#define qudaMemPrefetchAsync(ptr, count, mem_space, stream)
#define qudaMemcpy(dst, src, count, kind)
#define qudaEventRecord(event, stream)
#define qudaMemcpyAsync(dst, src, count, kind, stream)
cudaStream_t qudaStream_t
#define qudaDeviceSynchronize()
#define qudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, kind, stream)
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5.
QudaFieldLocation location