quda-ref/v1.1.0/cuda__gauge__field_8cpp_source.html

 #include <string.h>

 #include <gauge_field.h>

 #include <typeinfo>

 #include <blas_quda.h>


 namespace quda {


   cudaGaugeField::cudaGaugeField(const GaugeFieldParam &param) :

     GaugeField(param), gauge(0), even(0), odd(0)

   {

     if ((order == QUDA_QDP_GAUGE_ORDER || order == QUDA_QDPJIT_GAUGE_ORDER) &&

         create != QUDA_REFERENCE_FIELD_CREATE) {

       errorQuda("QDP ordering only supported for reference fields");

     }


     if (order == QUDA_QDP_GAUGE_ORDER ||

         order == QUDA_TIFR_GAUGE_ORDER || order == QUDA_TIFR_PADDED_GAUGE_ORDER ||

         order == QUDA_BQCD_GAUGE_ORDER || order == QUDA_CPS_WILSON_GAUGE_ORDER)

       errorQuda("Field ordering %d presently disabled for this type", order);


 #ifdef MULTI_GPU

     if (link_type != QUDA_ASQTAD_MOM_LINKS &&

         ghostExchange == QUDA_GHOST_EXCHANGE_PAD &&

         isNative()) {

       bool pad_check = true;

       for (int i=0; i<nDim; i++) {

         // when we have coarse links we need to double the pad since we're storing forwards and backwards links

         int minimum_pad = nFace*surfaceCB[i] * (geometry == QUDA_COARSE_GEOMETRY ? 2 : 1);

         if (pad < minimum_pad) pad_check = false;

         if (!pad_check)

           errorQuda("cudaGaugeField being constructed with insufficient padding in dim %d (%d < %d)\n", i, pad, minimum_pad);

       }

     }

 #endif


     if (create != QUDA_NULL_FIELD_CREATE &&

         create != QUDA_ZERO_FIELD_CREATE &&

         create != QUDA_REFERENCE_FIELD_CREATE){

       errorQuda("ERROR: create type(%d) not supported yet\n", create);

     }


     if (create != QUDA_REFERENCE_FIELD_CREATE) {

       switch(mem_type) {

       case QUDA_MEMORY_DEVICE: gauge = bytes ? pool_device_malloc(bytes) : nullptr; break;

       case QUDA_MEMORY_MAPPED:

         gauge_h = bytes ? mapped_malloc(bytes) : nullptr;

         gauge = bytes ? get_mapped_device_pointer(gauge_h) : nullptr; // set the matching device pointer

         break;

       default:

         errorQuda("Unsupported memory type %d", mem_type);

       }

       if (create == QUDA_ZERO_FIELD_CREATE && bytes) qudaMemset(gauge, 0, bytes);

     } else {

       gauge = param.gauge;

     }


     if ( !isNative() ) {

       for (int i=0; i<nDim; i++) {

         size_t nbytes = nFace * surface[i] * nInternal * precision;

         ghost[i] = nbytes ? pool_device_malloc(nbytes) : nullptr;

         ghost[i+4] = (nbytes && geometry == QUDA_COARSE_GEOMETRY) ? pool_device_malloc(nbytes) : nullptr;

       }

     }


     if (ghostExchange == QUDA_GHOST_EXCHANGE_PAD) {

       if (create == QUDA_REFERENCE_FIELD_CREATE) exchangeGhost(geometry == QUDA_VECTOR_GEOMETRY ? QUDA_LINK_BACKWARDS : QUDA_LINK_BIDIRECTIONAL);

     }


     even = gauge;

     odd = static_cast<char*>(gauge) + bytes/2;

     if (create != QUDA_ZERO_FIELD_CREATE && isNative() && ghostExchange == QUDA_GHOST_EXCHANGE_PAD) zeroPad();

   }


   void cudaGaugeField::zeroPad() {

     size_t pad_bytes = (stride - volumeCB) * precision * order;

     int Npad = (geometry * (reconstruct != QUDA_RECONSTRUCT_NO ? reconstruct : nColor * nColor * 2)) / order;


     size_t pitch = stride*order*precision;

     if (pad_bytes) {

       qudaMemset2D(static_cast<char *>(even) + volumeCB * order * precision, pitch, 0, pad_bytes, Npad);

       qudaMemset2D(static_cast<char *>(odd) + volumeCB * order * precision, pitch, 0, pad_bytes, Npad);

     }

   }


   cudaGaugeField::~cudaGaugeField()

   {

     destroyComms();


     if (create != QUDA_REFERENCE_FIELD_CREATE) {

       switch(mem_type) {

       case QUDA_MEMORY_DEVICE:

         if (gauge) pool_device_free(gauge);

         break;

       case QUDA_MEMORY_MAPPED:

         if (gauge_h) host_free(gauge_h);

         break;

       default:

         errorQuda("Unsupported memory type %d", mem_type);

       }

     }


     if ( !isNative() ) {

       for (int i=0; i<nDim; i++) {

         if (ghost[i]) pool_device_free(ghost[i]);

         if (ghost[i + 4] && geometry == QUDA_COARSE_GEOMETRY) pool_device_free(ghost[i + 4]);

       }

     }


   }


   // This does the exchange of the forwards boundary gauge field ghost zone and places

   // it into the ghost array of the next node

   void cudaGaugeField::exchangeGhost(QudaLinkDirection link_direction) {


     if (ghostExchange != QUDA_GHOST_EXCHANGE_PAD) errorQuda("Cannot call exchangeGhost with ghostExchange=%d", ghostExchange);

     if (geometry != QUDA_VECTOR_GEOMETRY && geometry != QUDA_COARSE_GEOMETRY) errorQuda("Invalid geometry=%d", geometry);

     if ( (link_direction == QUDA_LINK_BIDIRECTIONAL || link_direction == QUDA_LINK_FORWARDS) && geometry != QUDA_COARSE_GEOMETRY)

       errorQuda("Cannot request exchange of forward links on non-coarse geometry");

     if (nFace == 0) errorQuda("nFace = 0");


     const int dir = 1; // sending forwards only

     const int R[] = {nFace, nFace, nFace, nFace};

     const bool no_comms_fill = true; // dslash kernels presently require this

     const bool bidir = false; // communication is only ever done in one direction at once

     createComms(R, true, bidir); // always need to allocate space for non-partitioned dimension for copyGenericGauge


     // loop over backwards and forwards links

     const QudaLinkDirection directions[] = {QUDA_LINK_BACKWARDS, QUDA_LINK_FORWARDS};

     for (int link_dir = 0; link_dir<2; link_dir++) {

       if (!(link_direction == QUDA_LINK_BIDIRECTIONAL || link_direction == directions[link_dir])) continue;


       void *send_d[2*QUDA_MAX_DIM] = { };

       void *recv_d[2*QUDA_MAX_DIM] = { };


       size_t offset = 0;

       for (int d=0; d<nDim; d++) {

         recv_d[d] = static_cast<char *>(ghost_recv_buffer_d[bufferIndex]) + offset;

         if (bidir) offset += ghost_face_bytes_aligned[d];

         send_d[d] = static_cast<char *>(ghost_send_buffer_d[bufferIndex]) + offset;

         offset += ghost_face_bytes_aligned[d];

       }


       extractGaugeGhost(*this, send_d, true, link_dir*nDim); // get the links into contiguous buffers


       // issue receive preposts and host-to-device copies if needed

       for (int dim=0; dim<nDim; dim++) {

         if (!comm_dim_partitioned(dim)) continue;

         recvStart(dim, dir); // prepost the receive

         if (!comm_peer2peer_enabled(dir,dim) && !comm_gdr_enabled()) {

           qudaMemcpyAsync(my_face_dim_dir_h[bufferIndex][dim][dir], my_face_dim_dir_d[bufferIndex][dim][dir],

                           ghost_face_bytes[dim], cudaMemcpyDeviceToHost, streams[2 * dim + dir]);

         }

       }


       // if gdr enabled then synchronize

       if (comm_gdr_enabled()) qudaDeviceSynchronize();


       // if the sending direction is not peer-to-peer then we need to synchronize before we start sending

       for (int dim=0; dim<nDim; dim++) {

         if (!comm_dim_partitioned(dim)) continue;

         if (!comm_peer2peer_enabled(dir,dim) && !comm_gdr_enabled()) qudaStreamSynchronize(streams[2*dim+dir]);

         sendStart(dim, dir, &streams[2*dim+dir]); // start sending

       }


       // complete communication and issue host-to-device copies if needed

       for (int dim=0; dim<nDim; dim++) {

         if (!comm_dim_partitioned(dim)) continue;

         commsComplete(dim, dir);

         if (!comm_peer2peer_enabled(1-dir,dim) && !comm_gdr_enabled()) {

           qudaMemcpyAsync(from_face_dim_dir_d[bufferIndex][dim][1 - dir], from_face_dim_dir_h[bufferIndex][dim][1 - dir],

                           ghost_face_bytes[dim], cudaMemcpyHostToDevice, streams[2 * dim + dir]);

         }

       }


       // fill in the halos for non-partitioned dimensions

       for (int dim=0; dim<nDim; dim++) {

         if (!comm_dim_partitioned(dim) && no_comms_fill) {

           qudaMemcpy(recv_d[dim], send_d[dim], ghost_face_bytes[dim], cudaMemcpyDeviceToDevice);

         }

       }


       if (isNative()) {

         copyGenericGauge(*this, *this, QUDA_CUDA_FIELD_LOCATION, 0, 0, 0, recv_d, 1 + 2*link_dir); // 1, 3

       } else {

         // copy from receive buffer into ghost array

         for (int dim=0; dim<nDim; dim++)

           qudaMemcpy(ghost[dim+link_dir*nDim], recv_d[dim], ghost_face_bytes[dim], cudaMemcpyDeviceToDevice);

       }


       bufferIndex = 1-bufferIndex;

     } // link_dir


     qudaDeviceSynchronize();

   }


   // This does the opposite of exchangeGhost and sends back the ghost

   // zone to the node from which it came and injects it back into the

   // field

   void cudaGaugeField::injectGhost(QudaLinkDirection link_direction) {


     if (ghostExchange != QUDA_GHOST_EXCHANGE_PAD) errorQuda("Cannot call exchangeGhost with ghostExchange=%d", ghostExchange);

     if (geometry != QUDA_VECTOR_GEOMETRY && geometry != QUDA_COARSE_GEOMETRY) errorQuda("Invalid geometry=%d", geometry);

     if (link_direction != QUDA_LINK_BACKWARDS) errorQuda("Invalid link_direction = %d", link_direction);

     if (nFace == 0) errorQuda("nFace = 0");


     const int dir = 0; // sending backwards only

     const int R[] = {nFace, nFace, nFace, nFace};

     const bool no_comms_fill = false; // injection never does no_comms_fill

     const bool bidir = false; // communication is only ever done in one direction at once

     createComms(R, true, bidir); // always need to allocate space for non-partitioned dimension for copyGenericGauge


     // loop over backwards and forwards links (forwards links never sent but leave here just in case)

     const QudaLinkDirection directions[] = {QUDA_LINK_BACKWARDS, QUDA_LINK_FORWARDS};

     for (int link_dir = 0; link_dir<2; link_dir++) {

       if (!(link_direction == QUDA_LINK_BIDIRECTIONAL || link_direction == directions[link_dir])) continue;


       void *send_d[2*QUDA_MAX_DIM] = { };

       void *recv_d[2*QUDA_MAX_DIM] = { };


       size_t offset = 0;

       for (int d=0; d<nDim; d++) {

         // send backwards is first half of each ghost_send_buffer

         send_d[d] = static_cast<char *>(ghost_send_buffer_d[bufferIndex]) + offset;

         if (bidir) offset += ghost_face_bytes_aligned[d];

         // receive from forwards is the second half of each ghost_recv_buffer

         recv_d[d] = static_cast<char *>(ghost_recv_buffer_d[bufferIndex]) + offset;

         offset += ghost_face_bytes_aligned[d];

       }


       if (isNative()) { // copy from padded region in gauge field into send buffer

         copyGenericGauge(*this, *this, QUDA_CUDA_FIELD_LOCATION, 0, 0, send_d, 0, 1 + 2*link_dir);

       } else { // copy from receive buffer into ghost array

         for (int dim=0; dim<nDim; dim++) qudaMemcpy(send_d[dim], ghost[dim+link_dir*nDim], ghost_face_bytes[dim], cudaMemcpyDeviceToDevice);

       }


       // issue receive preposts and host-to-device copies if needed

       for (int dim=0; dim<nDim; dim++) {

         if (!comm_dim_partitioned(dim)) continue;

         recvStart(dim, dir); // prepost the receive

         if (!comm_peer2peer_enabled(dir,dim) && !comm_gdr_enabled()) {

           qudaMemcpyAsync(my_face_dim_dir_h[bufferIndex][dim][dir], my_face_dim_dir_d[bufferIndex][dim][dir],

                           ghost_face_bytes[dim], cudaMemcpyDeviceToHost, streams[2 * dim + dir]);

         }

       }


       // if gdr enabled then synchronize

       if (comm_gdr_enabled()) qudaDeviceSynchronize();


       // if the sending direction is not peer-to-peer then we need to synchronize before we start sending

       for (int dim=0; dim<nDim; dim++) {

         if (!comm_dim_partitioned(dim)) continue;

         if (!comm_peer2peer_enabled(dir,dim) && !comm_gdr_enabled()) qudaStreamSynchronize(streams[2*dim+dir]);

         sendStart(dim, dir, &streams[2*dim+dir]); // start sending

       }


       // complete communication and issue host-to-device copies if needed

       for (int dim=0; dim<nDim; dim++) {

         if (!comm_dim_partitioned(dim)) continue;

         commsComplete(dim, dir);

         if (!comm_peer2peer_enabled(1-dir,dim) && !comm_gdr_enabled()) {

           qudaMemcpyAsync(from_face_dim_dir_d[bufferIndex][dim][1 - dir], from_face_dim_dir_h[bufferIndex][dim][1 - dir],

                           ghost_face_bytes[dim], cudaMemcpyHostToDevice, streams[2 * dim + dir]);

         }

       }


       // fill in the halos for non-partitioned dimensions

       for (int dim=0; dim<nDim; dim++) {

         if (!comm_dim_partitioned(dim) && no_comms_fill) {

           qudaMemcpy(recv_d[dim], send_d[dim], ghost_face_bytes[dim], cudaMemcpyDeviceToDevice);

         }

       }


       // get the links into contiguous buffers

       extractGaugeGhost(*this, recv_d, false, link_dir*nDim);


       bufferIndex = 1-bufferIndex;

     } // link_dir


     qudaDeviceSynchronize();

   }


   void cudaGaugeField::allocateGhostBuffer(const int *R, bool no_comms_fill, bool bidir) const

   {

     createGhostZone(R, no_comms_fill, bidir);

     LatticeField::allocateGhostBuffer(ghost_bytes);

   }


   void cudaGaugeField::createComms(const int *R, bool no_comms_fill, bool bidir)

   {

     allocateGhostBuffer(R, no_comms_fill, bidir); // allocate the ghost buffer if not yet allocated


     // ascertain if this instance needs it comms buffers to be updated

     bool comms_reset = ghost_field_reset || // FIXME add send buffer check

       (my_face_h[0] != ghost_pinned_send_buffer_h[0]) || (my_face_h[1] != ghost_pinned_send_buffer_h[1]) ||

       (from_face_h[0] != ghost_pinned_recv_buffer_h[0]) || (from_face_h[1] != ghost_pinned_recv_buffer_h[1]) ||

       ghost_bytes != ghost_bytes_old; // ghost buffer has been resized (e.g., bidir to unidir)


     if (!initComms || comms_reset) LatticeField::createComms(no_comms_fill, bidir);


     if (ghost_field_reset) destroyIPCComms();

     createIPCComms();

   }


   void cudaGaugeField::recvStart(int dim, int dir)

   {

     if (!comm_dim_partitioned(dim)) return;


     if (dir==0) { // sending backwards

       // receive from the processor in the +1 direction

       if (comm_peer2peer_enabled(1,dim)) {

         comm_start(mh_recv_p2p_fwd[bufferIndex][dim]);

       } else if (comm_gdr_enabled()) {

         comm_start(mh_recv_rdma_fwd[bufferIndex][dim]);

       } else {

         comm_start(mh_recv_fwd[bufferIndex][dim]);

       }

     } else { //sending forwards

       // receive from the processor in the -1 direction

       if (comm_peer2peer_enabled(0,dim)) {

         comm_start(mh_recv_p2p_back[bufferIndex][dim]);

       } else if (comm_gdr_enabled()) {

         comm_start(mh_recv_rdma_back[bufferIndex][dim]);

       } else {

         comm_start(mh_recv_back[bufferIndex][dim]);

       }

     }

   }


   void cudaGaugeField::sendStart(int dim, int dir, qudaStream_t *stream_p)

   {

     if (!comm_dim_partitioned(dim)) return;


     if (!comm_peer2peer_enabled(dir,dim)) {

       if (dir == 0)

         if (comm_gdr_enabled()) {

           comm_start(mh_send_rdma_back[bufferIndex][dim]);

         } else {

           comm_start(mh_send_back[bufferIndex][dim]);

         }

       else

         if (comm_gdr_enabled()) {

           comm_start(mh_send_rdma_fwd[bufferIndex][dim]);

         } else {

           comm_start(mh_send_fwd[bufferIndex][dim]);

         }

     } else { // doing peer-to-peer


       void *ghost_dst

         = static_cast<char *>(ghost_remote_send_buffer_d[bufferIndex][dim][dir]) + ghost_offset[dim][(dir + 1) % 2];


       cudaMemcpyAsync(ghost_dst, my_face_dim_dir_d[bufferIndex][dim][dir],

                       ghost_face_bytes[dim], cudaMemcpyDeviceToDevice,

                       stream_p ? *stream_p : 0);


       if (dir == 0) {

         // record the event

         qudaEventRecord(ipcCopyEvent[bufferIndex][0][dim], stream_p ? *stream_p : 0);

         // send to the processor in the -1 direction

         comm_start(mh_send_p2p_back[bufferIndex][dim]);

       } else {

         qudaEventRecord(ipcCopyEvent[bufferIndex][1][dim], stream_p ? *stream_p : 0);

         // send to the processor in the +1 direction

         comm_start(mh_send_p2p_fwd[bufferIndex][dim]);

       }

     }

   }


   void cudaGaugeField::commsComplete(int dim, int dir)

   {

     if (!comm_dim_partitioned(dim)) return;


     if (dir==0) {

       if (comm_peer2peer_enabled(1,dim)) {

         comm_wait(mh_recv_p2p_fwd[bufferIndex][dim]);

         qudaEventSynchronize(ipcRemoteCopyEvent[bufferIndex][1][dim]);

       } else if (comm_gdr_enabled()) {

         comm_wait(mh_recv_rdma_fwd[bufferIndex][dim]);

       } else {

         comm_wait(mh_recv_fwd[bufferIndex][dim]);

       }


       if (comm_peer2peer_enabled(0,dim)) {

         comm_wait(mh_send_p2p_back[bufferIndex][dim]);

         qudaEventSynchronize(ipcCopyEvent[bufferIndex][0][dim]);

       } else if (comm_gdr_enabled()) {

         comm_wait(mh_send_rdma_back[bufferIndex][dim]);

       } else {

         comm_wait(mh_send_back[bufferIndex][dim]);

       }

     } else {

       if (comm_peer2peer_enabled(0,dim)) {

         comm_wait(mh_recv_p2p_back[bufferIndex][dim]);

         qudaEventSynchronize(ipcRemoteCopyEvent[bufferIndex][0][dim]);

       } else if (comm_gdr_enabled()) {

         comm_wait(mh_recv_rdma_back[bufferIndex][dim]);

       } else {

         comm_wait(mh_recv_back[bufferIndex][dim]);

       }


       if (comm_peer2peer_enabled(1,dim)) {

         comm_wait(mh_send_p2p_fwd[bufferIndex][dim]);

         qudaEventSynchronize(ipcCopyEvent[bufferIndex][1][dim]);

       } else if (comm_gdr_enabled()) {

         comm_wait(mh_send_rdma_fwd[bufferIndex][dim]);

       } else {

         comm_wait(mh_send_fwd[bufferIndex][dim]);

       }

     }

   }


   void cudaGaugeField::exchangeExtendedGhost(const int *R, bool no_comms_fill)

   {

     const int b = bufferIndex;

     void *send_d[QUDA_MAX_DIM], *recv_d[QUDA_MAX_DIM];


     createComms(R, no_comms_fill);


     size_t offset = 0;

     for (int dim=0; dim<nDim; dim++) {

       if ( !(comm_dim_partitioned(dim) || (no_comms_fill && R[dim])) ) continue;

       send_d[dim] = static_cast<char*>(ghost_send_buffer_d[b]) + offset;

       recv_d[dim] = static_cast<char*>(ghost_recv_buffer_d[b]) + offset;


       // silence cuda-memcheck initcheck errors that arise since we

       // have an oversized ghost buffer when doing the extended exchange

       qudaMemsetAsync(send_d[dim], 0, 2 * ghost_face_bytes_aligned[dim], 0);

       offset += 2 * ghost_face_bytes_aligned[dim]; // factor of two from fwd/back

     }


     for (int dim=0; dim<nDim; dim++) {

       if ( !(comm_dim_partitioned(dim) || (no_comms_fill && R[dim])) ) continue;


       //extract into a contiguous buffer

       extractExtendedGaugeGhost(*this, dim, R, send_d, true);


       if (comm_dim_partitioned(dim)) {

         for (int dir=0; dir<2; dir++) recvStart(dim, dir);


         for (int dir=0; dir<2; dir++) {

           // issue host-to-device copies if needed

           if (!comm_peer2peer_enabled(dir,dim) && !comm_gdr_enabled()) {

             qudaMemcpyAsync(my_face_dim_dir_h[bufferIndex][dim][dir], my_face_dim_dir_d[bufferIndex][dim][dir],

                             ghost_face_bytes[dim], cudaMemcpyDeviceToHost, streams[dir]);

           }

         }


         // if either direction is not peer-to-peer then we need to synchronize

         if (!comm_peer2peer_enabled(0, dim) || !comm_peer2peer_enabled(1, dim)) qudaDeviceSynchronize();


         // if we pass a stream to sendStart then we must ensure that stream is synchronized

         for (int dir = 0; dir < 2; dir++) sendStart(dim, dir, &streams[dir]);

         for (int dir = 0; dir < 2; dir++) commsComplete(dim, dir);


         for (int dir = 0; dir < 2; dir++) {

           // issue host-to-device copies if needed

           if (!comm_peer2peer_enabled(dir, dim) && !comm_gdr_enabled()) {

             qudaMemcpyAsync(from_face_dim_dir_d[bufferIndex][dim][dir], from_face_dim_dir_h[bufferIndex][dim][dir],

                             ghost_face_bytes[dim], cudaMemcpyHostToDevice, streams[dir]);

           }

         }


       } else { // if just doing a local exchange to fill halo then need to swap faces

         qudaMemcpy(from_face_dim_dir_d[b][dim][1], my_face_dim_dir_d[b][dim][0],

                    ghost_face_bytes[dim], cudaMemcpyDeviceToDevice);

         qudaMemcpy(from_face_dim_dir_d[b][dim][0], my_face_dim_dir_d[b][dim][1],

                    ghost_face_bytes[dim], cudaMemcpyDeviceToDevice);

       }


       // inject back into the gauge field

       extractExtendedGaugeGhost(*this, dim, R, recv_d, false);

     }


     bufferIndex = 1-bufferIndex;

     qudaDeviceSynchronize();

   }


   void cudaGaugeField::exchangeExtendedGhost(const int *R, TimeProfile &profile, bool no_comms_fill) {

     profile.TPSTART(QUDA_PROFILE_COMMS);

     exchangeExtendedGhost(R, no_comms_fill);

     profile.TPSTOP(QUDA_PROFILE_COMMS);

   }


   void cudaGaugeField::setGauge(void *gauge_)

   {

     if(create != QUDA_REFERENCE_FIELD_CREATE) {

       errorQuda("Setting gauge pointer is only allowed when create="

           "QUDA_REFERENCE_FIELD_CREATE type\n");

     }

     gauge = gauge_;

   }


   void *create_gauge_buffer(size_t bytes, QudaGaugeFieldOrder order, QudaFieldGeometry geometry) {

     if (order == QUDA_QDP_GAUGE_ORDER) {

       void **buffer = new void*[geometry];

       for (int d=0; d<geometry; d++) buffer[d] = pool_device_malloc(bytes/geometry);

       return ((void*)buffer);

     } else {

       return pool_device_malloc(bytes);

     }


   }


   void **create_ghost_buffer(size_t bytes[], QudaGaugeFieldOrder order, QudaFieldGeometry geometry) {


     if (order > 4) {

       void **buffer = new void*[geometry];

       for (int d=0; d<geometry; d++) buffer[d] = pool_device_malloc(bytes[d]);

       return buffer;

     } else {

       return 0;

     }


   }


   void free_gauge_buffer(void *buffer, QudaGaugeFieldOrder order, QudaFieldGeometry geometry) {

     if (order == QUDA_QDP_GAUGE_ORDER) {

       for (int d=0; d<geometry; d++) pool_device_free(((void**)buffer)[d]);

       delete []((void**)buffer);

     } else {

       pool_device_free(buffer);

     }

   }


   void free_ghost_buffer(void **buffer, QudaGaugeFieldOrder order, QudaFieldGeometry geometry) {

     if (order > 4) {

       for (int d=0; d<geometry; d++) pool_device_free(buffer[d]);

       delete []buffer;

     }

   }


   void cudaGaugeField::copy(const GaugeField &src) {

     if (this == &src) return;


     checkField(src);


     if (link_type == QUDA_ASQTAD_FAT_LINKS) {

       fat_link_max = src.LinkMax();

       if (fat_link_max == 0.0 && precision < QUDA_SINGLE_PRECISION) fat_link_max = src.abs_max();

     } else {

       fat_link_max = 1.0;

     }


     if (typeid(src) == typeid(cudaGaugeField)) {


       if (ghostExchange != QUDA_GHOST_EXCHANGE_EXTENDED && src.GhostExchange() != QUDA_GHOST_EXCHANGE_EXTENDED) {

         // copy field and ghost zone into this field

         copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, static_cast<const cudaGaugeField&>(src).gauge);


         if (geometry == QUDA_COARSE_GEOMETRY)

           copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, static_cast<const cudaGaugeField&>(src).gauge, 0, 0, 3);

       } else {

         copyExtendedGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, static_cast<const cudaGaugeField&>(src).gauge);

         if (geometry == QUDA_COARSE_GEOMETRY) errorQuda("Extended gauge copy for coarse geometry not supported");

       }


     } else if (typeid(src) == typeid(cpuGaugeField)) {

       if (reorder_location() == QUDA_CPU_FIELD_LOCATION) { // do reorder on the CPU

         void *buffer = pool_pinned_malloc(bytes);


         if (ghostExchange != QUDA_GHOST_EXCHANGE_EXTENDED && src.GhostExchange() != QUDA_GHOST_EXCHANGE_EXTENDED) {

           // copy field and ghost zone into buffer

           copyGenericGauge(*this, src, QUDA_CPU_FIELD_LOCATION, buffer, static_cast<const cpuGaugeField&>(src).gauge);


           if (geometry == QUDA_COARSE_GEOMETRY)

             copyGenericGauge(*this, src, QUDA_CPU_FIELD_LOCATION, buffer, static_cast<const cpuGaugeField &>(src).gauge,

                              0, 0, 3);

         } else {

           copyExtendedGauge(*this, src, QUDA_CPU_FIELD_LOCATION, buffer, static_cast<const cpuGaugeField&>(src).gauge);

           if (geometry == QUDA_COARSE_GEOMETRY) errorQuda("Extended gauge copy for coarse geometry not supported");

         }


         // this copies over both even and odd

         qudaMemcpy(gauge, buffer, bytes, cudaMemcpyDefault);

         pool_pinned_free(buffer);

       } else { // else on the GPU


         if (src.Order() == QUDA_MILC_SITE_GAUGE_ORDER ||

             src.Order() == QUDA_BQCD_GAUGE_ORDER      ||

             src.Order() == QUDA_TIFR_PADDED_GAUGE_ORDER) {

           // special case where we use zero-copy memory to read/write directly from application's array

           void *src_d = get_mapped_device_pointer(src.Gauge_p());


           if (src.GhostExchange() == QUDA_GHOST_EXCHANGE_NO) {

             copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, src_d);

           } else {

             errorQuda("Ghost copy not supported here");

           }


         } else {

           void *buffer = create_gauge_buffer(src.Bytes(), src.Order(), src.Geometry());

           size_t ghost_bytes[8];

           int srcNinternal = src.Reconstruct() != QUDA_RECONSTRUCT_NO ? src.Reconstruct() : 2*nColor*nColor;

           for (int d=0; d<geometry; d++) ghost_bytes[d] = nFace * surface[d%4] * srcNinternal * src.Precision();

           void **ghost_buffer = (nFace > 0) ? create_ghost_buffer(ghost_bytes, src.Order(), geometry) : nullptr;


           if (src.Order() == QUDA_QDP_GAUGE_ORDER) {

             for (int d=0; d<geometry; d++) {

               qudaMemcpy(((void **)buffer)[d], ((void **)src.Gauge_p())[d], src.Bytes() / geometry, cudaMemcpyDefault);

             }

           } else {

             qudaMemcpy(buffer, src.Gauge_p(), src.Bytes(), cudaMemcpyDefault);

           }


           if (src.Order() > 4 && GhostExchange() == QUDA_GHOST_EXCHANGE_PAD

               && src.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD && nFace)

             for (int d = 0; d < geometry; d++)

               qudaMemcpy(ghost_buffer[d], src.Ghost()[d], ghost_bytes[d], cudaMemcpyDefault);


           if (ghostExchange != QUDA_GHOST_EXCHANGE_EXTENDED && src.GhostExchange() != QUDA_GHOST_EXCHANGE_EXTENDED) {

             copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, buffer, 0, ghost_buffer);

             if (geometry == QUDA_COARSE_GEOMETRY)

               copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, buffer, 0, ghost_buffer, 3);

           } else {

             copyExtendedGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, buffer);

             if (geometry == QUDA_COARSE_GEOMETRY) errorQuda("Extended gauge copy for coarse geometry not supported");

           }

           free_gauge_buffer(buffer, src.Order(), src.Geometry());

           if (nFace > 0) free_ghost_buffer(ghost_buffer, src.Order(), geometry);

         }

       } // reorder_location

     } else {

       errorQuda("Invalid gauge field type");

     }


     // if we have copied from a source without a pad then we need to exchange

     if (ghostExchange == QUDA_GHOST_EXCHANGE_PAD && src.GhostExchange() != QUDA_GHOST_EXCHANGE_PAD)

       exchangeGhost(geometry == QUDA_VECTOR_GEOMETRY ? QUDA_LINK_BACKWARDS : QUDA_LINK_BIDIRECTIONAL);


     staggeredPhaseApplied = src.StaggeredPhaseApplied();

     staggeredPhaseType = src.StaggeredPhase();


     qudaDeviceSynchronize(); // include sync here for accurate host-device profiling

   }


   void cudaGaugeField::loadCPUField(const cpuGaugeField &cpu) {

     copy(cpu);

     qudaDeviceSynchronize();

   }


   void cudaGaugeField::loadCPUField(const cpuGaugeField &cpu, TimeProfile &profile) {

     profile.TPSTART(QUDA_PROFILE_H2D);

     loadCPUField(cpu);

     profile.TPSTOP(QUDA_PROFILE_H2D);

   }


   void cudaGaugeField::saveCPUField(cpuGaugeField &cpu) const

   {

     static_cast<LatticeField&>(cpu).checkField(*this);


     if (reorder_location() == QUDA_CUDA_FIELD_LOCATION) {


       if (cpu.Order() == QUDA_MILC_SITE_GAUGE_ORDER ||

           cpu.Order() == QUDA_BQCD_GAUGE_ORDER      ||

           cpu.Order() == QUDA_TIFR_PADDED_GAUGE_ORDER) {

         // special case where we use zero-copy memory to read/write directly from application's array

         void *cpu_d = get_mapped_device_pointer(cpu.Gauge_p());

         if (cpu.GhostExchange() == QUDA_GHOST_EXCHANGE_NO) {

           copyGenericGauge(cpu, *this, QUDA_CUDA_FIELD_LOCATION, cpu_d, gauge);

         } else {

           errorQuda("Ghost copy not supported here");

         }

       } else {

         void *buffer = create_gauge_buffer(cpu.Bytes(), cpu.Order(), cpu.Geometry());


         // Allocate space for ghost zone if required

         size_t ghost_bytes[8];

         int cpuNinternal = cpu.Reconstruct() != QUDA_RECONSTRUCT_NO ? cpu.Reconstruct() : 2*nColor*nColor;

         for (int d=0; d<geometry; d++) ghost_bytes[d] = nFace * surface[d%4] * cpuNinternal * cpu.Precision();

         void **ghost_buffer = (nFace > 0) ? create_ghost_buffer(ghost_bytes, cpu.Order(), geometry) : nullptr;


         if (cpu.GhostExchange() != QUDA_GHOST_EXCHANGE_EXTENDED) {

           copyGenericGauge(cpu, *this, QUDA_CUDA_FIELD_LOCATION, buffer, gauge, ghost_buffer, 0);

           if (geometry == QUDA_COARSE_GEOMETRY) copyGenericGauge(cpu, *this, QUDA_CUDA_FIELD_LOCATION, buffer, gauge, ghost_buffer, 0, 3);

         } else {

           copyExtendedGauge(cpu, *this, QUDA_CUDA_FIELD_LOCATION, buffer, gauge);

         }


         if (cpu.Order() == QUDA_QDP_GAUGE_ORDER) {

           for (int d = 0; d < geometry; d++)

             qudaMemcpy(((void **)cpu.gauge)[d], ((void **)buffer)[d], cpu.Bytes() / geometry, cudaMemcpyDefault);

         } else {

           qudaMemcpy(cpu.gauge, buffer, cpu.Bytes(), cudaMemcpyDefault);

         }


         if (cpu.Order() > 4 && GhostExchange() == QUDA_GHOST_EXCHANGE_PAD

             && cpu.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD && nFace)

           for (int d = 0; d < geometry; d++)

             qudaMemcpy(cpu.Ghost()[d], ghost_buffer[d], ghost_bytes[d], cudaMemcpyDefault);


         free_gauge_buffer(buffer, cpu.Order(), cpu.Geometry());

         if (nFace > 0) free_ghost_buffer(ghost_buffer, cpu.Order(), geometry);

       }

     } else if (reorder_location() == QUDA_CPU_FIELD_LOCATION) { // do copy then host-side reorder


       void *buffer = pool_pinned_malloc(bytes);

       qudaMemcpy(buffer, gauge, bytes, cudaMemcpyDefault);


       if (cpu.GhostExchange() != QUDA_GHOST_EXCHANGE_EXTENDED) {

         copyGenericGauge(cpu, *this, QUDA_CPU_FIELD_LOCATION, cpu.gauge, buffer);

       } else {

         copyExtendedGauge(cpu, *this, QUDA_CPU_FIELD_LOCATION, cpu.gauge, buffer);

       }

       pool_pinned_free(buffer);


     } else {

       errorQuda("Invalid pack location %d", reorder_location());

     }


     cpu.staggeredPhaseApplied = staggeredPhaseApplied;

     cpu.staggeredPhaseType = staggeredPhaseType;


     qudaDeviceSynchronize();

   }


   void cudaGaugeField::saveCPUField(cpuGaugeField &cpu, TimeProfile &profile) const {

     profile.TPSTART(QUDA_PROFILE_D2H);

     saveCPUField(cpu);

     profile.TPSTOP(QUDA_PROFILE_D2H);

   }


   void cudaGaugeField::backup() const {

     if (backed_up) errorQuda("Gauge field already backed up");

     backup_h = new char[bytes];

     qudaMemcpy(backup_h, gauge, bytes, cudaMemcpyDefault);

     backed_up = true;

   }


   void cudaGaugeField::restore() const

   {

     if (!backed_up) errorQuda("Cannot restore since not backed up");

     qudaMemcpy(gauge, backup_h, bytes, cudaMemcpyDefault);

     delete []backup_h;

     backed_up = false;

   }


   void cudaGaugeField::prefetch(QudaFieldLocation mem_space, qudaStream_t stream) const

   {

     if (is_prefetch_enabled() && mem_type == QUDA_MEMORY_DEVICE) {

       if (gauge) qudaMemPrefetchAsync(gauge, bytes, mem_space, stream);

       if (!isNative()) {

         for (int i = 0; i < nDim; i++) {

           size_t nbytes = nFace * surface[i] * nInternal * precision;

           if (ghost[i] && nbytes) qudaMemPrefetchAsync(ghost[i], nbytes, mem_space, stream);

           if (ghost[i + 4] && nbytes && geometry == QUDA_COARSE_GEOMETRY)

             qudaMemPrefetchAsync(ghost[i + 4], nbytes, mem_space, stream);

         }

       }

     }

   }


   void cudaGaugeField::zero() { qudaMemset(gauge, 0, bytes); }


   void cudaGaugeField::copy_to_buffer(void *buffer) const

   {

     qudaMemcpy(buffer, Gauge_p(), Bytes(), cudaMemcpyDeviceToHost);

   }


   void cudaGaugeField::copy_from_buffer(void *buffer)

   {

     qudaMemcpy(Gauge_p(), buffer, Bytes(), cudaMemcpyHostToDevice);

   }


 } // namespace quda

blas_quda.h

quda::GaugeField
Definition: gauge_field.h:200

quda::GaugeField::Geometry
QudaFieldGeometry Geometry() const
Definition: gauge_field.h:294

quda::GaugeField::link_type
QudaLinkType link_type
Definition: gauge_field.h:216

quda::GaugeField::StaggeredPhase
QudaStaggeredPhase StaggeredPhase() const
Definition: gauge_field.h:295

quda::GaugeField::create
QudaFieldCreate create
Definition: gauge_field.h:223

quda::GaugeField::ghost
void * ghost[2 *QUDA_MAX_DIM]
Definition: gauge_field.h:225

quda::GaugeField::nInternal
int nInternal
Definition: gauge_field.h:213

quda::GaugeField::order
QudaGaugeFieldOrder order
Definition: gauge_field.h:214

quda::GaugeField::nColor
int nColor
Definition: gauge_field.h:208

quda::GaugeField::staggeredPhaseApplied
bool staggeredPhaseApplied
Definition: gauge_field.h:237

quda::GaugeField::staggeredPhaseType
QudaStaggeredPhase staggeredPhaseType
Definition: gauge_field.h:232

quda::GaugeField::Order
QudaGaugeFieldOrder Order() const
Definition: gauge_field.h:287

quda::GaugeField::LinkMax
const double & LinkMax() const
Definition: gauge_field.h:321

quda::GaugeField::Bytes
size_t Bytes() const
Definition: gauge_field.h:352

quda::GaugeField::fat_link_max
double fat_link_max
Definition: gauge_field.h:221

quda::GaugeField::bytes
size_t bytes
Definition: gauge_field.h:203

quda::GaugeField::nFace
int nFace
Definition: gauge_field.h:209

quda::GaugeField::Gauge_p
virtual void * Gauge_p()
Definition: gauge_field.h:358

quda::GaugeField::isNative
bool isNative() const
Definition: gauge_field.h:350

quda::GaugeField::abs_max
double abs_max(int dim=-1, bool fixed=false) const
Compute the absolute maximum of the field (Linfinity norm)

quda::GaugeField::checkField
void checkField(const LatticeField &) const
Definition: gauge_field.cpp:257

quda::GaugeField::geometry
QudaFieldGeometry geometry
Definition: gauge_field.h:210

quda::GaugeField::reconstruct
QudaReconstructType reconstruct
Definition: gauge_field.h:212

quda::GaugeField::Ghost
const void ** Ghost() const
Definition: gauge_field.h:368

quda::GaugeField::StaggeredPhaseApplied
bool StaggeredPhaseApplied() const
Definition: gauge_field.h:296

quda::GaugeField::createGhostZone
void createGhostZone(const int *R, bool no_comms_fill, bool bidir=true) const
Definition: gauge_field.cpp:120

quda::GaugeField::Reconstruct
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:286

quda::LatticeField
Definition: lattice_field.h:145

quda::LatticeField::initComms
bool initComms
Definition: lattice_field.h:379

quda::LatticeField::ghostExchange
QudaGhostExchange ghostExchange
Definition: lattice_field.h:193

quda::LatticeField::stride
size_t stride
Definition: lattice_field.h:160

quda::LatticeField::createIPCComms
void createIPCComms()
Definition: lattice_field.cpp:418

quda::LatticeField::mh_send_fwd
MsgHandle * mh_send_fwd[2][QUDA_MAX_DIM]
Definition: lattice_field.h:331

quda::LatticeField::backed_up
bool backed_up
Definition: lattice_field.h:409

quda::LatticeField::bufferIndex
static int bufferIndex
Definition: lattice_field.h:490

quda::LatticeField::mh_recv_rdma_back
MsgHandle * mh_recv_rdma_back[2][QUDA_MAX_DIM]
Definition: lattice_field.h:340

quda::LatticeField::mh_send_rdma_fwd
MsgHandle * mh_send_rdma_fwd[2][QUDA_MAX_DIM]
Definition: lattice_field.h:343

quda::LatticeField::from_face_dim_dir_d
void * from_face_dim_dir_d[2][QUDA_MAX_DIM][2]
Definition: lattice_field.h:322

quda::LatticeField::mh_recv_p2p_back
static MsgHandle * mh_recv_p2p_back[2][QUDA_MAX_DIM]
Definition: lattice_field.h:358

quda::LatticeField::mh_send_rdma_back
MsgHandle * mh_send_rdma_back[2][QUDA_MAX_DIM]
Definition: lattice_field.h:346

quda::LatticeField::my_face_dim_dir_h
void * my_face_dim_dir_h[2][QUDA_MAX_DIM][2]
Definition: lattice_field.h:292

quda::LatticeField::from_face_dim_dir_h
void * from_face_dim_dir_h[2][QUDA_MAX_DIM][2]
Definition: lattice_field.h:316

quda::LatticeField::pad
int pad
Definition: lattice_field.h:161

quda::LatticeField::mh_send_p2p_fwd
static MsgHandle * mh_send_p2p_fwd[2][QUDA_MAX_DIM]
Definition: lattice_field.h:349

quda::LatticeField::nDim
int nDim
Definition: lattice_field.h:166

quda::LatticeField::mh_recv_p2p_fwd
static MsgHandle * mh_recv_p2p_fwd[2][QUDA_MAX_DIM]
Definition: lattice_field.h:355

quda::LatticeField::ghost_pinned_recv_buffer_h
static void * ghost_pinned_recv_buffer_h[2]
Definition: lattice_field.h:224

quda::LatticeField::ghost_offset
size_t ghost_offset[QUDA_MAX_DIM][2]
Definition: lattice_field.h:274

quda::LatticeField::Precision
QudaPrecision Precision() const
Definition: lattice_field.h:567

quda::LatticeField::precision
QudaPrecision precision
Definition: lattice_field.h:178

quda::LatticeField::my_face_dim_dir_d
void * my_face_dim_dir_d[2][QUDA_MAX_DIM][2]
Definition: lattice_field.h:298

quda::LatticeField::destroyIPCComms
static void destroyIPCComms()
Definition: lattice_field.cpp:572

quda::LatticeField::ghost_face_bytes
size_t ghost_face_bytes[QUDA_MAX_DIM]
Definition: lattice_field.h:264

quda::LatticeField::ghost_bytes
size_t ghost_bytes
Definition: lattice_field.h:254

quda::LatticeField::mem_type
QudaMemoryType mem_type
Definition: lattice_field.h:394

quda::LatticeField::my_face_h
void * my_face_h[2]
Definition: lattice_field.h:279

quda::LatticeField::volumeCB
size_t volumeCB
Definition: lattice_field.h:152

quda::LatticeField::ghost_pinned_send_buffer_h
static void * ghost_pinned_send_buffer_h[2]
Definition: lattice_field.h:219

quda::LatticeField::ghost_remote_send_buffer_d
static void * ghost_remote_send_buffer_d[2][QUDA_MAX_DIM][2]
Definition: lattice_field.h:239

quda::LatticeField::mh_send_p2p_back
static MsgHandle * mh_send_p2p_back[2][QUDA_MAX_DIM]
Definition: lattice_field.h:352

quda::LatticeField::ghost_field_reset
static bool ghost_field_reset
Definition: lattice_field.h:495

quda::LatticeField::R
const int * R() const
Definition: lattice_field.h:557

quda::LatticeField::surfaceCB
int surfaceCB[QUDA_MAX_DIM]
Definition: lattice_field.h:172

quda::LatticeField::destroyComms
void destroyComms()
Definition: lattice_field.cpp:386

quda::LatticeField::surface
int surface[QUDA_MAX_DIM]
Definition: lattice_field.h:171

quda::LatticeField::ghost_bytes_old
size_t ghost_bytes_old
Definition: lattice_field.h:259

quda::LatticeField::mh_send_back
MsgHandle * mh_send_back[2][QUDA_MAX_DIM]
Definition: lattice_field.h:334

quda::LatticeField::GhostExchange
QudaGhostExchange GhostExchange() const
Definition: lattice_field.h:562

quda::LatticeField::ipcCopyEvent
static cudaEvent_t ipcCopyEvent[2][2][QUDA_MAX_DIM]
Definition: lattice_field.h:373

quda::LatticeField::ipcRemoteCopyEvent
static cudaEvent_t ipcRemoteCopyEvent[2][2][QUDA_MAX_DIM]
Definition: lattice_field.h:376

quda::LatticeField::mh_recv_fwd
MsgHandle * mh_recv_fwd[2][QUDA_MAX_DIM]
Definition: lattice_field.h:325

quda::LatticeField::allocateGhostBuffer
void allocateGhostBuffer(size_t ghost_bytes) const
Allocate the static ghost buffers.
Definition: lattice_field.cpp:228

quda::LatticeField::mh_recv_rdma_fwd
MsgHandle * mh_recv_rdma_fwd[2][QUDA_MAX_DIM]
Definition: lattice_field.h:337

quda::LatticeField::ghost_face_bytes_aligned
size_t ghost_face_bytes_aligned[QUDA_MAX_DIM]
Definition: lattice_field.h:269

quda::LatticeField::ghost_recv_buffer_d
static void * ghost_recv_buffer_d[2]
Definition: lattice_field.h:214

quda::LatticeField::mh_recv_back
MsgHandle * mh_recv_back[2][QUDA_MAX_DIM]
Definition: lattice_field.h:328

quda::LatticeField::createComms
void createComms(bool no_comms_fill=false, bool bidir=true)
Definition: lattice_field.cpp:312

quda::LatticeField::from_face_h
void * from_face_h[2]
Definition: lattice_field.h:303

quda::LatticeField::ghost_send_buffer_d
static void * ghost_send_buffer_d[2]
Definition: lattice_field.h:209

quda::LatticeField::backup_h
char * backup_h
Definition: lattice_field.h:407

quda::TimeProfile
Definition: timer.h:174

quda::cpuGaugeField
Definition: gauge_field.h:626

quda::cpuGaugeField::Gauge_p
void * Gauge_p()
Definition: gauge_field.h:688

quda::cudaGaugeField
Definition: gauge_field.h:449

quda::cudaGaugeField::copy_from_buffer
virtual void copy_from_buffer(void *buffer)
Copy all contents of the field from a host buffer to this field.
Definition: cuda_gauge_field.cpp:758

quda::cudaGaugeField::setGauge
void setGauge(void *_gauge)
Definition: cuda_gauge_field.cpp:483

quda::cudaGaugeField::copy
void copy(const GaugeField &src)
Definition: cuda_gauge_field.cpp:531

quda::cudaGaugeField::prefetch
void prefetch(QudaFieldLocation mem_space, qudaStream_t stream=0) const
If managed memory and prefetch is enabled, prefetch the gauge field and buffers to the CPU or the GPU...
Definition: cuda_gauge_field.cpp:736

quda::cudaGaugeField::exchangeGhost
void exchangeGhost(QudaLinkDirection link_direction=QUDA_LINK_BACKWARDS)
Exchange the ghost and store store in the padded region.
Definition: cuda_gauge_field.cpp:113

quda::cudaGaugeField::createComms
void createComms(const int *R, bool no_comms_fill, bool bidir=true)
Create the communication handlers and buffers.
Definition: cuda_gauge_field.cpp:288

quda::cudaGaugeField::injectGhost
void injectGhost(QudaLinkDirection link_direction=QUDA_LINK_BACKWARDS)
The opposite of exchangeGhost: take the ghost zone on x, send to node x-1, and inject back into the f...
Definition: cuda_gauge_field.cpp:199

quda::cudaGaugeField::recvStart
void recvStart(int dim, int dir)
Start the receive communicators.
Definition: cuda_gauge_field.cpp:304

quda::cudaGaugeField::sendStart
void sendStart(int dim, int dir, qudaStream_t *stream_p=nullptr)
Start the sending communicators.
Definition: cuda_gauge_field.cpp:329

quda::cudaGaugeField::zero
void zero()
Definition: cuda_gauge_field.cpp:751

quda::cudaGaugeField::loadCPUField
void loadCPUField(const cpuGaugeField &cpu)
Download into this field from a CPU field.
Definition: cuda_gauge_field.cpp:635

quda::cudaGaugeField::Gauge_p
void * Gauge_p()
Definition: gauge_field.h:580

quda::cudaGaugeField::backup
void backup() const
Backs up the cudaGaugeField to CPU memory.
Definition: cuda_gauge_field.cpp:721

quda::cudaGaugeField::~cudaGaugeField
virtual ~cudaGaugeField()
Definition: cuda_gauge_field.cpp:85

quda::cudaGaugeField::allocateGhostBuffer
void allocateGhostBuffer(const int *R, bool no_comms_fill, bool bidir=true) const
Allocate the ghost buffers.
Definition: cuda_gauge_field.cpp:282

quda::cudaGaugeField::saveCPUField
void saveCPUField(cpuGaugeField &cpu) const
Upload from this field into a CPU field.
Definition: cuda_gauge_field.cpp:646

quda::cudaGaugeField::copy_to_buffer
virtual void copy_to_buffer(void *buffer) const
Copy all contents of the field to a host buffer.
Definition: cuda_gauge_field.cpp:753

quda::cudaGaugeField::restore
void restore() const
Restores the cudaGaugeField to CUDA memory.
Definition: cuda_gauge_field.cpp:728

quda::cudaGaugeField::cudaGaugeField
cudaGaugeField(const GaugeFieldParam &)
Definition: cuda_gauge_field.cpp:8

quda::cudaGaugeField::commsComplete
void commsComplete(int dim, int dir)
Wait for communication to complete.
Definition: cuda_gauge_field.cpp:368

quda::cudaGaugeField::exchangeExtendedGhost
void exchangeExtendedGhost(const int *R, bool no_comms_fill=false)
This does routine will populate the border / halo region of a gauge field that has been created using...
Definition: cuda_gauge_field.cpp:411

comm_start
void comm_start(MsgHandle *mh)
Definition: communicator_stack.cpp:165

comm_gdr_enabled
bool comm_gdr_enabled()
Query if GPU Direct RDMA communication is enabled (global setting)
Definition: communicator_stack.cpp:124

comm_peer2peer_enabled
bool comm_peer2peer_enabled(int dir, int dim)
Definition: communicator_stack.cpp:116

comm_dim_partitioned
int comm_dim_partitioned(int dim)
Definition: communicator_stack.cpp:74

comm_wait
void comm_wait(MsgHandle *mh)
Definition: communicator_stack.cpp:167

dim
std::array< int, 4 > dim
Definition: command_line_params.cpp:34

QudaLinkDirection
enum QudaLinkDirection_s QudaLinkDirection

QUDA_CUDA_FIELD_LOCATION
@ QUDA_CUDA_FIELD_LOCATION
Definition: enum_quda.h:326

QUDA_CPU_FIELD_LOCATION
@ QUDA_CPU_FIELD_LOCATION
Definition: enum_quda.h:325

QUDA_LINK_BIDIRECTIONAL
@ QUDA_LINK_BIDIRECTIONAL
Definition: enum_quda.h:497

QUDA_LINK_FORWARDS
@ QUDA_LINK_FORWARDS
Definition: enum_quda.h:497

QUDA_LINK_BACKWARDS
@ QUDA_LINK_BACKWARDS
Definition: enum_quda.h:497

QudaGaugeFieldOrder
enum QudaGaugeFieldOrder_s QudaGaugeFieldOrder

QUDA_RECONSTRUCT_NO
@ QUDA_RECONSTRUCT_NO
Definition: enum_quda.h:70

QUDA_MEMORY_MAPPED
@ QUDA_MEMORY_MAPPED
Definition: enum_quda.h:15

QUDA_MEMORY_DEVICE
@ QUDA_MEMORY_DEVICE
Definition: enum_quda.h:13

QUDA_VECTOR_GEOMETRY
@ QUDA_VECTOR_GEOMETRY
Definition: enum_quda.h:501

QUDA_COARSE_GEOMETRY
@ QUDA_COARSE_GEOMETRY
Definition: enum_quda.h:503

QudaFieldGeometry
enum QudaFieldGeometry_s QudaFieldGeometry

QudaFieldLocation
enum QudaFieldLocation_s QudaFieldLocation

QUDA_GHOST_EXCHANGE_EXTENDED
@ QUDA_GHOST_EXCHANGE_EXTENDED
Definition: enum_quda.h:510

QUDA_GHOST_EXCHANGE_NO
@ QUDA_GHOST_EXCHANGE_NO
Definition: enum_quda.h:508

QUDA_GHOST_EXCHANGE_PAD
@ QUDA_GHOST_EXCHANGE_PAD
Definition: enum_quda.h:509

QUDA_SINGLE_PRECISION
@ QUDA_SINGLE_PRECISION
Definition: enum_quda.h:64

QUDA_BQCD_GAUGE_ORDER
@ QUDA_BQCD_GAUGE_ORDER
Definition: enum_quda.h:49

QUDA_TIFR_GAUGE_ORDER
@ QUDA_TIFR_GAUGE_ORDER
Definition: enum_quda.h:50

QUDA_QDP_GAUGE_ORDER
@ QUDA_QDP_GAUGE_ORDER
Definition: enum_quda.h:44

QUDA_MILC_SITE_GAUGE_ORDER
@ QUDA_MILC_SITE_GAUGE_ORDER
Definition: enum_quda.h:48

QUDA_CPS_WILSON_GAUGE_ORDER
@ QUDA_CPS_WILSON_GAUGE_ORDER
Definition: enum_quda.h:46

QUDA_TIFR_PADDED_GAUGE_ORDER
@ QUDA_TIFR_PADDED_GAUGE_ORDER
Definition: enum_quda.h:51

QUDA_QDPJIT_GAUGE_ORDER
@ QUDA_QDPJIT_GAUGE_ORDER
Definition: enum_quda.h:45

QUDA_ZERO_FIELD_CREATE
@ QUDA_ZERO_FIELD_CREATE
Definition: enum_quda.h:361

QUDA_REFERENCE_FIELD_CREATE
@ QUDA_REFERENCE_FIELD_CREATE
Definition: enum_quda.h:363

QUDA_NULL_FIELD_CREATE
@ QUDA_NULL_FIELD_CREATE
Definition: enum_quda.h:360

QUDA_ASQTAD_MOM_LINKS
@ QUDA_ASQTAD_MOM_LINKS
Definition: enum_quda.h:33

QUDA_ASQTAD_FAT_LINKS
@ QUDA_ASQTAD_FAT_LINKS
Definition: enum_quda.h:31

gauge_field.h

pool_pinned_malloc
#define pool_pinned_malloc(size)
Definition: malloc_quda.h:172

pool_device_malloc
#define pool_device_malloc(size)
Definition: malloc_quda.h:170

pool_pinned_free
#define pool_pinned_free(ptr)
Definition: malloc_quda.h:173

pool_device_free
#define pool_device_free(ptr)
Definition: malloc_quda.h:171

get_mapped_device_pointer
#define get_mapped_device_pointer(ptr)
Definition: malloc_quda.h:116

host_free
#define host_free(ptr)
Definition: malloc_quda.h:115

mapped_malloc
#define mapped_malloc(size)
Definition: malloc_quda.h:108

quda::blas::bytes
unsigned long long bytes

quda
Definition: blas_lapack.h:24

quda::create_gauge_buffer
void * create_gauge_buffer(size_t bytes, QudaGaugeFieldOrder order, QudaFieldGeometry geometry)
Definition: cuda_gauge_field.cpp:492

quda::create_ghost_buffer
void ** create_ghost_buffer(size_t bytes[], QudaGaugeFieldOrder order, QudaFieldGeometry geometry)
Definition: cuda_gauge_field.cpp:503

quda::copyGenericGauge
void copyGenericGauge(GaugeField &out, const GaugeField &in, QudaFieldLocation location, void *Out=0, void *In=0, void **ghostOut=0, void **ghostIn=0, int type=0)
Definition: copy_gauge.cpp:44

quda::extractGaugeGhost
void extractGaugeGhost(const GaugeField &u, void **ghost, bool extract=true, int offset=0)

quda::extractExtendedGaugeGhost
void extractExtendedGaugeGhost(const GaugeField &u, int dim, const int *R, void **ghost, bool extract)

quda::free_gauge_buffer
void free_gauge_buffer(void *buffer, QudaGaugeFieldOrder order, QudaFieldGeometry geometry)
Definition: cuda_gauge_field.cpp:515

quda::stream
qudaStream_t * stream
Definition: cuda_color_spinor_field.cpp:644

quda::QUDA_PROFILE_COMMS
@ QUDA_PROFILE_COMMS
Definition: timer.h:109

quda::QUDA_PROFILE_H2D
@ QUDA_PROFILE_H2D
Definition: timer.h:104

quda::QUDA_PROFILE_D2H
@ QUDA_PROFILE_D2H
Definition: timer.h:105

quda::free_ghost_buffer
void free_ghost_buffer(void **buffer, QudaGaugeFieldOrder order, QudaFieldGeometry geometry)
Definition: cuda_gauge_field.cpp:524

quda::reorder_location
QudaFieldLocation reorder_location()
Return whether data is reordered on the CPU or GPU. This can set at QUDA initialization using the env...
Definition: lattice_field.cpp:748

quda::copyExtendedGauge
void copyExtendedGauge(GaugeField &out, const GaugeField &in, QudaFieldLocation location, void *Out=0, void *In=0)

quda::is_prefetch_enabled
bool is_prefetch_enabled()
Definition: malloc.cpp:198

param
QudaGaugeParam param
Definition: pack_test.cpp:18

qudaMemset2D
#define qudaMemset2D(ptr, pitch, value, width, height)
Definition: quda_api.h:221

qudaMemsetAsync
#define qudaMemsetAsync(ptr, value, count, stream)
Definition: quda_api.h:224

qudaMemPrefetchAsync
#define qudaMemPrefetchAsync(ptr, count, mem_space, stream)
Definition: quda_api.h:231

qudaMemcpy
#define qudaMemcpy(dst, src, count, kind)
Definition: quda_api.h:204

qudaEventSynchronize
#define qudaEventSynchronize(event)
Definition: quda_api.h:244

qudaMemset
#define qudaMemset(ptr, value, count)
Definition: quda_api.h:218

qudaEventRecord
#define qudaEventRecord(event, stream)
Definition: quda_api.h:238

qudaMemcpyAsync
#define qudaMemcpyAsync(dst, src, count, kind, stream)
Definition: quda_api.h:207

qudaStreamSynchronize
#define qudaStreamSynchronize(stream)
Definition: quda_api.h:247

qudaStream_t
cudaStream_t qudaStream_t
Definition: quda_api.h:9

qudaDeviceSynchronize
#define qudaDeviceSynchronize()
Definition: quda_api.h:250

QUDA_MAX_DIM
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5.
Definition: quda_constants.h:17

streams
qudaStream_t * streams
Definition: device.cpp:15

quda::GaugeFieldParam
Definition: gauge_field.h:44

errorQuda
#define errorQuda(...)
Definition: util_quda.h:120