v0.9.0/doc/cuda__gauge__field_8cu_source.html

 #include <string.h>
 #include <gauge_field.h>
 #include <typeinfo>
 #include <blas_quda.h>

 namespace quda {

   cudaGaugeField::cudaGaugeField(const GaugeFieldParam &param) :
     GaugeField(param), gauge(0), even(0), odd(0)
   {
     if ((order == QUDA_QDP_GAUGE_ORDER || order == QUDA_QDPJIT_GAUGE_ORDER) &&
         create != QUDA_REFERENCE_FIELD_CREATE) {
       errorQuda("QDP ordering only supported for reference fields");
     }

     if (order == QUDA_QDP_GAUGE_ORDER ||
   order == QUDA_TIFR_GAUGE_ORDER || order == QUDA_TIFR_PADDED_GAUGE_ORDER ||
   order == QUDA_BQCD_GAUGE_ORDER || order == QUDA_CPS_WILSON_GAUGE_ORDER)
       errorQuda("Field ordering %d presently disabled for this type", order);

 #ifdef MULTI_GPU
     if (link_type != QUDA_ASQTAD_MOM_LINKS &&
   ghostExchange == QUDA_GHOST_EXCHANGE_PAD &&
   isNative()) {
       bool pad_check = true;
       for (int i=0; i<nDim; i++) {
   // when we have coarse links we need to double the pad since we're storing forwards and backwards links
   int minimum_pad = nFace*surfaceCB[i] * (geometry == QUDA_COARSE_GEOMETRY ? 2 : 1);
   if (pad < minimum_pad) pad_check = false;
   if (!pad_check)
     errorQuda("cudaGaugeField being constructed with insufficient padding (%d < %d)\n", pad, minimum_pad);
       }
     }
 #endif

     if(create != QUDA_NULL_FIELD_CREATE &&
         create != QUDA_ZERO_FIELD_CREATE &&
         create != QUDA_REFERENCE_FIELD_CREATE){
       errorQuda("ERROR: create type(%d) not supported yet\n", create);
     }

     if (create != QUDA_REFERENCE_FIELD_CREATE) {
       gauge = pool_device_malloc(bytes);
       if (create == QUDA_ZERO_FIELD_CREATE) cudaMemset(gauge, 0, bytes);
     } else {
       gauge = param.gauge;
     }

     if ( !isNative() ) {
       for (int i=0; i<nDim; i++) {
         size_t nbytes = nFace * surface[i] * nInternal * precision;
         ghost[i] = nbytes ? pool_device_malloc(nbytes) : nullptr;
   ghost[i+4] = (nbytes && geometry == QUDA_COARSE_GEOMETRY) ? pool_device_malloc(nbytes) : nullptr;
       }
     }

     if (ghostExchange == QUDA_GHOST_EXCHANGE_PAD) {
       if (create == QUDA_REFERENCE_FIELD_CREATE) exchangeGhost(geometry == QUDA_VECTOR_GEOMETRY ? QUDA_LINK_BACKWARDS : QUDA_LINK_BIDIRECTIONAL);
     }

     even = gauge;
     odd = (char*)gauge + bytes/2;

 #ifdef USE_TEXTURE_OBJECTS
     createTexObject(tex, gauge, true);
     createTexObject(evenTex, even, false);
     createTexObject(oddTex, odd, false);
     if(reconstruct == QUDA_RECONSTRUCT_13 || reconstruct == QUDA_RECONSTRUCT_9)
     {  // Create texture objects for the phases
       bool isPhase = true;
       createTexObject(phaseTex, (char*)gauge + phase_offset, true, isPhase);
       createTexObject(evenPhaseTex, (char*)even + phase_offset, false, isPhase);
       createTexObject(oddPhaseTex, (char*)odd + phase_offset, false, isPhase);
     }
 #endif

   }

 #ifdef USE_TEXTURE_OBJECTS
   void cudaGaugeField::createTexObject(cudaTextureObject_t &tex, void *field, bool full, bool isPhase) {

     if( isNative() ){
       // create the texture for the field components
       cudaChannelFormatDesc desc;
       memset(&desc, 0, sizeof(cudaChannelFormatDesc));
       if (precision == QUDA_SINGLE_PRECISION) desc.f = cudaChannelFormatKindFloat;
       else desc.f = cudaChannelFormatKindSigned; // half is short, double is int2

       int texel_size = 1;
       if (isPhase) {
         if (precision == QUDA_DOUBLE_PRECISION) {
           desc.x = 8*sizeof(int);
           desc.y = 8*sizeof(int);
           desc.z = 0;
           desc.w = 0;
           texel_size = 2*sizeof(int);
         } else {
           desc.x = 8*precision;
           desc.y = desc.z = desc.w = 0;
           texel_size = precision;
         }
       } else {
         // always four components regardless of precision
         if (precision == QUDA_DOUBLE_PRECISION) {
           desc.x = 8*sizeof(int);
           desc.y = 8*sizeof(int);
           desc.z = 8*sizeof(int);
           desc.w = 8*sizeof(int);
     texel_size = 4*sizeof(int);
         } else {
           desc.x = 8*precision;
           desc.y = 8*precision;
           desc.z = (reconstruct == 18 || reconstruct == 10) ? 0 : 8*precision; // float2 or short2 for 18 reconstruct
           desc.w = (reconstruct == 18 || reconstruct == 10) ? 0 : 8*precision;
           texel_size = (reconstruct == 18 || reconstruct == 10 ? 2 : 4) * precision;
         }
       }

       cudaResourceDesc resDesc;
       memset(&resDesc, 0, sizeof(resDesc));
       resDesc.resType = cudaResourceTypeLinear;
       resDesc.res.linear.devPtr = field;
       resDesc.res.linear.desc = desc;
       resDesc.res.linear.sizeInBytes = isPhase ? phase_bytes/(!full ? 2 : 1) : (bytes-phase_bytes)/(!full ? 2 : 1);

       unsigned long texels = resDesc.res.linear.sizeInBytes / texel_size;
       if (texels > (unsigned)deviceProp.maxTexture1DLinear) {
   errorQuda("Attempting to bind too large a texture %lu > %d", texels, deviceProp.maxTexture1DLinear);
       }

       cudaTextureDesc texDesc;
       memset(&texDesc, 0, sizeof(texDesc));
       if (precision == QUDA_HALF_PRECISION) texDesc.readMode = cudaReadModeNormalizedFloat;
       else texDesc.readMode = cudaReadModeElementType;

       cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
       checkCudaError();
     }
   }

   void cudaGaugeField::destroyTexObject() {
     if( isNative() ){
       cudaDestroyTextureObject(tex);
       cudaDestroyTextureObject(evenTex);
       cudaDestroyTextureObject(oddTex);
       if(reconstruct == QUDA_RECONSTRUCT_9 || reconstruct == QUDA_RECONSTRUCT_13){
         cudaDestroyTextureObject(phaseTex);
         cudaDestroyTextureObject(evenPhaseTex);
         cudaDestroyTextureObject(oddPhaseTex);
       }
       checkCudaError();
     }
   }
 #endif

   cudaGaugeField::~cudaGaugeField()
   {
 #ifdef USE_TEXTURE_OBJECTS
     destroyTexObject();
 #endif

     destroyComms();

     if (create != QUDA_REFERENCE_FIELD_CREATE) {
       if (gauge) pool_device_free(gauge);
     }

     if ( !isNative() ) {
       for (int i=0; i<nDim; i++) {
         if (ghost[i]) pool_device_free(ghost[i]);
         if (ghost[i+4] && geometry == QUDA_COARSE_GEOMETRY) pool_device_free(ghost[i]);
       }
     }

   }

   // This does the exchange of the forwards boundary gauge field ghost zone and places
   // it into the ghost array of the next node
   void cudaGaugeField::exchangeGhost(QudaLinkDirection link_direction) {

     if (ghostExchange != QUDA_GHOST_EXCHANGE_PAD) errorQuda("Cannot call exchangeGhost with ghostExchange=%d", ghostExchange);
     if (geometry != QUDA_VECTOR_GEOMETRY && geometry != QUDA_COARSE_GEOMETRY) errorQuda("Invalid geometry=%d", geometry);
     if ( (link_direction == QUDA_LINK_BIDIRECTIONAL || link_direction == QUDA_LINK_FORWARDS) && geometry != QUDA_COARSE_GEOMETRY)
       errorQuda("Cannot request exchange of forward links on non-coarse geometry");
     if (nFace == 0) errorQuda("nFace = 0");

     const int dir = 1; // sending forwards only
     const int R[] = {nFace, nFace, nFace, nFace};
     const bool no_comms_fill = true; // dslash kernels presently require this
     createComms(R, true); // always need to allocate space for non-partitioned dimension for copyGenericGauge

     // loop over backwards and forwards links
     const QudaLinkDirection directions[] = {QUDA_LINK_BACKWARDS, QUDA_LINK_FORWARDS};
     for (int link_dir = 0; link_dir<2; link_dir++) {
       if (!(link_direction == QUDA_LINK_BIDIRECTIONAL || link_direction == directions[link_dir])) continue;

       void *send_d[2*QUDA_MAX_DIM] = { };
       void *recv_d[2*QUDA_MAX_DIM] = { };

       size_t offset = 0;
       for (int d=0; d<nDim; d++) {
   // receive from backwards is first half of each ghost_recv_buffer
   recv_d[d] = static_cast<char*>(ghost_recv_buffer_d[bufferIndex]) + offset; offset += ghost_face_bytes[d];
   // send forwards is second half of each ghost_send_buffer
   send_d[d] = static_cast<char*>(ghost_send_buffer_d[bufferIndex]) + offset; offset += ghost_face_bytes[d];
       }

       extractGaugeGhost(*this, send_d, true, link_dir*nDim); // get the links into contiguous buffers

       // issue receive preposts and host-to-device copies if needed
       for (int dim=0; dim<nDim; dim++) {
   if (!comm_dim_partitioned(dim)) continue;
   recvStart(dim, dir); // prepost the receive
   if (!comm_peer2peer_enabled(dir,dim) && !comm_gdr_enabled()) {
     cudaMemcpyAsync(my_face_dim_dir_h[bufferIndex][dim][dir], my_face_dim_dir_d[bufferIndex][dim][dir],
         ghost_face_bytes[dim], cudaMemcpyDeviceToHost, streams[2*dim+dir]);
   }
       }

       // if gdr enabled then synchronize
       if (comm_gdr_enabled()) qudaDeviceSynchronize();

       // if the sending direction is not peer-to-peer then we need to synchronize before we start sending
       for (int dim=0; dim<nDim; dim++) {
   if (!comm_dim_partitioned(dim)) continue;
   if (!comm_peer2peer_enabled(dir,dim) && !comm_gdr_enabled()) qudaStreamSynchronize(streams[2*dim+dir]);
   sendStart(dim, dir, &streams[2*dim+dir]); // start sending
       }

       // complete communication and issue host-to-device copies if needed
       for (int dim=0; dim<nDim; dim++) {
   if (!comm_dim_partitioned(dim)) continue;
   commsComplete(dim, dir);
   if (!comm_peer2peer_enabled(1-dir,dim) && !comm_gdr_enabled()) {
     cudaMemcpyAsync(from_face_dim_dir_d[bufferIndex][dim][1-dir], from_face_dim_dir_h[bufferIndex][dim][1-dir],
         ghost_face_bytes[dim], cudaMemcpyHostToDevice, streams[2*dim+dir]);
   }
       }

       // fill in the halos for non-partitioned dimensions
       for (int dim=0; dim<nDim; dim++) {
   if (!comm_dim_partitioned(dim) && no_comms_fill) {
     qudaMemcpy(recv_d[dim], send_d[dim], ghost_face_bytes[dim], cudaMemcpyDeviceToDevice);
   }
       }

       if (isNative()) {
   copyGenericGauge(*this, *this, QUDA_CUDA_FIELD_LOCATION, 0, 0, 0, recv_d, 1 + 2*link_dir); // 1, 3
       } else {
   // copy from receive buffer into ghost array
   for (int dim=0; dim<nDim; dim++)
     qudaMemcpy(ghost[dim+link_dir*nDim], recv_d[dim], ghost_face_bytes[dim], cudaMemcpyDeviceToDevice);
       }

       bufferIndex = 1-bufferIndex;
     } // link_dir

     qudaDeviceSynchronize();
   }

   // This does the opposite of exchangeGhost and sends back the ghost
   // zone to the node from which it came and injects it back into the
   // field
   void cudaGaugeField::injectGhost(QudaLinkDirection link_direction) {

     if (ghostExchange != QUDA_GHOST_EXCHANGE_PAD) errorQuda("Cannot call exchangeGhost with ghostExchange=%d", ghostExchange);
     if (geometry != QUDA_VECTOR_GEOMETRY && geometry != QUDA_COARSE_GEOMETRY) errorQuda("Invalid geometry=%d", geometry);
     if (link_direction != QUDA_LINK_BACKWARDS) errorQuda("Invalid link_direction = %d", link_direction);
     if (nFace == 0) errorQuda("nFace = 0");

     const int dir = 0; // sending backwards only
     const int R[] = {nFace, nFace, nFace, nFace};
     const bool no_comms_fill = false; // injection never does no_comms_fill
     createComms(R, true); // always need to allocate space for non-partitioned dimension for copyGenericGauge

     // loop over backwards and forwards links (forwards links never sent but leave here just in case)
     const QudaLinkDirection directions[] = {QUDA_LINK_BACKWARDS, QUDA_LINK_FORWARDS};
     for (int link_dir = 0; link_dir<2; link_dir++) {
       if (!(link_direction == QUDA_LINK_BIDIRECTIONAL || link_direction == directions[link_dir])) continue;

       void *send_d[2*QUDA_MAX_DIM] = { };
       void *recv_d[2*QUDA_MAX_DIM] = { };

       size_t offset = 0;
       for (int d=0; d<nDim; d++) {
   // send backwards is first half of each ghost_send_buffer
   send_d[d] = static_cast<char*>(ghost_send_buffer_d[bufferIndex]) + offset; offset += ghost_face_bytes[d];
   // receive from forwards is the second half of each ghost_recv_buffer
   recv_d[d] = static_cast<char*>(ghost_recv_buffer_d[bufferIndex]) + offset; offset += ghost_face_bytes[d];
       }

       if (isNative()) { // copy from padded region in gauge field into send buffer
   copyGenericGauge(*this, *this, QUDA_CUDA_FIELD_LOCATION, 0, 0, send_d, 0, 1 + 2*link_dir);
       } else { // copy from receive buffer into ghost array
   for (int dim=0; dim<nDim; dim++) qudaMemcpy(send_d[dim], ghost[dim+link_dir*nDim], ghost_face_bytes[dim], cudaMemcpyDeviceToDevice);
       }

       // issue receive preposts and host-to-device copies if needed
       for (int dim=0; dim<nDim; dim++) {
   if (!comm_dim_partitioned(dim)) continue;
   recvStart(dim, dir); // prepost the receive
   if (!comm_peer2peer_enabled(dir,dim) && !comm_gdr_enabled()) {
     cudaMemcpyAsync(my_face_dim_dir_h[bufferIndex][dim][dir], my_face_dim_dir_d[bufferIndex][dim][dir],
         ghost_face_bytes[dim], cudaMemcpyDeviceToHost, streams[2*dim+dir]);
   }
       }

       // if gdr enabled then synchronize
       if (comm_gdr_enabled()) qudaDeviceSynchronize();

       // if the sending direction is not peer-to-peer then we need to synchronize before we start sending
       for (int dim=0; dim<nDim; dim++) {
   if (!comm_dim_partitioned(dim)) continue;
   if (!comm_peer2peer_enabled(dir,dim) && !comm_gdr_enabled()) qudaStreamSynchronize(streams[2*dim+dir]);
   sendStart(dim, dir, &streams[2*dim+dir]); // start sending
       }

       // complete communication and issue host-to-device copies if needed
       for (int dim=0; dim<nDim; dim++) {
   if (!comm_dim_partitioned(dim)) continue;
   commsComplete(dim, dir);
   if (!comm_peer2peer_enabled(1-dir,dim) && !comm_gdr_enabled()) {
     cudaMemcpyAsync(from_face_dim_dir_d[bufferIndex][dim][1-dir], from_face_dim_dir_h[bufferIndex][dim][1-dir],
         ghost_face_bytes[dim], cudaMemcpyHostToDevice, streams[2*dim+dir]);
   }
       }

       // fill in the halos for non-partitioned dimensions
       for (int dim=0; dim<nDim; dim++) {
   if (!comm_dim_partitioned(dim) && no_comms_fill) {
     qudaMemcpy(recv_d[dim], send_d[dim], ghost_face_bytes[dim], cudaMemcpyDeviceToDevice);
   }
       }

       // get the links into contiguous buffers
       extractGaugeGhost(*this, recv_d, false, link_dir*nDim);

       bufferIndex = 1-bufferIndex;
     } // link_dir

     qudaDeviceSynchronize();
   }

   void cudaGaugeField::allocateGhostBuffer(const int *R, bool no_comms_fill) const
   {
     createGhostZone(R, no_comms_fill);
     LatticeField::allocateGhostBuffer(ghost_bytes);
   }

   void cudaGaugeField::createComms(const int *R, bool no_comms_fill)
   {
     allocateGhostBuffer(R, no_comms_fill); // allocate the ghost buffer if not yet allocated

     // ascertain if this instance needs it comms buffers to be updated
     bool comms_reset = ghost_field_reset || // FIXME add send buffer check
       (my_face_h[0] != ghost_pinned_buffer_h[0]) || (my_face_h[1] != ghost_pinned_buffer_h[1]); // pinned buffers

     if (!initComms || comms_reset) LatticeField::createComms(no_comms_fill);

     if (ghost_field_reset) destroyIPCComms();
     createIPCComms();
   }

   void cudaGaugeField::recvStart(int dim, int dir)
   {
     if (!comm_dim_partitioned(dim)) return;

     if (dir==0) { // sending backwards
       // receive from the processor in the +1 direction
       if (comm_peer2peer_enabled(1,dim)) {
   comm_start(mh_recv_p2p_fwd[bufferIndex][dim]);
       } else if (comm_gdr_enabled()) {
         comm_start(mh_recv_rdma_fwd[bufferIndex][dim]);
       } else {
         comm_start(mh_recv_fwd[bufferIndex][dim]);
       }
     } else { //sending forwards
       // receive from the processor in the -1 direction
       if (comm_peer2peer_enabled(0,dim)) {
   comm_start(mh_recv_p2p_back[bufferIndex][dim]);
       } else if (comm_gdr_enabled()) {
         comm_start(mh_recv_rdma_back[bufferIndex][dim]);
       } else {
         comm_start(mh_recv_back[bufferIndex][dim]);
       }
     }
   }

   void cudaGaugeField::sendStart(int dim, int dir, cudaStream_t* stream_p)
   {
     if (!comm_dim_partitioned(dim)) return;

     if (!comm_peer2peer_enabled(dir,dim)) {
       if (dir == 0)
   if (comm_gdr_enabled()) {
     comm_start(mh_send_rdma_back[bufferIndex][dim]);
   } else {
     comm_start(mh_send_back[bufferIndex][dim]);
   }
       else
   if (comm_gdr_enabled()) {
     comm_start(mh_send_rdma_fwd[bufferIndex][dim]);
   } else {
     comm_start(mh_send_fwd[bufferIndex][dim]);
   }
     } else { // doing peer-to-peer

       void* ghost_dst = static_cast<char*>(ghost_remote_send_buffer_d[bufferIndex][dim][dir])
   + precision*ghostOffset[dim][(dir+1)%2];

       cudaMemcpyAsync(ghost_dst, my_face_dim_dir_d[bufferIndex][dim][dir],
           ghost_face_bytes[dim], cudaMemcpyDeviceToDevice,
           stream_p ? *stream_p : 0);

       if (dir == 0) {
   // record the event
   qudaEventRecord(ipcCopyEvent[bufferIndex][0][dim], stream_p ? *stream_p : 0);
   // send to the processor in the -1 direction
   comm_start(mh_send_p2p_back[bufferIndex][dim]);
       } else {
   qudaEventRecord(ipcCopyEvent[bufferIndex][1][dim], stream_p ? *stream_p : 0);
   // send to the processor in the +1 direction
   comm_start(mh_send_p2p_fwd[bufferIndex][dim]);
       }
     }
   }

   void cudaGaugeField::commsComplete(int dim, int dir)
   {
     if (!comm_dim_partitioned(dim)) return;

     if (dir==0) {
       if (comm_peer2peer_enabled(1,dim)) {
   comm_wait(mh_recv_p2p_fwd[bufferIndex][dim]);
   qudaEventSynchronize(ipcRemoteCopyEvent[bufferIndex][1][dim]);
       } else if (comm_gdr_enabled()) {
   comm_wait(mh_recv_rdma_fwd[bufferIndex][dim]);
       } else {
   comm_wait(mh_recv_fwd[bufferIndex][dim]);
       }

       if (comm_peer2peer_enabled(0,dim)) {
   comm_wait(mh_send_p2p_back[bufferIndex][dim]);
   qudaEventSynchronize(ipcCopyEvent[bufferIndex][0][dim]);
       } else if (comm_gdr_enabled()) {
   comm_wait(mh_send_rdma_back[bufferIndex][dim]);
       } else {
   comm_wait(mh_send_back[bufferIndex][dim]);
       }
     } else {
       if (comm_peer2peer_enabled(0,dim)) {
   comm_wait(mh_recv_p2p_back[bufferIndex][dim]);
   qudaEventSynchronize(ipcRemoteCopyEvent[bufferIndex][0][dim]);
       } else if (comm_gdr_enabled()) {
   comm_wait(mh_recv_rdma_back[bufferIndex][dim]);
       } else {
   comm_wait(mh_recv_back[bufferIndex][dim]);
       }

       if (comm_peer2peer_enabled(1,dim)) {
   comm_wait(mh_send_p2p_fwd[bufferIndex][dim]);
   qudaEventSynchronize(ipcCopyEvent[bufferIndex][1][dim]);
       } else if (comm_gdr_enabled()) {
   comm_wait(mh_send_rdma_fwd[bufferIndex][dim]);
       } else {
   comm_wait(mh_send_fwd[bufferIndex][dim]);
       }
     }
   }

   void cudaGaugeField::exchangeExtendedGhost(const int *R, bool no_comms_fill)
   {
     const int b = bufferIndex;
     void *send_d[QUDA_MAX_DIM], *recv_d[QUDA_MAX_DIM];

     createComms(R, no_comms_fill);

     size_t offset = 0;
     for (int dim=0; dim<nDim; dim++) {
       if ( !(comm_dim_partitioned(dim) || (no_comms_fill && R[dim])) ) continue;
       send_d[dim] = static_cast<char*>(ghost_send_buffer_d[b]) + offset;
       recv_d[dim] = static_cast<char*>(ghost_recv_buffer_d[b]) + offset;
       offset += 2*ghost_face_bytes[dim]; // factor of two from fwd/back
     }

     for (int dim=0; dim<nDim; dim++) {
       if ( !(comm_dim_partitioned(dim) || (no_comms_fill && R[dim])) ) continue;

       //extract into a contiguous buffer
       extractExtendedGaugeGhost(*this, dim, R, send_d, true);

       if (comm_dim_partitioned(dim)) {
   for (int dir=0; dir<2; dir++) recvStart(dim, dir);

   for (int dir=0; dir<2; dir++) {
     // issue host-to-device copies if needed
     if (!comm_peer2peer_enabled(dir,dim) && !comm_gdr_enabled()) {
       cudaMemcpyAsync(my_face_dim_dir_h[bufferIndex][dim][dir], my_face_dim_dir_d[bufferIndex][dim][dir],
           ghost_face_bytes[dim], cudaMemcpyDeviceToHost, streams[dir]);
     }
   }

   // if either direction is not peer-to-peer then we need to synchronize
   if (!comm_peer2peer_enabled(0,dim) || !comm_peer2peer_enabled(1,dim)) qudaDeviceSynchronize();

   // if we pass a stream to sendStart then we must ensure that stream is synchronized
   for (int dir=0; dir<2; dir++) sendStart(dim, dir, &streams[dir]);
   for (int dir=0; dir<2; dir++) commsComplete(dim, dir);

   for (int dir=0; dir<2; dir++) {
     // issue host-to-device copies if needed
     if (!comm_peer2peer_enabled(dir,dim) && !comm_gdr_enabled()) {
       cudaMemcpyAsync(from_face_dim_dir_d[bufferIndex][dim][dir], from_face_dim_dir_h[bufferIndex][dim][dir],
           ghost_face_bytes[dim], cudaMemcpyHostToDevice, streams[dir]);
     }
   }

       } else { // if just doing a local exchange to fill halo then need to swap faces
   qudaMemcpy(from_face_dim_dir_d[b][dim][1], my_face_dim_dir_d[b][dim][0],
        ghost_face_bytes[dim], cudaMemcpyDeviceToDevice);
   qudaMemcpy(from_face_dim_dir_d[b][dim][0], my_face_dim_dir_d[b][dim][1],
        ghost_face_bytes[dim], cudaMemcpyDeviceToDevice);
       }

       // inject back into the gauge field
       extractExtendedGaugeGhost(*this, dim, R, recv_d, false);
     }

     bufferIndex = 1-bufferIndex;
     qudaDeviceSynchronize();
   }

   void cudaGaugeField::exchangeExtendedGhost(const int *R, TimeProfile &profile, bool no_comms_fill) {
     profile.TPSTART(QUDA_PROFILE_COMMS);
     exchangeExtendedGhost(R, no_comms_fill);
     profile.TPSTOP(QUDA_PROFILE_COMMS);
   }

   void cudaGaugeField::setGauge(void *gauge_)
   {
     if(create != QUDA_REFERENCE_FIELD_CREATE) {
       errorQuda("Setting gauge pointer is only allowed when create="
           "QUDA_REFERENCE_FIELD_CREATE type\n");
     }
     gauge = gauge_;
   }

   void *create_gauge_buffer(size_t bytes, QudaGaugeFieldOrder order, QudaFieldGeometry geometry) {
     if (order == QUDA_QDP_GAUGE_ORDER) {
       void **buffer = new void*[geometry];
       for (int d=0; d<geometry; d++) buffer[d] = pool_device_malloc(bytes/geometry);
       return ((void*)buffer);
     } else {
       return pool_device_malloc(bytes);
     }

   }

   void **create_ghost_buffer(size_t bytes[], QudaGaugeFieldOrder order, QudaFieldGeometry geometry) {

     if (order > 4) {
       void **buffer = new void*[geometry];
       for (int d=0; d<geometry; d++) buffer[d] = pool_device_malloc(bytes[d]);
       return buffer;
     } else {
       return 0;
     }

   }

   void free_gauge_buffer(void *buffer, QudaGaugeFieldOrder order, QudaFieldGeometry geometry) {
     if (order == QUDA_QDP_GAUGE_ORDER) {
       for (int d=0; d<geometry; d++) pool_device_free(((void**)buffer)[d]);
       delete []((void**)buffer);
     } else {
       pool_device_free(buffer);
     }
   }

   void free_ghost_buffer(void **buffer, QudaGaugeFieldOrder order, QudaFieldGeometry geometry) {
     if (order > 4) {
       for (int d=0; d<geometry; d++) pool_device_free(buffer[d]);
       delete []buffer;
     }
   }

   void cudaGaugeField::copy(const GaugeField &src) {
     if (this == &src) return;

     checkField(src);

     if (link_type == QUDA_ASQTAD_FAT_LINKS) {
       fat_link_max = src.LinkMax();
       if (precision == QUDA_HALF_PRECISION && fat_link_max == 0.0)
         errorQuda("fat_link_max has not been computed");
     } else {
       fat_link_max = 1.0;
     }

     if (typeid(src) == typeid(cudaGaugeField)) {

       if (ghostExchange != QUDA_GHOST_EXCHANGE_EXTENDED && src.GhostExchange() != QUDA_GHOST_EXCHANGE_EXTENDED) {
         // copy field and ghost zone into this field
         copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, static_cast<const cudaGaugeField&>(src).gauge);

         if (geometry == QUDA_COARSE_GEOMETRY)
           copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, static_cast<const cudaGaugeField&>(src).gauge, 0, 0, 3);
       } else {
         copyExtendedGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, static_cast<const cudaGaugeField&>(src).gauge);
         if (geometry == QUDA_COARSE_GEOMETRY) errorQuda("Extended gauge copy for coarse geometry not supported");
       }

     } else if (typeid(src) == typeid(cpuGaugeField)) {
       if (reorder_location() == QUDA_CPU_FIELD_LOCATION) { // do reorder on the CPU
   void *buffer = pool_pinned_malloc(bytes);

   if (ghostExchange != QUDA_GHOST_EXCHANGE_EXTENDED && src.GhostExchange() != QUDA_GHOST_EXCHANGE_EXTENDED) {
     // copy field and ghost zone into buffer
     copyGenericGauge(*this, src, QUDA_CPU_FIELD_LOCATION, buffer, static_cast<const cpuGaugeField&>(src).gauge);

         if (geometry == QUDA_COARSE_GEOMETRY)
           copyGenericGauge(*this, src, QUDA_CPU_FIELD_LOCATION, buffer, static_cast<const cpuGaugeField&>(src).gauge, 0, 0, 3);
   } else {
     copyExtendedGauge(*this, src, QUDA_CPU_FIELD_LOCATION, buffer, static_cast<const cpuGaugeField&>(src).gauge);
           if (geometry == QUDA_COARSE_GEOMETRY) errorQuda("Extended gauge copy for coarse geometry not supported");
   }

   // this copies over both even and odd
   qudaMemcpy(gauge, buffer, bytes, cudaMemcpyHostToDevice);
   pool_pinned_free(buffer);
       } else { // else on the GPU

   if (src.Order() == QUDA_MILC_SITE_GAUGE_ORDER || src.Order() == QUDA_BQCD_GAUGE_ORDER) {
     // special case where we use zero-copy memory to read/write directly from application's array
     void *src_d;
     cudaError_t error = cudaHostGetDevicePointer(&src_d, const_cast<void*>(src.Gauge_p()), 0);
     if (error != cudaSuccess) errorQuda("Failed to get device pointer for MILC site / BQCD array");

     if (src.GhostExchange() == QUDA_GHOST_EXCHANGE_NO) {
       copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, src_d);
     } else {
       errorQuda("Ghost copy not supported here");
     }

   } else {
     void *buffer = create_gauge_buffer(src.Bytes(), src.Order(), src.Geometry());
     size_t ghost_bytes[8];
     int srcNinternal = src.Reconstruct() != QUDA_RECONSTRUCT_NO ? src.Reconstruct() : 2*nColor*nColor;
     for (int d=0; d<geometry; d++) ghost_bytes[d] = nFace * surface[d%4] * srcNinternal * src.Precision();
     void **ghost_buffer = (nFace > 0) ? create_ghost_buffer(ghost_bytes, src.Order(), geometry) : nullptr;

     if (src.Order() == QUDA_QDP_GAUGE_ORDER) {
       for (int d=0; d<geometry; d++) {
         qudaMemcpy(((void**)buffer)[d], ((void**)src.Gauge_p())[d], src.Bytes()/geometry, cudaMemcpyHostToDevice);
       }
     } else {
       qudaMemcpy(buffer, src.Gauge_p(), src.Bytes(), cudaMemcpyHostToDevice);
     }

     if (src.Order() > 4 && GhostExchange() == QUDA_GHOST_EXCHANGE_PAD &&
         src.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD && nFace)
       for (int d=0; d<geometry; d++)
         qudaMemcpy(ghost_buffer[d], src.Ghost()[d], ghost_bytes[d], cudaMemcpyHostToDevice);

     if (ghostExchange != QUDA_GHOST_EXCHANGE_EXTENDED && src.GhostExchange() != QUDA_GHOST_EXCHANGE_EXTENDED) {
       copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, buffer, 0, ghost_buffer);
       if (geometry == QUDA_COARSE_GEOMETRY) copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, buffer, 0, ghost_buffer, 3);
     } else {
       copyExtendedGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, buffer);
             if (geometry == QUDA_COARSE_GEOMETRY) errorQuda("Extended gauge copy for coarse geometry not supported");
     }
     free_gauge_buffer(buffer, src.Order(), src.Geometry());
     if (nFace > 0) free_ghost_buffer(ghost_buffer, src.Order(), geometry);
   }
       } // reorder_location
     } else {
       errorQuda("Invalid gauge field type");
     }

     // if we have copied from a source without a pad then we need to exchange
     if (ghostExchange == QUDA_GHOST_EXCHANGE_PAD && src.GhostExchange() != QUDA_GHOST_EXCHANGE_PAD)
       exchangeGhost(geometry == QUDA_VECTOR_GEOMETRY ? QUDA_LINK_BACKWARDS : QUDA_LINK_BIDIRECTIONAL);

     staggeredPhaseApplied = src.StaggeredPhaseApplied();
     staggeredPhaseType = src.StaggeredPhase();

     checkCudaError();
   }

   void cudaGaugeField::loadCPUField(const cpuGaugeField &cpu) {
     copy(cpu);
     qudaDeviceSynchronize();
     checkCudaError();
   }

   void cudaGaugeField::loadCPUField(const cpuGaugeField &cpu, TimeProfile &profile) {
     profile.TPSTART(QUDA_PROFILE_H2D);
     loadCPUField(cpu);
     profile.TPSTOP(QUDA_PROFILE_H2D);
   }

   void cudaGaugeField::saveCPUField(cpuGaugeField &cpu) const
   {
     static_cast<LatticeField&>(cpu).checkField(*this);

     if (reorder_location() == QUDA_CUDA_FIELD_LOCATION) {

       if (cpu.Order() == QUDA_MILC_SITE_GAUGE_ORDER || cpu.Order() == QUDA_BQCD_GAUGE_ORDER) {
   // special case where we use zero-copy memory to read/write directly from application's array
   void *cpu_d;
   cudaError_t error = cudaHostGetDevicePointer(&cpu_d, const_cast<void*>(cpu.Gauge_p()), 0);
   if (error != cudaSuccess) errorQuda("Failed to get device pointer for MILC site / BQCD array");
   if (cpu.GhostExchange() == QUDA_GHOST_EXCHANGE_NO) {
     copyGenericGauge(cpu, *this, QUDA_CUDA_FIELD_LOCATION, cpu_d, gauge);
   } else {
     errorQuda("Ghost copy not supported here");
   }
       } else {
   void *buffer = create_gauge_buffer(cpu.Bytes(), cpu.Order(), cpu.Geometry());

   // Allocate space for ghost zone if required
   size_t ghost_bytes[8];
   int cpuNinternal = cpu.Reconstruct() != QUDA_RECONSTRUCT_NO ? cpu.Reconstruct() : 2*nColor*nColor;
   for (int d=0; d<geometry; d++) ghost_bytes[d] = nFace * surface[d%4] * cpuNinternal * cpu.Precision();
   void **ghost_buffer = (nFace > 0) ? create_ghost_buffer(ghost_bytes, cpu.Order(), geometry) : nullptr;

   if (cpu.GhostExchange() != QUDA_GHOST_EXCHANGE_EXTENDED) {
     copyGenericGauge(cpu, *this, QUDA_CUDA_FIELD_LOCATION, buffer, gauge, ghost_buffer, 0);
     if (geometry == QUDA_COARSE_GEOMETRY) copyGenericGauge(cpu, *this, QUDA_CUDA_FIELD_LOCATION, buffer, gauge, ghost_buffer, 0, 3);
   } else {
     copyExtendedGauge(cpu, *this, QUDA_CUDA_FIELD_LOCATION, buffer, gauge);
   }

   if (cpu.Order() == QUDA_QDP_GAUGE_ORDER) {
     for (int d=0; d<geometry; d++) qudaMemcpy(((void**)cpu.gauge)[d], ((void**)buffer)[d], cpu.Bytes()/geometry, cudaMemcpyDeviceToHost);
   } else {
     qudaMemcpy(cpu.gauge, buffer, cpu.Bytes(), cudaMemcpyDeviceToHost);
   }

   if (cpu.Order() > 4 && GhostExchange() == QUDA_GHOST_EXCHANGE_PAD &&
       cpu.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD && nFace)
     for (int d=0; d<geometry; d++)
       qudaMemcpy(cpu.Ghost()[d], ghost_buffer[d], ghost_bytes[d], cudaMemcpyDeviceToHost);

   free_gauge_buffer(buffer, cpu.Order(), cpu.Geometry());
   if (nFace > 0) free_ghost_buffer(ghost_buffer, cpu.Order(), geometry);
       }
     } else if (reorder_location() == QUDA_CPU_FIELD_LOCATION) { // do copy then host-side reorder

       void *buffer = pool_pinned_malloc(bytes);
       qudaMemcpy(buffer, gauge, bytes, cudaMemcpyDeviceToHost);

       if (cpu.GhostExchange() != QUDA_GHOST_EXCHANGE_EXTENDED) {
   copyGenericGauge(cpu, *this, QUDA_CPU_FIELD_LOCATION, cpu.gauge, buffer);
       } else {
   copyExtendedGauge(cpu, *this, QUDA_CPU_FIELD_LOCATION, cpu.gauge, buffer);
       }
       pool_pinned_free(buffer);

     } else {
       errorQuda("Invalid pack location %d", reorder_location());
     }

     cpu.staggeredPhaseApplied = staggeredPhaseApplied;
     cpu.staggeredPhaseType = staggeredPhaseType;

     qudaDeviceSynchronize();
     checkCudaError();
   }

   void cudaGaugeField::saveCPUField(cpuGaugeField &cpu, TimeProfile &profile) const {
     profile.TPSTART(QUDA_PROFILE_D2H);
     saveCPUField(cpu);
     profile.TPSTOP(QUDA_PROFILE_D2H);
   }

   void cudaGaugeField::backup() const {
     if (backed_up) errorQuda("Gauge field already backed up");
     backup_h = new char[bytes];
     cudaMemcpy(backup_h, gauge, bytes, cudaMemcpyDeviceToHost);
     checkCudaError();
     backed_up = true;
   }

   void cudaGaugeField::restore() {
     if (!backed_up) errorQuda("Cannot restore since not backed up");
     cudaMemcpy(gauge, backup_h, bytes, cudaMemcpyHostToDevice);
     delete []backup_h;
     checkCudaError();
     backed_up = false;
   }

   void cudaGaugeField::zero() {
     cudaMemset(gauge, 0, bytes);
   }


 } // namespace quda
qudaMemcpy
#define qudaMemcpy(dst, src, count, kind)
Definition: quda_cuda_api.h:32

QUDA_RECONSTRUCT_NO
Definition: enum_quda.h:66

quda::reorder_location
QudaFieldLocation reorder_location()
Return whether data is reordered on the CPU or GPU. This can set at QUDA initialization using the env...
Definition: lattice_field.cpp:585

quda::extractGaugeGhost
void extractGaugeGhost(const GaugeField &u, void **ghost, bool extract=true, int offset=0)
Definition: extract_gauge_ghost.cu:103

quda::LatticeField::backed_up
bool backed_up
Definition: lattice_field.h:322

quda::LatticeField::allocateGhostBuffer
void allocateGhostBuffer(size_t ghost_bytes) const
Allocate the static ghost buffers.
Definition: lattice_field.cpp:128

pool_pinned_free
#define pool_pinned_free(ptr)
Definition: malloc_quda.h:116

quda::qudaEventSynchronize
cudaError_t qudaEventSynchronize(cudaEvent_t &event)
Wrapper around cudaEventSynchronize or cuEventSynchronize.
Definition: quda_cuda_api.cpp:260

quda::GaugeField::phase_bytes
int phase_bytes
Definition: gauge_field.h:128

quda::LatticeField::my_face_dim_dir_d
void * my_face_dim_dir_d[2][QUDA_MAX_DIM][2]
Definition: lattice_field.h:235

quda::cudaGaugeField::saveCPUField
void saveCPUField(cpuGaugeField &cpu) const
Upload from this field into a CPU field.
Definition: cuda_gauge_field.cu:702

quda::copyGenericGauge
void copyGenericGauge(GaugeField &out, const GaugeField &in, QudaFieldLocation location, void *Out=0, void *In=0, void **ghostOut=0, void **ghostIn=0, int type=0)
Definition: copy_gauge.cu:38

deviceProp
cudaDeviceProp deviceProp
Definition: interface_quda.cpp:152

quda::LatticeField::pad
int pad
Definition: lattice_field.h:132

quda::LatticeField::createComms
void createComms(bool no_comms_fill=false)
Definition: lattice_field.cpp:190

quda::LatticeField::initComms
bool initComms
Definition: lattice_field.h:306

quda::GaugeField::bytes
size_t bytes
Definition: gauge_field.h:126

src
const void * src
Definition: CMakeCUDACompilerId.cpp1.ii:2290

QUDA_ASQTAD_MOM_LINKS
Definition: enum_quda.h:32

errorQuda
#define errorQuda(...)
Definition: util_quda.h:90

QUDA_BQCD_GAUGE_ORDER
Definition: enum_quda.h:46

quda::free_gauge_buffer
void free_gauge_buffer(void *buffer, QudaGaugeFieldOrder order, QudaFieldGeometry geometry)
Definition: cuda_gauge_field.cu:571

QUDA_CUDA_FIELD_LOCATION
Definition: enum_quda.h:297

QUDA_HALF_PRECISION
Definition: enum_quda.h:59

QUDA_QDP_GAUGE_ORDER
Definition: enum_quda.h:41

QudaLinkDirection
enum QudaLinkDirection_s QudaLinkDirection

streams
cudaStream_t * streams
Definition: interface_quda.cpp:153

quda::LatticeField::backup_h
char * backup_h
Definition: lattice_field.h:320

quda::cudaGaugeField::injectGhost
void injectGhost(QudaLinkDirection link_direction=QUDA_LINK_BACKWARDS)
The opposite of exchangeGhost: take the ghost zone on x, send to node x-1, and inject back into the f...
Definition: cuda_gauge_field.cu:264

quda::free_ghost_buffer
void free_ghost_buffer(void **buffer, QudaGaugeFieldOrder order, QudaFieldGeometry geometry)
Definition: cuda_gauge_field.cu:580

quda::GaugeField::reconstruct
QudaReconstructType reconstruct
Definition: gauge_field.h:135

quda::GaugeField::staggeredPhaseApplied
bool staggeredPhaseApplied
Definition: gauge_field.h:161

QUDA_LINK_BIDIRECTIONAL
Definition: enum_quda.h:422

quda::cudaGaugeField::even
void * even
Definition: gauge_field.h:302

dim
static __inline__ dim3 dim3 void size_t cudaStream_t int dim
Definition: CMakeCUDACompilerId.cpp1.ii:15687

QUDA_NULL_FIELD_CREATE
Definition: enum_quda.h:330

R
static int R[4]
Definition: interface_quda.cpp:83

quda::LatticeField::mh_send_p2p_back
static MsgHandle * mh_send_p2p_back[2][QUDA_MAX_DIM]
Definition: lattice_field.h:279

quda::GaugeField::Geometry
QudaFieldGeometry Geometry() const
Definition: gauge_field.h:212

QUDA_TIFR_PADDED_GAUGE_ORDER
Definition: enum_quda.h:48

QUDA_MILC_SITE_GAUGE_ORDER
Definition: enum_quda.h:45

quda::cudaGaugeField::restore
void restore()
Restores the cudaGaugeField to CUDA memory.
Definition: cuda_gauge_field.cu:785

QUDA_LINK_FORWARDS
Definition: enum_quda.h:421

quda::LatticeField::mh_send_rdma_fwd
MsgHandle * mh_send_rdma_fwd[2][QUDA_MAX_DIM]
Definition: lattice_field.h:270

quda::GaugeField::nFace
int nFace
Definition: gauge_field.h:132

quda::GaugeField::staggeredPhaseType
QudaStaggeredPhase staggeredPhaseType
Definition: gauge_field.h:156

quda
Definition: blas_cublas.h:6

quda::cudaGaugeField::cudaGaugeField
cudaGaugeField(const GaugeFieldParam &)
Definition: cuda_gauge_field.cu:8

offset
size_t size_t offset
Definition: CMakeCUDACompilerId.cpp1.ii:2497

quda::GaugeField::geometry
QudaFieldGeometry geometry
Definition: gauge_field.h:133

QUDA_REFERENCE_FIELD_CREATE
Definition: enum_quda.h:333

param
QudaGaugeParam param
Definition: pack_test.cpp:17

b
#define b
Definition: dw_dslash4_core.h:83

quda::QUDA_PROFILE_D2H
Definition: quda_internal.h:169

quda::LatticeField::ghost_pinned_buffer_h
static void * ghost_pinned_buffer_h[2]
Definition: lattice_field.h:181

QUDA_RECONSTRUCT_9
Definition: enum_quda.h:69

quda::LatticeField::destroyComms
void destroyComms()
Definition: lattice_field.cpp:260

quda::LatticeField::R
const int * R() const
Definition: lattice_field.h:452

quda::GaugeFieldParam
Definition: gauge_field.h:10

quda::cudaGaugeField::loadCPUField
void loadCPUField(const cpuGaugeField &cpu)
Download into this field from a CPU field.
Definition: cuda_gauge_field.cu:690

QUDA_GHOST_EXCHANGE_EXTENDED
Definition: enum_quda.h:436

quda::LatticeField::mh_send_rdma_back
MsgHandle * mh_send_rdma_back[2][QUDA_MAX_DIM]
Definition: lattice_field.h:273

quda::LatticeField::mh_recv_p2p_fwd
static MsgHandle * mh_recv_p2p_fwd[2][QUDA_MAX_DIM]
Definition: lattice_field.h:282

quda::LatticeField::createIPCComms
void createIPCComms()
Definition: lattice_field.cpp:286

quda::GaugeField::checkField
void checkField(const LatticeField &) const
Definition: gauge_field.cpp:236

quda::LatticeField::ghost_field_reset
static bool ghost_field_reset
Definition: lattice_field.h:405

quda::cudaGaugeField::commsComplete
void commsComplete(int dim, int dir)
Wait for communication to complete.
Definition: cuda_gauge_field.cu:428

quda::cudaGaugeField::allocateGhostBuffer
void allocateGhostBuffer(const int *R, bool no_comms_fill) const
Allocate the ghost buffers.
Definition: cuda_gauge_field.cu:344

quda::LatticeField::bufferIndex
static int bufferIndex
Definition: lattice_field.h:400

QUDA_LINK_BACKWARDS
Definition: enum_quda.h:420

quda::qudaStreamSynchronize
cudaError_t qudaStreamSynchronize(cudaStream_t &stream)
Wrapper around cudaStreamSynchronize or cuStreamSynchronize.
Definition: quda_cuda_api.cpp:243

quda::GaugeField::Bytes
size_t Bytes() const
Definition: gauge_field.h:242

quda::cudaGaugeField::exchangeExtendedGhost
void exchangeExtendedGhost(const int *R, bool no_comms_fill=false)
This does routine will populate the border / halo region of a gauge field that has been created using...
Definition: cuda_gauge_field.cu:471

quda::LatticeField::mh_recv_back
MsgHandle * mh_recv_back[2][QUDA_MAX_DIM]
Definition: lattice_field.h:255

quda::QUDA_PROFILE_H2D
Definition: quda_internal.h:168

quda::cudaGaugeField::sendStart
void sendStart(int dim, int dir, cudaStream_t *stream_p=nullptr)
Start the sending communicators.
Definition: cuda_gauge_field.cu:389

quda::LatticeField
Definition: lattice_field.h:122

quda::extractExtendedGaugeGhost
void extractExtendedGaugeGhost(const GaugeField &u, int dim, const int *R, void **ghost, bool extract)
Definition: extract_gauge_ghost_extended.cu:422

quda::cpuGaugeField
Definition: gauge_field.h:464

comm_start
void comm_start(MsgHandle *mh)
Definition: comm_mpi.cpp:260

pool_device_malloc
#define pool_device_malloc(size)
Definition: malloc_quda.h:113

quda::LatticeField::mh_recv_rdma_fwd
MsgHandle * mh_recv_rdma_fwd[2][QUDA_MAX_DIM]
Definition: lattice_field.h:264

fused_exterior_ndeg_tm_dslash_cuda_gen.i
int i
start here
Definition: fused_exterior_ndeg_tm_dslash_cuda_gen.py:816

quda::LatticeField::ghostExchange
QudaGhostExchange ghostExchange
Definition: lattice_field.h:155

quda::cudaGaugeField
Definition: gauge_field.h:298

quda::GaugeField::nInternal
int nInternal
Definition: gauge_field.h:136

QUDA_CPS_WILSON_GAUGE_ORDER
Definition: enum_quda.h:43

quda::cudaGaugeField::zero
void zero()
Definition: cuda_gauge_field.cu:793

quda::LatticeField::ghost_remote_send_buffer_d
static void * ghost_remote_send_buffer_d[2][QUDA_MAX_DIM][2]
Definition: lattice_field.h:191

quda::create_gauge_buffer
void * create_gauge_buffer(size_t bytes, QudaGaugeFieldOrder order, QudaFieldGeometry geometry)
Definition: cuda_gauge_field.cu:548

quda::cudaGaugeField::exchangeGhost
void exchangeGhost(QudaLinkDirection link_direction=QUDA_LINK_BACKWARDS)
Exchange the ghost and store store in the padded region.
Definition: cuda_gauge_field.cu:179

quda::cpuGaugeField::Gauge_p
void * Gauge_p()
Definition: gauge_field.h:526

quda::LatticeField::ghost_bytes
size_t ghost_bytes
Definition: lattice_field.h:206

QUDA_GHOST_EXCHANGE_NO
Definition: enum_quda.h:434

quda::LatticeField::my_face_h
void * my_face_h[2]
Definition: lattice_field.h:224

quda::GaugeField::Ghost
const void ** Ghost() const
Definition: gauge_field.h:254

QudaGaugeFieldOrder
enum QudaGaugeFieldOrder_s QudaGaugeFieldOrder

quda::cudaGaugeField::backup
void backup() const
Backs up the cudaGaugeField to CPU memory.
Definition: cuda_gauge_field.cu:777

quda::LatticeField::from_face_dim_dir_d
void * from_face_dim_dir_d[2][QUDA_MAX_DIM][2]
Definition: lattice_field.h:249

quda::LatticeField::ghost_face_bytes
size_t ghost_face_bytes[QUDA_MAX_DIM]
Definition: lattice_field.h:211

quda::qudaDeviceSynchronize
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
Definition: quda_cuda_api.cpp:277

quda::LatticeField::nDim
int nDim
Definition: lattice_field.h:137

quda::cpuGaugeField::gauge
void ** gauge
Definition: gauge_field.h:471

QUDA_VECTOR_GEOMETRY
Definition: enum_quda.h:427

quda::LatticeField::destroyIPCComms
static void destroyIPCComms()
Definition: lattice_field.cpp:429

quda::LatticeField::mh_send_fwd
MsgHandle * mh_send_fwd[2][QUDA_MAX_DIM]
Definition: lattice_field.h:258

QUDA_TIFR_GAUGE_ORDER
Definition: enum_quda.h:47

comm_peer2peer_enabled
bool comm_peer2peer_enabled(int dir, int dim)
Definition: comm_common.cpp:265

blas_quda.h

quda::LatticeField::ghost_send_buffer_d
static void * ghost_send_buffer_d[2]
Definition: lattice_field.h:171

memset
void * memset(void *__b, int __c, size_t __len)

quda::LatticeField::surface
int surface[QUDA_MAX_DIM]
Definition: lattice_field.h:142

pool_pinned_malloc
#define pool_pinned_malloc(size)
Definition: malloc_quda.h:115

quda::cudaGaugeField::recvStart
void recvStart(int dim, int dir)
Start the receive communicators.
Definition: cuda_gauge_field.cu:364

QUDA_DOUBLE_PRECISION
Definition: enum_quda.h:61

quda::GaugeField::createGhostZone
void createGhostZone(const int *R, bool no_comms_fill) const
Definition: gauge_field.cpp:90

quda::LatticeField::ghostOffset
int ghostOffset[QUDA_MAX_DIM][2]
Definition: lattice_field.h:216

quda::create_ghost_buffer
void ** create_ghost_buffer(size_t bytes[], QudaGaugeFieldOrder order, QudaFieldGeometry geometry)
Definition: cuda_gauge_field.cu:559

quda::GaugeField::link_type
QudaLinkType link_type
Definition: gauge_field.h:139

QUDA_SINGLE_PRECISION
Definition: enum_quda.h:60

quda::LatticeField::from_face_dim_dir_h
void * from_face_dim_dir_h[2][QUDA_MAX_DIM][2]
Definition: lattice_field.h:243

quda::LatticeField::mh_send_p2p_fwd
static MsgHandle * mh_send_p2p_fwd[2][QUDA_MAX_DIM]
Definition: lattice_field.h:276

int
int
Definition: CMakeCUDACompilerId.cpp1.ii:3962

quda::LatticeField::ghost_recv_buffer_d
static void * ghost_recv_buffer_d[2]
Definition: lattice_field.h:176

quda::LatticeField::mh_recv_rdma_back
MsgHandle * mh_recv_rdma_back[2][QUDA_MAX_DIM]
Definition: lattice_field.h:267

quda::LatticeField::ipcCopyEvent
static cudaEvent_t ipcCopyEvent[2][2][QUDA_MAX_DIM]
Definition: lattice_field.h:300

quda::cudaGaugeField::setGauge
void setGauge(void *_gauge)
Definition: cuda_gauge_field.cu:539

QUDA_QDPJIT_GAUGE_ORDER
Definition: enum_quda.h:42

tex
static __inline__ dim3 dim3 void size_t cudaStream_t int enum cudaTextureReadMode readMode static __inline__ const struct texture< T, dim, readMode > & tex
Definition: CMakeCUDACompilerId.cpp1.ii:15874

quda::cudaGaugeField::createComms
void createComms(const int *R, bool no_comms_fill)
Create the communication handlers and buffers.
Definition: cuda_gauge_field.cu:350

quda::GaugeField::create
QudaFieldCreate create
Definition: gauge_field.h:147

QUDA_RECONSTRUCT_13
Definition: enum_quda.h:70

comm_gdr_enabled
bool comm_gdr_enabled()
Query if GPU Direct RDMA communication is enabled (global setting)
Definition: comm_common.cpp:611

quda::GaugeField::nColor
int nColor
Definition: gauge_field.h:131

quda::TimeProfile
Definition: quda_internal.h:232

quda::GaugeField::phase_offset
int phase_offset
Definition: gauge_field.h:127

quda::qudaEventRecord
cudaError_t qudaEventRecord(cudaEvent_t &event, cudaStream_t stream=0)
Wrapper around cudaEventRecord or cuEventRecord.
Definition: quda_cuda_api.cpp:209

quda::GaugeField::fat_link_max
double fat_link_max
Definition: gauge_field.h:144

quda::LatticeField::surfaceCB
int surfaceCB[QUDA_MAX_DIM]
Definition: lattice_field.h:143

quda::GaugeField::Reconstruct
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:203

quda::LatticeField::ipcRemoteCopyEvent
static cudaEvent_t ipcRemoteCopyEvent[2][2][QUDA_MAX_DIM]
Definition: lattice_field.h:303

QudaFieldGeometry
enum QudaFieldGeometry_s QudaFieldGeometry

quda::cudaGaugeField::~cudaGaugeField
virtual ~cudaGaugeField()
Definition: cuda_gauge_field.cu:156

pool_device_free
#define pool_device_free(ptr)
Definition: malloc_quda.h:114

QUDA_COARSE_GEOMETRY
Definition: enum_quda.h:429

quda::GaugeField::Order
QudaGaugeFieldOrder Order() const
Definition: gauge_field.h:204

QUDA_MAX_DIM
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
Definition: quda_constants.h:17

checkCudaError
#define checkCudaError()
Definition: util_quda.h:129

desc
const struct cudaChannelFormatDesc * desc
Definition: CMakeCUDACompilerId.cpp1.ii:2509

quda::LatticeField::mh_recv_fwd
MsgHandle * mh_recv_fwd[2][QUDA_MAX_DIM]
Definition: lattice_field.h:252

QUDA_ZERO_FIELD_CREATE
Definition: enum_quda.h:331

comm_wait
void comm_wait(MsgHandle *mh)
Definition: comm_mpi.cpp:266

quda::LatticeField::mh_recv_p2p_back
static MsgHandle * mh_recv_p2p_back[2][QUDA_MAX_DIM]
Definition: lattice_field.h:285

quda::GaugeField::order
QudaGaugeFieldOrder order
Definition: gauge_field.h:137

quda::LatticeField::GhostExchange
QudaGhostExchange GhostExchange() const
Definition: lattice_field.h:457

quda::cudaGaugeField::copy
void copy(const GaugeField &src)
Definition: cuda_gauge_field.cu:587

QUDA_GHOST_EXCHANGE_PAD
Definition: enum_quda.h:435

d
static __inline__ size_t size_t d
Definition: CMakeCUDACompilerId.cpp1.ii:3019

quda::LatticeField::Precision
QudaPrecision Precision() const
Definition: lattice_field.h:462

quda::GaugeField::isNative
bool isNative() const
Definition: gauge_field.cpp:138

quda::cudaGaugeField::odd
void * odd
Definition: gauge_field.h:303

quda::LatticeField::my_face_dim_dir_h
void * my_face_dim_dir_h[2][QUDA_MAX_DIM][2]
Definition: lattice_field.h:229

quda::GaugeField::ghost
void * ghost[2 *QUDA_MAX_DIM]
Definition: gauge_field.h:149

QUDA_CPU_FIELD_LOCATION
Definition: enum_quda.h:296

quda::LatticeField::precision
QudaPrecision precision
Definition: lattice_field.h:149

quda::LatticeField::mh_send_back
MsgHandle * mh_send_back[2][QUDA_MAX_DIM]
Definition: lattice_field.h:261

QUDA_ASQTAD_FAT_LINKS
Definition: enum_quda.h:30

gauge_field.h

quda::copyExtendedGauge
void copyExtendedGauge(GaugeField &out, const GaugeField &in, QudaFieldLocation location, void *Out=0, void *In=0)
Definition: copy_gauge_extended.cu:321

quda::blas::bytes
unsigned long long bytes
Definition: blas_quda.cu:43

comm_dim_partitioned
int comm_dim_partitioned(int dim)
Definition: comm_common.cpp:597

quda::cudaGaugeField::gauge
void * gauge
Definition: gauge_field.h:301

quda::QUDA_PROFILE_COMMS
Definition: quda_internal.h:173

quda::GaugeField
Definition: gauge_field.h:123