quda-ref/v1.0.0/pgauge__exchange_8cu_source.html

 #include <quda_internal.h>
 #include <quda_matrix.h>
 #include <tune_quda.h>
 #include <gauge_field.h>
 #include <gauge_field_order.h>
 #include <cub_helper.cuh>
 #include <launch_kernel.cuh>
 #include <comm_quda.h>


 namespace quda {

 #ifdef GPU_GAUGE_ALG

 #define LAUNCH_KERNEL_GAUGEFIX(kernel, tp, stream, arg, parity, ...)     \
   if ( tp.block.z == 0 ) { \
     switch ( tp.block.x ) {             \
     case 256:                \
       kernel<0, 32,__VA_ARGS__>           \
       <<< tp.grid.x, tp.block.x, tp.shared_bytes, stream >>> (arg, parity);   \
       break;                \
     case 512:                \
       kernel<0, 64,__VA_ARGS__>           \
       <<< tp.grid.x, tp.block.x, tp.shared_bytes, stream >>> (arg, parity);   \
       break;                \
     case 768:                \
       kernel<0, 96,__VA_ARGS__>           \
       <<< tp.grid.x, tp.block.x, tp.shared_bytes, stream >>> (arg, parity);   \
       break;                \
     case 1024:               \
       kernel<0, 128,__VA_ARGS__>            \
       <<< tp.grid.x, tp.block.x, tp.shared_bytes, stream >>> (arg, parity);   \
       break;                \
     default:                \
       errorQuda("%s not implemented for %d threads", # kernel, tp.block.x); \
     } \
   } \
   else{ \
     switch ( tp.block.x ) {             \
     case 256:                \
       kernel<1, 32,__VA_ARGS__>           \
       <<< tp.grid.x, tp.block.x, tp.shared_bytes, stream >>> (arg, parity);   \
       break;                \
     case 512:                \
       kernel<1, 64,__VA_ARGS__>           \
       <<< tp.grid.x, tp.block.x, tp.shared_bytes, stream >>> (arg, parity);   \
       break;                \
     case 768:                \
       kernel<1, 96,__VA_ARGS__>           \
       <<< tp.grid.x, tp.block.x, tp.shared_bytes, stream >>> (arg, parity);   \
       break;                \
     case 1024:               \
       kernel<1, 128,__VA_ARGS__>            \
       <<< tp.grid.x, tp.block.x, tp.shared_bytes, stream >>> (arg, parity);   \
       break;                \
     default:                \
       errorQuda("%s not implemented for %d threads", # kernel, tp.block.x); \
     } \
   }


   template <typename Gauge>
   struct GaugeFixUnPackArg {
     int X[4]; // grid dimensions
     Gauge dataOr;
     GaugeFixUnPackArg(Gauge & dataOr, cudaGaugeField & data)
       : dataOr(dataOr) {
       for ( int dir = 0; dir < 4; ++dir ) X[dir] = data.X()[dir];
     }
   };


   template<int NElems, typename Float, typename Gauge, bool pack>
   __global__ void Kernel_UnPack(int size, GaugeFixUnPackArg<Gauge> arg, \
                                 complex<Float> *array, int parity, int face, int dir, int borderid){
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
     if ( idx >= size ) return;
     int X[4];
     for ( int dr = 0; dr < 4; ++dr ) X[dr] = arg.X[dr];
     int x[4];
     int za, xodd;
     switch ( face ) {
     case 0: //X FACE
       za = idx / ( X[1] / 2);
       x[3] = za / X[2];
       x[2] = za - x[3] * X[2];
       x[0] = borderid;
       xodd = (borderid + x[2] + x[3] + parity) & 1;
       x[1] = (2 * idx + xodd)  - za * X[1];
       break;
     case 1: //Y FACE
       za = idx / ( X[0] / 2);
       x[3] = za / X[2];
       x[2] = za - x[3] * X[2];
       x[1] = borderid;
       xodd = (borderid  + x[2] + x[3] + parity) & 1;
       x[0] = (2 * idx + xodd)  - za * X[0];
       break;
     case 2: //Z FACE
       za = idx / ( X[0] / 2);
       x[3] = za / X[1];
       x[1] = za - x[3] * X[1];
       x[2] = borderid;
       xodd = (borderid  + x[1] + x[3] + parity) & 1;
       x[0] = (2 * idx + xodd)  - za * X[0];
       break;
     case 3: //T FACE
       za = idx / ( X[0] / 2);
       x[2] = za / X[1];
       x[1] = za - x[2] * X[1];
       x[3] = borderid;
       xodd = (borderid  + x[1] + x[2] + parity) & 1;
       x[0] = (2 * idx + xodd)  - za * X[0];
       break;
     }
     int id = (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]) >> 1;
     typedef complex<Float> Complex;
     typedef typename mapper<Float>::type RegType;
     RegType tmp[NElems];
     Complex data[9];
     if ( pack ) {
       arg.dataOr.load(data, id, dir, parity);
       arg.dataOr.reconstruct.Pack(tmp, data, id);
       for ( int i = 0; i < NElems / 2; ++i ) array[idx + size * i] = Complex(tmp[2*i+0], tmp[2*i+1]);
     }
     else{
       for ( int i = 0; i < NElems / 2; ++i ) {
         tmp[2*i+0] = array[idx + size * i].real();
         tmp[2*i+1] = array[idx + size * i].imag();
       }
       arg.dataOr.reconstruct.Unpack(data, tmp, id, dir, 0, arg.dataOr.X, arg.dataOr.R);
       arg.dataOr.save(data, id, dir, parity);
     }
   }

 #ifdef MULTI_GPU
   static void *send[4];
   static void *recv[4];
   static void *sendg[4];
   static void *recvg[4];
   static void *send_d[4];
   static void *recv_d[4];
   static void *sendg_d[4];
   static void *recvg_d[4];
   static void *hostbuffer_h[4];
   static cudaStream_t GFStream[2];
   static size_t offset[4];
   static size_t bytes[4];
   static size_t faceVolume[4];
   static size_t faceVolumeCB[4];
   // do the exchange
   static MsgHandle *mh_recv_back[4];
   static MsgHandle *mh_recv_fwd[4];
   static MsgHandle *mh_send_fwd[4];
   static MsgHandle *mh_send_back[4];
   static int X[4];
   static dim3 block[4];
   static dim3 grid[4];
   static bool notinitialized = true;
 #endif // MULTI_GPU

   void PGaugeExchangeFree(){
 #ifdef MULTI_GPU
     if ( comm_dim_partitioned(0) || comm_dim_partitioned(1) || comm_dim_partitioned(2) || comm_dim_partitioned(3) ) {
       cudaStreamDestroy(GFStream[0]);
       cudaStreamDestroy(GFStream[1]);
       for ( int d = 0; d < 4; d++ ) {
         if ( commDimPartitioned(d)) {
           comm_free(mh_send_fwd[d]);
           comm_free(mh_send_back[d]);
           comm_free(mh_recv_back[d]);
           comm_free(mh_recv_fwd[d]);
           device_free(send_d[d]);
           device_free(recv_d[d]);
           device_free(sendg_d[d]);
           device_free(recvg_d[d]);
         #ifndef GPU_COMMS
           host_free(hostbuffer_h[d]);
         #endif
         }
       }
       notinitialized = true;
     }
 #endif
   }


   template<typename Float, int NElems, typename Gauge>
   void PGaugeExchange( Gauge dataOr,  cudaGaugeField& data, const int dir, const int parity) {


 #ifdef MULTI_GPU
     if ( notinitialized == false ) {
       for ( int d = 0; d < 4; d++ ) {
         if ( X[d] != data.X()[d] ) {
           PGaugeExchangeFree();
           notinitialized = true;
           printfQuda("PGaugeExchange needs to be reinitialized...\n");
           break;
         }
       }
     }
     if ( notinitialized ) {
       for ( int d = 0; d < 4; d++ ) {
         X[d] = data.X()[d];
       }
       for ( int i = 0; i < 4; i++ ) {
         faceVolume[i] = 1;
         for ( int j = 0; j < 4; j++ ) {
           if ( i == j ) continue;
           faceVolume[i] *= X[j];
         }
         faceVolumeCB[i] = faceVolume[i] / 2;
       }

       cudaStreamCreate(&GFStream[0]);
       cudaStreamCreate(&GFStream[1]);
       for ( int d = 0; d < 4; d++ ) {
         if ( !commDimPartitioned(d)) continue;
         // store both parities and directions in each
         offset[d] = faceVolumeCB[d] * NElems;
         bytes[d] =  sizeof(Float) * offset[d];
         send_d[d] = device_malloc(bytes[d]);
         recv_d[d] = device_malloc(bytes[d]);
         sendg_d[d] = device_malloc(bytes[d]);
         recvg_d[d] = device_malloc(bytes[d]);
       #ifndef GPU_COMMS
         hostbuffer_h[d] = (void*)pinned_malloc(4 * bytes[d]);
       #endif
         block[d] = make_uint3(128, 1, 1);
         grid[d] = make_uint3((faceVolumeCB[d] + block[d].x - 1) / block[d].x, 1, 1);
       }

       for ( int d = 0; d < 4; d++ ) {
         if ( !commDimPartitioned(d)) continue;
       #ifdef GPU_COMMS
         recv[d] = recv_d[d];
         send[d] = send_d[d];
         recvg[d] = recvg_d[d];
         sendg[d] = sendg_d[d];
       #else
         recv[d] = hostbuffer_h[d];
         send[d] = static_cast<char*>(hostbuffer_h[d]) + bytes[d];
         recvg[d] = static_cast<char*>(hostbuffer_h[d]) + 3 * bytes[d];
         sendg[d] = static_cast<char*>(hostbuffer_h[d]) + 2 * bytes[d];
       #endif
         // look into storing these for later
         mh_recv_back[d] = comm_declare_receive_relative(recv[d], d, -1, bytes[d]);
         mh_recv_fwd[d]  = comm_declare_receive_relative(recvg[d], d, +1, bytes[d]);
         mh_send_back[d] = comm_declare_send_relative(sendg[d], d, -1, bytes[d]);
         mh_send_fwd[d]  = comm_declare_send_relative(send[d], d, +1, bytes[d]);
       }
       notinitialized = false;
     }
     GaugeFixUnPackArg<Gauge> dataexarg(dataOr, data);


     for ( int d = 0; d < 4; d++ ) {
       if ( !commDimPartitioned(d)) continue;
       comm_start(mh_recv_back[d]);
       comm_start(mh_recv_fwd[d]);

       //extract top face
       Kernel_UnPack<NElems, Float, Gauge, true> <<< grid[d], block[d], 0, GFStream[0] >>>
   (faceVolumeCB[d], dataexarg, reinterpret_cast<complex<Float>*>(send_d[d]), parity, d, dir, X[d] -  data.R()[d] - 1);
       //extract bottom
       Kernel_UnPack<NElems, Float, Gauge, true> <<< grid[d], block[d], 0, GFStream[1] >>>
   (faceVolumeCB[d], dataexarg, reinterpret_cast<complex<Float>*>(sendg_d[d]), parity, d, dir, data.R()[d]);

     #ifndef GPU_COMMS
       cudaMemcpyAsync(send[d], send_d[d], bytes[d], cudaMemcpyDeviceToHost, GFStream[0]);
       cudaMemcpyAsync(sendg[d], sendg_d[d], bytes[d], cudaMemcpyDeviceToHost, GFStream[1]);
     #endif
       qudaStreamSynchronize(GFStream[0]);
       comm_start(mh_send_fwd[d]);

       qudaStreamSynchronize(GFStream[1]);
       comm_start(mh_send_back[d]);

     #ifndef GPU_COMMS
       comm_wait(mh_recv_back[d]);
       cudaMemcpyAsync(recv_d[d], recv[d], bytes[d], cudaMemcpyHostToDevice, GFStream[0]);
     #endif
       #ifdef GPU_COMMS
       comm_wait(mh_recv_back[d]);
       #endif
       Kernel_UnPack<NElems, Float, Gauge, false> <<< grid[d], block[d], 0, GFStream[0] >>>
   (faceVolumeCB[d], dataexarg, reinterpret_cast<complex<Float>*>(recv_d[d]), parity, d, dir, data.R()[d] - 1);

     #ifndef GPU_COMMS
       comm_wait(mh_recv_fwd[d]);
       cudaMemcpyAsync(recvg_d[d], recvg[d], bytes[d], cudaMemcpyHostToDevice, GFStream[1]);
     #endif

       #ifdef GPU_COMMS
       comm_wait(mh_recv_fwd[d]);
       #endif
       Kernel_UnPack<NElems, Float, Gauge, false> <<< grid[d], block[d], 0, GFStream[1] >>>
   (faceVolumeCB[d], dataexarg, reinterpret_cast<complex<Float>*>(recvg_d[d]), parity, d, dir, X[d] - data.R()[d]);

       comm_wait(mh_send_back[d]);
       comm_wait(mh_send_fwd[d]);
       qudaStreamSynchronize(GFStream[0]);
       qudaStreamSynchronize(GFStream[1]);
     }
     checkCudaError();
     qudaDeviceSynchronize();
   #endif

   }


   template<typename Float>
   void PGaugeExchange( cudaGaugeField& data, const int dir, const int parity) {


     // Switching to FloatNOrder for the gauge field in order to support RECONSTRUCT_12
     // Need to fix this!!
     if ( data.isNative() ) {
       if ( data.Reconstruct() == QUDA_RECONSTRUCT_NO ) {
   typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type G;
   PGaugeExchange<Float, 18>(G(data), data, dir, parity);
       } else if ( data.Reconstruct() == QUDA_RECONSTRUCT_12 ) {
   typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type G;
         PGaugeExchange<Float, 12>(G(data), data, dir, parity);
       } else if ( data.Reconstruct() == QUDA_RECONSTRUCT_8 ) {
   typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type G;
         PGaugeExchange<Float, 8>(G(data), data, dir, parity);
       } else {
         errorQuda("Reconstruction type %d of gauge field not supported", data.Reconstruct());
       }
     } else {
       errorQuda("Invalid Gauge Order\n");
     }
   }

 #endif // GPU_GAUGE_ALG

   void PGaugeExchange( cudaGaugeField& data, const int dir, const int parity) {

 #ifdef GPU_GAUGE_ALG
 #ifdef MULTI_GPU
     if ( comm_dim_partitioned(0) || comm_dim_partitioned(1) || comm_dim_partitioned(2) || comm_dim_partitioned(3) ) {
       if ( data.Precision() == QUDA_HALF_PRECISION ) {
         errorQuda("Half precision not supported\n");
       }
       if ( data.Precision() == QUDA_SINGLE_PRECISION ) {
         PGaugeExchange<float> (data, dir, parity);
       } else if ( data.Precision() == QUDA_DOUBLE_PRECISION ) {
         PGaugeExchange<double>(data, dir, parity);
       } else {
         errorQuda("Precision %d not supported", data.Precision());
       }
     }
 #endif
 #else
     errorQuda("Pure gauge code has not been built");
 #endif
   }
 }
QUDA_RECONSTRUCT_NO
Definition: enum_quda.h:67

commDimPartitioned
int commDimPartitioned(int dir)
Definition: comm_common.cpp:815

pinned_malloc
#define pinned_malloc(size)
Definition: malloc_quda.h:67

errorQuda
#define errorQuda(...)
Definition: util_quda.h:121

host_free
#define host_free(ptr)
Definition: malloc_quda.h:71

QUDA_HALF_PRECISION
Definition: enum_quda.h:60

cub_helper.cuh

tmp
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:44

quda
Definition: blas_cublas.h:5

quda::PGaugeExchange
void PGaugeExchange(cudaGaugeField &data, const int dir, const int parity)
Perform heatbath and overrelaxation. Performs nhb heatbath steps followed by nover overrelaxation ste...
Definition: pgauge_exchange.cu:342

comm_quda.h

quda::LatticeField::R
const int * R() const
Definition: lattice_field.h:536

qudaDeviceSynchronize
#define qudaDeviceSynchronize()
Definition: quda_cuda_api.h:145

launch_kernel.cuh

QUDA_RECONSTRUCT_12
Definition: enum_quda.h:68

comm_declare_send_relative
#define comm_declare_send_relative(buffer, dim, dir, nbytes)
Definition: comm_quda.h:59

quda::PGaugeExchangeFree
void PGaugeExchangeFree()
Release all allocated memory used to exchange data between nodes.

quda::qudaStreamSynchronize
cudaError_t qudaStreamSynchronize(cudaStream_t &stream)
Wrapper around cudaStreamSynchronize or cuStreamSynchronize.
Definition: quda_cuda_api.cpp:268

comm_declare_receive_relative
#define comm_declare_receive_relative(buffer, dim, dir, nbytes)
Definition: comm_quda.h:74

comm_start
void comm_start(MsgHandle *mh)
Definition: comm_mpi.cpp:216

quda::size
constexpr int size
Definition: dslash_domain_wall_4d.cuh:8

quda::cudaGaugeField
Definition: gauge_field.h:404

gauge_field_order.h
Main header file for host and device accessors to GaugeFields.

comm_free
void comm_free(MsgHandle *&mh)
Definition: comm_mpi.cpp:207

X
int X[4]
Definition: covdev_test.cpp:70

quda::Complex
std::complex< double > Complex
Definition: quda_internal.h:46

tune_quda.h

QUDA_RECONSTRUCT_8
Definition: enum_quda.h:69

quda_matrix.h

QUDA_DOUBLE_PRECISION
Definition: enum_quda.h:62

quda::pack
__device__ __host__ void pack(Arg &arg, int ghost_idx, int s, int parity)
Definition: dslash_pack.cuh:83

QUDA_SINGLE_PRECISION
Definition: enum_quda.h:61

quda::mapper
Definition: register_traits.h:43

quda::gauge_mapper
Definition: gauge_field_order.h:3012

printfQuda
#define printfQuda(...)
Definition: util_quda.h:115

quda::arg
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:1076

device_malloc
#define device_malloc(size)
Definition: malloc_quda.h:64

faceVolume
int faceVolume[4]
Definition: test_util.cpp:31

quda::GaugeField::Reconstruct
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:250

checkCudaError
#define checkCudaError()
Definition: util_quda.h:161

comm_wait
void comm_wait(MsgHandle *mh)
Definition: comm_mpi.cpp:222

quda::LatticeField::Precision
QudaPrecision Precision() const
Definition: lattice_field.h:546

quda::GaugeField::isNative
bool isNative() const
Definition: gauge_field.cpp:167

parity
QudaParity parity
Definition: covdev_test.cpp:54

gauge_field.h

MsgHandle_s
Definition: comm_mpi.cpp:24

quda::blas::bytes
unsigned long long bytes
Definition: blas_quda.cu:23

comm_dim_partitioned
int comm_dim_partitioned(int dim)
Definition: comm_common.cpp:635

quda::LatticeField::X
const int * X() const
Definition: lattice_field.h:499

quda_internal.h

device_free
#define device_free(ptr)
Definition: malloc_quda.h:69