15 #define LAUNCH_KERNEL_GAUGEFIX(kernel, tp, stream, arg, parity, ...) \ 16 if ( tp.block.z == 0 ) { \ 17 switch ( tp.block.x ) { \ 19 kernel<0, 32,__VA_ARGS__> \ 20 <<< tp.grid.x, tp.block.x, tp.shared_bytes, stream >>> (arg, parity); \ 23 kernel<0, 64,__VA_ARGS__> \ 24 <<< tp.grid.x, tp.block.x, tp.shared_bytes, stream >>> (arg, parity); \ 27 kernel<0, 96,__VA_ARGS__> \ 28 <<< tp.grid.x, tp.block.x, tp.shared_bytes, stream >>> (arg, parity); \ 31 kernel<0, 128,__VA_ARGS__> \ 32 <<< tp.grid.x, tp.block.x, tp.shared_bytes, stream >>> (arg, parity); \ 35 errorQuda("%s not implemented for %d threads", # kernel, tp.block.x); \ 39 switch ( tp.block.x ) { \ 41 kernel<1, 32,__VA_ARGS__> \ 42 <<< tp.grid.x, tp.block.x, tp.shared_bytes, stream >>> (arg, parity); \ 45 kernel<1, 64,__VA_ARGS__> \ 46 <<< tp.grid.x, tp.block.x, tp.shared_bytes, stream >>> (arg, parity); \ 49 kernel<1, 96,__VA_ARGS__> \ 50 <<< tp.grid.x, tp.block.x, tp.shared_bytes, stream >>> (arg, parity); \ 53 kernel<1, 128,__VA_ARGS__> \ 54 <<< tp.grid.x, tp.block.x, tp.shared_bytes, stream >>> (arg, parity); \ 57 errorQuda("%s not implemented for %d threads", # kernel, tp.block.x); \ 62 template <
typename Gauge>
63 struct GaugeFixUnPackArg {
68 for (
int dir = 0; dir < 4; ++dir ) X[dir] = data.
X()[dir];
73 template<
int NElems,
typename Float,
typename Gauge,
bool pack>
74 __global__
void Kernel_UnPack(
int size, GaugeFixUnPackArg<Gauge>
arg, \
75 complex<Float> *array,
int parity,
int face,
int dir,
int borderid){
76 int idx = blockIdx.x * blockDim.x + threadIdx.x;
77 if ( idx >= size )
return;
79 for (
int dr = 0; dr < 4; ++dr ) X[dr] = arg.X[dr];
84 za = idx / ( X[1] / 2);
86 x[2] = za - x[3] * X[2];
88 xodd = (borderid + x[2] + x[3] +
parity) & 1;
89 x[1] = (2 * idx + xodd) - za * X[1];
92 za = idx / ( X[0] / 2);
94 x[2] = za - x[3] * X[2];
96 xodd = (borderid + x[2] + x[3] +
parity) & 1;
97 x[0] = (2 * idx + xodd) - za * X[0];
100 za = idx / ( X[0] / 2);
102 x[1] = za - x[3] * X[1];
104 xodd = (borderid + x[1] + x[3] +
parity) & 1;
105 x[0] = (2 * idx + xodd) - za * X[0];
108 za = idx / ( X[0] / 2);
110 x[1] = za - x[2] * X[1];
112 xodd = (borderid + x[1] + x[2] +
parity) & 1;
113 x[0] = (2 * idx + xodd) - za * X[0];
116 int id = (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]) >> 1;
117 typedef complex<Float>
Complex;
122 arg.dataOr.load(data,
id, dir, parity);
123 arg.dataOr.reconstruct.Pack(tmp, data,
id);
124 for (
int i = 0; i < NElems / 2; ++i ) array[idx + size * i] =
Complex(tmp[2*i+0], tmp[2*i+1]);
127 for (
int i = 0; i < NElems / 2; ++i ) {
128 tmp[2*i+0] = array[idx + size * i].real();
129 tmp[2*i+1] = array[idx + size * i].imag();
131 arg.dataOr.reconstruct.Unpack(data, tmp,
id, dir, 0, arg.dataOr.X, arg.dataOr.R);
132 arg.dataOr.save(data,
id, dir, parity);
137 static void *send[4];
138 static void *recv[4];
139 static void *sendg[4];
140 static void *recvg[4];
141 static void *send_d[4];
142 static void *recv_d[4];
143 static void *sendg_d[4];
144 static void *recvg_d[4];
145 static void *hostbuffer_h[4];
146 static cudaStream_t GFStream[2];
147 static size_t offset[4];
148 static size_t bytes[4];
150 static size_t faceVolumeCB[4];
157 static dim3 block[4];
159 static bool notinitialized =
true;
168 cudaStreamDestroy(GFStream[0]);
169 cudaStreamDestroy(GFStream[1]);
170 for (
int d = 0; d < 4; d++ ) {
185 notinitialized =
true;
191 template<
typename Float,
int NElems,
typename Gauge>
196 if ( notinitialized ==
false ) {
197 for (
int d = 0; d < 4; d++ ) {
198 if ( X[d] != data.
X()[d] ) {
200 notinitialized =
true;
201 printfQuda(
"PGaugeExchange needs to be reinitialized...\n");
206 if ( notinitialized ) {
207 for (
int d = 0; d < 4; d++ ) {
210 for (
int i = 0; i < 4; i++ ) {
212 for (
int j = 0; j < 4; j++ ) {
213 if ( i == j )
continue;
214 faceVolume[i] *= X[j];
216 faceVolumeCB[i] = faceVolume[i] / 2;
219 cudaStreamCreate(&GFStream[0]);
220 cudaStreamCreate(&GFStream[1]);
221 for (
int d = 0; d < 4; d++ ) {
224 offset[d] = faceVolumeCB[d] * NElems;
225 bytes[d] =
sizeof(Float) * offset[d];
233 block[d] = make_uint3(128, 1, 1);
234 grid[d] = make_uint3((faceVolumeCB[d] + block[d].x - 1) / block[d].x, 1, 1);
237 for (
int d = 0; d < 4; d++ ) {
242 recvg[d] = recvg_d[d];
243 sendg[d] = sendg_d[d];
245 recv[d] = hostbuffer_h[d];
246 send[d] =
static_cast<char*
>(hostbuffer_h[d]) + bytes[d];
247 recvg[d] =
static_cast<char*
>(hostbuffer_h[d]) + 3 * bytes[d];
248 sendg[d] =
static_cast<char*
>(hostbuffer_h[d]) + 2 * bytes[d];
256 notinitialized =
false;
258 GaugeFixUnPackArg<Gauge> dataexarg(dataOr, data);
261 for (
int d = 0; d < 4; d++ ) {
267 Kernel_UnPack<NElems, Float, Gauge, true> <<< grid[d], block[d], 0, GFStream[0] >>>
268 (faceVolumeCB[d], dataexarg,
reinterpret_cast<complex<Float>*
>(send_d[d]), parity, d, dir, X[d] - data.
R()[d] - 1);
270 Kernel_UnPack<NElems, Float, Gauge, true> <<< grid[d], block[d], 0, GFStream[1] >>>
271 (faceVolumeCB[d], dataexarg,
reinterpret_cast<complex<Float>*
>(sendg_d[d]), parity, d, dir, data.
R()[d]);
274 cudaMemcpyAsync(send[d], send_d[d], bytes[d], cudaMemcpyDeviceToHost, GFStream[0]);
275 cudaMemcpyAsync(sendg[d], sendg_d[d], bytes[d], cudaMemcpyDeviceToHost, GFStream[1]);
285 cudaMemcpyAsync(recv_d[d], recv[d], bytes[d], cudaMemcpyHostToDevice, GFStream[0]);
290 Kernel_UnPack<NElems, Float, Gauge, false> <<< grid[d], block[d], 0, GFStream[0] >>>
291 (faceVolumeCB[d], dataexarg,
reinterpret_cast<complex<Float>*
>(recv_d[d]), parity, d, dir, data.
R()[d] - 1);
295 cudaMemcpyAsync(recvg_d[d], recvg[d], bytes[d], cudaMemcpyHostToDevice, GFStream[1]);
301 Kernel_UnPack<NElems, Float, Gauge, false> <<< grid[d], block[d], 0, GFStream[1] >>>
302 (faceVolumeCB[d], dataexarg,
reinterpret_cast<complex<Float>*
>(recvg_d[d]), parity, d, dir, X[d] - data.
R()[d]);
316 template<
typename Float>
325 PGaugeExchange<Float, 18>(G(data), data, dir,
parity);
328 PGaugeExchange<Float, 12>(G(data), data, dir,
parity);
331 PGaugeExchange<Float, 8>(G(data), data, dir,
parity);
340 #endif // GPU_GAUGE_ALG 348 errorQuda(
"Half precision not supported\n");
351 PGaugeExchange<float> (data, dir,
parity);
353 PGaugeExchange<double>(data, dir,
parity);
360 errorQuda(
"Pure gauge code has not been built");
int commDimPartitioned(int dir)
#define pinned_malloc(size)
cudaColorSpinorField * tmp
void PGaugeExchange(cudaGaugeField &data, const int dir, const int parity)
Perform heatbath and overrelaxation. Performs nhb heatbath steps followed by nover overrelaxation ste...
#define qudaDeviceSynchronize()
#define comm_declare_send_relative(buffer, dim, dir, nbytes)
void PGaugeExchangeFree()
Release all allocated memory used to exchange data between nodes.
cudaError_t qudaStreamSynchronize(cudaStream_t &stream)
Wrapper around cudaStreamSynchronize or cuStreamSynchronize.
#define comm_declare_receive_relative(buffer, dim, dir, nbytes)
void comm_start(MsgHandle *mh)
Main header file for host and device accessors to GaugeFields.
void comm_free(MsgHandle *&mh)
std::complex< double > Complex
__device__ __host__ void pack(Arg &arg, int ghost_idx, int s, int parity)
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
#define device_malloc(size)
QudaReconstructType Reconstruct() const
void comm_wait(MsgHandle *mh)
QudaPrecision Precision() const
int comm_dim_partitioned(int dim)