9 #include <device_functions.h> 18 #define LAUNCH_KERNEL_GAUGEFIX(kernel, tp, stream, arg, parity, ...) \ 19 if ( tp.block.z == 0 ) { \ 20 switch ( tp.block.x ) { \ 22 kernel<0, 32,__VA_ARGS__> \ 23 << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \ 26 kernel<0, 64,__VA_ARGS__> \ 27 << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \ 30 kernel<0, 96,__VA_ARGS__> \ 31 << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \ 34 kernel<0, 128,__VA_ARGS__> \ 35 << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \ 38 errorQuda("%s not implemented for %d threads", # kernel, tp.block.x); \ 42 switch ( tp.block.x ) { \ 44 kernel<1, 32,__VA_ARGS__> \ 45 << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \ 48 kernel<1, 64,__VA_ARGS__> \ 49 << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \ 52 kernel<1, 96,__VA_ARGS__> \ 53 << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \ 56 kernel<1, 128,__VA_ARGS__> \ 57 << < tp.grid.x, tp.block.x, tp.shared_bytes, stream >> > (arg, parity); \ 60 errorQuda("%s not implemented for %d threads", # kernel, tp.block.x); \ 65 template <
typename Gauge>
66 struct GaugeFixUnPackArg {
72 GaugeFixUnPackArg(Gauge & dataOr, cudaGaugeField & data)
74 for (
int dir = 0; dir < 4; ++dir )
X[dir] = data.X()[dir];
79 template<
int NElems,
typename Float,
typename Gauge,
bool pack>
80 __global__
void Kernel_UnPack(
int size, GaugeFixUnPackArg<Gauge>
arg, \
81 complex<Float> *
array,
int parity,
int face,
int dir,
int borderid){
85 for (
int dr = 0; dr < 4; ++dr )
X[dr] =
arg.X[dr];
92 x[2] =
za -
x[3] *
X[2];
94 xodd = (borderid +
x[2] +
x[3] +
parity) & 1;
95 x[1] = (2 *
idx + xodd) -
za *
X[1];
100 x[2] =
za -
x[3] *
X[2];
102 xodd = (borderid +
x[2] +
x[3] +
parity) & 1;
103 x[0] = (2 *
idx + xodd) -
za *
X[0];
108 x[1] =
za -
x[3] *
X[1];
110 xodd = (borderid +
x[1] +
x[3] +
parity) & 1;
111 x[0] = (2 *
idx + xodd) -
za *
X[0];
116 x[1] =
za -
x[2] *
X[1];
118 xodd = (borderid +
x[1] +
x[2] +
parity) & 1;
119 x[0] = (2 *
idx + xodd) -
za *
X[0];
122 int id = (((
x[3] *
X[2] +
x[2]) *
X[1] +
x[1]) *
X[0] +
x[0]) >> 1;
123 typedef complex<Float>
Complex;
124 typedef typename mapper<Float>::type RegType;
129 arg.dataOr.reconstruct.Pack(
tmp, data,
id);
134 arg.dataOr.reconstruct.Unpack(data,
tmp,
id, dir, 0,
arg.dataOr.X,
arg.dataOr.R);
140 static void *send[4];
141 static void *recv[4];
142 static void *sendg[4];
143 static void *recvg[4];
144 static void *send_d[4];
145 static void *recv_d[4];
146 static void *sendg_d[4];
147 static void *recvg_d[4];
148 static void *hostbuffer_h[4];
149 static cudaStream_t GFStream[2];
151 static size_t bytes[4];
153 static size_t faceVolumeCB[4];
160 static dim3
block[4];
162 static bool notinitialized =
true;
171 cudaStreamDestroy(GFStream[0]);
172 cudaStreamDestroy(GFStream[1]);
173 for (
int d = 0;
d < 4;
d++ ) {
188 notinitialized =
true;
194 template<
typename Float,
int NElems,
typename Gauge>
199 if ( notinitialized ==
false ) {
200 for (
int d = 0;
d < 4;
d++ ) {
201 if (
X[
d] != data.X()[
d] ) {
203 notinitialized =
true;
204 printfQuda(
"PGaugeExchange needs to be reinitialized...\n");
209 if ( notinitialized ) {
210 for (
int d = 0;
d < 4;
d++ ) {
213 for (
int i = 0;
i < 4;
i++ ) {
215 for (
int j = 0; j < 4; j++ ) {
216 if (
i == j )
continue;
222 cudaStreamCreate(&GFStream[0]);
223 cudaStreamCreate(&GFStream[1]);
224 for (
int d = 0;
d < 4;
d++ ) {
227 offset[
d] = faceVolumeCB[
d] * NElems;
236 block[
d] = make_uint3(128, 1, 1);
237 grid[
d] = make_uint3((faceVolumeCB[
d] +
block[
d].
x - 1) /
block[
d].
x, 1, 1);
240 for (
int d = 0;
d < 4;
d++ ) {
245 recvg[
d] = recvg_d[
d];
246 sendg[
d] = sendg_d[
d];
248 recv[
d] = hostbuffer_h[
d];
249 send[
d] =
static_cast<char*
>(hostbuffer_h[
d]) +
bytes[
d];
250 recvg[
d] =
static_cast<char*
>(hostbuffer_h[
d]) + 3 *
bytes[
d];
251 sendg[
d] =
static_cast<char*
>(hostbuffer_h[
d]) + 2 *
bytes[
d];
259 notinitialized =
false;
261 GaugeFixUnPackArg<Gauge> dataexarg(dataOr, data);
264 for (
int d = 0;
d < 4;
d++ ) {
270 Kernel_UnPack<NElems, Float, Gauge, true><< < grid[
d],
block[
d], 0, GFStream[0] >> >
271 (faceVolumeCB[
d], dataexarg,
reinterpret_cast<complex<Float>*
>(send_d[
d]),
parity,
d, dir,
X[
d] - data.R()[
d] - 1);
273 Kernel_UnPack<NElems, Float, Gauge, true><< < grid[
d],
block[
d], 0, GFStream[1] >> >
274 (faceVolumeCB[
d], dataexarg,
reinterpret_cast<complex<Float>*
>(sendg_d[
d]),
parity,
d, dir, data.R()[
d]);
277 cudaMemcpyAsync(send[
d], send_d[
d],
bytes[
d], cudaMemcpyDeviceToHost, GFStream[0]);
278 cudaMemcpyAsync(sendg[
d], sendg_d[
d],
bytes[
d], cudaMemcpyDeviceToHost, GFStream[1]);
288 cudaMemcpyAsync(recv_d[
d], recv[
d],
bytes[
d], cudaMemcpyHostToDevice, GFStream[0]);
293 Kernel_UnPack<NElems, Float, Gauge, false><< < grid[
d],
block[
d], 0, GFStream[0] >> >
294 (faceVolumeCB[
d], dataexarg,
reinterpret_cast<complex<Float>*
>(recv_d[
d]),
parity,
d, dir, data.R()[
d] - 1);
298 cudaMemcpyAsync(recvg_d[
d], recvg[
d],
bytes[
d], cudaMemcpyHostToDevice, GFStream[1]);
304 Kernel_UnPack<NElems, Float, Gauge, false><< < grid[
d],
block[
d], 0, GFStream[1] >> >
305 (faceVolumeCB[
d], dataexarg,
reinterpret_cast<complex<Float>*
>(recvg_d[
d]),
parity,
d, dir,
X[
d] - data.R()[
d]);
319 template<
typename Float>
325 if ( data.isNative() ) {
327 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type G;
328 PGaugeExchange<Float, 18>(G(data), data, dir,
parity);
330 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type G;
331 PGaugeExchange<Float, 12>(G(data), data, dir,
parity);
333 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type G;
334 PGaugeExchange<Float, 8>(G(data), data, dir,
parity);
336 errorQuda(
"Reconstruction type %d of gauge field not supported", data.Reconstruct());
343 #endif // GPU_GAUGE_ALG 351 errorQuda(
"Half precision not supported\n");
354 PGaugeExchange<float> (data, dir,
parity);
356 PGaugeExchange<double>(data, dir,
parity);
363 errorQuda(
"Pure gauge code has not been built");
int commDimPartitioned(int dir)
#define pinned_malloc(size)
std::complex< double > Complex
cudaColorSpinorField * tmp
void PGaugeExchange(cudaGaugeField &data, const int dir, const int parity)
Perform heatbath and overrelaxation. Performs nhb heatbath steps followed by nover overrelaxation ste...
void comm_free(MsgHandle *mh)
#define comm_declare_send_relative(buffer, dim, dir, nbytes)
void PGaugeExchangeFree()
Release all allocated memory used to exchange data between nodes.
cudaError_t qudaStreamSynchronize(cudaStream_t &stream)
Wrapper around cudaStreamSynchronize or cuStreamSynchronize.
#define comm_declare_receive_relative(buffer, dim, dir, nbytes)
void comm_start(MsgHandle *mh)
Main header file for host and device accessors to GaugeFields.
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
#define device_malloc(size)
struct cudaExtent unsigned int cudaArray_t array
void comm_wait(MsgHandle *mh)
static __inline__ size_t size_t d
QudaPrecision Precision() const
int comm_dim_partitioned(int dim)