1 #include <quda_internal.h>
3 #include <gauge_field_order.h>
4 #include <quda_matrix.h>
5 #include <instantiate.h>
11 template <typename Order, int nDim, int dim>
12 struct ExtractGhostExArg {
23 int fBody[nDim][nDim];
25 int localParity[nDim];
27 ExtractGhostExArg(const Order &order, const int *X_, const int *R_,
28 const int *surfaceCB_,
29 const int *A0_, const int *A1_, const int *B0_, const int *B1_,
30 const int *C0_, const int *C1_, const int fBody_[nDim][nDim],
31 const int fBuf_[nDim][nDim], const int *localParity_)
32 : order(order), threads(0) {
34 threads = R_[dim]*(A1_[dim]-A0_[dim])*(B1_[dim]-B0_[dim])*(C1_[dim]-C0_[dim])*order.geometry;
36 for (int d=0; d<nDim; d++) {
39 surfaceCB[d] = surfaceCB_[d];
46 for (int e=0; e<nDim; e++) {
47 fBody[d][e] = fBody_[d][e];
48 fBuf[d][e] = fBuf_[d][e];
50 localParity[d] = localParity_[d];
56 template <typename Float, int length, int dim, typename Arg>
57 __device__ __host__ void extractor(Arg &arg, int dir, int a, int b,
58 int c, int d, int g, int parity) {
59 int srcIdx = (a*arg.fBody[dim][0] + b*arg.fBody[dim][1] +
60 c*arg.fBody[dim][2] + d*arg.fBody[dim][3]) >> 1;
62 int dstIdx = (a*arg.fBuf[dim][0] + b*arg.fBuf[dim][1] +
63 c*arg.fBuf[dim][2] + (d-(dir?arg.X[dim]:arg.R[dim]))*arg.fBuf[dim][3]) >> 1;
65 Matrix<complex<typename mapper<Float>::type>, Ncolor(length)> u;
67 // load the ghost element from the bulk
68 u = arg.order(g, srcIdx, parity);
70 // need dir dependence in write
71 // srcIdx is used here to determine boundary condition
72 arg.order.saveGhostEx(u.data, dstIdx, srcIdx, dir, dim, g, (parity+arg.localParity[dim])&1, arg.R);
76 template <typename Float, int length, int dim, typename Arg>
77 __device__ __host__ void injector(Arg &arg, int dir, int a, int b,
78 int c, int d, int g, int parity) {
79 int srcIdx = (a*arg.fBuf[dim][0] + b*arg.fBuf[dim][1] +
80 c*arg.fBuf[dim][2] + (d-dir*(arg.X[dim]+arg.R[dim]))*arg.fBuf[dim][3]) >> 1;
82 int dstIdx = (a*arg.fBody[dim][0] + b*arg.fBody[dim][1] +
83 c*arg.fBody[dim][2] + d*arg.fBody[dim][3]) >> 1;
85 int oddness = (parity+arg.localParity[dim])&1;
87 Matrix<complex<typename mapper<Float>::type>, Ncolor(length)> u;
89 // need dir dependence in read
90 // dstIdx is used here to determine boundary condition
91 arg.order.loadGhostEx(u.data, srcIdx, dstIdx, dir, dim, g, oddness, arg.R);
93 arg.order(g, dstIdx, parity) = u; // save the ghost element into the bulk
97 Generic CPU gauge ghost extraction and packing
98 NB This routines is specialized to four dimensions
100 template <typename Float, int length, int dim, bool extract, typename Arg>
101 void extractGhostEx(Arg &arg)
103 for (int parity=0; parity<2; parity++) {
105 // the following 4-way loop means this is specialized for 4 dimensions
106 // dir = 0 backwards, dir = 1 forwards
107 for (int dir = 0; dir<2; dir++) {
109 int D0 = extract ? dir*arg.X[dim] + (1-dir)*arg.R[dim] : dir*(arg.X[dim] + arg.R[dim]);
111 for (int d=D0; d<D0+arg.R[dim]; d++) {
112 for (int a=arg.A0[dim]; a<arg.A1[dim]; a++) { // loop over the interior surface
113 for (int b=arg.B0[dim]; b<arg.B1[dim]; b++) { // loop over the interior surface
114 for (int c=arg.C0[dim]; c<arg.C1[dim]; c++) { // loop over the interior surface
115 for (int g=0; g<arg.order.geometry; g++) {
117 // we only do the extraction for parity we are currently working on
118 int oddness = (a+b+c+d) & 1;
119 if (oddness == parity) {
120 if (extract) extractor<Float,length,dim>(arg, dir, a, b, c, d, g, parity);
121 else injector<Float,length,dim>(arg, dir, a, b, c, d, g, parity);
122 } // oddness == parity
135 Generic GPU gauge ghost extraction and packing
136 NB This routines is specialized to four dimensions
137 FIXME this implementation will have two-way warp divergence
141 Generic CPU gauge ghost extraction and packing
142 NB This routines is specialized to four dimensions
144 template <typename Float, int length, int dim, bool extract, typename Arg>
145 __global__ void extractGhostExKernel(Arg arg)
147 // parallelize over parity and dir using block or grid
148 /*for (int parity=0; parity<2; parity++) {*/
150 int parity = blockIdx.z;
152 // the following 4-way loop means this is specialized for 4 dimensions
153 // dir = 0 backwards, dir = 1 forwards
154 //for (int dir = 0; dir<2; dir++) {
156 int dir = blockIdx.y;
158 // this will have two-warp divergence since we only do work on
159 // one parity but parity alternates between threads
160 // linear index used for writing into ghost buffer
161 int X = blockIdx.x * blockDim.x + threadIdx.x;
162 if (X >= arg.threads) return;
164 int dA = arg.A1[dim]-arg.A0[dim];
165 int dB = arg.B1[dim]-arg.B0[dim];
166 int dC = arg.C1[dim]-arg.C0[dim];
167 int D0 = extract ? dir*arg.X[dim] + (1-dir)*arg.R[dim] : dir*(arg.X[dim] + arg.R[dim]);
169 // thread order is optimized to maximize coalescing
170 // X = (((g*R + d) * dA + a)*dB + b)*dC + c
172 int c = arg.C0[dim] + X - gdab*dC;
174 int b = arg.B0[dim] + gdab - gda *dB;
176 int a = arg.A0[dim] + gda - gd *dA;
177 int g = gd / arg.R[dim];
178 int d = D0 + gd - g *arg.R[dim];
180 // we only do the extraction for parity we are currently working on
181 int oddness = (a+b+c+d) & 1;
182 if (oddness == parity) {
183 if (extract) extractor<Float,length,dim>(arg, dir, a, b, c, d, g, parity);
184 else injector<Float,length,dim>(arg, dir, a, b, c, d, g, parity);
185 } // oddness == parity
192 template <typename Float, int length, int nDim, int dim, typename Order>
193 class ExtractGhostEx : Tunable {
194 ExtractGhostExArg<Order,nDim,dim> arg;
197 const GaugeField &meta;
198 QudaFieldLocation location;
201 unsigned int sharedBytesPerThread() const { return 0; }
202 unsigned int sharedBytesPerBlock(const TuneParam ¶m) const { return 0 ;}
204 bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
205 unsigned int minThreads() const { return size; }
208 ExtractGhostEx(ExtractGhostExArg<Order,nDim,dim> &arg, bool extract,
209 const GaugeField &meta, QudaFieldLocation location)
210 : arg(arg), extract(extract), meta(meta), location(location) {
211 int dA = arg.A1[dim]-arg.A0[dim];
212 int dB = arg.B1[dim]-arg.B0[dim];
213 int dC = arg.C1[dim]-arg.C0[dim];
214 size = arg.R[dim]*dA*dB*dC*arg.order.geometry;
215 writeAuxString("prec=%lu,stride=%d,extract=%d,dimension=%d,geometry=%d",
216 sizeof(Float),arg.order.stride, extract, dim, arg.order.geometry);
219 void apply(const qudaStream_t &stream) {
221 if (location==QUDA_CPU_FIELD_LOCATION) {
222 extractGhostEx<Float,length,dim,true>(arg);
224 TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
227 qudaLaunchKernel(extractGhostExKernel<Float,length,dim,true,decltype(arg)>, tp, stream, arg);
229 } else { // we are injecting
230 if (location==QUDA_CPU_FIELD_LOCATION) {
231 extractGhostEx<Float,length,dim,false>(arg);
233 TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
236 qudaLaunchKernel(extractGhostExKernel<Float,length,dim,false,decltype(arg)>, tp, stream, arg);
241 TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
243 long long flops() const { return 0; }
244 long long bytes() const { return 2 * 2 * 2 * size * arg.order.Bytes(); } // 2 for i/o
249 Generic CPU gauge ghost extraction and packing
250 NB This routines is specialized to four dimensions
251 @param E the extended gauge dimensions
252 @param R array holding the radius of the extended region
253 @param extract Whether we are extracting or injecting the ghost zone
255 template <typename Float, int length, typename Order>
256 void extractGhostEx(Order order, const int dim, const int *surfaceCB, const int *E,
257 const int *R, bool extract, const GaugeField &u, QudaFieldLocation location) {
259 //loop variables: a, b, c with a the most signifcant and c the least significant
260 //A0, B0, C0 the minimum value
261 //A0, B0, C0 the maximum value
263 int X[nDim]; // compute interior dimensions
264 for (int d=0; d<nDim; d++) X[d] = E[d] - 2*R[d];
266 //..........x..........y............z.............t
267 int A0[nDim] = {R[3], R[3], R[3], 0};
268 int A1[nDim] = {X[3]+R[3], X[3]+R[3], X[3]+R[3], X[2]+2*R[2]};
270 int B0[nDim] = {R[2], R[2], 0, 0};
271 int B1[nDim] = {X[2]+R[2], X[2]+R[2], X[1]+2*R[1], X[1]+2*R[1]};
273 int C0[nDim] = {R[1], 0, 0, 0};
274 int C1[nDim] = {X[1]+R[1], X[0]+2*R[0], X[0]+2*R[0], X[0]+2*R[0]};
276 int fSrc[nDim][nDim] = {
277 {E[2]*E[1]*E[0], E[1]*E[0], E[0], 1},
278 {E[2]*E[1]*E[0], E[1]*E[0], 1, E[0]},
279 {E[2]*E[1]*E[0], E[0], 1, E[1]*E[0]},
280 {E[1]*E[0], E[0], 1, E[2]*E[1]*E[0]}
283 int fBuf[nDim][nDim]={
284 {E[2]*E[1], E[1], 1, E[3]*E[2]*E[1]},
285 {E[2]*E[0], E[0], 1, E[3]*E[2]*E[0]},
286 {E[1]*E[0], E[0], 1, E[3]*E[1]*E[0]},
287 {E[1]*E[0], E[0], 1, E[2]*E[1]*E[0]}
290 //set the local processor parity
291 //switching odd and even ghost gauge when that dimension size is odd
292 //only switch if X[dir] is odd and the gridsize in that dimension is greater than 1
293 // FIXME - I don't understand this, shouldn't it be commDim(dim) == 0 ?
294 int localParity[nDim];
295 for (int d=0; d<nDim; d++)
296 localParity[dim] = ((X[dim] % 2 ==1) && (commDim(dim) > 1)) ? 1 : 0;
297 // localParity[dim] = (X[dim]%2==0 || commDim(dim)) ? 0 : 1;
300 ExtractGhostExArg<Order,nDim,0> arg(order, X, R, surfaceCB, A0, A1, B0, B1,
301 C0, C1, fSrc, fBuf, localParity);
302 ExtractGhostEx<Float,length,nDim,0,Order> extractor(arg, extract, u, location);
305 ExtractGhostExArg<Order,nDim,1> arg(order, X, R, surfaceCB, A0, A1, B0, B1,
306 C0, C1, fSrc, fBuf, localParity);
307 ExtractGhostEx<Float,length,nDim,1,Order> extractor(arg, extract, u, location);
310 ExtractGhostExArg<Order,nDim,2> arg(order, X, R, surfaceCB, A0, A1, B0, B1,
311 C0, C1, fSrc, fBuf, localParity);
312 ExtractGhostEx<Float,length,nDim,2,Order> extractor(arg, extract, u, location);
315 ExtractGhostExArg<Order,nDim,3> arg(order, X, R, surfaceCB, A0, A1, B0, B1,
316 C0, C1, fSrc, fBuf, localParity);
317 ExtractGhostEx<Float,length,nDim,3,Order> extractor(arg, extract, u, location);
320 errorQuda("Invalid dim=%d", dim);
324 /** This is the template driver for extractGhost */
325 template <typename Float> struct GhostExtractEx {
326 GhostExtractEx(const GaugeField &u, int dim, const int *R, void **Ghost_, bool extract)
328 Float **Ghost = reinterpret_cast<Float**>(Ghost_);
329 const int length = 18;
331 QudaFieldLocation location =
332 (typeid(u)==typeid(cudaGaugeField)) ? QUDA_CUDA_FIELD_LOCATION : QUDA_CPU_FIELD_LOCATION;
335 if (u.Reconstruct() == QUDA_RECONSTRUCT_NO) {
336 typedef typename gauge_mapper<Float, QUDA_RECONSTRUCT_NO>::type G;
337 extractGhostEx<Float, length>(G(u, 0, Ghost), dim, u.SurfaceCB(), u.X(), R, extract, u, location);
338 } else if (u.Reconstruct() == QUDA_RECONSTRUCT_12) {
339 #if QUDA_RECONSTRUCT & 2
340 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type G;
341 extractGhostEx<Float,length>(G(u, 0, Ghost),
342 dim, u.SurfaceCB(), u.X(), R, extract, u, location);
344 errorQuda("QUDA_RECONSTRUCT = %d does not enable QUDA_RECONSTRUCT_12", QUDA_RECONSTRUCT);
346 } else if (u.Reconstruct() == QUDA_RECONSTRUCT_8) {
347 #if QUDA_RECONSTRUCT & 1
348 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type G;
349 extractGhostEx<Float,length>(G(u, 0, Ghost),
350 dim, u.SurfaceCB(), u.X(), R, extract, u, location);
352 errorQuda("QUDA_RECONSTRUCT = %d does not enable QUDA_RECONSTRUCT_8", QUDA_RECONSTRUCT);
354 } else if (u.Reconstruct() == QUDA_RECONSTRUCT_13) {
355 #if QUDA_RECONSTRUCT & 2
356 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_13>::type G;
357 extractGhostEx<Float,length>(G(u, 0, Ghost),
358 dim, u.SurfaceCB(), u.X(), R, extract, u, location);
360 errorQuda("QUDA_RECONSTRUCT = %d does not enable QUDA_RECONSTRUCT_13", QUDA_RECONSTRUCT);
362 } else if (u.Reconstruct() == QUDA_RECONSTRUCT_9) {
363 #if QUDA_RECONSTRUCT & 1
364 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_9>::type G;
365 extractGhostEx<Float,length>(G(u, 0, Ghost),
366 dim, u.SurfaceCB(), u.X(), R, extract, u, location);
368 errorQuda("QUDA_RECONSTRUCT = %d does not enable QUDA_RECONSTRUCT_9", QUDA_RECONSTRUCT);
371 } else if (u.Order() == QUDA_QDP_GAUGE_ORDER) {
373 #ifdef BUILD_QDP_INTERFACE
374 extractGhostEx<Float,length>(QDPOrder<Float,length>(u, 0, Ghost),
375 dim, u.SurfaceCB(), u.X(), R, extract, u, location);
377 errorQuda("QDP interface has not been built\n");
380 } else if (u.Order() == QUDA_QDPJIT_GAUGE_ORDER) {
382 #ifdef BUILD_QDPJIT_INTERFACE
383 extractGhostEx<Float,length>(QDPJITOrder<Float,length>(u, 0, Ghost),
384 dim, u.SurfaceCB(), u.X(), R, extract, u, location);
386 errorQuda("QDPJIT interface has not been built\n");
389 } else if (u.Order() == QUDA_CPS_WILSON_GAUGE_ORDER) {
391 #ifdef BUILD_CPS_INTERFACE
392 extractGhostEx<Float,length>(CPSOrder<Float,length>(u, 0, Ghost),
393 dim, u.SurfaceCB(), u.X(), R, extract, u, location);
395 errorQuda("CPS interface has not been built\n");
398 } else if (u.Order() == QUDA_MILC_GAUGE_ORDER) {
400 #ifdef BUILD_MILC_INTERFACE
401 extractGhostEx<Float,length>(MILCOrder<Float,length>(u, 0, Ghost),
402 dim, u.SurfaceCB(), u.X(), R, extract, u, location);
404 errorQuda("MILC interface has not been built\n");
407 } else if (u.Order() == QUDA_BQCD_GAUGE_ORDER) {
409 #ifdef BUILD_BQCD_INTERFACE
410 extractGhostEx<Float,length>(BQCDOrder<Float,length>(u, 0, Ghost),
411 dim, u.SurfaceCB(), u.X(), R, extract, u, location);
413 errorQuda("BQCD interface has not been built\n");
416 } else if (u.Order() == QUDA_TIFR_GAUGE_ORDER) {
418 #ifdef BUILD_TIFR_INTERFACE
419 extractGhostEx<Float,length>(TIFROrder<Float,length>(u, 0, Ghost),
420 dim, u.SurfaceCB(), u.X(), R, extract, u, location);
422 errorQuda("TIFR interface has not been built\n");
426 errorQuda("Gauge field %d order not supported", u.Order());
432 void extractExtendedGaugeGhost(const GaugeField &u, int dim, const int *R,
433 void **ghost, bool extract)
435 instantiatePrecision<GhostExtractEx>(u, dim, R, ghost, extract);