12 template <
typename OutOrder,
typename InOrder>
24 CopyGaugeExArg(
const OutOrder &out,
const InOrder &in,
const int *Xout,
const int *Xin,
25 const int *faceVolumeCB,
int nDim,
int geometry)
26 : out(out), in(in), nDim(nDim), geometry(geometry) {
27 for (
int d=0; d<nDim; d++) {
28 this->Xout[d] = Xout[d];
29 this->Xin[d] = Xin[d];
30 this->faceVolumeCB[d] = faceVolumeCB[d];
33 if (out.volumeCB > in.volumeCB) {
34 this->volume = 2*in.volumeCB;
35 this->volumeEx = 2*out.volumeCB;
36 this->regularToextended =
true;
38 this->volume = 2*out.volumeCB;
39 this->volumeEx = 2*in.volumeCB;
40 this->regularToextended =
false;
49 template <
typename FloatOut,
typename FloatIn,
int length,
typename OutOrder,
typename InOrder,
bool regularToextended>
58 if(regularToextended){
60 for (
int d=0; d<4; d++) R[d] = (arg.
Xout[d] - arg.
Xin[d]) >> 1;
61 int za = X/(arg.
Xin[0]/2);
62 int x0h = X - za*(arg.
Xin[0]/2);
63 int zb = za/arg.
Xin[1];
64 x[1] = za - zb*arg.
Xin[1];
65 x[3] = zb / arg.
Xin[2];
66 x[2] = zb - x[3]*arg.
Xin[2];
67 x[0] = 2*x0h + ((x[1] + x[2] + x[3] +
parity) & 1);
69 xout = ((((x[3]+R[3])*arg.
Xout[2] + (x[2]+R[2]))*arg.
Xout[1] + (x[1]+R[1]))*arg.
Xout[0]+(x[0]+R[0])) >> 1;
73 for (
int d=0; d<4; d++) R[d] = (arg.
Xin[d] - arg.
Xout[d]) >> 1;
74 int za = X/(arg.
Xout[0]/2);
75 int x0h = X - za*(arg.
Xout[0]/2);
76 int zb = za/arg.
Xout[1];
77 x[1] = za - zb*arg.
Xout[1];
78 x[3] = zb / arg.
Xout[2];
79 x[2] = zb - x[3]*arg.
Xout[2];
80 x[0] = 2*x0h + ((x[1] + x[2] + x[3] +
parity) & 1);
82 xin = ((((x[3]+R[3])*arg.
Xin[2] + (x[2]+R[2]))*arg.
Xin[1] + (x[1]+R[1]))*arg.
Xin[0]+(x[0]+R[0])) >> 1;
88 arg.
out(d, xout, parity) =
out;
92 template <
typename FloatOut,
typename FloatIn,
int length,
typename OutOrder,
typename InOrder,
bool regularToextended>
96 copyGaugeEx<FloatOut, FloatIn, length, OutOrder, InOrder, regularToextended>(
arg,
X,
parity);
101 template <
typename FloatOut,
typename FloatIn,
int length,
typename OutOrder,
typename InOrder,
bool regularToextended>
104 int X = blockIdx.x * blockDim.x + threadIdx.x;
105 if (X >= arg.
volume/2)
return;
106 copyGaugeEx<FloatOut, FloatIn, length, OutOrder, InOrder, regularToextended>(
arg,
X,
parity);
110 template <
typename FloatOut,
typename FloatIn,
int length,
typename OutOrder,
typename InOrder>
125 : arg(arg), meta(meta), location(location) {
126 writeAuxString(
"out_stride=%d,in_stride=%d,geometry=%d",arg.
out.stride,arg.
in.stride,arg.
geometry);
135 else copyGaugeEx<FloatOut, FloatIn, length, OutOrder, InOrder, false>(
arg);
137 if(arg.
regularToextended) copyGaugeExKernel<FloatOut, FloatIn, length, OutOrder, InOrder, true>
139 else copyGaugeExKernel<FloatOut, FloatIn, length, OutOrder, InOrder, false>
148 long long flops()
const {
return 0; }
150 int sites = 4*arg.
volume/2;
151 return 2 * sites * ( arg.
in.Bytes() + arg.
in.hasPhase*
sizeof(FloatIn)
152 + arg.
out.Bytes() + arg.
out.hasPhase*
sizeof(FloatOut) );
157 template <
typename FloatOut,
typename FloatIn,
int length,
typename OutOrder,
typename InOrder>
158 void copyGaugeEx(OutOrder outOrder,
const InOrder inOrder,
const int *
E,
162 arg(outOrder, inOrder, E, X, faceVolumeCB, meta.
Ndim(), meta.
Geometry());
168 template <
typename FloatOut,
typename FloatIn,
int length,
typename InOrder>
173 for (
int i=0; i<4; i++) faceVolumeCB[i] = out.
SurfaceCB(i) * out.
Nface();
178 copyGaugeEx<FloatOut, FloatIn, length>(G(out, Out), inOrder, out.
X(),
X, faceVolumeCB,
out, location);
180 #if QUDA_RECONSTRUCT & 2 182 copyGaugeEx<FloatOut,FloatIn,length>
183 (G(out, Out), inOrder, out.
X(),
X, faceVolumeCB,
out, location);
185 errorQuda(
"QUDA_RECONSTRUCT=%d does not enable reconstruct-12", QUDA_RECONSTRUCT);
188 #if QUDA_RECONSTRUCT & 1 190 copyGaugeEx<FloatOut,FloatIn,length>
191 (G(out, Out), inOrder, out.
X(),
X, faceVolumeCB,
out, location);
193 errorQuda(
"QUDA_RECONSTRUCT=%d does not enable reconstruct-8", QUDA_RECONSTRUCT);
195 #ifdef GPU_STAGGERED_DIRAC 197 #if QUDA_RECONSTRUCT & 2 199 copyGaugeEx<FloatOut,FloatIn,length>
200 (G(out, Out), inOrder, out.
X(),
X, faceVolumeCB,
out, location);
202 errorQuda(
"QUDA_RECONSTRUCT=%d does not enable reconstruct-13", QUDA_RECONSTRUCT);
205 #if QUDA_RECONSTRUCT & 1 207 copyGaugeEx<FloatOut,FloatIn,length>
208 (G(out, Out), inOrder, out.
X(),
X, faceVolumeCB,
out, location);
210 errorQuda(
"QUDA_RECONSTRUCT=%d does not enable reconstruct-9", QUDA_RECONSTRUCT);
212 #endif // GPU_STAGGERED_DIRAC 218 #ifdef BUILD_QDP_INTERFACE 219 copyGaugeEx<FloatOut,FloatIn,length>
222 errorQuda(
"QDP interface has not been built\n");
227 #ifdef BUILD_MILC_INTERFACE 228 copyGaugeEx<FloatOut,FloatIn,length>
231 errorQuda(
"MILC interface has not been built\n");
236 #ifdef BUILD_TIFR_INTERFACE 237 copyGaugeEx<FloatOut,FloatIn,length>
240 errorQuda(
"TIFR interface has not been built\n");
249 template <
typename FloatOut,
typename FloatIn,
int length>
251 FloatOut *Out, FloatIn *In) {
256 copyGaugeEx<FloatOut, FloatIn, length>(G(in, In), in.
X(),
out, location, Out);
258 #if QUDA_RECONSTRUCT & 2 260 copyGaugeEx<FloatOut,FloatIn,length> (G(in, In), in.
X(),
out, location, Out);
262 errorQuda(
"QUDA_RECONSTRUCT=%d does not enable reconstruct-12", QUDA_RECONSTRUCT);
265 #if QUDA_RECONSTRUCT & 1 267 copyGaugeEx<FloatOut,FloatIn,length> (G(in, In), in.
X(),
out, location, Out);
269 errorQuda(
"QUDA_RECONSTRUCT=%d does not enable reconstruct-8", QUDA_RECONSTRUCT);
271 #ifdef GPU_STAGGERED_DIRAC 273 #if QUDA_RECONSTRUCT & 2 275 copyGaugeEx<FloatOut,FloatIn,length> (G(in, In), in.
X(),
out, location, Out);
277 errorQuda(
"QUDA_RECONSTRUCT=%d does not enable reconstruct-13", QUDA_RECONSTRUCT);
280 #if QUDA_RECONSTRUCT & 1 282 copyGaugeEx<FloatOut,FloatIn,length> (G(in, In), in.
X(),
out, location, Out);
284 errorQuda(
"QUDA_RECONSTRUCT=%d does not enable reconstruct-9", QUDA_RECONSTRUCT);
286 #endif // GPU_STAGGERED_DIRAC 292 #ifdef BUILD_QDP_INTERFACE 294 in.X(),
out, location, Out);
296 errorQuda(
"QDP interface has not been built\n");
301 #ifdef BUILD_MILC_INTERFACE 303 in.X(),
out, location, Out);
305 errorQuda(
"MILC interface has not been built\n");
310 #ifdef BUILD_TIFR_INTERFACE 312 in.X(),
out, location, Out);
314 errorQuda(
"TIFR interface has not been built\n");
323 template <
typename FloatOut,
typename FloatIn>
325 FloatOut *Out, FloatIn *In) {
337 copyGaugeEx<FloatOut,FloatIn,18>(
out,
in, location, Out, In);
346 for (
int d=0; d<in.
Ndim(); d++) {
347 if ( (out.
X()[d] - in.
X()[d]) % 2 != 0)
348 errorQuda(
"Cannot copy into an asymmetrically extended gauge field");
353 copyGaugeEx(out, in, location, (
double*)Out, (
double*)In);
355 #if QUDA_PRECISION & 4 356 copyGaugeEx(out, in, location, (
double*)Out, (
float*)In);
358 errorQuda(
"QUDA_PRECISION=%d does not enable single precision", QUDA_PRECISION);
363 copyGaugeEx(out, in, location, (
float*)Out, (
double*)In);
365 #if QUDA_PRECISION & 4 366 copyGaugeEx(out, in, location, (
float*)Out, (
float*)In);
368 errorQuda(
"QUDA_PRECISION=%d does not enable single precision", QUDA_PRECISION);
struct to define TIFR ordered gauge fields: [mu][parity][volumecb][col][row]
__host__ __device__ constexpr int Ncolor(int length)
Return the number of colors of the accessor based on the length of the field.
QudaFieldLocation location
CopyGaugeExArg(const OutOrder &out, const InOrder &in, const int *Xout, const int *Xin, const int *faceVolumeCB, int nDim, int geometry)
__device__ __host__ void copyGaugeEx(CopyGaugeExArg< OutOrder, InOrder > &arg, int X, int parity)
QudaVerbosity getVerbosity()
void apply(const cudaStream_t &stream)
QudaLinkType LinkType() const
const char * VolString() const
const int * SurfaceCB() const
QudaFieldGeometry Geometry() const
CopyGaugeExArg< OutOrder, InOrder > arg
unsigned int sharedBytesPerThread() const
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Main header file for host and device accessors to GaugeFields.
enum QudaFieldLocation_s QudaFieldLocation
cpuColorSpinorField * out
unsigned int sharedBytesPerBlock(const TuneParam ¶m) const
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
QudaReconstructType Reconstruct() const
QudaGaugeFieldOrder Order() const
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
unsigned int minThreads() const
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
QudaPrecision Precision() const
__global__ void copyGaugeExKernel(CopyGaugeExArg< OutOrder, InOrder > arg)
CopyGaugeEx(CopyGaugeExArg< OutOrder, InOrder > &arg, const GaugeField &meta, QudaFieldLocation location)
void copyExtendedGauge(GaugeField &out, const GaugeField &in, QudaFieldLocation location, void *Out=0, void *In=0)