8 template <
typename OutOrder,
typename InOrder>
21 : out(out), in(in),
volume(2*in.volumeCB),
volumeEx(2*out.volumeCB),
22 nDim(nDim), geometry(geometry) {
23 for (
int d=0; d<
nDim; d++) {
26 this->faceVolumeCB[d] = faceVolumeCB[d];
34 template <
typename FloatOut,
typename FloatIn,
int length,
typename OutOrder,
typename InOrder>
41 for (
int d=0; d<4; d++) R[d] = (arg.
E[d] - arg.
X[d]) >> 1;
43 int za = X/(arg.
X[0]/2);
44 int x0h = X - za*(arg.
X[0]/2);
46 x[1] = za - zb*arg.
X[1];
48 x[2] = zb - x[3]*arg.
X[2];
49 x[0] = 2*x0h + ((x[1] + x[2] + x[3] +
parity) & 1);
52 int Y = ((((x[3]+R[3])*arg.
E[2] + (x[2]+R[2]))*arg.
E[1] + (x[1]+R[1]))*arg.
E[0]+(x[0]+R[0])) >> 1;
57 arg.
in.load(in, X, d, parity);
58 for (
int i=0; i<
length; i++) out[i] = in[i];
59 arg.
out.save(out, Y, d, parity);
63 template <
typename FloatOut,
typename FloatIn,
int length,
typename OutOrder,
typename InOrder>
67 copyGaugeEx<FloatOut, FloatIn, length, OutOrder, InOrder>(
arg,
X,
parity);
72 template <
typename FloatOut,
typename FloatIn,
int length,
typename OutOrder,
typename InOrder>
75 int X = blockIdx.x * blockDim.x + threadIdx.x;
76 if (X >= arg.
volume/2)
return;
77 copyGaugeEx<FloatOut, FloatIn, length, OutOrder, InOrder>(
arg,
X,
parity);
81 template <
typename FloatOut,
typename FloatIn,
int length,
typename OutOrder,
typename InOrder>
88 unsigned int sharedBytesPerThread()
const {
return 0; }
89 unsigned int sharedBytesPerBlock(
const TuneParam &
param)
const {
return 0 ;}
91 bool tuneGridDim()
const {
return false; }
92 unsigned int minThreads()
const {
return arg.volume/2; }
96 : arg(arg), meta(meta), location(location) {
105 copyGaugeEx<FloatOut, FloatIn, length>(arg);
107 #if (__COMPUTE_CAPABILITY__ >= 200)
108 copyGaugeExKernel<FloatOut, FloatIn, length, OutOrder, InOrder>
111 errorQuda(
"Extended gauge copy not supported on pre-Fermi architecture");
121 std::stringstream ps;
122 ps <<
"block=(" << param.
block.x <<
"," << param.
block.y <<
"," << param.
block.z <<
"), ";
127 long long flops()
const {
return 0; }
129 int sites = 4*arg.volume/2;
130 #if (__COMPUTE_CAPABILITY__ >= 200)
131 return 2 * sites * ( arg.in.Bytes() + arg.in.hasPhase*
sizeof(FloatIn)
132 + arg.out.Bytes() + arg.out.hasPhase*
sizeof(FloatOut) );
134 return 2 * sites * ( arg.in.Bytes() + arg.out.Bytes() );
140 template <
typename FloatOut,
typename FloatIn,
int length,
typename OutOrder,
typename InOrder>
141 void copyGaugeEx(OutOrder outOrder,
const InOrder inOrder,
const int *
E,
145 arg(outOrder, inOrder, E, X, faceVolumeCB, meta.
Ndim(), meta.
Geometry());
151 template <
typename FloatOut,
typename FloatIn,
int length,
typename InOrder>
155 for (
int i=0; i<4; i++) faceVolumeCB[i] = out.
SurfaceCB(i) * out.
Nface();
160 copyGaugeEx<FloatOut,FloatIn,length>
161 (
FloatNOrder<FloatOut,length,2,19>(
out, Out), inOrder, out.X(),
X, faceVolumeCB,
out,
location);
163 copyGaugeEx<FloatOut,FloatIn,length>
164 (
FloatNOrder<FloatOut,length,2,18>(
out, Out), inOrder, out.X(),
X, faceVolumeCB,
out,
location);
167 copyGaugeEx<FloatOut,FloatIn,length>
168 (
FloatNOrder<FloatOut,length,2,12>(
out, Out), inOrder, out.X(),
X, faceVolumeCB,
out,
location);
170 copyGaugeEx<FloatOut,FloatIn,length>
171 (
FloatNOrder<FloatOut,length,2,8>(
out, Out), inOrder, out.X(),
X, faceVolumeCB,
out,
location);
172 #ifdef GPU_STAGGERED_DIRAC
174 copyGaugeEx<FloatOut,FloatIn,length>
175 (
FloatNOrder<FloatOut,length,2,13>(
out, Out), inOrder, out.X(),
X, faceVolumeCB,
out,
location);
177 copyGaugeEx<FloatOut,FloatIn,length>
178 (
FloatNOrder<FloatOut,length,2,9>(
out, Out), inOrder, out.X(),
X, faceVolumeCB,
out,
location);
185 copyGaugeEx<FloatOut,FloatIn,length>
186 (
FloatNOrder<FloatOut,length,4,12>(
out, Out), inOrder, out.X(),
X, faceVolumeCB,
out,
location);
188 copyGaugeEx<FloatOut,FloatIn,length>
189 (
FloatNOrder<FloatOut,length,4,8>(
out, Out), inOrder, out.X(),
X, faceVolumeCB,
out,
location);
190 #ifdef GPU_STAGGERED_DIRAC
192 copyGaugeEx<FloatOut,FloatIn,length>
193 (
FloatNOrder<FloatOut,length,4,13>(
out, Out), inOrder, out.X(),
X, faceVolumeCB,
out,
location);
195 copyGaugeEx<FloatOut,FloatIn,length>
196 (
FloatNOrder<FloatOut,length,4,9>(
out, Out), inOrder, out.X(),
X, faceVolumeCB,
out,
location);
204 #ifdef BUILD_QDP_INTERFACE
205 copyGaugeEx<FloatOut,FloatIn,length>
208 errorQuda(
"QDP interface has not been built\n");
213 #ifdef BUILD_MILC_INTERFACE
214 copyGaugeEx<FloatOut,FloatIn,length>
217 errorQuda(
"MILC interface has not been built\n");
222 #ifdef BUILD_TIFR_INTERFACE
223 copyGaugeEx<FloatOut,FloatIn,length>
226 errorQuda(
"TIFR interface has not been built\n");
235 template <
typename FloatOut,
typename FloatIn,
int length>
237 FloatOut *Out, FloatIn *In) {
254 #ifdef GPU_STAGGERED_DIRAC
272 #ifdef GPU_STAGGERED_DIRAC
286 #ifdef BUILD_QDP_INTERFACE
290 errorQuda(
"QDP interface has not been built\n");
295 #ifdef BUILD_MILC_INTERFACE
299 errorQuda(
"MILC interface has not been built\n");
304 #ifdef BUILD_TIFR_INTERFACE
308 errorQuda(
"TIFR interface has not been built\n");
317 template <
typename FloatOut,
typename FloatIn>
319 FloatOut *Out, FloatIn *In) {
331 copyGaugeEx<FloatOut,FloatIn,18>(
out,
in,
location, Out, In);
340 for (
int d=0; d<in.
Ndim(); d++) {
341 if ( (out.
X()[d] - in.
X()[d]) % 2 != 0)
342 errorQuda(
"Cannot copy into an asymmetrically extended gauge field");
347 copyGaugeEx(out, in, location, (
double*)Out, (
double*)In);
349 copyGaugeEx(out, in, location, (
double*)Out, (
float*)In);
353 copyGaugeEx(out, in, location, (
float*)Out, (
double*)In);
355 copyGaugeEx(out, in, location, (
float*)Out, (
float*)In);
int faceVolumeCB[QUDA_MAX_DIM]
QudaVerbosity getVerbosity()
void apply(const cudaStream_t &stream)
QudaGaugeFieldOrder Order() const
const int * SurfaceCB() const
QudaPrecision Precision() const
void writeAuxString(const char *format,...)
const QudaFieldLocation location
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
QudaReconstructType Reconstruct() const
const char * VolString() const
enum QudaFieldLocation_s QudaFieldLocation
cpuColorSpinorField * out
QudaLinkType LinkType() const
__device__ __host__ void copyGaugeEx(CopyGaugeExArg< OutOrder, InOrder > &arg, int X, int parity)
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
QudaFieldGeometry Geometry() const
CopyGaugeEx(CopyGaugeExArg< OutOrder, InOrder > &arg, const GaugeField &meta, QudaFieldLocation location)
std::string paramString(const TuneParam ¶m) const
void copyExtendedGauge(GaugeField &out, const GaugeField &in, QudaFieldLocation location, void *Out=0, void *In=0)
CopyGaugeExArg(const OutOrder &out, const InOrder &in, const int *E, const int *X, const int *faceVolumeCB, int nDim, int geometry)
__global__ void copyGaugeExKernel(CopyGaugeExArg< OutOrder, InOrder > arg)