8 template <
typename OutOrder,
typename InOrder>
18 : out(out), in(in), volume(volume), nDim(nDim), geometry(geometry) {
19 for (
int d=0; d<
nDim; d++) this->faceVolumeCB[d] = faceVolumeCB[d];
26 template <
typename FloatOut,
typename FloatIn,
int length,
typename OutOrder,
typename InOrder>
38 for (
int i=0; i<
length; i++) out[i] = in[i];
50 template <
typename FloatOut,
typename FloatIn,
int length,
typename OutOrder,
typename InOrder>
58 int x = blockIdx.x * blockDim.x + threadIdx.x;
59 if (x >= arg.
volume/2)
return;
64 for (
int i=0; i<
length; i++) out[i] = in[i];
73 template <
typename FloatOut,
typename FloatIn,
int length,
typename OutOrder,
typename InOrder>
80 for (
int d=0; d<arg.
nDim; d++) {
85 for (
int i=0; i<
length; i++) out[i] = in[i];
97 template <
typename FloatOut,
typename FloatIn,
int length,
typename OutOrder,
typename InOrder>
102 int x = blockIdx.x * blockDim.x + threadIdx.x;
105 for (
int d=0; d<arg.
nDim; d++) {
110 for (
int i=0; i<
length; i++) out[i] = in[i];
118 template <
typename FloatOut,
typename FloatIn,
int length,
typename OutOrder,
typename InOrder,
bool isGhost>
125 unsigned int sharedBytesPerThread()
const {
return 0; }
126 unsigned int sharedBytesPerBlock(
const TuneParam &
param)
const {
return 0 ;}
128 bool tuneGridDim()
const {
return false; }
129 unsigned int minThreads()
const {
return size; }
134 for (
int d=0; d<arg.
nDim; d++) {
137 size = isGhost ? faceMax : arg.
volume/2;
145 #if (__COMPUTE_CAPABILITY__ >= 200)
147 copyGaugeKernel<FloatOut, FloatIn, length, OutOrder, InOrder>
150 copyGhostKernel<FloatOut, FloatIn, length, OutOrder, InOrder>
154 errorQuda(
"Gauge copy not supported on pre-Fermi architecture");
161 std::stringstream ps;
162 ps <<
"block=(" << param.
block.x <<
"," << param.
block.y <<
"," << param.
block.z <<
"), ";
167 long long flops()
const {
return 0; }
169 int sites = 4*arg.volume/2;
172 for (
int d=0; d<4; d++) sites += arg.faceVolumeCB[d];
174 #if __COMPUTE_CAPABILITY__ >= 200
175 return 2 * sites * ( arg.in.Bytes() + arg.in.hasPhase*
sizeof(FloatIn)
176 + arg.out.Bytes() + arg.out.hasPhase*
sizeof(FloatOut) );
178 return 2 * sites * ( arg.in.Bytes() + arg.out.Bytes() );
184 template <
typename FloatOut,
typename FloatIn,
int length,
typename OutOrder,
typename InOrder>
185 void copyGauge(OutOrder outOrder,
const InOrder inOrder,
int volume,
const int *faceVolumeCB,
191 if (type == 0 || type == 2) {
192 copyGauge<FloatOut, FloatIn, length>(
arg);
194 #ifdef MULTI_GPU // only copy the ghost zone if doing multi-gpu
195 if (type == 0 || type == 1) {
202 if (type == 0 || type == 2) {
204 gaugeCopier.
apply(0);
207 if (type == 0 || type == 1) {
211 ghostCopier.
apply(0);
218 errorQuda(
"Undefined field location %d for copyGauge", location);
223 template <
typename FloatOut,
typename FloatIn,
int length,
typename InOrder>
225 FloatOut *Out, FloatOut **outGhost,
int type) {
227 for (
int i=0; i<4; i++) faceVolumeCB[i] = out.
SurfaceCB(i) * out.
Nface();
231 copyGauge<FloatOut,FloatIn,length>
233 faceVolumeCB, out.Ndim(), out.Geometry(),
out,
location, type);
235 copyGauge<FloatOut,FloatIn,length>
237 faceVolumeCB, out.Ndim(), out.Geometry(),
out,
location, type);
240 copyGauge<FloatOut,FloatIn,length>
242 faceVolumeCB, out.Ndim(), out.Geometry(),
out,
location, type);
244 copyGauge<FloatOut,FloatIn,length>
246 faceVolumeCB, out.Ndim(), out.Geometry(),
out,
location, type);
247 #if defined(GPU_STAGGERED_DIRAC) && __COMPUTE_CAPABILITY__ >= 200
249 copyGauge<FloatOut,FloatIn,length>
251 faceVolumeCB, out.Ndim(), out.Geometry(),
out,
location, type);
253 copyGauge<FloatOut,FloatIn,length>
255 faceVolumeCB, out.Ndim(), out.Geometry(),
out,
location, type);
262 copyGauge<FloatOut,FloatIn,length>
264 faceVolumeCB, out.Ndim(), out.Geometry(),
out,
location, type);
266 copyGauge<FloatOut,FloatIn,length>
268 faceVolumeCB, out.Ndim(), out.Geometry(),
out,
location, type);
269 #if defined(GPU_STAGGERED_DIRAC) && __COMPUTE_CAPABILITY__ >= 200
271 copyGauge<FloatOut,FloatIn,length>
273 faceVolumeCB, out.Ndim(), out.Geometry(),
out,
location, type);
275 copyGauge<FloatOut,FloatIn,length>
277 faceVolumeCB, out.Ndim(), out.Geometry(),
out,
location, type);
284 #ifdef BUILD_QDP_INTERFACE
285 copyGauge<FloatOut,FloatIn,length>
287 faceVolumeCB, out.Ndim(), out.Geometry(),
out,
location, type);
289 errorQuda(
"QDP interface has not been built\n");
294 #ifdef BUILD_QDPJIT_INTERFACE
295 copyGauge<FloatOut,FloatIn,length>
297 faceVolumeCB, out.Ndim(), out.Geometry(),
out,
location, type);
299 errorQuda(
"QDPJIT interface has not been built\n");
304 #ifdef BUILD_CPS_INTERFACE
305 copyGauge<FloatOut,FloatIn,length>
307 faceVolumeCB, out.Ndim(), out.Geometry(),
out,
location, type);
309 errorQuda(
"CPS interface has not been built\n");
314 #ifdef BUILD_MILC_INTERFACE
315 copyGauge<FloatOut,FloatIn,length>
317 faceVolumeCB, out.Ndim(), out.Geometry(),
out,
location, type);
319 errorQuda(
"MILC interface has not been built\n");
324 #ifdef BUILD_BQCD_INTERFACE
325 copyGauge<FloatOut,FloatIn,length>
327 faceVolumeCB, out.Ndim(), out.Geometry(),
out,
location, type);
329 errorQuda(
"BQCD interface has not been built\n");
334 #ifdef BUILD_TIFR_INTERFACE
335 copyGauge<FloatOut,FloatIn,length>
337 faceVolumeCB, out.Ndim(), out.Geometry(),
out,
location, type);
339 errorQuda(
"TIFR interface has not been built\n");
348 template <
typename FloatOut,
typename FloatIn,
int length>
350 FloatOut *Out, FloatIn *In, FloatOut **outGhost, FloatIn **inGhost,
int type) {
357 out, location, Out, outGhost, type);
360 out, location, Out, outGhost, type);
364 out, location, Out, outGhost, type);
367 out, location, Out, outGhost, type);
368 #if defined(GPU_STAGGERED_DIRAC) && __COMPUTE_CAPABILITY__ >= 200
371 out, location, Out, outGhost, type);
374 out, location, Out, outGhost, type);
382 out, location, Out, outGhost, type);
385 out, location, Out, outGhost, type);
386 #if defined(GPU_STAGGERED_DIRAC) && __COMPUTE_CAPABILITY__ >= 200
389 out, location, Out, outGhost, type);
392 out, location, Out, outGhost, type);
399 #ifdef BUILD_QDP_INTERFACE
401 out, location, Out, outGhost, type);
403 errorQuda(
"QDP interface has not been built\n");
408 #ifdef BUILD_QDPJIT_INTERFACE
410 out, location, Out, outGhost, type);
412 errorQuda(
"QDPJIT interface has not been built\n");
417 #ifdef BUILD_CPS_INTERFACE
419 out, location, Out, outGhost, type);
421 errorQuda(
"CPS interface has not been built\n");
426 #ifdef BUILD_MILC_INTERFACE
428 out, location, Out, outGhost, type);
430 errorQuda(
"MILC interface has not been built\n");
435 #ifdef BUILD_BQCD_INTERFACE
437 out, location, Out, outGhost, type);
439 errorQuda(
"BQCD interface has not been built\n");
444 #ifdef BUILD_TIFR_INTERFACE
446 out, location, Out, outGhost, type);
448 errorQuda(
"TIFR interface has not been built\n");
459 template <
typename FloatOut,
typename FloatIn>
461 FloatIn *In, FloatOut **outGhost, FloatIn **inGhost,
int type) {
471 #if __COMPUTE_CAPABILITY__ < 200
474 errorQuda(
"Reconstruct 9/13 not supported on pre-Fermi architecture");
479 copyGauge<FloatOut,FloatIn,18>(
out,
in,
location, Out, In, outGhost, inGhost, type);
496 copyGauge<FloatOut,FloatIn,10>(
arg);
498 #ifdef BUILD_MILC_INTERFACE
502 copyGauge<FloatOut,FloatIn,10>(
arg);
504 errorQuda(
"MILC interface has not been built\n");
508 #ifdef BUILD_TIFR_INTERFACE
512 copyGauge<FloatOut,FloatIn,18>(
arg);
514 errorQuda(
"TIFR interface has not been built\n");
521 #ifdef BUILD_MILC_INTERFACE
526 copyGauge<FloatOut,FloatIn,10>(
arg);
531 copyGauge<FloatOut,FloatIn,10>(
arg);
536 errorQuda(
"MILC interface has not been built\n");
540 #ifdef BUILD_TIFR_INTERFACE
546 copyGauge<FloatOut,FloatIn,18>(
arg);
551 copyGauge<FloatOut,FloatIn,10>(
arg);
556 errorQuda(
"TIFR interface has not been built\n");
std::string paramString(const TuneParam ¶m) const
void apply(const cudaStream_t &stream)
QudaVerbosity getVerbosity()
void copyGauge(CopyGaugeArg< OutOrder, InOrder > arg)
CopyGauge(CopyGaugeArg< OutOrder, InOrder > &arg, const GaugeField &meta)
QudaGaugeFieldOrder Order() const
int faceVolumeCB[QUDA_MAX_DIM]
const int * SurfaceCB() const
__global__ void copyGhostKernel(CopyGaugeArg< OutOrder, InOrder > arg)
void writeAuxString(const char *format,...)
const QudaFieldLocation location
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
QudaReconstructType Reconstruct() const
const char * VolString() const
__global__ void copyGaugeKernel(CopyGaugeArg< OutOrder, InOrder > arg)
enum QudaFieldLocation_s QudaFieldLocation
cpuColorSpinorField * out
CopyGaugeArg(const OutOrder &out, const InOrder &in, int volume, const int *faceVolumeCB, int nDim, int geometry)
QudaLinkType LinkType() const
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
QudaFieldGeometry Geometry() const
void checkMomOrder(const GaugeField &u)
void copyGhost(CopyGaugeArg< OutOrder, InOrder > arg)