10 template <
typename OutOrder,
typename InOrder>
21 const int *faceVolumeCB,
int nDim,
int geometry)
22 :
out(
out),
in(
in), volume(volume), nDim(nDim), geometry(geometry),
23 out_offset(0), in_offset(0) {
24 for (
int d=0;
d<nDim;
d++) this->faceVolumeCB[
d] = faceVolumeCB[
d];
31 template <
typename FloatOut,
typename FloatIn,
int length,
typename OutOrder,
typename InOrder>
38 for (
int d=0;
d<
arg.geometry;
d++) {
39 for (
int x=0;
x<
arg.volume/2;
x++) {
40 #ifdef FINE_GRAINED_ACCESS 61 template <
typename Float,
int length,
typename Arg>
67 for (
int d=0;
d<
arg.geometry;
d++) {
68 for (
int x=0;
x<
arg.volume/2;
x++) {
69 #ifdef FINE_GRAINED_ACCESS 95 template <
typename FloatOut,
typename FloatIn,
int length,
typename OutOrder,
typename InOrder>
101 int x = blockIdx.x *
blockDim.x + threadIdx.x;
102 int d = blockIdx.y *
blockDim.y + threadIdx.y;
103 if (
x >=
arg.volume/2)
return;
104 if (
d >=
arg.geometry)
return;
106 #ifdef FINE_GRAINED_ACCESS 123 template <
typename FloatOut,
typename FloatIn,
int length,
typename OutOrder,
typename InOrder>
130 for (
int d=0;
d<
arg.nDim;
d++) {
131 for (
int x=0;
x<
arg.faceVolumeCB[
d];
x++) {
132 #ifdef FINE_GRAINED_ACCESS 153 template <
typename FloatOut,
typename FloatIn,
int length,
typename OutOrder,
typename InOrder>
158 int x = blockIdx.x *
blockDim.x + threadIdx.x;
161 for (
int d=0;
d<
arg.nDim;
d++) {
162 if (
x <
arg.faceVolumeCB[
d]) {
163 #ifdef FINE_GRAINED_ACCESS 180 template <
typename FloatOut,
typename FloatIn,
int length,
typename OutOrder,
typename InOrder,
bool isGhost>
197 for (
int d=0;
d<
arg.nDim;
d++) {
198 faceMax = (
arg.faceVolumeCB[
d] > faceMax ) ?
arg.faceVolumeCB[
d] : faceMax;
200 size = isGhost ? faceMax :
arg.volume/2;
201 if (
size == 0 && isGhost) {
202 errorQuda(
"Cannot copy zero-sized ghost zone. Check nFace parameter is non-zero for both input and output gauge fields");
205 #ifndef FINE_GRAINED_ACCESS 206 int n = writeAuxString(
"out_stride=%d,in_stride=%d,geometry=%d",
arg.out.stride,
arg.in.stride,
arg.in.geometry);
216 writeAuxString(
"fine-grained,geometry=%d",
arg.in.geometry);
225 copyGaugeKernel<FloatOut, FloatIn, length, OutOrder, InOrder>
228 copyGhostKernel<FloatOut, FloatIn, length, OutOrder, InOrder>
235 long long flops()
const {
return 0; }
237 int sites = 4*
arg.volume/2;
240 for (
int d=0;
d<4;
d++) sites +=
arg.faceVolumeCB[
d];
242 #ifndef FINE_GRAINED_ACCESS 243 return 2 * sites * (
arg.in.Bytes() +
arg.in.hasPhase*
sizeof(FloatIn)
244 +
arg.out.Bytes() +
arg.out.hasPhase*
sizeof(FloatOut) );
246 return 2 * sites * (
arg.in.Bytes() +
arg.out.Bytes() );
252 template <
typename FloatOut,
typename FloatIn,
int length,
typename OutOrder,
typename InOrder>
253 void copyGauge(OutOrder &&outOrder,
const InOrder &inOrder,
int volume,
const int *faceVolumeCB,
261 checkNan<FloatIn, length>(
arg);
264 if (type == 0 || type == 2) {
265 copyGauge<FloatOut, FloatIn, length>(
arg);
267 #ifdef MULTI_GPU // only copy the ghost zone if doing multi-gpu 268 if (type == 0 || type == 1) {
276 arg.out_offset = nDim;
277 copyGhost<FloatOut, FloatIn, length>(
arg);
282 if (type == 0 || type == 2) {
284 gaugeCopier.
apply(0);
287 if (type == 0 || type == 1) {
291 ghostCopier.
apply(0);
300 arg.out_offset = nDim;
302 ghostCopier.
apply(0);
306 errorQuda(
"Undefined field location %d for copyGauge", location);
__host__ __device__ constexpr int Ncolor(int length)
Return the number of colors of the accessor based on the length of the field.
void apply(const cudaStream_t &stream)
int snprintf(char *__str, size_t __size, const char *__format,...) __attribute__((__format__(__printf__
CopyGauge(CopyGaugeArg< OutOrder, InOrder > &arg, const GaugeField &out, const GaugeField &in)
QudaVerbosity getVerbosity()
void copyGauge(CopyGaugeArg< OutOrder, InOrder > arg)
const char * VolString() const
__global__ void copyGhostKernel(CopyGaugeArg< OutOrder, InOrder > arg)
unsigned int minThreads() const
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
unsigned int sharedBytesPerThread() const
Main header file for host and device accessors to GaugeFields.
__global__ void copyGaugeKernel(CopyGaugeArg< OutOrder, InOrder > arg)
enum QudaFieldLocation_s QudaFieldLocation
cpuColorSpinorField * out
CopyGaugeArg(const OutOrder &out, const InOrder &in, int volume, const int *faceVolumeCB, int nDim, int geometry)
unsigned int sharedBytesPerBlock(const TuneParam ¶m) const
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
static __inline__ size_t size_t d
void copyGhost(CopyGaugeArg< OutOrder, InOrder > arg)
CopyGaugeArg< OutOrder, InOrder > arg