10 template <
typename FloatOut,
typename FloatIn,
int length,
typename Arg>
32 #ifndef FINE_GRAINED_ACCESS 37 arg(arg), meta(in), location(location), is_ghost(
false) {
47 #ifdef FINE_GRAINED_ACCESS 48 strcat(aux,
",fine-grained");
52 #ifdef FINE_GRAINED_ACCESS 53 std::vector<std::string> macro = {
"-DFINE_GRAINED_ACCESS" };
54 create_jitify_program(
"kernels/copy_gauge.cuh", macro);
56 create_jitify_program(
"kernels/copy_gauge.cuh");
63 if (is_ghost_ == 2) arg.out_offset = meta.
Ndim();
66 for (
int d=0; d<arg.nDim; d++) {
67 faceMax = (arg.faceVolumeCB[d] > faceMax ) ? arg.faceVolumeCB[d] : faceMax;
69 size = is_ghost ? faceMax : arg.volume/2;
70 if (size == 0 && is_ghost) {
71 errorQuda(
"Cannot copy zero-sized ghost zone. Check nFace parameter is non-zero for both input and output gauge fields");
73 #ifndef FINE_GRAINED_ACCESS 74 resizeVector(1, (is_ghost ? arg.nDim : meta.
Geometry()) * 2);
86 copyGauge<FloatOut, FloatIn, length>(
arg);
88 copyGhost<FloatOut, FloatIn, length>(
arg);
92 using namespace jitify::reflection;
93 jitify_error = program->kernel(!is_ghost ?
"quda::copyGaugeKernel" :
"quda::copyGhostKernel")
94 .instantiate(Type<FloatOut>(),Type<FloatIn>(),
length,Type<Arg>())
98 copyGaugeKernel<FloatOut, FloatIn, length, Arg>
101 copyGhostKernel<FloatOut, FloatIn, length, Arg>
106 errorQuda(
"Invalid field location %d\n", location);
113 if (is_ghost) strcat(aux_,
",ghost");
117 long long flops()
const {
return 0; }
119 int sites = 4*arg.volume/2;
122 for (
int d=0; d<4; d++) sites += arg.faceVolumeCB[d];
124 #ifndef FINE_GRAINED_ACCESS 125 return 2 * sites * ( arg.in.Bytes() + arg.in.hasPhase*
sizeof(FloatIn)
126 + arg.out.Bytes() + arg.out.hasPhase*
sizeof(FloatOut) );
128 return 2 * sites * ( arg.in.Bytes() + arg.out.Bytes() );
134 template <
typename FloatOut,
typename FloatIn,
int length,
typename OutOrder,
typename InOrder>
146 if (type == 0 || type == 2) {
148 gaugeCopier.
apply(0);
152 if (type == 0 || type == 1) {
156 gaugeCopier.
apply(0);
170 gaugeCopier.
apply(0);
__host__ __device__ constexpr int Ncolor(int length)
Return the number of colors of the accessor based on the length of the field.
void apply(const cudaStream_t &stream)
const char * AuxString() const
QudaVerbosity getVerbosity()
Helper file when using jitify run-time compilation. This file should be included in source code...
const char * VolString() const
QudaFieldGeometry Geometry() const
const char * compile_type_str(const LatticeField &meta, QudaFieldLocation location_=QUDA_INVALID_FIELD_LOCATION)
Helper function for setting auxilary string.
bool advanceTuneParam(TuneParam ¶m) const
unsigned int sharedBytesPerBlock(const TuneParam ¶m) const
unsigned int sharedBytesPerThread() const
QudaFieldLocation location
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
unsigned int minThreads() const
enum QudaFieldLocation_s QudaFieldLocation
cpuColorSpinorField * out
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
CopyGauge(Arg &arg, const GaugeField &out, const GaugeField &in, QudaFieldLocation location)
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
void set_ghost(int is_ghost_)
virtual bool advanceTuneParam(TuneParam ¶m) const