QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
copy_gauge_helper.cuh
Go to the documentation of this file.
1 #include <tune_quda.h>
2 
3 #include <jitify_helper.cuh>
4 #include <kernels/copy_gauge.cuh>
5 
6 namespace quda {
7 
8  using namespace gauge;
9 
10  template <typename FloatOut, typename FloatIn, int length, typename Arg>
13  int size;
14  const GaugeField &meta;
16  bool is_ghost;
17 
18  private:
19  unsigned int sharedBytesPerThread() const { return 0; }
20  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0 ;}
21 
22  bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
23  unsigned int minThreads() const { return size; }
24 
26  // only do autotuning if we are doing the copy on the GPU
27  return location == QUDA_CUDA_FIELD_LOCATION ? TunableVectorYZ::advanceTuneParam(param) : false;
28  }
29 
30 public:
31  CopyGauge(Arg &arg, const GaugeField &out, const GaugeField &in, QudaFieldLocation location)
32 #ifndef FINE_GRAINED_ACCESS
33  : TunableVectorYZ(1, in.Geometry()* 2),
34 #else
35  : TunableVectorYZ(Ncolor(length), in.Geometry() * 2),
36 #endif
37  arg(arg), meta(in), location(location), is_ghost(false) {
38 
39  set_ghost(is_ghost); // initial state is not ghost
40 
41  strcpy(aux, compile_type_str(in,location));
42  strcat(aux, "out:");
43  strcat(aux, out.AuxString());
44  strcat(aux, ",in:");
45  strcat(aux, in.AuxString());
46 
47 #ifdef FINE_GRAINED_ACCESS
48  strcat(aux,",fine-grained");
49 #endif
50 
51 #ifdef JITIFY
52 #ifdef FINE_GRAINED_ACCESS
53  std::vector<std::string> macro = { "-DFINE_GRAINED_ACCESS" }; // need to pass macro to jitify
54  create_jitify_program("kernels/copy_gauge.cuh", macro);
55 #else
56  create_jitify_program("kernels/copy_gauge.cuh");
57 #endif
58 #endif
59  }
60 
61  void set_ghost(int is_ghost_) {
62  is_ghost = is_ghost_;
63  if (is_ghost_ == 2) arg.out_offset = meta.Ndim(); // forward links
64 
65  int faceMax = 0;
66  for (int d=0; d<arg.nDim; d++) {
67  faceMax = (arg.faceVolumeCB[d] > faceMax ) ? arg.faceVolumeCB[d] : faceMax;
68  }
69  size = is_ghost ? faceMax : arg.volume/2;
70  if (size == 0 && is_ghost) {
71  errorQuda("Cannot copy zero-sized ghost zone. Check nFace parameter is non-zero for both input and output gauge fields");
72  }
73 #ifndef FINE_GRAINED_ACCESS
74  resizeVector(1, (is_ghost ? arg.nDim : meta.Geometry()) * 2);
75 #else
76  resizeVector(Ncolor(length), (is_ghost ? arg.nDim : meta.Geometry()) * 2);
77 #endif
78  }
79 
80  virtual ~CopyGauge() { ; }
81 
82  void apply(const cudaStream_t &stream) {
83  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
84  if (location == QUDA_CPU_FIELD_LOCATION) {
85  if (!is_ghost) {
86  copyGauge<FloatOut, FloatIn, length>(arg);
87  } else {
88  copyGhost<FloatOut, FloatIn, length>(arg);
89  }
90  } else if (location == QUDA_CUDA_FIELD_LOCATION) {
91 #ifdef JITIFY
92  using namespace jitify::reflection;
93  jitify_error = program->kernel(!is_ghost ? "quda::copyGaugeKernel" : "quda::copyGhostKernel")
94  .instantiate(Type<FloatOut>(),Type<FloatIn>(),length,Type<Arg>())
95  .configure(tp.grid,tp.block,tp.shared_bytes,stream).launch(arg);
96 #else
97  if (!is_ghost) {
98  copyGaugeKernel<FloatOut, FloatIn, length, Arg>
99  <<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
100  } else {
101  copyGhostKernel<FloatOut, FloatIn, length, Arg>
102  <<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
103  }
104 #endif
105  } else {
106  errorQuda("Invalid field location %d\n", location);
107  }
108  }
109 
110  TuneKey tuneKey() const {
111  char aux_[TuneKey::aux_n];
112  strcpy(aux_,aux);
113  if (is_ghost) strcat(aux_, ",ghost");
114  return TuneKey(meta.VolString(), typeid(*this).name(), aux_);
115  }
116 
117  long long flops() const { return 0; }
118  long long bytes() const {
119  int sites = 4*arg.volume/2;
120  if (is_ghost) {
121  sites = 0;
122  for (int d=0; d<4; d++) sites += arg.faceVolumeCB[d];
123  }
124 #ifndef FINE_GRAINED_ACCESS
125  return 2 * sites * ( arg.in.Bytes() + arg.in.hasPhase*sizeof(FloatIn)
126  + arg.out.Bytes() + arg.out.hasPhase*sizeof(FloatOut) );
127 #else
128  return 2 * sites * ( arg.in.Bytes() + arg.out.Bytes() );
129 #endif
130  }
131  };
132 
133 
134  template <typename FloatOut, typename FloatIn, int length, typename OutOrder, typename InOrder>
135  void copyGauge(OutOrder &&outOrder, const InOrder &inOrder, const GaugeField &out, const GaugeField &in,
136  QudaFieldLocation location, int type) {
137 
138  CopyGaugeArg<OutOrder,InOrder> arg(outOrder, inOrder, in);
139  CopyGauge<FloatOut, FloatIn, length, CopyGaugeArg<OutOrder,InOrder> > gaugeCopier(arg, out, in, location);
140 
141 #ifdef HOST_DEBUG
142  if (location == QUDA_CPU_FIELD_LOCATION) checkNan<FloatIn, length>(arg);
143 #endif
144 
145  // first copy body
146  if (type == 0 || type == 2) {
147  gaugeCopier.set_ghost(0);
148  gaugeCopier.apply(0);
149  }
150 
151 #ifdef MULTI_GPU
152  if (type == 0 || type == 1) {
154  // now copy ghost
155  gaugeCopier.set_ghost(1);
156  gaugeCopier.apply(0);
157  } else {
158  warningQuda("Cannot copy for %d geometry gauge field", in.Geometry());
159  }
160  }
161 
162  // special copy that only copies the second set of links in the
163  // ghost zone for bi-directional link fields - at present this is
164  // only used in cudaGaugefield::exchangeGhost where we copy from
165  // the buffer into the field's ghost zone (padded
166  // region), so we only have the offset on the receiver
167  if (type == 3) {
168  if (in.Geometry() != QUDA_COARSE_GEOMETRY) errorQuda("Cannot request copy type %d on non-coarse link fields", in.Geometry());
169  gaugeCopier.set_ghost(2);
170  gaugeCopier.apply(0);
171  }
172 #endif
173 
174  }
175 
176 } // namespace quda
__host__ __device__ constexpr int Ncolor(int length)
Return the number of colors of the accessor based on the length of the field.
void apply(const cudaStream_t &stream)
const char * AuxString() const
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21
bool tuneGridDim() const
#define errorQuda(...)
Definition: util_quda.h:121
Helper file when using jitify run-time compilation. This file should be included in source code...
cudaStream_t * stream
const char * VolString() const
QudaFieldGeometry Geometry() const
Definition: gauge_field.h:258
void copyGauge(Arg &arg)
Definition: copy_gauge.cuh:32
int length[]
const char * compile_type_str(const LatticeField &meta, QudaFieldLocation location_=QUDA_INVALID_FIELD_LOCATION)
Helper function for setting auxilary string.
bool advanceTuneParam(TuneParam &param) const
QudaGaugeParam param
Definition: pack_test.cpp:17
const GaugeField & meta
unsigned int sharedBytesPerBlock(const TuneParam &param) const
unsigned int sharedBytesPerThread() const
QudaFieldLocation location
cpuColorSpinorField * in
constexpr int size
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:643
#define warningQuda(...)
Definition: util_quda.h:133
long long bytes() const
unsigned int minThreads() const
enum QudaFieldLocation_s QudaFieldLocation
cpuColorSpinorField * out
static const int aux_n
Definition: tune_key.h:12
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
CopyGauge(Arg &arg, const GaugeField &out, const GaugeField &in, QudaFieldLocation location)
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:52
void set_ghost(int is_ghost_)
long long flops() const
TuneKey tuneKey() const
virtual bool advanceTuneParam(TuneParam &param) const
Definition: tune_quda.h:335