QUDA  1.0.0
copy_clover.cu
Go to the documentation of this file.
1 #include <clover_field_order.h>
2 #include <tune_quda.h>
3 
4 namespace quda {
5 
6  using namespace clover;
7 
8 #ifdef GPU_CLOVER_DIRAC
9 
13  template <typename Out, typename In>
14  struct CopyCloverArg {
15  Out out;
16  const In in;
17  int volumeCB;
18  CopyCloverArg (const Out &out, const In in, int volume) : out(out), in(in), volumeCB(in.volumeCB) { }
19  };
20 
24  template <typename FloatOut, typename FloatIn, int length, typename Out, typename In>
25  void copyClover(CopyCloverArg<Out,In> arg) {
26  typedef typename mapper<FloatIn>::type RegTypeIn;
27  typedef typename mapper<FloatOut>::type RegTypeOut;
28 
29  for (int parity=0; parity<2; parity++) {
30  for (int x=0; x<arg.volumeCB; x++) {
31  RegTypeIn in[length];
32  RegTypeOut out[length];
33  arg.in.load(in, x, parity);
34  for (int i=0; i<length; i++) out[i] = in[i];
35  arg.out.save(out, x, parity);
36  }
37  }
38 
39  }
40 
44  template <typename FloatOut, typename FloatIn, int length, typename Out, typename In>
45  __global__ void copyCloverKernel(CopyCloverArg<Out,In> arg) {
46  typedef typename mapper<FloatIn>::type RegTypeIn;
47  typedef typename mapper<FloatOut>::type RegTypeOut;
48 
49  int x = blockIdx.x * blockDim.x + threadIdx.x;
50  if (x >= arg.volumeCB) return;
51  int parity = blockIdx.y * blockDim.y + threadIdx.y;
52 
53  RegTypeIn in[length];
54  RegTypeOut out[length];
55  arg.in.load(in, x, parity);
56 #pragma unroll
57  for (int i=0; i<length; i++) out[i] = in[i];
58  arg.out.save(out, x, parity);
59 
60  }
61 
62  template <typename FloatOut, typename FloatIn, int length, typename Out, typename In>
63  class CopyClover : TunableVectorY {
64  CopyCloverArg<Out,In> arg;
65  const CloverField &meta;
66 
67  private:
68  unsigned int sharedBytesPerThread() const { return 0; }
69  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0 ;}
70 
71  bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
72  unsigned int minThreads() const { return arg.volumeCB; }
73 
74  public:
75  CopyClover(CopyCloverArg<Out,In> &arg, const CloverField &meta)
76  : TunableVectorY(2), arg(arg), meta(meta) {
77  writeAuxString("out_stride=%d,in_stride=%d", arg.out.stride, arg.in.stride);
78  }
79  virtual ~CopyClover() { ; }
80 
81  void apply(const cudaStream_t &stream) {
82  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
83  copyCloverKernel<FloatOut, FloatIn, length, Out, In>
84  <<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
85  }
86 
87  TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
88 
89  long long flops() const { return 0; }
90  long long bytes() const { return 2*arg.volumeCB*(arg.in.Bytes() + arg.out.Bytes()); }
91  };
92 
93  template <typename FloatOut, typename FloatIn, int length, typename OutOrder, typename InOrder>
94  void copyClover(OutOrder outOrder, const InOrder inOrder, const CloverField &out, QudaFieldLocation location) {
95 
96  CopyCloverArg<OutOrder,InOrder> arg(outOrder, inOrder, out.Volume());
97 
98  if (location == QUDA_CPU_FIELD_LOCATION) {
99  copyClover<FloatOut, FloatIn, length, OutOrder, InOrder>(arg);
100  } else if (location == QUDA_CUDA_FIELD_LOCATION) {
101  CopyClover<FloatOut, FloatIn, length, OutOrder, InOrder> cloverCopier(arg, out);
102  cloverCopier.apply(0);
103  } else {
104  errorQuda("Undefined field location %d for copyClover", location);
105  }
106 
107  }
108 
109  template <typename FloatOut, typename FloatIn, int length, typename InOrder>
110  void copyClover(const InOrder &inOrder, CloverField &out, bool inverse, QudaFieldLocation location, FloatOut *Out, float *outNorm) {
111 
112  if (out.isNative()) {
113  const bool override = true;
114  typedef typename clover_mapper<FloatOut>::type C;
115  copyClover<FloatOut,FloatIn,length>(C(out, inverse, Out, outNorm, override), inOrder, out, location);
116  } else if (out.Order() == QUDA_PACKED_CLOVER_ORDER) {
117  copyClover<FloatOut,FloatIn,length>
118  (QDPOrder<FloatOut,length>(out, inverse, Out), inOrder, out, location);
119  } else if (out.Order() == QUDA_QDPJIT_CLOVER_ORDER) {
120 
121 #ifdef BUILD_QDPJIT_INTERFACE
122  copyClover<FloatOut,FloatIn,length>
123  (QDPJITOrder<FloatOut,length>(out, inverse, Out), inOrder, out, location);
124 #else
125  errorQuda("QDPJIT interface has not been built\n");
126 #endif
127 
128  } else if (out.Order() == QUDA_BQCD_CLOVER_ORDER) {
129  errorQuda("BQCD output not supported");
130  } else {
131  errorQuda("Clover field %d order not supported", out.Order());
132  }
133 
134  }
135 
136  template <typename FloatOut, typename FloatIn, int length>
137  void copyClover(CloverField &out, const CloverField &in, bool inverse, QudaFieldLocation location,
138  FloatOut *Out, FloatIn *In, float *outNorm, float *inNorm) {
139 
140  // reconstruction only supported on FloatN fields currently
141  if (in.isNative()) {
142  const bool override = true;
143  typedef typename clover_mapper<FloatIn>::type C;
144  copyClover<FloatOut,FloatIn,length>(C(in, inverse, In, inNorm, override), out, inverse, location, Out, outNorm);
145  } else if (in.Order() == QUDA_PACKED_CLOVER_ORDER) {
146  copyClover<FloatOut,FloatIn,length>
147  (QDPOrder<FloatIn,length>(in, inverse, In), out, inverse, location, Out, outNorm);
148  } else if (in.Order() == QUDA_QDPJIT_CLOVER_ORDER) {
149 
150 #ifdef BUILD_QDPJIT_INTERFACE
151  copyClover<FloatOut,FloatIn,length>
152  (QDPJITOrder<FloatIn,length>(in, inverse, In), out, inverse, location, Out, outNorm);
153 #else
154  errorQuda("QDPJIT interface has not been built\n");
155 #endif
156 
157  } else if (in.Order() == QUDA_BQCD_CLOVER_ORDER) {
158 
159 #ifdef BUILD_BQCD_INTERFACE
160  copyClover<FloatOut,FloatIn,length>
161  (BQCDOrder<FloatIn,length>(in, inverse, In), out, inverse, location, Out, outNorm);
162 #else
163  errorQuda("BQCD interface has not been built\n");
164 #endif
165 
166  } else {
167  errorQuda("Clover field %d order not supported", in.Order());
168  }
169 
170  }
171 
172 #endif
173 
174  // this is the function that is actually called, from here on down we instantiate all required templates
175  void copyGenericClover(CloverField &out, const CloverField &in, bool inverse, QudaFieldLocation location,
176  void *Out, void *In, void *outNorm, void *inNorm) {
177 #ifdef GPU_CLOVER_DIRAC
178  if (out.Precision() == QUDA_HALF_PRECISION && out.Order() > 4)
179  errorQuda("Half precision not supported for order %d", out.Order());
180  if (in.Precision() == QUDA_HALF_PRECISION && in.Order() > 4)
181  errorQuda("Half precision not supported for order %d", in.Order());
182 
183  if (out.Precision() == QUDA_DOUBLE_PRECISION) {
184  if (in.Precision() == QUDA_DOUBLE_PRECISION) {
185  copyClover<double,double,72>(out, in, inverse, location, (double*)Out, (double*)In, (float*)outNorm, (float*)inNorm);
186  } else if (in.Precision() == QUDA_SINGLE_PRECISION) {
187  copyClover<double,float,72>(out, in, inverse, location, (double*)Out, (float*)In, (float*)outNorm, (float*)inNorm);
188  } else if (in.Precision() == QUDA_HALF_PRECISION) {
189  copyClover<double,short,72>(out, in, inverse, location, (double*)Out, (short*)In, (float*)outNorm, (float*)inNorm);
190  } else if (in.Precision() == QUDA_QUARTER_PRECISION) {
191  copyClover<double, char, 72>(
192  out, in, inverse, location, (double *)Out, (char *)In, (float *)outNorm, (float *)inNorm);
193  } else {
194  errorQuda("Unknown precision %d", in.Precision());
195  }
196  } else if (out.Precision() == QUDA_SINGLE_PRECISION) {
197  if (in.Precision() == QUDA_DOUBLE_PRECISION) {
198  copyClover<float,double,72>(out, in, inverse, location, (float*)Out, (double*)In, (float*)outNorm, (float*)inNorm);
199  } else if (in.Precision() == QUDA_SINGLE_PRECISION) {
200  copyClover<float,float,72>(out, in, inverse, location, (float*)Out, (float*)In, (float*)outNorm, (float*)inNorm);
201  } else if (in.Precision() == QUDA_HALF_PRECISION) {
202  copyClover<float,short,72>(out, in, inverse, location, (float*)Out, (short*)In, (float*)outNorm, (float*)inNorm);
203  } else if (in.Precision() == QUDA_HALF_PRECISION) {
204  copyClover<float, char, 72>(
205  out, in, inverse, location, (float *)Out, (char *)In, (float *)outNorm, (float *)inNorm);
206  } else {
207  errorQuda("Unknown precision %d", in.Precision());
208  }
209  } else if (out.Precision() == QUDA_HALF_PRECISION) {
210  if (in.Precision() == QUDA_DOUBLE_PRECISION){
211  copyClover<short,double,72>(out, in, inverse, location, (short*)Out, (double*)In, (float*)outNorm, (float*)inNorm);
212  } else if (in.Precision() == QUDA_SINGLE_PRECISION) {
213  copyClover<short,float,72>(out, in, inverse, location, (short*)Out, (float*)In, (float*)outNorm, (float*)inNorm);
214  } else if (in.Precision() == QUDA_HALF_PRECISION) {
215  copyClover<short,short,72>(out, in, inverse, location, (short*)Out, (short*)In, (float*)outNorm, (float*)inNorm);
216  } else if (in.Precision() == QUDA_QUARTER_PRECISION) {
217  copyClover<short, char, 72>(
218  out, in, inverse, location, (short *)Out, (char *)In, (float *)outNorm, (float *)inNorm);
219  } else {
220  errorQuda("Unknown precision %d", in.Precision());
221  }
222  } else if (out.Precision() == QUDA_QUARTER_PRECISION) {
223  if (in.Precision() == QUDA_DOUBLE_PRECISION) {
224  copyClover<char, double, 72>(
225  out, in, inverse, location, (char *)Out, (double *)In, (float *)outNorm, (float *)inNorm);
226  } else if (in.Precision() == QUDA_SINGLE_PRECISION) {
227  copyClover<char, float, 72>(
228  out, in, inverse, location, (char *)Out, (float *)In, (float *)outNorm, (float *)inNorm);
229  } else if (in.Precision() == QUDA_HALF_PRECISION) {
230  copyClover<char, short, 72>(
231  out, in, inverse, location, (char *)Out, (short *)In, (float *)outNorm, (float *)inNorm);
232  } else if (in.Precision() == QUDA_QUARTER_PRECISION) {
233  copyClover<char, char, 72>(
234  out, in, inverse, location, (char *)Out, (char *)In, (float *)outNorm, (float *)inNorm);
235  } else {
236  errorQuda("Unknown precision %d", in.Precision());
237  }
238  } else {
239  errorQuda("Unknown precision %d", out.Precision());
240  }
241 #else
242  errorQuda("Clover has not been built");
243 #endif
244  }
245 
246 
247 } // namespace quda
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21
#define errorQuda(...)
Definition: util_quda.h:121
cudaStream_t * stream
QudaCloverFieldOrder Order() const
Definition: clover_field.h:93
Main header file for host and device accessors to CloverFields.
int length[]
QudaGaugeParam param
Definition: pack_test.cpp:17
cpuColorSpinorField * in
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:643
enum QudaFieldLocation_s QudaFieldLocation
__device__ __host__ Matrix< T, 3 > inverse(const Matrix< T, 3 > &u)
Definition: quda_matrix.h:611
cpuColorSpinorField * out
unsigned long long flops
Definition: blas_quda.cu:22
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
static int volumeCB
Definition: face_gauge.cpp:43
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:52
QudaPrecision Precision() const
QudaParity parity
Definition: covdev_test.cpp:54
unsigned long long bytes
Definition: blas_quda.cu:23
void copyGenericClover(CloverField &out, const CloverField &in, bool inverse, QudaFieldLocation location, void *Out=0, void *In=0, void *outNorm=0, void *inNorm=0)
This generic function is used for copying the clover field where in the input and output can be in an...
Definition: copy_clover.cu:175