QUDA  0.9.0
copy_clover.cu
Go to the documentation of this file.
1 #include <clover_field_order.h>
2 #include <tune_quda.h>
3 
4 namespace quda {
5 
6  using namespace clover;
7 
8 #ifdef GPU_CLOVER_DIRAC
9 
13  template <typename Out, typename In>
14  struct CopyCloverArg {
15  Out out;
16  const In in;
17  int volumeCB;
18  CopyCloverArg (const Out &out, const In in, int volume) : out(out), in(in), volumeCB(in.volumeCB) { }
19  };
20 
24  template <typename FloatOut, typename FloatIn, int length, typename Out, typename In>
25  void copyClover(CopyCloverArg<Out,In> arg) {
26  typedef typename mapper<FloatIn>::type RegTypeIn;
27  typedef typename mapper<FloatOut>::type RegTypeOut;
28 
29  for (int parity=0; parity<2; parity++) {
30  for (int x=0; x<arg.volumeCB; x++) {
31  RegTypeIn in[length];
32  RegTypeOut out[length];
33  arg.in.load(in, x, parity);
34  for (int i=0; i<length; i++) out[i] = in[i];
35  arg.out.save(out, x, parity);
36  }
37  }
38 
39  }
40 
44  template <typename FloatOut, typename FloatIn, int length, typename Out, typename In>
45  __global__ void copyCloverKernel(CopyCloverArg<Out,In> arg) {
46  typedef typename mapper<FloatIn>::type RegTypeIn;
47  typedef typename mapper<FloatOut>::type RegTypeOut;
48 
49  int x = blockIdx.x * blockDim.x + threadIdx.x;
50  if (x >= arg.volumeCB) return;
51  int parity = blockIdx.y * blockDim.y + threadIdx.y;
52 
53  RegTypeIn in[length];
54  RegTypeOut out[length];
55  arg.in.load(in, x, parity);
56 #pragma unroll
57  for (int i=0; i<length; i++) out[i] = in[i];
58  arg.out.save(out, x, parity);
59 
60  }
61 
62  template <typename FloatOut, typename FloatIn, int length, typename Out, typename In>
63  class CopyClover : TunableVectorY {
64  CopyCloverArg<Out,In> arg;
65  const CloverField &meta;
66 
67  private:
68  unsigned int sharedBytesPerThread() const { return 0; }
69  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0 ;}
70 
71  bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
72  unsigned int minThreads() const { return arg.volumeCB; }
73 
74  public:
75  CopyClover(CopyCloverArg<Out,In> &arg, const CloverField &meta)
76  : TunableVectorY(2), arg(arg), meta(meta) {
77  writeAuxString("out_stride=%d,in_stride=%d", arg.out.stride, arg.in.stride);
78  }
79  virtual ~CopyClover() { ; }
80 
81  void apply(const cudaStream_t &stream) {
82  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
83  copyCloverKernel<FloatOut, FloatIn, length, Out, In>
84  <<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
85  }
86 
87  TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
88 
89  long long flops() const { return 0; }
90  long long bytes() const { return 2*arg.volumeCB*(arg.in.Bytes() + arg.out.Bytes()); }
91  };
92 
93  template <typename FloatOut, typename FloatIn, int length, typename OutOrder, typename InOrder>
94  void copyClover(OutOrder outOrder, const InOrder inOrder, const CloverField &out, QudaFieldLocation location) {
95 
96  CopyCloverArg<OutOrder,InOrder> arg(outOrder, inOrder, out.Volume());
97 
98  if (location == QUDA_CPU_FIELD_LOCATION) {
99  copyClover<FloatOut, FloatIn, length, OutOrder, InOrder>(arg);
100  } else if (location == QUDA_CUDA_FIELD_LOCATION) {
101  CopyClover<FloatOut, FloatIn, length, OutOrder, InOrder> cloverCopier(arg, out);
102  cloverCopier.apply(0);
103  } else {
104  errorQuda("Undefined field location %d for copyClover", location);
105  }
106 
107  }
108 
109  template <typename FloatOut, typename FloatIn, int length, typename InOrder>
110  void copyClover(const InOrder &inOrder, CloverField &out, bool inverse, QudaFieldLocation location, FloatOut *Out, float *outNorm) {
111 
112  if (out.isNative()) {
113  const bool override = true;
114  typedef typename clover_mapper<FloatOut>::type C;
115  copyClover<FloatOut,FloatIn,length>(C(out, inverse, Out, outNorm, override), inOrder, out, location);
116  } else if (out.Order() == QUDA_PACKED_CLOVER_ORDER) {
117  copyClover<FloatOut,FloatIn,length>
118  (QDPOrder<FloatOut,length>(out, inverse, Out), inOrder, out, location);
119  } else if (out.Order() == QUDA_QDPJIT_CLOVER_ORDER) {
120 
121 #ifdef BUILD_QDPJIT_INTERFACE
122  copyClover<FloatOut,FloatIn,length>
123  (QDPJITOrder<FloatOut,length>(out, inverse, Out), inOrder, out, location);
124 #else
125  errorQuda("QDPJIT interface has not been built\n");
126 #endif
127 
128  } else if (out.Order() == QUDA_BQCD_CLOVER_ORDER) {
129  errorQuda("BQCD output not supported");
130  } else {
131  errorQuda("Clover field %d order not supported", out.Order());
132  }
133 
134  }
135 
136  template <typename FloatOut, typename FloatIn, int length>
137  void copyClover(CloverField &out, const CloverField &in, bool inverse, QudaFieldLocation location,
138  FloatOut *Out, FloatIn *In, float *outNorm, float *inNorm) {
139 
140  // reconstruction only supported on FloatN fields currently
141  if (in.isNative()) {
142  const bool override = true;
143  typedef typename clover_mapper<FloatIn>::type C;
144  copyClover<FloatOut,FloatIn,length>(C(in, inverse, In, inNorm, override), out, inverse, location, Out, outNorm);
145  } else if (in.Order() == QUDA_PACKED_CLOVER_ORDER) {
146  copyClover<FloatOut,FloatIn,length>
147  (QDPOrder<FloatIn,length>(in, inverse, In), out, inverse, location, Out, outNorm);
148  } else if (in.Order() == QUDA_QDPJIT_CLOVER_ORDER) {
149 
150 #ifdef BUILD_QDPJIT_INTERFACE
151  copyClover<FloatOut,FloatIn,length>
152  (QDPJITOrder<FloatIn,length>(in, inverse, In), out, inverse, location, Out, outNorm);
153 #else
154  errorQuda("QDPJIT interface has not been built\n");
155 #endif
156 
157  } else if (in.Order() == QUDA_BQCD_CLOVER_ORDER) {
158 
159 #ifdef BUILD_BQCD_INTERFACE
160  copyClover<FloatOut,FloatIn,length>
161  (BQCDOrder<FloatIn,length>(in, inverse, In), out, inverse, location, Out, outNorm);
162 #else
163  errorQuda("BQCD interface has not been built\n");
164 #endif
165 
166  } else {
167  errorQuda("Clover field %d order not supported", in.Order());
168  }
169 
170  }
171 
172 #endif
173 
174  // this is the function that is actually called, from here on down we instantiate all required templates
175  void copyGenericClover(CloverField &out, const CloverField &in, bool inverse, QudaFieldLocation location,
176  void *Out, void *In, void *outNorm, void *inNorm) {
177 #ifdef GPU_CLOVER_DIRAC
178  if (out.Precision() == QUDA_HALF_PRECISION && out.Order() > 4)
179  errorQuda("Half precision not supported for order %d", out.Order());
180  if (in.Precision() == QUDA_HALF_PRECISION && in.Order() > 4)
181  errorQuda("Half precision not supported for order %d", in.Order());
182 
183  if (out.Precision() == QUDA_DOUBLE_PRECISION) {
184  if (in.Precision() == QUDA_DOUBLE_PRECISION) {
185  copyClover<double,double,72>(out, in, inverse, location, (double*)Out, (double*)In, (float*)outNorm, (float*)inNorm);
186  } else if (in.Precision() == QUDA_SINGLE_PRECISION) {
187  copyClover<double,float,72>(out, in, inverse, location, (double*)Out, (float*)In, (float*)outNorm, (float*)inNorm);
188  } else if (in.Precision() == QUDA_HALF_PRECISION) {
189  copyClover<double,short,72>(out, in, inverse, location, (double*)Out, (short*)In, (float*)outNorm, (float*)inNorm);
190  }
191  } else if (out.Precision() == QUDA_SINGLE_PRECISION) {
192  if (in.Precision() == QUDA_DOUBLE_PRECISION) {
193  copyClover<float,double,72>(out, in, inverse, location, (float*)Out, (double*)In, (float*)outNorm, (float*)inNorm);
194  } else if (in.Precision() == QUDA_SINGLE_PRECISION) {
195  copyClover<float,float,72>(out, in, inverse, location, (float*)Out, (float*)In, (float*)outNorm, (float*)inNorm);
196  } else if (in.Precision() == QUDA_HALF_PRECISION) {
197  copyClover<float,short,72>(out, in, inverse, location, (float*)Out, (short*)In, (float*)outNorm, (float*)inNorm);
198  }
199  } else if (out.Precision() == QUDA_HALF_PRECISION) {
200  if (in.Precision() == QUDA_DOUBLE_PRECISION){
201  copyClover<short,double,72>(out, in, inverse, location, (short*)Out, (double*)In, (float*)outNorm, (float*)inNorm);
202  } else if (in.Precision() == QUDA_SINGLE_PRECISION) {
203  copyClover<short,float,72>(out, in, inverse, location, (short*)Out, (float*)In, (float*)outNorm, (float*)inNorm);
204  } else if (in.Precision() == QUDA_HALF_PRECISION) {
205  copyClover<short,short,72>(out, in, inverse, location, (short*)Out, (short*)In, (float*)outNorm, (float*)inNorm);
206  }
207  }
208 #else
209  errorQuda("Clover has not been built");
210 #endif
211  }
212 
213 
214 } // namespace quda
dim3 dim3 blockDim
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
#define errorQuda(...)
Definition: util_quda.h:90
cudaStream_t * stream
Main header file for host and device accessors to CloverFields.
QudaGaugeParam param
Definition: pack_test.cpp:17
cpuColorSpinorField * in
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603
enum QudaFieldLocation_s QudaFieldLocation
cpuColorSpinorField * out
unsigned long long flops
Definition: blas_quda.cu:42
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:880
void size_t length
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51
QudaParity parity
Definition: covdev_test.cpp:53
unsigned long long bytes
Definition: blas_quda.cu:43
void copyGenericClover(CloverField &out, const CloverField &in, bool inverse, QudaFieldLocation location, void *Out=0, void *In=0, void *outNorm=0, void *inNorm=0)
This generic function is used for copying the clover field where in the input and output can be in an...
Definition: copy_clover.cu:175