QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
copy_clover.cu
Go to the documentation of this file.
1 #include <clover_field_order.h>
2 #include <tune_quda.h>
3 
4 namespace quda {
5 
6 #ifdef GPU_CLOVER_DIRAC
7 
11  template <typename Out, typename In>
12  struct CopyCloverArg {
13  Out out;
14  const In in;
15  int volumeCB;
16  CopyCloverArg (const Out &out, const In in, int volume) : out(out), in(in), volumeCB(in.volumeCB) { }
17  };
18 
22  template <typename FloatOut, typename FloatIn, int length, typename Out, typename In>
23  void copyClover(CopyCloverArg<Out,In> arg) {
24  typedef typename mapper<FloatIn>::type RegTypeIn;
25  typedef typename mapper<FloatOut>::type RegTypeOut;
26 
27  for (int parity=0; parity<2; parity++) {
28  for (int x=0; x<arg.volumeCB; x++) {
29  RegTypeIn in[length];
30  RegTypeOut out[length];
31  arg.in.load(in, x, parity);
32  for (int i=0; i<length; i++) out[i] = in[i];
33  arg.out.save(out, x, parity);
34  }
35  }
36 
37  }
38 
42  template <typename FloatOut, typename FloatIn, int length, typename Out, typename In>
43  __global__ void copyCloverKernel(CopyCloverArg<Out,In> arg) {
44  typedef typename mapper<FloatIn>::type RegTypeIn;
45  typedef typename mapper<FloatOut>::type RegTypeOut;
46 
47  for (int parity=0; parity<2; parity++) {
48  int x = blockIdx.x * blockDim.x + threadIdx.x;
49  if (x >= arg.volumeCB) return;
50 
51  RegTypeIn in[length];
52  RegTypeOut out[length];
53  arg.in.load(in, x, parity);
54  for (int i=0; i<length; i++) out[i] = in[i];
55  arg.out.save(out, x, parity);
56  }
57 
58  }
59 
60  template <typename FloatOut, typename FloatIn, int length, typename Out, typename In>
61  class CopyClover : Tunable {
62  CopyCloverArg<Out,In> arg;
63  const CloverField &meta;
64 
65  private:
66  unsigned int sharedBytesPerThread() const { return 0; }
67  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0 ;}
68 
69  bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
70  unsigned int minThreads() const { return arg.volumeCB; }
71 
72  public:
73  CopyClover(CopyCloverArg<Out,In> &arg, const CloverField &meta) : arg(arg), meta(meta) {
74  writeAuxString("out_stride=%d,in_stride=%d", arg.out.stride, arg.in.stride);
75  }
76  virtual ~CopyClover() { ; }
77 
78  void apply(const cudaStream_t &stream) {
79  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
80  copyCloverKernel<FloatOut, FloatIn, length, Out, In>
81  <<<tp.grid, tp.block, tp.shared_bytes, stream>>>(arg);
82  }
83 
84  TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
85 
86  std::string paramString(const TuneParam &param) const { // Don't bother printing the grid dim.
87  std::stringstream ps;
88  ps << "block=(" << param.block.x << "," << param.block.y << "," << param.block.z << "), ";
89  ps << "shared=" << param.shared_bytes;
90  return ps.str();
91  }
92 
93  long long flops() const { return 0; }
94  long long bytes() const { return 2*arg.volumeCB*(arg.in.Bytes() + arg.out.Bytes()); }
95  };
96 
97  template <typename FloatOut, typename FloatIn, int length, typename OutOrder, typename InOrder>
98  void copyClover(OutOrder outOrder, const InOrder inOrder, const CloverField &out, QudaFieldLocation location) {
99 
100  CopyCloverArg<OutOrder,InOrder> arg(outOrder, inOrder, out.Volume());
101 
102  if (location == QUDA_CPU_FIELD_LOCATION) {
103  copyClover<FloatOut, FloatIn, length, OutOrder, InOrder>(arg);
104  } else if (location == QUDA_CUDA_FIELD_LOCATION) {
105  CopyClover<FloatOut, FloatIn, length, OutOrder, InOrder> cloverCopier(arg, out);
106  cloverCopier.apply(0);
107  } else {
108  errorQuda("Undefined field location %d for copyClover", location);
109  }
110 
111  }
112 
113  template <typename FloatOut, typename FloatIn, int length, typename InOrder>
114  void copyClover(const InOrder &inOrder, CloverField &out, bool inverse, QudaFieldLocation location, FloatOut *Out, float *outNorm) {
115  if (out.Order() == QUDA_FLOAT2_CLOVER_ORDER) {
116  copyClover<FloatOut,FloatIn,length>
117  (FloatNOrder<FloatOut,length,2>(out, inverse, Out, outNorm), inOrder, out, location);
118  } else if (out.Order() == QUDA_FLOAT4_CLOVER_ORDER) {
119  copyClover<FloatOut,FloatIn,length>
120  (FloatNOrder<FloatOut,length,4>(out, inverse, Out, outNorm), inOrder, out, location);
121  } else if (out.Order() == QUDA_PACKED_CLOVER_ORDER) {
122  copyClover<FloatOut,FloatIn,length>
123  (QDPOrder<FloatOut,length>(out, inverse, Out), inOrder, out, location);
124  } else if (out.Order() == QUDA_QDPJIT_CLOVER_ORDER) {
125 
126 #ifdef BUILD_QDPJIT_INTERFACE
127  copyClover<FloatOut,FloatIn,length>
128  (QDPJITOrder<FloatOut,length>(out, inverse, Out), inOrder, out, location);
129 #else
130  errorQuda("QDPJIT interface has not been built\n");
131 #endif
132 
133  } else if (out.Order() == QUDA_BQCD_CLOVER_ORDER) {
134  errorQuda("BQCD output not supported");
135  } else {
136  errorQuda("Clover field %d order not supported", out.Order());
137  }
138 
139  }
140 
141  template <typename FloatOut, typename FloatIn, int length>
142  void copyClover(CloverField &out, const CloverField &in, bool inverse, QudaFieldLocation location,
143  FloatOut *Out, FloatIn *In, float *outNorm, float *inNorm) {
144 
145  // reconstruction only supported on FloatN fields currently
146  if (in.Order() == QUDA_FLOAT2_CLOVER_ORDER) {
147  copyClover<FloatOut,FloatIn,length>
148  (FloatNOrder<FloatIn,length,2>(in, inverse, In, inNorm), out, inverse, location, Out, outNorm);
149  } else if (in.Order() == QUDA_FLOAT4_CLOVER_ORDER) {
150  copyClover<FloatOut,FloatIn,length>
151  (FloatNOrder<FloatIn,length,4>(in, inverse, In, inNorm), out, inverse, location, Out, outNorm);
152  } else if (in.Order() == QUDA_PACKED_CLOVER_ORDER) {
153  copyClover<FloatOut,FloatIn,length>
154  (QDPOrder<FloatIn,length>(in, inverse, In), out, inverse, location, Out, outNorm);
155  } else if (in.Order() == QUDA_QDPJIT_CLOVER_ORDER) {
156 
157 #ifdef BUILD_QDPJIT_INTERFACE
158  copyClover<FloatOut,FloatIn,length>
159  (QDPJITOrder<FloatIn,length>(in, inverse, In), out, inverse, location, Out, outNorm);
160 #else
161  errorQuda("QDPJIT interface has not been built\n");
162 #endif
163 
164  } else if (in.Order() == QUDA_BQCD_CLOVER_ORDER) {
165 
166 #ifdef BUILD_BQCD_INTERFACE
167  copyClover<FloatOut,FloatIn,length>
168  (BQCDOrder<FloatIn,length>(in, inverse, In), out, inverse, location, Out, outNorm);
169 #else
170  errorQuda("BQCD interface has not been built\n");
171 #endif
172 
173  } else {
174  errorQuda("Clover field %d order not supported", in.Order());
175  }
176 
177  }
178 
179 #endif
180 
181  // this is the function that is actually called, from here on down we instantiate all required templates
182  void copyGenericClover(CloverField &out, const CloverField &in, bool inverse, QudaFieldLocation location,
183  void *Out, void *In, void *outNorm, void *inNorm) {
184 
185 #ifdef GPU_CLOVER_DIRAC
186  if (out.Precision() == QUDA_HALF_PRECISION && out.Order() > 4)
187  errorQuda("Half precision not supported for order %d", out.Order());
188  if (in.Precision() == QUDA_HALF_PRECISION && in.Order() > 4)
189  errorQuda("Half precision not supported for order %d", in.Order());
190 
191  if (out.Precision() == QUDA_DOUBLE_PRECISION) {
192  if (in.Precision() == QUDA_DOUBLE_PRECISION) {
193  copyClover<double,double,72>(out, in, inverse, location, (double*)Out, (double*)In, (float*)outNorm, (float*)inNorm);
194  } else if (in.Precision() == QUDA_SINGLE_PRECISION) {
195  copyClover<double,float,72>(out, in, inverse, location, (double*)Out, (float*)In, (float*)outNorm, (float*)inNorm);
196  } else if (in.Precision() == QUDA_HALF_PRECISION) {
197  copyClover<double,short,72>(out, in, inverse, location, (double*)Out, (short*)In, (float*)outNorm, (float*)inNorm);
198  }
199  } else if (out.Precision() == QUDA_SINGLE_PRECISION) {
200  if (in.Precision() == QUDA_DOUBLE_PRECISION) {
201  copyClover<float,double,72>(out, in, inverse, location, (float*)Out, (double*)In, (float*)outNorm, (float*)inNorm);
202  } else if (in.Precision() == QUDA_SINGLE_PRECISION) {
203  copyClover<float,float,72>(out, in, inverse, location, (float*)Out, (float*)In, (float*)outNorm, (float*)inNorm);
204  } else if (in.Precision() == QUDA_HALF_PRECISION) {
205  copyClover<float,short,72>(out, in, inverse, location, (float*)Out, (short*)In, (float*)outNorm, (float*)inNorm);
206  }
207  } else if (out.Precision() == QUDA_HALF_PRECISION) {
208  if (in.Precision() == QUDA_DOUBLE_PRECISION){
209  copyClover<short,double,72>(out, in, inverse, location, (short*)Out, (double*)In, (float*)outNorm, (float*)inNorm);
210  } else if (in.Precision() == QUDA_SINGLE_PRECISION) {
211  copyClover<short,float,72>(out, in, inverse, location, (short*)Out, (float*)In, (float*)outNorm, (float*)inNorm);
212  } else if (in.Precision() == QUDA_HALF_PRECISION) {
213  copyClover<short,short,72>(out, in, inverse, location, (short*)Out, (short*)In, (float*)outNorm, (float*)inNorm);
214  }
215  }
216 #else
217  errorQuda("Clover has not been built");
218 #endif
219 
220  }
221 
222 
223 } // namespace quda
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
#define errorQuda(...)
Definition: util_quda.h:73
cudaStream_t * stream
::std::string string
Definition: gtest.h:1979
int length[]
QudaGaugeParam param
Definition: pack_test.cpp:17
QudaPrecision Precision() const
const QudaFieldLocation location
Definition: pack_test.cpp:46
QudaCloverFieldOrder Order() const
Definition: clover_field.h:66
cpuColorSpinorField * in
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:271
int x[4]
enum QudaFieldLocation_s QudaFieldLocation
cpuColorSpinorField * out
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:843
QudaTune getTuning()
Definition: util_quda.cpp:32
const QudaParity parity
Definition: dslash_test.cpp:29
void copyGenericClover(CloverField &out, const CloverField &in, bool inverse, QudaFieldLocation location, void *Out=0, void *In=0, void *outNorm=0, void *inNorm=0)
Definition: copy_clover.cu:182