QUDA  v1.1.0
A library for QCD on GPUs
copy_clover.cu
Go to the documentation of this file.
1 #include <clover_field_order.h>
2 #include <tune_quda.h>
3 #include <instantiate.h>
4 
5 namespace quda {
6 
7  using namespace clover;
8 
9  /**
10  Kernel argument struct
11  */
12  template <typename Out, typename In>
13  struct CopyCloverArg {
14  Out out;
15  const In in;
16  int volumeCB;
17  CopyCloverArg (const Out &out, const In in, int volume) : out(out), in(in), volumeCB(in.volumeCB) { }
18  };
19 
20  /**
21  Generic CPU clover reordering and packing
22  */
23  template <typename FloatOut, typename FloatIn, int length, typename Arg>
24  void copyClover(Arg &arg) {
25  typedef typename mapper<FloatIn>::type RegTypeIn;
26  typedef typename mapper<FloatOut>::type RegTypeOut;
27 
28  for (int parity=0; parity<2; parity++) {
29  for (int x=0; x<arg.volumeCB; x++) {
30  RegTypeIn in[length];
31  RegTypeOut out[length];
32  arg.in.load(in, x, parity);
33  for (int i=0; i<length; i++) out[i] = in[i];
34  arg.out.save(out, x, parity);
35  }
36  }
37 
38  }
39 
40  /**
41  Generic CUDA clover reordering and packing
42  */
43  template <typename FloatOut, typename FloatIn, int length, typename Arg>
44  __global__ void copyCloverKernel(Arg arg) {
45  typedef typename mapper<FloatIn>::type RegTypeIn;
46  typedef typename mapper<FloatOut>::type RegTypeOut;
47 
48  int x = blockIdx.x * blockDim.x + threadIdx.x;
49  if (x >= arg.volumeCB) return;
50  int parity = blockIdx.y * blockDim.y + threadIdx.y;
51 
52  RegTypeIn in[length];
53  RegTypeOut out[length];
54  arg.in.load(in, x, parity);
55 #pragma unroll
56  for (int i=0; i<length; i++) out[i] = in[i];
57  arg.out.save(out, x, parity);
58  }
59 
60  template <typename FloatOut, typename FloatIn, int length, typename Out, typename In>
61  class CopyClover : TunableVectorY {
62  CopyCloverArg<Out,In> arg;
63  const CloverField &meta;
64 
65  unsigned int sharedBytesPerThread() const { return 0; }
66  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0 ;}
67 
68  bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
69  unsigned int minThreads() const { return arg.volumeCB; }
70 
71  public:
72  CopyClover(CopyCloverArg<Out,In> &arg, const CloverField &meta)
73  : TunableVectorY(2), arg(arg), meta(meta) {
74  writeAuxString("out_stride=%d,in_stride=%d", arg.out.stride, arg.in.stride);
75  }
76 
77  void apply(const qudaStream_t &stream) {
78  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
79  qudaLaunchKernel(copyCloverKernel<FloatOut, FloatIn, length, decltype(arg)>, tp, stream, arg);
80  }
81 
82  TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
83 
84  long long flops() const { return 0; }
85  long long bytes() const { return 2*arg.volumeCB*(arg.in.Bytes() + arg.out.Bytes()); }
86  };
87 
88  template <typename FloatOut, typename FloatIn, int length, typename OutOrder, typename InOrder>
89  void copyClover(OutOrder outOrder, const InOrder inOrder, const CloverField &out, QudaFieldLocation location) {
90 
91  CopyCloverArg<OutOrder,InOrder> arg(outOrder, inOrder, out.Volume());
92 
93  if (location == QUDA_CPU_FIELD_LOCATION) {
94  copyClover<FloatOut, FloatIn, length>(arg);
95  } else if (location == QUDA_CUDA_FIELD_LOCATION) {
96  CopyClover<FloatOut, FloatIn, length, OutOrder, InOrder> cloverCopier(arg, out);
97  cloverCopier.apply(0);
98  } else {
99  errorQuda("Undefined field location %d for copyClover", location);
100  }
101 
102  }
103 
104  template <typename FloatOut, typename FloatIn, int length, typename InOrder>
105  void copyClover(const InOrder &inOrder, CloverField &out, bool inverse, QudaFieldLocation location, FloatOut *Out, float *outNorm) {
106 
107  if (out.isNative()) {
108  const bool override = true;
109  typedef typename clover_mapper<FloatOut>::type C;
110  copyClover<FloatOut,FloatIn,length>(C(out, inverse, Out, outNorm, override), inOrder, out, location);
111  } else if (out.Order() == QUDA_PACKED_CLOVER_ORDER) {
112  copyClover<FloatOut,FloatIn,length>
113  (QDPOrder<FloatOut,length>(out, inverse, Out), inOrder, out, location);
114  } else if (out.Order() == QUDA_QDPJIT_CLOVER_ORDER) {
115 
116 #ifdef BUILD_QDPJIT_INTERFACE
117  copyClover<FloatOut,FloatIn,length>
118  (QDPJITOrder<FloatOut,length>(out, inverse, Out), inOrder, out, location);
119 #else
120  errorQuda("QDPJIT interface has not been built\n");
121 #endif
122 
123  } else if (out.Order() == QUDA_BQCD_CLOVER_ORDER) {
124  errorQuda("BQCD output not supported");
125  } else {
126  errorQuda("Clover field %d order not supported", out.Order());
127  }
128 
129  }
130 
131  template <typename FloatOut, typename FloatIn> struct CloverCopy {
132  CloverCopy(CloverField &out, const CloverField &in, bool inverse, QudaFieldLocation location,
133  void *Out_, void *In_, float *outNorm, float *inNorm)
134  {
135  constexpr int length = 72;
136  FloatOut *Out = reinterpret_cast<FloatOut*>(Out_);
137  FloatIn *In = reinterpret_cast<FloatIn*>(In_);
138 
139  if (in.isNative()) {
140  const bool override = true;
141  typedef typename clover_mapper<FloatIn>::type C;
142  copyClover<FloatOut,FloatIn,length>(C(in, inverse, In, inNorm, override), out, inverse, location, Out, outNorm);
143  } else if (in.Order() == QUDA_PACKED_CLOVER_ORDER) {
144  copyClover<FloatOut,FloatIn,length>
145  (QDPOrder<FloatIn,length>(in, inverse, In), out, inverse, location, Out, outNorm);
146  } else if (in.Order() == QUDA_QDPJIT_CLOVER_ORDER) {
147 
148 #ifdef BUILD_QDPJIT_INTERFACE
149  copyClover<FloatOut,FloatIn,length>
150  (QDPJITOrder<FloatIn,length>(in, inverse, In), out, inverse, location, Out, outNorm);
151 #else
152  errorQuda("QDPJIT interface has not been built\n");
153 #endif
154 
155  } else if (in.Order() == QUDA_BQCD_CLOVER_ORDER) {
156 
157 #ifdef BUILD_BQCD_INTERFACE
158  copyClover<FloatOut,FloatIn,length>
159  (BQCDOrder<FloatIn,length>(in, inverse, In), out, inverse, location, Out, outNorm);
160 #else
161  errorQuda("BQCD interface has not been built\n");
162 #endif
163 
164  } else {
165  errorQuda("Clover field %d order not supported", in.Order());
166  }
167 
168  }
169  };
170 
171  template <typename FloatIn> struct CloverCopyIn {
172  CloverCopyIn(const CloverField &in, CloverField &out, bool inverse, QudaFieldLocation location,
173  void *Out, void *In, float *outNorm, float *inNorm)
174  {
175  // swizzle in/out back to instantiate out precision
176  instantiatePrecision2<CloverCopy, FloatIn>(out, in, inverse, location, Out, In, outNorm, inNorm);
177  }
178  };
179 
180  void copyGenericClover(CloverField &out, const CloverField &in, bool inverse, QudaFieldLocation location,
181  void *Out, void *In, void *outNorm, void *inNorm)
182  {
183 #ifdef GPU_CLOVER_DIRAC
184  if (out.Precision() < QUDA_SINGLE_PRECISION && out.Order() > 4)
185  errorQuda("Fixed-point precision not supported for order %d", out.Order());
186  if (in.Precision() < QUDA_SINGLE_PRECISION && in.Order() > 4)
187  errorQuda("Fixed-point precision not supported for order %d", in.Order());
188 
189  // swizzle in/out since we first want to instantiate precision
190  instantiatePrecision<CloverCopyIn>(in, out, inverse, location, Out, In,
191  reinterpret_cast<float*>(outNorm), reinterpret_cast<float*>(inNorm));
192 #else
193  errorQuda("Clover has not been built");
194 #endif
195  }
196 
197 
198 } // namespace quda