1 #include <clover_field_order.h>
3 #include <instantiate.h>
7 using namespace clover;
10 Kernel argument struct
12 template <typename Out, typename In>
13 struct CopyCloverArg {
17 CopyCloverArg (const Out &out, const In in, int volume) : out(out), in(in), volumeCB(in.volumeCB) { }
21 Generic CPU clover reordering and packing
23 template <typename FloatOut, typename FloatIn, int length, typename Arg>
24 void copyClover(Arg &arg) {
25 typedef typename mapper<FloatIn>::type RegTypeIn;
26 typedef typename mapper<FloatOut>::type RegTypeOut;
28 for (int parity=0; parity<2; parity++) {
29 for (int x=0; x<arg.volumeCB; x++) {
31 RegTypeOut out[length];
32 arg.in.load(in, x, parity);
33 for (int i=0; i<length; i++) out[i] = in[i];
34 arg.out.save(out, x, parity);
41 Generic CUDA clover reordering and packing
43 template <typename FloatOut, typename FloatIn, int length, typename Arg>
44 __global__ void copyCloverKernel(Arg arg) {
45 typedef typename mapper<FloatIn>::type RegTypeIn;
46 typedef typename mapper<FloatOut>::type RegTypeOut;
48 int x = blockIdx.x * blockDim.x + threadIdx.x;
49 if (x >= arg.volumeCB) return;
50 int parity = blockIdx.y * blockDim.y + threadIdx.y;
53 RegTypeOut out[length];
54 arg.in.load(in, x, parity);
56 for (int i=0; i<length; i++) out[i] = in[i];
57 arg.out.save(out, x, parity);
60 template <typename FloatOut, typename FloatIn, int length, typename Out, typename In>
61 class CopyClover : TunableVectorY {
62 CopyCloverArg<Out,In> arg;
63 const CloverField &meta;
65 unsigned int sharedBytesPerThread() const { return 0; }
66 unsigned int sharedBytesPerBlock(const TuneParam ¶m) const { return 0 ;}
68 bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
69 unsigned int minThreads() const { return arg.volumeCB; }
72 CopyClover(CopyCloverArg<Out,In> &arg, const CloverField &meta)
73 : TunableVectorY(2), arg(arg), meta(meta) {
74 writeAuxString("out_stride=%d,in_stride=%d", arg.out.stride, arg.in.stride);
77 void apply(const qudaStream_t &stream) {
78 TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
79 qudaLaunchKernel(copyCloverKernel<FloatOut, FloatIn, length, decltype(arg)>, tp, stream, arg);
82 TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
84 long long flops() const { return 0; }
85 long long bytes() const { return 2*arg.volumeCB*(arg.in.Bytes() + arg.out.Bytes()); }
88 template <typename FloatOut, typename FloatIn, int length, typename OutOrder, typename InOrder>
89 void copyClover(OutOrder outOrder, const InOrder inOrder, const CloverField &out, QudaFieldLocation location) {
91 CopyCloverArg<OutOrder,InOrder> arg(outOrder, inOrder, out.Volume());
93 if (location == QUDA_CPU_FIELD_LOCATION) {
94 copyClover<FloatOut, FloatIn, length>(arg);
95 } else if (location == QUDA_CUDA_FIELD_LOCATION) {
96 CopyClover<FloatOut, FloatIn, length, OutOrder, InOrder> cloverCopier(arg, out);
97 cloverCopier.apply(0);
99 errorQuda("Undefined field location %d for copyClover", location);
104 template <typename FloatOut, typename FloatIn, int length, typename InOrder>
105 void copyClover(const InOrder &inOrder, CloverField &out, bool inverse, QudaFieldLocation location, FloatOut *Out, float *outNorm) {
107 if (out.isNative()) {
108 const bool override = true;
109 typedef typename clover_mapper<FloatOut>::type C;
110 copyClover<FloatOut,FloatIn,length>(C(out, inverse, Out, outNorm, override), inOrder, out, location);
111 } else if (out.Order() == QUDA_PACKED_CLOVER_ORDER) {
112 copyClover<FloatOut,FloatIn,length>
113 (QDPOrder<FloatOut,length>(out, inverse, Out), inOrder, out, location);
114 } else if (out.Order() == QUDA_QDPJIT_CLOVER_ORDER) {
116 #ifdef BUILD_QDPJIT_INTERFACE
117 copyClover<FloatOut,FloatIn,length>
118 (QDPJITOrder<FloatOut,length>(out, inverse, Out), inOrder, out, location);
120 errorQuda("QDPJIT interface has not been built\n");
123 } else if (out.Order() == QUDA_BQCD_CLOVER_ORDER) {
124 errorQuda("BQCD output not supported");
126 errorQuda("Clover field %d order not supported", out.Order());
131 template <typename FloatOut, typename FloatIn> struct CloverCopy {
132 CloverCopy(CloverField &out, const CloverField &in, bool inverse, QudaFieldLocation location,
133 void *Out_, void *In_, float *outNorm, float *inNorm)
135 constexpr int length = 72;
136 FloatOut *Out = reinterpret_cast<FloatOut*>(Out_);
137 FloatIn *In = reinterpret_cast<FloatIn*>(In_);
140 const bool override = true;
141 typedef typename clover_mapper<FloatIn>::type C;
142 copyClover<FloatOut,FloatIn,length>(C(in, inverse, In, inNorm, override), out, inverse, location, Out, outNorm);
143 } else if (in.Order() == QUDA_PACKED_CLOVER_ORDER) {
144 copyClover<FloatOut,FloatIn,length>
145 (QDPOrder<FloatIn,length>(in, inverse, In), out, inverse, location, Out, outNorm);
146 } else if (in.Order() == QUDA_QDPJIT_CLOVER_ORDER) {
148 #ifdef BUILD_QDPJIT_INTERFACE
149 copyClover<FloatOut,FloatIn,length>
150 (QDPJITOrder<FloatIn,length>(in, inverse, In), out, inverse, location, Out, outNorm);
152 errorQuda("QDPJIT interface has not been built\n");
155 } else if (in.Order() == QUDA_BQCD_CLOVER_ORDER) {
157 #ifdef BUILD_BQCD_INTERFACE
158 copyClover<FloatOut,FloatIn,length>
159 (BQCDOrder<FloatIn,length>(in, inverse, In), out, inverse, location, Out, outNorm);
161 errorQuda("BQCD interface has not been built\n");
165 errorQuda("Clover field %d order not supported", in.Order());
171 template <typename FloatIn> struct CloverCopyIn {
172 CloverCopyIn(const CloverField &in, CloverField &out, bool inverse, QudaFieldLocation location,
173 void *Out, void *In, float *outNorm, float *inNorm)
175 // swizzle in/out back to instantiate out precision
176 instantiatePrecision2<CloverCopy, FloatIn>(out, in, inverse, location, Out, In, outNorm, inNorm);
180 void copyGenericClover(CloverField &out, const CloverField &in, bool inverse, QudaFieldLocation location,
181 void *Out, void *In, void *outNorm, void *inNorm)
183 #ifdef GPU_CLOVER_DIRAC
184 if (out.Precision() < QUDA_SINGLE_PRECISION && out.Order() > 4)
185 errorQuda("Fixed-point precision not supported for order %d", out.Order());
186 if (in.Precision() < QUDA_SINGLE_PRECISION && in.Order() > 4)
187 errorQuda("Fixed-point precision not supported for order %d", in.Order());
189 // swizzle in/out since we first want to instantiate precision
190 instantiatePrecision<CloverCopyIn>(in, out, inverse, location, Out, In,
191 reinterpret_cast<float*>(outNorm), reinterpret_cast<float*>(inNorm));
193 errorQuda("Clover has not been built");