6 using namespace clover;
8 #ifdef GPU_CLOVER_DIRAC 13 template <
typename Out,
typename In>
14 struct CopyCloverArg {
18 CopyCloverArg (
const Out &out,
const In in,
int volume) : out(out), in(in), volumeCB(in.volumeCB) { }
24 template <
typename FloatOut,
typename FloatIn,
int length,
typename Out,
typename In>
25 void copyClover(CopyCloverArg<Out,In>
arg) {
26 typedef typename mapper<FloatIn>::type RegTypeIn;
27 typedef typename mapper<FloatOut>::type RegTypeOut;
30 for (
int x=0; x<arg.volumeCB; x++) {
33 arg.in.load(in, x,
parity);
34 for (
int i=0; i<
length; i++) out[i] = in[i];
35 arg.out.save(out, x,
parity);
44 template <
typename FloatOut,
typename FloatIn,
int length,
typename Out,
typename In>
45 __global__
void copyCloverKernel(CopyCloverArg<Out,In> arg) {
46 typedef typename mapper<FloatIn>::type RegTypeIn;
47 typedef typename mapper<FloatOut>::type RegTypeOut;
49 int x = blockIdx.x * blockDim.x + threadIdx.x;
50 if (x >= arg.volumeCB)
return;
51 int parity = blockIdx.y * blockDim.y + threadIdx.y;
55 arg.in.load(in, x, parity);
57 for (
int i=0; i<
length; i++) out[i] = in[i];
58 arg.out.save(out, x, parity);
62 template <
typename FloatOut,
typename FloatIn,
int length,
typename Out,
typename In>
63 class CopyClover : TunableVectorY {
64 CopyCloverArg<Out,In>
arg;
65 const CloverField &meta;
68 unsigned int sharedBytesPerThread()
const {
return 0; }
69 unsigned int sharedBytesPerBlock(
const TuneParam &
param)
const {
return 0 ;}
71 bool tuneGridDim()
const {
return false; }
72 unsigned int minThreads()
const {
return arg.volumeCB; }
75 CopyClover(CopyCloverArg<Out,In> &arg,
const CloverField &meta)
76 : TunableVectorY(2), arg(arg), meta(meta) {
77 writeAuxString(
"out_stride=%d,in_stride=%d", arg.out.stride, arg.in.stride);
79 virtual ~CopyClover() { ; }
81 void apply(
const cudaStream_t &
stream) {
83 copyCloverKernel<FloatOut, FloatIn, length, Out, In>
84 <<<tp.grid, tp.block, tp.shared_bytes, stream>>>(
arg);
87 TuneKey tuneKey()
const {
return TuneKey(meta.VolString(),
typeid(*this).name(), aux); }
89 long long flops()
const {
return 0; }
90 long long bytes()
const {
return 2*arg.volumeCB*(arg.in.Bytes() + arg.out.Bytes()); }
93 template <
typename FloatOut,
typename FloatIn,
int length,
typename OutOrder,
typename InOrder>
94 void copyClover(OutOrder outOrder,
const InOrder inOrder,
const CloverField &out,
QudaFieldLocation location) {
96 CopyCloverArg<OutOrder,InOrder>
arg(outOrder, inOrder, out.Volume());
99 copyClover<FloatOut, FloatIn, length, OutOrder, InOrder>(
arg);
101 CopyClover<FloatOut, FloatIn, length, OutOrder, InOrder> cloverCopier(arg, out);
102 cloverCopier.apply(0);
104 errorQuda(
"Undefined field location %d for copyClover", location);
109 template <
typename FloatOut,
typename FloatIn,
int length,
typename InOrder>
110 void copyClover(
const InOrder &inOrder, CloverField &out,
bool inverse,
QudaFieldLocation location, FloatOut *Out,
float *outNorm) {
112 if (out.isNative()) {
113 const bool override =
true;
114 typedef typename clover_mapper<FloatOut>::type C;
115 copyClover<FloatOut,FloatIn,length>(C(out, inverse, Out, outNorm,
override), inOrder,
out, location);
117 copyClover<FloatOut,FloatIn,length>
118 (QDPOrder<FloatOut,length>(
out,
inverse, Out), inOrder, out, location);
121 #ifdef BUILD_QDPJIT_INTERFACE 122 copyClover<FloatOut,FloatIn,length>
123 (QDPJITOrder<FloatOut,length>(
out,
inverse, Out), inOrder, out, location);
125 errorQuda(
"QDPJIT interface has not been built\n");
131 errorQuda(
"Clover field %d order not supported", out.Order());
136 template <
typename FloatOut,
typename FloatIn,
int length>
137 void copyClover(CloverField &out,
const CloverField &in,
bool inverse,
QudaFieldLocation location,
138 FloatOut *Out, FloatIn *In,
float *outNorm,
float *inNorm) {
142 const bool override =
true;
143 typedef typename clover_mapper<FloatIn>::type C;
144 copyClover<FloatOut,FloatIn,length>(C(in, inverse, In, inNorm,
override),
out,
inverse, location, Out, outNorm);
146 copyClover<FloatOut,FloatIn,length>
147 (QDPOrder<FloatIn,length>(
in,
inverse, In), out, inverse, location, Out, outNorm);
150 #ifdef BUILD_QDPJIT_INTERFACE 151 copyClover<FloatOut,FloatIn,length>
152 (QDPJITOrder<FloatIn,length>(
in,
inverse, In), out, inverse, location, Out, outNorm);
154 errorQuda(
"QDPJIT interface has not been built\n");
159 #ifdef BUILD_BQCD_INTERFACE 160 copyClover<FloatOut,FloatIn,length>
161 (BQCDOrder<FloatIn,length>(
in,
inverse, In), out, inverse, location, Out, outNorm);
163 errorQuda(
"BQCD interface has not been built\n");
167 errorQuda(
"Clover field %d order not supported", in.Order());
176 void *Out,
void *In,
void *outNorm,
void *inNorm) {
177 #ifdef GPU_CLOVER_DIRAC 179 errorQuda(
"Half precision not supported for order %d", out.
Order());
181 errorQuda(
"Half precision not supported for order %d", in.
Order());
185 copyClover<double,double,72>(
out,
in,
inverse, location, (
double*)Out, (
double*)In, (
float*)outNorm, (
float*)inNorm);
187 copyClover<double,float,72>(
out,
in,
inverse, location, (
double*)Out, (
float*)In, (
float*)outNorm, (
float*)inNorm);
189 copyClover<double,short,72>(
out,
in,
inverse, location, (
double*)Out, (
short*)In, (
float*)outNorm, (
float*)inNorm);
191 copyClover<double, char, 72>(
192 out,
in,
inverse, location, (
double *)Out, (
char *)In, (
float *)outNorm, (
float *)inNorm);
198 copyClover<float,double,72>(
out,
in,
inverse, location, (
float*)Out, (
double*)In, (
float*)outNorm, (
float*)inNorm);
200 copyClover<float,float,72>(
out,
in,
inverse, location, (
float*)Out, (
float*)In, (
float*)outNorm, (
float*)inNorm);
202 copyClover<float,short,72>(
out,
in,
inverse, location, (
float*)Out, (
short*)In, (
float*)outNorm, (
float*)inNorm);
204 copyClover<float, char, 72>(
205 out,
in,
inverse, location, (
float *)Out, (
char *)In, (
float *)outNorm, (
float *)inNorm);
211 copyClover<short,double,72>(
out,
in,
inverse, location, (
short*)Out, (
double*)In, (
float*)outNorm, (
float*)inNorm);
213 copyClover<short,float,72>(
out,
in,
inverse, location, (
short*)Out, (
float*)In, (
float*)outNorm, (
float*)inNorm);
215 copyClover<short,short,72>(
out,
in,
inverse, location, (
short*)Out, (
short*)In, (
float*)outNorm, (
float*)inNorm);
217 copyClover<short, char, 72>(
218 out,
in,
inverse, location, (
short *)Out, (
char *)In, (
float *)outNorm, (
float *)inNorm);
224 copyClover<char, double, 72>(
225 out,
in,
inverse, location, (
char *)Out, (
double *)In, (
float *)outNorm, (
float *)inNorm);
227 copyClover<char, float, 72>(
228 out,
in,
inverse, location, (
char *)Out, (
float *)In, (
float *)outNorm, (
float *)inNorm);
230 copyClover<char, short, 72>(
231 out,
in,
inverse, location, (
char *)Out, (
short *)In, (
float *)outNorm, (
float *)inNorm);
233 copyClover<char, char, 72>(
234 out,
in,
inverse, location, (
char *)Out, (
char *)In, (
float *)outNorm, (
float *)inNorm);
QudaVerbosity getVerbosity()
QudaCloverFieldOrder Order() const
Main header file for host and device accessors to CloverFields.
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
enum QudaFieldLocation_s QudaFieldLocation
__device__ __host__ Matrix< T, 3 > inverse(const Matrix< T, 3 > &u)
cpuColorSpinorField * out
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
QudaPrecision Precision() const
void copyGenericClover(CloverField &out, const CloverField &in, bool inverse, QudaFieldLocation location, void *Out=0, void *In=0, void *outNorm=0, void *inNorm=0)
This generic function is used for copying the clover field where in the input and output can be in an...