6 #ifdef GPU_CLOVER_DIRAC
11 template <
typename Out,
typename In>
12 struct CopyCloverArg {
16 CopyCloverArg (
const Out &
out,
const In
in,
int volume) : out(out), in(in), volumeCB(in.volumeCB) { }
22 template <
typename FloatOut,
typename FloatIn,
int length,
typename Out,
typename In>
23 void copyClover(CopyCloverArg<Out,In>
arg) {
24 typedef typename mapper<FloatIn>::type RegTypeIn;
25 typedef typename mapper<FloatOut>::type RegTypeOut;
28 for (
int x=0;
x<arg.volumeCB;
x++) {
32 for (
int i=0; i<
length; i++) out[i] = in[i];
42 template <
typename FloatOut,
typename FloatIn,
int length,
typename Out,
typename In>
43 __global__
void copyCloverKernel(CopyCloverArg<Out,In> arg) {
44 typedef typename mapper<FloatIn>::type RegTypeIn;
45 typedef typename mapper<FloatOut>::type RegTypeOut;
48 int x = blockIdx.x * blockDim.x + threadIdx.x;
49 if (x >= arg.volumeCB)
return;
53 arg.in.load(in, x,
parity);
54 for (
int i=0; i<
length; i++) out[i] = in[i];
55 arg.out.save(out, x,
parity);
60 template <
typename FloatOut,
typename FloatIn,
int length,
typename Out,
typename In>
61 class CopyClover : Tunable {
62 CopyCloverArg<Out,In>
arg;
63 const CloverField &meta;
66 unsigned int sharedBytesPerThread()
const {
return 0; }
67 unsigned int sharedBytesPerBlock(
const TuneParam &
param)
const {
return 0 ;}
69 bool tuneGridDim()
const {
return false; }
70 unsigned int minThreads()
const {
return arg.volumeCB; }
73 CopyClover(CopyCloverArg<Out,In> &arg,
const CloverField &meta) : arg(arg), meta(meta) {
74 writeAuxString(
"out_stride=%d,in_stride=%d", arg.out.stride, arg.in.stride);
76 virtual ~CopyClover() { ; }
78 void apply(
const cudaStream_t &
stream) {
80 copyCloverKernel<FloatOut, FloatIn, length, Out, In>
81 <<<tp.grid, tp.block, tp.shared_bytes, stream>>>(
arg);
84 TuneKey tuneKey()
const {
return TuneKey(meta.VolString(),
typeid(*this).name(), aux); }
88 ps <<
"block=(" << param.block.x <<
"," << param.block.y <<
"," << param.block.z <<
"), ";
89 ps <<
"shared=" << param.shared_bytes;
93 long long flops()
const {
return 0; }
94 long long bytes()
const {
return 2*arg.volumeCB*(arg.in.Bytes() + arg.out.Bytes()); }
97 template <
typename FloatOut,
typename FloatIn,
int length,
typename OutOrder,
typename InOrder>
100 CopyCloverArg<OutOrder,InOrder>
arg(outOrder, inOrder, out.Volume());
103 copyClover<FloatOut, FloatIn, length, OutOrder, InOrder>(
arg);
105 CopyClover<FloatOut, FloatIn, length, OutOrder, InOrder> cloverCopier(arg, out);
106 cloverCopier.apply(0);
108 errorQuda(
"Undefined field location %d for copyClover", location);
113 template <
typename FloatOut,
typename FloatIn,
int length,
typename InOrder>
114 void copyClover(
const InOrder &inOrder, CloverField &out,
bool inverse,
QudaFieldLocation location, FloatOut *Out,
float *outNorm) {
116 copyClover<FloatOut,FloatIn,length>
117 (FloatNOrder<FloatOut,length,2>(
out, inverse, Out, outNorm), inOrder, out, location);
119 copyClover<FloatOut,FloatIn,length>
120 (FloatNOrder<FloatOut,length,4>(
out, inverse, Out, outNorm), inOrder, out, location);
122 copyClover<FloatOut,FloatIn,length>
123 (QDPOrder<FloatOut,length>(
out, inverse, Out), inOrder, out, location);
126 #ifdef BUILD_QDPJIT_INTERFACE
127 copyClover<FloatOut,FloatIn,length>
128 (QDPJITOrder<FloatOut,length>(
out, inverse, Out), inOrder, out, location);
130 errorQuda(
"QDPJIT interface has not been built\n");
136 errorQuda(
"Clover field %d order not supported", out.Order());
141 template <
typename FloatOut,
typename FloatIn,
int length>
142 void copyClover(CloverField &out,
const CloverField &in,
bool inverse,
QudaFieldLocation location,
143 FloatOut *Out, FloatIn *In,
float *outNorm,
float *inNorm) {
147 copyClover<FloatOut,FloatIn,length>
148 (FloatNOrder<FloatIn,length,2>(
in, inverse, In, inNorm), out, inverse, location, Out, outNorm);
150 copyClover<FloatOut,FloatIn,length>
151 (FloatNOrder<FloatIn,length,4>(
in, inverse, In, inNorm), out, inverse, location, Out, outNorm);
153 copyClover<FloatOut,FloatIn,length>
154 (QDPOrder<FloatIn,length>(
in, inverse, In), out, inverse, location, Out, outNorm);
157 #ifdef BUILD_QDPJIT_INTERFACE
158 copyClover<FloatOut,FloatIn,length>
159 (QDPJITOrder<FloatIn,length>(
in, inverse, In), out, inverse, location, Out, outNorm);
161 errorQuda(
"QDPJIT interface has not been built\n");
166 #ifdef BUILD_BQCD_INTERFACE
167 copyClover<FloatOut,FloatIn,length>
168 (BQCDOrder<FloatIn,length>(
in, inverse, In), out, inverse, location, Out, outNorm);
170 errorQuda(
"BQCD interface has not been built\n");
174 errorQuda(
"Clover field %d order not supported", in.Order());
183 void *Out,
void *In,
void *outNorm,
void *inNorm) {
185 #ifdef GPU_CLOVER_DIRAC
187 errorQuda(
"Half precision not supported for order %d", out.
Order());
189 errorQuda(
"Half precision not supported for order %d", in.
Order());
193 copyClover<double,double,72>(
out,
in, inverse,
location, (
double*)Out, (
double*)In, (
float*)outNorm, (
float*)inNorm);
195 copyClover<double,float,72>(
out,
in, inverse,
location, (
double*)Out, (
float*)In, (
float*)outNorm, (
float*)inNorm);
197 copyClover<double,short,72>(
out,
in, inverse,
location, (
double*)Out, (
short*)In, (
float*)outNorm, (
float*)inNorm);
201 copyClover<float,double,72>(
out,
in, inverse,
location, (
float*)Out, (
double*)In, (
float*)outNorm, (
float*)inNorm);
203 copyClover<float,float,72>(
out,
in, inverse,
location, (
float*)Out, (
float*)In, (
float*)outNorm, (
float*)inNorm);
205 copyClover<float,short,72>(
out,
in, inverse,
location, (
float*)Out, (
short*)In, (
float*)outNorm, (
float*)inNorm);
209 copyClover<short,double,72>(
out,
in, inverse,
location, (
short*)Out, (
double*)In, (
float*)outNorm, (
float*)inNorm);
211 copyClover<short,float,72>(
out,
in, inverse,
location, (
short*)Out, (
float*)In, (
float*)outNorm, (
float*)inNorm);
213 copyClover<short,short,72>(
out,
in, inverse,
location, (
short*)Out, (
short*)In, (
float*)outNorm, (
float*)inNorm);
QudaVerbosity getVerbosity()
QudaPrecision Precision() const
const QudaFieldLocation location
QudaCloverFieldOrder Order() const
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
enum QudaFieldLocation_s QudaFieldLocation
cpuColorSpinorField * out
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
void copyGenericClover(CloverField &out, const CloverField &in, bool inverse, QudaFieldLocation location, void *Out=0, void *In=0, void *outNorm=0, void *inNorm=0)