8 #ifdef GPU_CLOVER_DIRAC 13 template <
typename Out,
typename In>
14 struct CopyCloverArg {
18 CopyCloverArg (
const Out &
out,
const In
in,
int volume) :
out(
out),
in(
in), volumeCB(
in.volumeCB) { }
24 template <
typename FloatOut,
typename FloatIn,
int length,
typename Out,
typename In>
25 void copyClover(CopyCloverArg<Out,In>
arg) {
26 typedef typename mapper<FloatIn>::type RegTypeIn;
27 typedef typename mapper<FloatOut>::type RegTypeOut;
30 for (
int x=0;
x<
arg.volumeCB;
x++) {
44 template <
typename FloatOut,
typename FloatIn,
int length,
typename Out,
typename In>
45 __global__
void copyCloverKernel(CopyCloverArg<Out,In>
arg) {
46 typedef typename mapper<FloatIn>::type RegTypeIn;
47 typedef typename mapper<FloatOut>::type RegTypeOut;
49 int x = blockIdx.x *
blockDim.x + threadIdx.x;
50 if (
x >=
arg.volumeCB)
return;
62 template <
typename FloatOut,
typename FloatIn,
int length,
typename Out,
typename In>
63 class CopyClover : TunableVectorY {
64 CopyCloverArg<Out,In>
arg;
65 const CloverField &meta;
68 unsigned int sharedBytesPerThread()
const {
return 0; }
69 unsigned int sharedBytesPerBlock(
const TuneParam &
param)
const {
return 0 ;}
71 bool tuneGridDim()
const {
return false; }
72 unsigned int minThreads()
const {
return arg.volumeCB; }
75 CopyClover(CopyCloverArg<Out,In> &
arg,
const CloverField &meta)
76 : TunableVectorY(2),
arg(
arg), meta(meta) {
77 writeAuxString(
"out_stride=%d,in_stride=%d",
arg.out.stride,
arg.in.stride);
79 virtual ~CopyClover() { ; }
81 void apply(
const cudaStream_t &
stream) {
83 copyCloverKernel<FloatOut, FloatIn, length, Out, In>
84 <<<tp.grid, tp.block, tp.shared_bytes,
stream>>>(
arg);
87 TuneKey tuneKey()
const {
return TuneKey(meta.VolString(),
typeid(*this).name(), aux); }
89 long long flops()
const {
return 0; }
90 long long bytes()
const {
return 2*
arg.volumeCB*(
arg.in.Bytes() +
arg.out.Bytes()); }
93 template <
typename FloatOut,
typename FloatIn,
int length,
typename OutOrder,
typename InOrder>
94 void copyClover(OutOrder outOrder,
const InOrder inOrder,
const CloverField &
out,
QudaFieldLocation location) {
96 CopyCloverArg<OutOrder,InOrder>
arg(outOrder, inOrder,
out.Volume());
99 copyClover<FloatOut, FloatIn, length, OutOrder, InOrder>(
arg);
101 CopyClover<FloatOut, FloatIn, length, OutOrder, InOrder> cloverCopier(
arg,
out);
102 cloverCopier.apply(0);
104 errorQuda(
"Undefined field location %d for copyClover", location);
109 template <
typename FloatOut,
typename FloatIn,
int length,
typename InOrder>
110 void copyClover(
const InOrder &inOrder, CloverField &
out,
bool inverse,
QudaFieldLocation location, FloatOut *Out,
float *outNorm) {
112 if (
out.isNative()) {
113 const bool override =
true;
114 typedef typename clover_mapper<FloatOut>::type C;
115 copyClover<FloatOut,FloatIn,length>(C(
out, inverse, Out, outNorm,
override), inOrder,
out, location);
117 copyClover<FloatOut,FloatIn,length>
118 (QDPOrder<FloatOut,length>(
out, inverse, Out), inOrder,
out, location);
121 #ifdef BUILD_QDPJIT_INTERFACE 122 copyClover<FloatOut,FloatIn,length>
123 (QDPJITOrder<FloatOut,length>(
out, inverse, Out), inOrder,
out, location);
125 errorQuda(
"QDPJIT interface has not been built\n");
131 errorQuda(
"Clover field %d order not supported",
out.Order());
136 template <
typename FloatOut,
typename FloatIn,
int length>
138 FloatOut *Out, FloatIn *In,
float *outNorm,
float *inNorm) {
142 const bool override =
true;
143 typedef typename clover_mapper<FloatIn>::type C;
144 copyClover<FloatOut,FloatIn,length>(C(
in, inverse, In, inNorm,
override),
out, inverse, location, Out, outNorm);
146 copyClover<FloatOut,FloatIn,length>
147 (QDPOrder<FloatIn,length>(
in, inverse, In),
out, inverse, location, Out, outNorm);
150 #ifdef BUILD_QDPJIT_INTERFACE 151 copyClover<FloatOut,FloatIn,length>
152 (QDPJITOrder<FloatIn,length>(
in, inverse, In),
out, inverse, location, Out, outNorm);
154 errorQuda(
"QDPJIT interface has not been built\n");
159 #ifdef BUILD_BQCD_INTERFACE 160 copyClover<FloatOut,FloatIn,length>
161 (BQCDOrder<FloatIn,length>(
in, inverse, In),
out, inverse, location, Out, outNorm);
163 errorQuda(
"BQCD interface has not been built\n");
167 errorQuda(
"Clover field %d order not supported",
in.Order());
176 void *Out,
void *In,
void *outNorm,
void *inNorm) {
177 #ifdef GPU_CLOVER_DIRAC 179 errorQuda(
"Half precision not supported for order %d",
out.Order());
181 errorQuda(
"Half precision not supported for order %d",
in.Order());
185 copyClover<double,double,72>(
out,
in, inverse, location, (
double*)Out, (
double*)In, (
float*)outNorm, (
float*)inNorm);
187 copyClover<double,float,72>(
out,
in, inverse, location, (
double*)Out, (
float*)In, (
float*)outNorm, (
float*)inNorm);
189 copyClover<double,short,72>(
out,
in, inverse, location, (
double*)Out, (
short*)In, (
float*)outNorm, (
float*)inNorm);
193 copyClover<float,double,72>(
out,
in, inverse, location, (
float*)Out, (
double*)In, (
float*)outNorm, (
float*)inNorm);
195 copyClover<float,float,72>(
out,
in, inverse, location, (
float*)Out, (
float*)In, (
float*)outNorm, (
float*)inNorm);
197 copyClover<float,short,72>(
out,
in, inverse, location, (
float*)Out, (
short*)In, (
float*)outNorm, (
float*)inNorm);
201 copyClover<short,double,72>(
out,
in, inverse, location, (
short*)Out, (
double*)In, (
float*)outNorm, (
float*)inNorm);
203 copyClover<short,float,72>(
out,
in, inverse, location, (
short*)Out, (
float*)In, (
float*)outNorm, (
float*)inNorm);
205 copyClover<short,short,72>(
out,
in, inverse, location, (
short*)Out, (
short*)In, (
float*)outNorm, (
float*)inNorm);
QudaVerbosity getVerbosity()
Main header file for host and device accessors to CloverFields.
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
enum QudaFieldLocation_s QudaFieldLocation
cpuColorSpinorField * out
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
void copyGenericClover(CloverField &out, const CloverField &in, bool inverse, QudaFieldLocation location, void *Out=0, void *In=0, void *outNorm=0, void *inNorm=0)
This generic function is used for copying the clover field where in the input and output can be in an...