14 #ifdef GPU_CLOVER_DIRAC 16 template <
typename Float,
typename Arg>
class CloverSigmaOprod :
public TunableVectorYZ
21 const GaugeField &meta;
23 unsigned int sharedBytesPerThread()
const {
return 0; }
24 unsigned int sharedBytesPerBlock(
const TuneParam &)
const {
return 0; }
26 unsigned int minThreads()
const {
return arg.length; }
27 bool tuneGridDim()
const {
return false; }
30 CloverSigmaOprod(Arg &arg,
const GaugeField &meta) : TunableVectorYZ(2, 6), arg(arg), meta(meta)
32 writeAuxString(
"prec=%lu,stride=%d,nvector=%d",
sizeof(Float), arg.inA[0].Stride(), arg.nvector);
35 create_jitify_program(
"kernels/clover_sigma_outer_product.cuh");
39 virtual ~CloverSigmaOprod() {}
41 void apply(
const cudaStream_t &
stream)
46 using namespace jitify::reflection;
47 jitify_error = program->kernel(
"quda::sigmaOprodKernel")
48 .instantiate(arg.nvector, Type<Float>(), Type<Arg>())
49 .configure(tp.grid, tp.block, tp.shared_bytes, stream)
52 switch (arg.nvector) {
53 case 1: sigmaOprodKernel<1, Float><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(
arg);
break;
54 case 2: sigmaOprodKernel<2, Float><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(
arg);
break;
55 case 3: sigmaOprodKernel<3, Float><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(
arg);
break;
56 case 4: sigmaOprodKernel<4, Float><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(
arg);
break;
57 case 5: sigmaOprodKernel<5, Float><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(
arg);
break;
58 case 6: sigmaOprodKernel<6, Float><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(
arg);
break;
59 case 7: sigmaOprodKernel<7, Float><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(
arg);
break;
60 case 8: sigmaOprodKernel<8, Float><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(
arg);
break;
61 case 9: sigmaOprodKernel<9, Float><<<tp.grid, tp.block, tp.shared_bytes, stream>>>(
arg);
break;
65 errorQuda(
"No CPU support for staggered outer-product calculation\n");
69 void preTune() { this->arg.oprod.save(); }
70 void postTune() { this->arg.oprod.load(); }
72 long long flops()
const 74 return (2 * (
long long)arg.length) * 6
75 * ((0 + 144 + 18) * arg.nvector + 18);
77 long long bytes()
const 79 return (2 * (
long long)arg.length) * 6
80 * ((arg.inA[0].Bytes() + arg.inB[0].Bytes()) * arg.nvector + 2 * arg.oprod.Bytes());
83 TuneKey tuneKey()
const {
return TuneKey(meta.VolString(),
"CloverSigmaOprod", aux); }
86 template<
typename Float,
typename Output,
typename InputA,
typename InputB>
88 std::vector<std::vector<double> > &coeff,
int nvector) {
90 typedef CloverSigmaOprodArg<Float, Output, InputA, InputB> Arg;
91 Arg
arg(oprod, inA, inB, coeff, out, nvector);
92 CloverSigmaOprod<Float, Arg> sigma_oprod(arg, out);
96 #endif // GPU_CLOVER_FORCE 99 std::vector<ColorSpinorField*> &x,
100 std::vector<ColorSpinorField*> &p,
101 std::vector<std::vector<double> > &coeff)
104 #ifdef GPU_CLOVER_DIRAC 107 std::vector<ColorSpinorField*> x0(x.begin(), x.begin()+x.size()/2);
108 std::vector<ColorSpinorField*> p0(p.begin(), p.begin()+p.size()/2);
109 std::vector<std::vector<double> > coeff0(coeff.begin(), coeff.begin()+coeff.size()/2);
110 for (
unsigned int i=0; i<coeff0.size(); i++) {
111 coeff0[i].reserve(2); coeff0[i][0] = coeff[i][0]; coeff0[i][1] = coeff[i][1];
115 std::vector<ColorSpinorField*> x1(x.begin()+x.size()/2, x.end());
116 std::vector<ColorSpinorField*> p1(p.begin()+p.size()/2, p.end());
117 std::vector<std::vector<double> > coeff1(coeff.begin()+coeff.size()/2, coeff.end());
118 for (
unsigned int i=0; i<coeff1.size(); i++) {
119 coeff1[i].reserve(2); coeff1[i][0] = coeff[coeff.size()/2 + i][0]; coeff1[i][1] = coeff[coeff.size()/2 + i][1];
128 if(x[0]->Precision() != oprod.
Precision())
129 errorQuda(
"Mixed precision not supported: %d %d\n", x[0]->Precision(), oprod.
Precision());
136 for (
unsigned int i=0; i<x.size(); i++) {
137 spinorA[i].
set(*dynamic_cast<cudaColorSpinorField*>(x[i]));
138 spinorB[i].
set(*dynamic_cast<cudaColorSpinorField*>(p[i]));
142 oprod, spinorA, spinorB, coeff, x.size());
147 #else // GPU_CLOVER_DIRAC not defined 148 errorQuda(
"Clover Dirac operator has not been built!");
QudaVerbosity getVerbosity()
Helper file when using jitify run-time compilation. This file should be included in source code...
void set(const cudaColorSpinorField &x)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
cpuColorSpinorField * out
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
QudaGaugeFieldOrder Order() const
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
void computeCloverSigmaOprod(GaugeField &oprod, std::vector< ColorSpinorField *> &x, std::vector< ColorSpinorField *> &p, std::vector< std::vector< double > > &coeff)
Compute the outer product from the solver solution fields arising from the diagonal term of the fermi...
QudaPrecision Precision() const