5 #include <gauge_field.h>
6 #include <color_spinor_field.h>
7 #include <dslash_quda.h>
9 #include <jitify_helper.cuh>
10 #include <kernels/clover_sigma_outer_product.cuh>
14 #ifdef GPU_CLOVER_DIRAC
16 template <typename Float, typename Arg> class CloverSigmaOprod : public TunableVectorYZ
19 const GaugeField &meta;
21 unsigned int sharedBytesPerThread() const { return 0; }
22 unsigned int sharedBytesPerBlock(const TuneParam &) const { return 0; }
24 unsigned int minThreads() const { return arg.length; }
25 bool tuneGridDim() const { return false; }
28 CloverSigmaOprod(Arg &arg, const GaugeField &meta) : TunableVectorYZ(2, 6), arg(arg), meta(meta)
30 writeAuxString("%s,nvector=%d", meta.AuxString(), arg.nvector);
31 // this sets the communications pattern for the packing kernel
33 create_jitify_program("kernels/clover_sigma_outer_product.cuh");
37 virtual ~CloverSigmaOprod() {}
39 void apply(const qudaStream_t &stream)
41 if (meta.Location() == QUDA_CUDA_FIELD_LOCATION) {
42 TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
44 using namespace jitify::reflection;
45 jitify_error = program->kernel("quda::sigmaOprodKernel")
46 .instantiate(arg.nvector, Type<Float>(), Type<Arg>())
47 .configure(tp.grid, tp.block, tp.shared_bytes, stream)
50 switch (arg.nvector) {
51 case 1: qudaLaunchKernel(sigmaOprodKernel<1, Float, Arg>, tp, stream, arg); break;
52 default: errorQuda("Unsupported nvector = %d\n", arg.nvector);
55 } else { // run the CPU code
56 errorQuda("No CPU support for staggered outer-product calculation\n");
60 void preTune() { this->arg.oprod.save(); }
61 void postTune() { this->arg.oprod.load(); }
63 long long flops() const
65 return (2 * (long long)arg.length) * 6
66 * ((0 + 144 + 18) * arg.nvector + 18); // spin_mu_nu + spin trace + multiply-add
68 long long bytes() const
70 return (2 * (long long)arg.length) * 6
71 * ((arg.inA[0].Bytes() + arg.inB[0].Bytes()) * arg.nvector + 2 * arg.oprod.Bytes());
74 TuneKey tuneKey() const { return TuneKey(meta.VolString(), "CloverSigmaOprod", aux); }
75 }; // CloverSigmaOprod
77 template<typename Float>
78 void computeCloverSigmaOprod(GaugeField& oprod, const std::vector<ColorSpinorField*> &x,
79 const std::vector<ColorSpinorField*> &p, const std::vector<std::vector<double> > &coeff, int nvector)
81 // Create the arguments
82 CloverSigmaOprodArg<Float, 3> arg(oprod, x, p, coeff, nvector);
83 CloverSigmaOprod<Float, decltype(arg)> sigma_oprod(arg, oprod);
85 } // computeCloverSigmaOprod
87 #endif // GPU_CLOVER_FORCE
89 void computeCloverSigmaOprod(GaugeField& oprod,
90 std::vector<ColorSpinorField*> &x,
91 std::vector<ColorSpinorField*> &p,
92 std::vector<std::vector<double> > &coeff)
95 #ifdef GPU_CLOVER_DIRAC
96 if (x.size() > MAX_NVECTOR) {
98 std::vector<ColorSpinorField*> x0(x.begin(), x.begin()+x.size()/2);
99 std::vector<ColorSpinorField*> p0(p.begin(), p.begin()+p.size()/2);
100 std::vector<std::vector<double> > coeff0(coeff.begin(), coeff.begin()+coeff.size()/2);
101 for (unsigned int i=0; i<coeff0.size(); i++) {
102 coeff0[i].reserve(2); coeff0[i][0] = coeff[i][0]; coeff0[i][1] = coeff[i][1];
104 computeCloverSigmaOprod(oprod, x0, p0, coeff0);
106 std::vector<ColorSpinorField*> x1(x.begin()+x.size()/2, x.end());
107 std::vector<ColorSpinorField*> p1(p.begin()+p.size()/2, p.end());
108 std::vector<std::vector<double> > coeff1(coeff.begin()+coeff.size()/2, coeff.end());
109 for (unsigned int i=0; i<coeff1.size(); i++) {
110 coeff1[i].reserve(2); coeff1[i][0] = coeff[coeff.size()/2 + i][0]; coeff1[i][1] = coeff[coeff.size()/2 + i][1];
112 computeCloverSigmaOprod(oprod, x1, p1, coeff1);
117 if (oprod.Order() != QUDA_FLOAT2_GAUGE_ORDER) errorQuda("Unsupported output ordering: %d\n", oprod.Order());
119 if (checkPrecision(*x[0], *p[0], oprod) == QUDA_DOUBLE_PRECISION) {
120 computeCloverSigmaOprod<double>(oprod, x, p, coeff, x.size());
122 errorQuda("Unsupported precision: %d\n", oprod.Precision());
124 #else // GPU_CLOVER_DIRAC not defined
125 errorQuda("Clover Dirac operator has not been built!");
128 } // computeCloverForce