2 #include <gauge_field.h>
3 #include <jitify_helper.cuh>
4 #include <kernels/gauge_plaq.cuh>
5 #include <instantiate.h>
9 template<typename Float, int nColor, QudaReconstructType recon>
10 class GaugePlaq : TunableLocalParityReduction {
15 GaugePlaq(const GaugeField &u, double2 &plq) :
20 create_jitify_program("kernels/gauge_plaq.cuh");
22 strcpy(aux, compile_type_str(u));
26 void apply(const qudaStream_t &stream){
27 if (u.Location() == QUDA_CUDA_FIELD_LOCATION) {
28 TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
29 GaugePlaqArg<Float, nColor, recon> arg(u);
31 using namespace jitify::reflection;
32 jitify_error = program->kernel("quda::computePlaq")
33 .instantiate((int)tp.block.x,type_of(arg))
34 .configure(tp.grid,tp.block,tp.shared_bytes,stream).launch(arg);
35 arg.launch_error = jitify_error == CUDA_SUCCESS ? QUDA_SUCCESS : QUDA_ERROR;
37 LAUNCH_KERNEL_LOCAL_PARITY(computePlaq, (*this), tp, stream, arg, decltype(arg));
40 if (!activeTuning()) {
41 comm_allreduce_array((double*)&plq, 2);
42 for (int i = 0; i < 2; i++) ((double*)&plq)[i] /= 9.*2*arg.threads*comm_size();
45 errorQuda("CPU not supported yet\n");
49 TuneKey tuneKey() const { return TuneKey(u.VolString(), typeid(*this).name(), aux); }
50 long long flops() const
53 return 6ll*u.Volume()*(3 * (8 * Nc * Nc * Nc - 2 * Nc * Nc) + Nc);
55 long long bytes() const { return u.Bytes(); }
58 double3 plaquette(const GaugeField &U)
61 instantiate<GaugePlaq>(U, plq);
62 double3 plaq = make_double3(0.5*(plq.x + plq.y), plq.x, plq.y);