10 using namespace gauge;
12 #ifdef GPU_GAUGE_TOOLS 14 template <
typename Mom>
15 struct MomActionArg :
public ReduceArg<double> {
20 MomActionArg(
const Mom &mom,
const GaugeField &meta)
23 for(
int dir=0; dir<4; ++dir)
X[dir] = meta.
X()[dir];
27 template<
int blockSize,
typename Float,
typename Mom>
29 int x = threadIdx.x + blockIdx.x*
blockDim.x;
35 for (
int mu=0;
mu<4;
mu++) {
39 double local_sum = 0.0;
40 for (
int j=0; j<6; j++) local_sum += v[j]*v[j];
41 for (
int j=6; j<9; j++) local_sum += 0.5*v[j]*v[j];
48 reduce2d<blockSize,2>(
arg, action);
51 template<
typename Float,
typename Mom>
53 MomActionArg<Mom> &
arg;
57 unsigned int minThreads()
const {
return arg.threads; }
61 virtual ~MomAction () { }
63 void apply(
const cudaStream_t &
stream){
65 arg.result_h[0] = 0.0;
74 std::stringstream aux;
75 aux <<
"threads=" <<
arg.threads <<
",prec=" <<
sizeof(Float);
79 long long flops()
const {
return 4*2*
arg.threads*23; }
80 long long bytes()
const {
return 4*2*
arg.threads*
arg.mom.Bytes(); }
83 template<
typename Float,
typename Mom>
84 void momAction(
const Mom mom,
const GaugeField& meta,
double &action) {
85 MomActionArg<Mom>
arg(mom, meta);
86 MomAction<Float,Mom> momAction(
arg, meta);
92 action =
arg.result_h[0];
95 template<
typename Float>
115 #ifdef GPU_GAUGE_TOOLS 117 action = momAction<double>(mom);
119 action = momAction<float>(mom);
130 #ifdef GPU_GAUGE_TOOLS 131 template<
typename Float,
typename Mom,
typename Force>
132 struct UpdateMomArg {
138 UpdateMomArg(Mom &mom,
const Float &
coeff, Force &force,
GaugeField &meta)
139 : threads(meta.VolumeCB()), mom(mom),
coeff(
coeff), force(force) {
140 for (
int dir=0; dir<4; ++dir)
X[dir] = meta.
X()[dir];
144 template<
typename Float,
typename Mom,
typename Force>
145 __global__
void UpdateMomKernel(UpdateMomArg<Float, Mom, Force>
arg) {
146 int x = blockIdx.x*
blockDim.x + threadIdx.x;
149 while(
x<
arg.threads){
150 for (
int d=0;
d<4;
d++) {
152 arg.force.load(reinterpret_cast<Float*>(
f.data),
x,
d,
parity);
154 m = m +
arg.coeff *
f;
166 template<
typename Float,
typename Mom,
typename Force>
168 UpdateMomArg<Float, Mom, Force> &
arg;
172 unsigned int minThreads()
const {
return arg.threads; }
175 UpdateMom(UpdateMomArg<Float,Mom,Force> &
arg,
const GaugeField &meta) :
arg(
arg), meta(meta) {}
176 virtual ~UpdateMom () { }
178 void apply(
const cudaStream_t &
stream){
188 std::stringstream aux;
189 aux <<
"threads=" <<
arg.threads <<
",prec=" <<
sizeof(Float);
193 void preTune() {
arg.mom.save();}
194 void postTune() {
arg.mom.load();}
195 long long flops()
const {
return 4*2*
arg.threads*(36+42); }
196 long long bytes()
const {
return 4*2*
arg.threads*(2*
arg.mom.Bytes()+
arg.force.Bytes()); }
199 template<
typename Float,
typename Mom,
typename Force>
201 UpdateMomArg<Float,Mom,Force>
arg(mom,
coeff, force, meta);
202 UpdateMom<Float,Mom,Force> update(
arg, meta);
206 template <
typename Float>
222 #endif // GPU_GAUGE_TOOLS 225 #ifdef GPU_GAUGE_TOOLS 233 updateMomentum<double>(mom,
coeff, force);
241 #endif // GPU_GAUGE_TOOLS 247 #ifdef GPU_GAUGE_TOOLS 249 template<
typename Float,
typename Force,
typename Gauge>
255 ApplyUArg(Force &force, Gauge &U,
GaugeField &meta)
256 : threads(meta.VolumeCB()), force(force), U(U) {
257 for (
int dir=0; dir<4; ++dir)
X[dir] = meta.
X()[dir];
261 template<
typename Float,
typename Force,
typename Gauge>
262 __global__
void ApplyUKernel(ApplyUArg<Float,Force,Gauge>
arg) {
263 int x = blockIdx.x*
blockDim.x + threadIdx.x;
267 while (
x<
arg.threads) {
268 for (
int d=0;
d<4;
d++) {
269 arg.force.load(reinterpret_cast<Float*>(
f.data),
x,
d,
parity);
274 arg.force.save(reinterpret_cast<Float*>(
f.data),
x,
d,
parity);
284 template<
typename Float,
typename Force,
typename Gauge>
286 ApplyUArg<Float, Force, Gauge> &
arg;
290 unsigned int minThreads()
const {
return arg.threads; }
293 ApplyU(ApplyUArg<Float,Force,Gauge> &
arg,
const GaugeField &meta) :
arg(
arg), meta(meta) {}
294 virtual ~ApplyU () { }
296 void apply(
const cudaStream_t &
stream){
306 std::stringstream aux;
307 aux <<
"threads=" <<
arg.threads <<
",prec=" <<
sizeof(Float);
311 void preTune() {
arg.force.save();}
312 void postTune() {
arg.force.load();}
313 long long flops()
const {
return 4*2*
arg.threads*198; }
314 long long bytes()
const {
return 4*2*
arg.threads*(2*
arg.force.Bytes()+
arg.U.Bytes()); }
317 template<
typename Float,
typename Force,
typename Gauge>
319 ApplyUArg<Float,Force,Gauge>
arg(force, U, meta);
320 ApplyU<Float,Force,Gauge>
applyU(
arg, meta);
324 template <
typename Float>
338 #endif // GPU_GAUGE_TOOLS 341 #ifdef GPU_GAUGE_TOOLS 349 applyU<double>(force, U);
357 #endif // GPU_GAUGE_TOOLS
#define LAUNCH_KERNEL_LOCAL_PARITY(kernel, tp, stream, arg,...)
QudaVerbosity getVerbosity()
const char * VolString() const
double computeMomAction(const GaugeField &mom)
Compute and return global the momentum action 1/2 mom^2.
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
int int int enum cudaChannelFormatKind f
Main header file for host and device accessors to GaugeFields.
void applyU(GaugeField &force, GaugeField &U)
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
QudaFieldLocation Location() const
void updateMomentum(GaugeField &mom, double coeff, GaugeField &force)
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Accessor routine for CloverFields in native field order.
__device__ __host__ void makeAntiHerm(Matrix< Complex, N > &m)
QudaReconstructType Reconstruct() const
QudaGaugeFieldOrder Order() const
void comm_allreduce(double *data)
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
static __inline__ size_t size_t d
QudaPrecision Precision() const