4 #include <generics/ldg.h> 11 template <
typename Mom,
typename Gauge>
12 struct GaugeForceArg {
26 const int *input_path_d[4];
28 const double *path_coeff_d;
32 GaugeForceArg(Mom &mom,
const Gauge &u,
int num_paths,
int path_max_length,
double coeff,
33 int **input_path_d,
const int *length_d,
const double* path_coeff_d,
int count,
34 const GaugeField &meta_mom,
const GaugeField &meta_u)
35 : mom(mom), u(u), threads(meta_mom.VolumeCB()), num_paths(num_paths),
36 path_max_length(path_max_length), coeff(coeff),
37 input_path_d{ input_path_d[0], input_path_d[1], input_path_d[2], input_path_d[3] },
38 length_d(length_d), path_coeff_d(path_coeff_d),
count(count)
40 for(
int i=0; i<4; i++) {
41 X[i] = meta_mom.X()[i];
43 border[i] = (E[i] - X[i])/2;
47 virtual ~GaugeForceArg() { }
50 __device__ __host__
inline static int flipDir(
int dir) {
return (7-dir); }
51 __device__ __host__
inline static bool isForwards(
int dir) {
return (dir <= 3); }
55 __device__ __host__
inline static T cache(
const T *ptr,
int idx) {
57 return __ldg(ptr+idx);
63 template<
typename Float,
typename Arg,
int dir>
64 __device__ __host__
inline void GaugeForceKernel(Arg &
arg,
int idx,
int parity)
68 int x[4] = {0, 0, 0, 0};
70 for (
int dr=0; dr<4; ++dr) x[dr] += arg.border[dr];
74 Link linkA, linkB, staple;
77 extern __shared__
int s[];
78 int tid = (threadIdx.z*blockDim.y + threadIdx.y)*blockDim.x + threadIdx.x;
80 signed char *dx = (
signed char*)&
s[tid];
82 int dx[4] = {0, 0, 0, 0};
85 for (
int i=0; i<arg.num_paths; i++) {
86 Float coeff = cache(arg.path_coeff_d,i);
87 if (coeff == 0)
continue;
89 const int* path = arg.input_path_d[dir] + i*arg.path_max_length;
92 int nbr_oddbit = (parity^1);
95 int path0 = cache(path,0);
96 int lnkdir = isForwards(path0) ? path0 : flipDir(path0);
98 if (isForwards(path0)) {
102 nbr_oddbit = nbr_oddbit^1;
105 nbr_oddbit = nbr_oddbit^1;
110 for (
int j=1; j<cache(arg.length_d,i); j++) {
112 int pathj = cache(path,j);
113 int lnkdir = isForwards(pathj) ? pathj : flipDir(pathj);
115 if (isForwards(pathj)) {
117 linkA = linkA * linkB;
119 nbr_oddbit = nbr_oddbit^1;
122 nbr_oddbit = nbr_oddbit^1;
124 linkA = linkA *
conj(linkB);
127 staple = staple + coeff*linkA;
131 linkA = arg.u(dir,
linkIndex(x,arg.E), parity);
132 linkA = linkA * staple;
135 Link mom = arg.mom(dir, idx, parity);
136 mom = mom - arg.coeff * linkA;
138 arg.mom(dir, idx, parity) = mom;
142 template <
typename Float,
typename Arg>
143 void GaugeForceCPU(Arg &arg) {
144 for (
int dir=0; dir<4; dir++) {
145 for (
int parity=0; parity<2; parity++) {
146 for (
int idx=0; idx<arg.threads; idx++) {
149 GaugeForceKernel<Float,Arg,0>(
arg, idx,
parity);
152 GaugeForceKernel<Float,Arg,1>(
arg, idx,
parity);
155 GaugeForceKernel<Float,Arg,2>(
arg, idx,
parity);
158 GaugeForceKernel<Float,Arg,3>(
arg, idx,
parity);
167 template <
typename Float,
typename Arg>
168 __global__
void GaugeForceGPU(Arg arg) {
169 int idx = blockIdx.x * blockDim.x + threadIdx.x;
170 if (idx >= arg.threads)
return;
171 int parity = blockIdx.y * blockDim.y + threadIdx.y;
172 int dir = blockIdx.z * blockDim.z + threadIdx.z;
175 GaugeForceKernel<Float,Arg,0>(
arg, idx,
parity);
178 GaugeForceKernel<Float,Arg,1>(
arg, idx,
parity);
181 GaugeForceKernel<Float,Arg,2>(
arg, idx,
parity);
184 GaugeForceKernel<Float,Arg,3>(
arg, idx,
parity);
190 template <
typename Float,
typename Arg>
191 class GaugeForce :
public TunableVectorY {
197 unsigned int sharedBytesPerThread()
const {
return 4; }
198 unsigned int minThreads()
const {
return arg.threads; }
199 bool tuneGridDim()
const {
return false; }
202 GaugeForce(Arg &arg,
const GaugeField &meta_mom,
const GaugeField &meta_u)
203 : TunableVectorY(2), arg(arg), location(meta_mom.Location()), vol_str(meta_mom.VolString()) { }
204 virtual ~GaugeForce() { }
206 void apply(
const cudaStream_t &
stream) {
209 GaugeForceGPU<Float,Arg><<<tp.grid,tp.block,tp.shared_bytes>>>(
arg);
211 GaugeForceCPU<Float,Arg>(
arg);
215 void preTune() { arg.mom.save(); }
216 void postTune() { arg.mom.load(); }
218 long long flops()
const {
return (arg.count - arg.num_paths + 1) * 198ll * 2 * arg.mom.volumeCB * 4; }
219 long long bytes()
const {
return ((arg.count + 1ll) * arg.u.Bytes() + 2ll*arg.mom.Bytes()) * 2 * arg.mom.volumeCB * 4; }
221 TuneKey tuneKey()
const {
222 std::stringstream aux;
229 aux <<
"comm=" << comm <<
",threads=" << arg.threads <<
",num_paths=" << arg.num_paths;
230 return TuneKey(vol_str,
typeid(*this).name(), aux.str().c_str());
233 bool advanceBlockDim(TuneParam &
param)
const {
234 dim3 block = param.block;
235 dim3 grid = param.grid;
237 param.block.z = block.z;
238 param.grid.z = grid.z;
241 if (param.block.z < 4) {
243 param.grid.z = 4 / param.block.z;
254 void initTuneParam(TuneParam ¶m)
const {
260 void defaultTuneParam(TuneParam ¶m)
const {
267 template <
typename Float,
typename Mom,
typename Gauge>
268 void gaugeForce(Mom mom,
const Gauge &u, GaugeField& meta_mom,
const GaugeField& meta_u,
const double coeff,
269 int ***input_path,
const int* length_h,
const double* path_coeff_h,
const int num_paths,
const int path_max_length)
271 size_t bytes = num_paths*path_max_length*
sizeof(int);
272 int *input_path_d[4];
275 for (
int dir=0; dir<4; dir++) {
277 cudaMemset(input_path_d[dir], 0, bytes);
280 memset(input_path_h, 0, bytes);
283 for (
int i=0; i < num_paths; i++) {
284 for (
int j=0; j < length_h[i]; j++) {
285 input_path_h[i*path_max_length + j] = input_path[dir][i][j];
289 qudaMemcpy(input_path_d[dir], input_path_h, bytes, cudaMemcpyHostToDevice);
296 qudaMemcpy(length_d, length_h, num_paths*
sizeof(
int), cudaMemcpyHostToDevice);
300 qudaMemcpy(path_coeff_d, path_coeff_h, num_paths*
sizeof(
double), cudaMemcpyHostToDevice);
302 GaugeForceArg<Mom,Gauge>
arg(mom, u, num_paths, path_max_length, coeff, input_path_d,
303 length_d, path_coeff_d, count, meta_mom, meta_u);
304 GaugeForce<Float,GaugeForceArg<Mom,Gauge> > gauge_force(arg, meta_mom, meta_u);
305 gauge_force.apply(0);
314 template <
typename Float>
315 void gaugeForce(GaugeField& mom,
const GaugeField& u,
const double coeff,
int ***input_path,
316 const int*
length,
const double* path_coeff,
const int num_paths,
const int max_length)
319 errorQuda(
"Reconstruction type %d not supported", mom.Reconstruct());
322 typedef typename gauge::FloatNOrder<Float,18,2,11> M;
324 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type G;
325 gaugeForce<Float,M,G>(M(mom), G(u), mom, u, coeff, input_path,
length, path_coeff, num_paths, max_length);
327 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type G;
328 gaugeForce<Float,M,G>(M(mom), G(u), mom, u, coeff, input_path,
length, path_coeff, num_paths, max_length);
330 errorQuda(
"Reconstruction type %d not supported", u.Reconstruct());
333 errorQuda(
"Gauge Field order %d not supported", mom.Order());
337 #endif // GPU_GAUGE_FORCE 341 int *length,
double *path_coeff,
int num_paths,
int max_length)
343 #ifdef GPU_GAUGE_FORCE 349 gaugeForce<double>(mom, u, coeff, input_path,
length, path_coeff, num_paths, max_length);
352 gaugeForce<float>(mom, u, coeff, input_path,
length, path_coeff, num_paths, max_length);
355 errorQuda(
"Unsupported precision %d", mom.Precision());
358 errorQuda(
"Gauge force has not been built");
359 #endif // GPU_GAUGE_FORCE #define qudaMemcpy(dst, src, count, kind)
int commDimPartitioned(int dir)
static __device__ __host__ int linkIndexShift(const I x[], const J dx[], const K X[4])
static __device__ __host__ int linkIndex(const int x[], const I X[4])
QudaVerbosity getVerbosity()
void initTuneParam(TuneParam ¶m) const
bool advanceBlockDim(TuneParam ¶m) const
void gaugeForce(GaugeField &mom, const GaugeField &u, double coeff, int ***input_path, int *length, double *path_coeff, int num_paths, int max_length)
Compute the gauge-force contribution to the momentum.
#define qudaDeviceSynchronize()
#define pool_device_malloc(size)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
void defaultTuneParam(TuneParam ¶m) const
Main header file for host and device accessors to GaugeFields.
#define safe_malloc(size)
void * memset(void *s, int c, size_t n)
QudaFieldLocation Location() const
enum QudaFieldLocation_s QudaFieldLocation
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
__device__ __host__ void makeAntiHerm(Matrix< Complex, N > &m)
#define pool_device_free(ptr)
__host__ __device__ ValueType conj(ValueType x)
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
QudaPrecision Precision() const
__device__ unsigned int count[QUDA_MAX_MULTI_REDUCE]
__host__ __device__ int getCoords(int coord[], const Arg &arg, int &idx, int parity, int &dim)
Compute the space-time coordinates we are at.