4 #include <generics/ldg.h> 10 template <
typename Mom,
typename Gauge>
11 struct GaugeForceArg {
25 const int *input_path_d[4];
27 const double *path_coeff_d;
31 GaugeForceArg(Mom &mom,
const Gauge &u,
int num_paths,
int path_max_length,
double coeff,
32 int **input_path_d,
const int *length_d,
const double* path_coeff_d,
int count,
33 const GaugeField &meta_mom,
const GaugeField &meta_u)
34 : mom(mom), u(u), threads(meta_mom.VolumeCB()), num_paths(num_paths),
35 path_max_length(path_max_length),
coeff(
coeff),
36 input_path_d{ input_path_d[0], input_path_d[1], input_path_d[2], input_path_d[3] },
37 length_d(length_d), path_coeff_d(path_coeff_d),
count(
count)
39 for(
int i=0;
i<4;
i++) {
40 X[
i] = meta_mom.X()[
i];
42 border[
i] = (
E[
i] -
X[
i])/2;
46 virtual ~GaugeForceArg() { }
49 __device__ __host__
inline static int flipDir(
int dir) {
return (7-dir); }
50 __device__ __host__
inline static bool isForwards(
int dir) {
return (dir <= 3); }
54 __device__ __host__
inline static T cache(
const T *
ptr,
int idx) {
62 template<
typename Float,
typename Arg,
int dir>
63 __device__ __host__
inline void GaugeForceKernel(Arg &
arg,
int idx,
int parity)
67 int x[4] = {0, 0, 0, 0};
69 for (
int dr=0; dr<4; ++dr)
x[dr] +=
arg.border[dr];
73 Link linkA, linkB, staple;
76 extern __shared__
int s[];
79 signed char *dx = (
signed char*)&
s[tid];
81 int dx[4] = {0, 0, 0, 0};
84 for (
int i=0;
i<
arg.num_paths;
i++) {
86 if (
coeff == 0)
continue;
88 const int* path =
arg.input_path_d[dir] +
i*
arg.path_max_length;
91 int nbr_oddbit = (
parity^1);
94 int path0 = cache(path,0);
95 int lnkdir = isForwards(path0) ? path0 : flipDir(path0);
97 if (isForwards(path0)) {
101 nbr_oddbit = nbr_oddbit^1;
104 nbr_oddbit = nbr_oddbit^1;
109 for (
int j=1; j<cache(
arg.length_d,
i); j++) {
111 int pathj = cache(path,j);
112 int lnkdir = isForwards(pathj) ? pathj : flipDir(pathj);
114 if (isForwards(pathj)) {
116 linkA = linkA * linkB;
118 nbr_oddbit = nbr_oddbit^1;
121 nbr_oddbit = nbr_oddbit^1;
123 linkA = linkA *
conj(linkB);
126 staple = staple +
coeff*linkA;
131 linkA = linkA * staple;
135 mom = mom -
arg.coeff * linkA;
141 template <
typename Float,
typename Arg>
142 void GaugeForceCPU(Arg &
arg) {
143 for (
int dir=0; dir<4; dir++) {
166 template <
typename Float,
typename Arg>
167 __global__
void GaugeForceGPU(Arg
arg) {
169 if (
idx >=
arg.threads)
return;
171 int dir = blockIdx.z *
blockDim.z + threadIdx.z;
189 template <
typename Float,
typename Arg>
190 class GaugeForce :
public TunableVectorY {
196 unsigned int sharedBytesPerThread()
const {
return 4; }
197 unsigned int minThreads()
const {
return arg.threads; }
198 bool tuneGridDim()
const {
return false; }
201 GaugeForce(Arg &
arg,
const GaugeField &meta_mom,
const GaugeField &meta_u)
202 : TunableVectorY(2),
arg(
arg), location(meta_mom.Location()),
vol_str(meta_mom.VolString()) { }
203 virtual ~GaugeForce() { }
205 void apply(
const cudaStream_t &
stream) {
208 GaugeForceGPU<Float,Arg><<<tp.grid,tp.block,tp.shared_bytes>>>(
arg);
210 GaugeForceCPU<Float,Arg>(
arg);
214 void preTune() {
arg.mom.save(); }
215 void postTune() {
arg.mom.load(); }
217 long long flops()
const {
return (
arg.count -
arg.num_paths + 1) * 198ll * 2 *
arg.mom.volumeCB * 4; }
218 long long bytes()
const {
return ((
arg.count + 1ll) *
arg.u.Bytes() + 2ll*
arg.mom.Bytes()) * 2 *
arg.mom.volumeCB * 4; }
220 TuneKey tuneKey()
const {
221 std::stringstream aux;
228 aux <<
"comm=" << comm <<
",threads=" <<
arg.threads <<
",num_paths=" <<
arg.num_paths;
229 return TuneKey(
vol_str,
typeid(*this).name(), aux.str().c_str());
232 bool advanceBlockDim(TuneParam &
param)
const {
234 dim3 grid =
param.grid;
237 param.grid.z = grid.z;
240 if (
param.block.z < 4) {
253 void initTuneParam(TuneParam &
param)
const {
259 void defaultTuneParam(TuneParam &
param)
const {
266 template <
typename Float,
typename Mom,
typename Gauge>
267 void gaugeForce(Mom mom,
const Gauge &u, GaugeField& meta_mom,
const GaugeField& meta_u,
const double coeff,
268 int ***input_path,
const int* length_h,
const double* path_coeff_h,
const int num_paths,
const int path_max_length)
270 size_t bytes = num_paths*path_max_length*
sizeof(
int);
271 int *input_path_d[4];
274 for (
int dir=0; dir<4; dir++) {
276 cudaMemset(input_path_d[dir], 0,
bytes);
282 for (
int i=0;
i < num_paths;
i++) {
283 for (
int j=0; j < length_h[
i]; j++) {
284 input_path_h[
i*path_max_length + j] = input_path[dir][
i][j];
288 qudaMemcpy(input_path_d[dir], input_path_h,
bytes, cudaMemcpyHostToDevice);
295 qudaMemcpy(length_d, length_h, num_paths*
sizeof(
int), cudaMemcpyHostToDevice);
299 qudaMemcpy(path_coeff_d, path_coeff_h, num_paths*
sizeof(
double), cudaMemcpyHostToDevice);
301 GaugeForceArg<Mom,Gauge>
arg(mom, u, num_paths, path_max_length,
coeff, input_path_d,
302 length_d, path_coeff_d,
count, meta_mom, meta_u);
303 GaugeForce<Float,GaugeForceArg<Mom,Gauge> > gauge_force(
arg, meta_mom, meta_u);
304 gauge_force.apply(0);
313 template <
typename Float>
314 void gaugeForce(GaugeField& mom,
const GaugeField& u,
const double coeff,
int ***input_path,
315 const int*
length,
const double* path_coeff,
const int num_paths,
const int max_length)
318 errorQuda(
"Reconstruction type %d not supported", mom.Reconstruct());
321 typedef typename gauge::FloatNOrder<Float,18,2,11> M;
323 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type G;
324 gaugeForce<Float,M,G>(M(mom), G(u), mom, u,
coeff, input_path,
length, path_coeff, num_paths, max_length);
326 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type G;
327 gaugeForce<Float,M,G>(M(mom), G(u), mom, u,
coeff, input_path,
length, path_coeff, num_paths, max_length);
329 errorQuda(
"Reconstruction type %d not supported", u.Reconstruct());
332 errorQuda(
"Gauge Field order %d not supported", mom.Order());
336 #endif // GPU_GAUGE_FORCE 340 int *
length,
double *path_coeff,
int num_paths,
int max_length)
342 #ifdef GPU_GAUGE_FORCE 348 gaugeForce<double>(mom, u,
coeff, input_path,
length, path_coeff, num_paths, max_length);
351 gaugeForce<float>(mom, u,
coeff, input_path,
length, path_coeff, num_paths, max_length);
354 errorQuda(
"Unsupported precision %d", mom.Precision());
357 errorQuda(
"Gauge force has not been built");
358 #endif // GPU_GAUGE_FORCE #define qudaMemcpy(dst, src, count, kind)
int commDimPartitioned(int dir)
static __device__ __host__ int linkIndexShift(const I x[], const J dx[], const K X[4])
static __device__ __host__ int linkIndex(const int x[], const I X[4])
QudaVerbosity getVerbosity()
void initTuneParam(TuneParam ¶m) const
bool advanceBlockDim(TuneParam ¶m) const
void gaugeForce(GaugeField &mom, const GaugeField &u, double coeff, int ***input_path, int *length, double *path_coeff, int num_paths, int max_length)
Compute the gauge-force contribution to the momentum.
#define pool_device_malloc(size)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
void defaultTuneParam(TuneParam ¶m) const
Main header file for host and device accessors to GaugeFields.
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
#define safe_malloc(size)
QudaFieldLocation Location() const
void * memset(void *__b, int __c, size_t __len)
enum QudaFieldLocation_s QudaFieldLocation
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
__device__ __host__ void makeAntiHerm(Matrix< Complex, N > &m)
#define pool_device_free(ptr)
__host__ __device__ ValueType conj(ValueType x)
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
QudaPrecision Precision() const
__device__ unsigned int count[QUDA_MAX_MULTI_REDUCE]
static __device__ __host__ void getCoords(int x[], int cb_index, const I X[], int parity)