1 #include <gauge_field_order.h>
2 #include <quda_matrix.h>
3 #include <index_helper.cuh>
5 #include <instantiate.h>
14 const double *path_coeff;
17 paths(void *buffer, size_t bytes, int ***input_path, int *length_h, double *path_coeff_h, int num_paths, int max_length) :
19 max_length(max_length),
22 void *path_h = safe_malloc(bytes);
23 memset(path_h, 0, bytes);
25 int *input_path_h = (int*)path_h;
26 for (int dir=0; dir<4; dir++) {
27 // flatten the input_path array for copying to the device
28 for (int i=0; i < num_paths; i++) {
29 for (int j=0; j < length_h[i]; j++) {
30 input_path_h[dir*num_paths*max_length + i*max_length + j] = input_path[dir][i][j];
37 memcpy((char*)path_h + 4 * num_paths * max_length * sizeof(int), length_h, num_paths*sizeof(int));
40 memcpy((char*)path_h + 4 * num_paths * max_length * sizeof(int) + num_paths*sizeof(int), path_coeff_h, num_paths*sizeof(double));
42 qudaMemcpy(buffer, path_h, bytes, cudaMemcpyHostToDevice);
45 // finally set the pointers to the correct offsets in the buffer
46 for (int d=0; d < 4; d++) this->input_path[d] = (int*)((char*)buffer + d*num_paths*max_length*sizeof(int));
47 length = (int*)((char*)buffer + 4*num_paths*max_length*sizeof(int));
48 path_coeff = (double*)((char*)buffer + 4 * num_paths * max_length * sizeof(int) + num_paths*sizeof(int));
52 template <typename Float_, int nColor_, QudaReconstructType recon_u, QudaReconstructType recon_m>
53 struct GaugeForceArg {
55 static constexpr int nColor = nColor_;
56 static_assert(nColor == 3, "Only nColor=3 enabled at this time");
57 typedef typename gauge_mapper<Float,recon_u>::type Gauge;
58 typedef typename gauge_mapper<Float,recon_m>::type Mom;
64 int X[4]; // the regular volume parameters
65 int E[4]; // the extended volume parameters
66 int border[4]; // radius of border
68 Float epsilon; // stepsize and any other overall scaling factor
71 GaugeForceArg(GaugeField &mom, const GaugeField &u, double epsilon, const paths &p)
73 threads(mom.VolumeCB()),
77 for (int i=0; i<4; i++) {
80 border[i] = (E[i] - X[i])/2;
85 constexpr int flipDir(int dir) { return (7-dir); }
86 constexpr bool isForwards(int dir) { return (dir <= 3); }
88 // this ensures that array elements are held in cache
89 template <typename T> constexpr T cache(const T *ptr, int idx) {
91 return __ldg(ptr+idx);
97 template <typename Arg, int dir>
98 __device__ __host__ inline void GaugeForceKernel(Arg &arg, int idx, int parity)
100 using real = typename Arg::Float;
101 typedef Matrix<complex<real>,Arg::nColor> Link;
103 int x[4] = {0, 0, 0, 0};
104 getCoords(x, idx, arg.X, parity);
105 for (int dr=0; dr<4; ++dr) x[dr] += arg.border[dr]; // extended grid coordinates
107 //linkA: current matrix
108 //linkB: the loaded matrix in this round
109 Link linkA, linkB, staple;
112 extern __shared__ int s[];
113 int tid = (threadIdx.z*blockDim.y + threadIdx.y)*blockDim.x + threadIdx.x;
115 signed char *dx = (signed char*)&s[tid];
117 int dx[4] = {0, 0, 0, 0};
120 for (int i=0; i<arg.p.num_paths; i++) {
121 real coeff = cache(arg.p.path_coeff, i);
122 if (coeff == 0) continue;
124 const int* path = arg.p.input_path[dir] + i*arg.p.max_length;
126 // start from end of link in direction dir
127 int nbr_oddbit = (parity^1);
130 int path0 = cache(path,0);
131 int lnkdir = isForwards(path0) ? path0 : flipDir(path0);
133 if (isForwards(path0)) {
134 linkB = arg.u(lnkdir, linkIndexShift(x,dx,arg.E), nbr_oddbit);
136 dx[lnkdir]++; // now have to update location
137 nbr_oddbit = nbr_oddbit^1;
139 dx[lnkdir]--; // if we are going backwards the link is on the adjacent site
140 nbr_oddbit = nbr_oddbit^1;
141 linkB = arg.u(lnkdir, linkIndexShift(x,dx,arg.E), nbr_oddbit);
145 for (int j=1; j<cache(arg.p.length,i); j++) {
147 int pathj = cache(path,j);
148 int lnkdir = isForwards(pathj) ? pathj : flipDir(pathj);
150 if (isForwards(pathj)) {
151 linkB = arg.u(lnkdir, linkIndexShift(x,dx,arg.E), nbr_oddbit);
152 linkA = linkA * linkB;
153 dx[lnkdir]++; // now have to update to new location
154 nbr_oddbit = nbr_oddbit^1;
156 dx[lnkdir]--; // if we are going backwards the link is on the adjacent site
157 nbr_oddbit = nbr_oddbit^1;
158 linkB = arg.u(lnkdir, linkIndexShift(x,dx,arg.E), nbr_oddbit);
159 linkA = linkA * conj(linkB);
162 staple = staple + coeff*linkA;
166 linkA = arg.u(dir, linkIndex(x,arg.E), parity);
167 linkA = linkA * staple;
170 Link mom = arg.mom(dir, idx, parity);
171 mom = mom - arg.epsilon * linkA;
173 arg.mom(dir, idx, parity) = mom;
176 template <typename Arg>
177 __global__ void GaugeForceKernel(Arg arg) {
178 int idx = blockIdx.x * blockDim.x + threadIdx.x;
179 if (idx >= arg.threads) return;
180 int parity = blockIdx.y * blockDim.y + threadIdx.y;
181 int dir = blockIdx.z * blockDim.z + threadIdx.z;
182 if (dir >= 4) return;
185 case 0: GaugeForceKernel<Arg,0>(arg, idx, parity); break;
186 case 1: GaugeForceKernel<Arg,1>(arg, idx, parity); break;
187 case 2: GaugeForceKernel<Arg,2>(arg, idx, parity); break;
188 case 3: GaugeForceKernel<Arg,3>(arg, idx, parity); break;
192 template <typename Float, int nColor, QudaReconstructType recon_u> class GaugeForce : public TunableVectorYZ {
194 GaugeForceArg<Float, nColor, recon_u, QUDA_RECONSTRUCT_10> arg;
195 const GaugeField &meta;
197 unsigned int sharedBytesPerThread() const { return 4; } // for dynamic indexing array
198 unsigned int minThreads() const { return arg.threads; }
199 bool tuneGridDim() const { return false; } // don't tune the grid dimension
202 GaugeForce(const GaugeField &u, GaugeField &mom, double epsilon, const paths &p) :
203 TunableVectorYZ(2,4),
204 arg(mom, u, epsilon, p),
208 qudaDeviceSynchronize();
211 void apply(const qudaStream_t &stream) {
212 TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
213 qudaLaunchKernel(GaugeForceKernel<decltype(arg)>, tp, stream, arg);
216 void preTune() { arg.mom.save(); }
217 void postTune() { arg.mom.load(); }
219 long long flops() const { return (arg.p.count - arg.p.num_paths + 1) * 198ll * 2 * arg.mom.volumeCB * 4; }
220 long long bytes() const { return ((arg.p.count + 1ll) * arg.u.Bytes() + 2ll*arg.mom.Bytes()) * 2 * arg.mom.volumeCB * 4; }
222 TuneKey tuneKey() const {
223 std::stringstream aux;
224 aux << meta.AuxString() << ",num_paths=" << arg.p.num_paths << comm_dim_partitioned_string();
225 return TuneKey(meta.VolString(), typeid(*this).name(), aux.str().c_str());
229 void gaugeForce(GaugeField& mom, const GaugeField& u, double epsilon, int ***input_path,
230 int *length_h, double *path_coeff_h, int num_paths, int path_max_length)
232 checkPrecision(mom, u);
233 checkLocation(mom, u);
234 if (mom.Reconstruct() != QUDA_RECONSTRUCT_10) errorQuda("Reconstruction type %d not supported", mom.Reconstruct());
236 // create path struct in a single allocation
237 size_t bytes = 4 * num_paths * path_max_length * sizeof(int) + num_paths*sizeof(int) + num_paths*sizeof(double);
238 void *buffer = pool_device_malloc(bytes);
239 paths p(buffer, bytes, input_path, length_h, path_coeff_h, num_paths, path_max_length);
241 #ifdef GPU_GAUGE_FORCE
242 // gauge field must be passed as first argument so we peel off its reconstruct type
243 instantiate<GaugeForce,ReconstructNo12>(u, mom, epsilon, p);
245 errorQuda("Gauge force has not been built");
246 #endif // GPU_GAUGE_FORCE
247 pool_device_free(buffer);