QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
ks_force_quda.cu
Go to the documentation of this file.
1 #include <quda_internal.h>
2 #include <quda_matrix.h>
3 #include <tune_quda.h>
4 #include <gauge_field.h>
5 #include <gauge_field_order.h>
6 #include <ks_force_quda.h>
7 #include <index_helper.cuh>
8 
9 namespace quda {
10 
11  using namespace gauge;
12 
13  template<typename Oprod, typename Gauge, typename Mom>
14  struct KSForceArg {
15  int threads;
16  int X[4]; // grid dimensions
17 #ifndef BUILD_TIFR_INTERFACE
18 #ifdef MULTI_GPU
19  int border[4];
20 #endif
21 #endif
22  Oprod oprod;
23  Gauge gauge;
24  Mom mom;
25 
26  KSForceArg(Oprod& oprod, Gauge &gauge, Mom& mom, int dim[4])
27  : oprod(oprod), gauge(gauge), mom(mom){
28 
29  threads = 1;
30  for(int dir=0; dir<4; ++dir) threads *= dim[dir];
31 
32  for(int dir=0; dir<4; ++dir) X[dir] = dim[dir];
33 #ifndef BUILD_TIFR_INTERFACE
34 #ifdef MULTI_GPU
35  for(int dir=0; dir<4; ++dir) border[dir] = 2;
36 #endif
37 #endif
38  }
39 
40  };
41 
42  template<typename Float, typename Oprod, typename Gauge, typename Mom>
43  __host__ __device__ void completeKSForceCore(KSForceArg<Oprod,Gauge,Mom>& arg, int idx){
44 
45  int parity = 0;
46  if(idx >= arg.threads/2){
47  parity = 1;
48  idx -= arg.threads/2;
49  }
50 
51  int X[4];
52  for(int dir=0; dir<4; ++dir) X[dir] = arg.X[dir];
53 
54  int x[4];
55  getCoords(x, idx, X, parity);
56 #ifndef BUILD_TIFR_INTERFACE
57 #ifdef MULTI_GPU
58  for(int dir=0; dir<4; ++dir){
59  x[dir] += arg.border[dir];
60  X[dir] += 2*arg.border[dir];
61  }
62 #endif
63 #endif
64 
65  Matrix<complex<Float>,3> O, G, M;
66 
67  int dx[4] = {0,0,0,0};
68  for(int dir=0; dir<4; ++dir){
69  G = arg.gauge(dir, linkIndexShift(x,dx,X), parity);
70  O = arg.oprod(dir, linkIndexShift(x,dx,X), parity);
71  if(parity==0){
72  M = G*O;
73  }else{
74  M = -G*O;
75  }
76 
77  makeAntiHerm(M);
78 
79  arg.mom(dir, idx, parity) = M;
80  }
81  }
82 
83  template<typename Float, typename Oprod, typename Gauge, typename Mom>
85  {
86  int idx = threadIdx.x + blockIdx.x*blockDim.x;
87 
88  if(idx >= arg.threads) return;
89  completeKSForceCore<Float,Oprod,Gauge,Mom>(arg,idx);
90  }
91 
92  template<typename Float, typename Oprod, typename Gauge, typename Mom>
94  {
95  for(int idx=0; idx<arg.threads; idx++){
96  completeKSForceCore<Float,Oprod,Gauge,Mom>(arg,idx);
97  }
98  }
99 
100  template<typename Float, typename Oprod, typename Gauge, typename Mom>
102 
104  const GaugeField &meta;
106 
107  private:
108  unsigned int sharedBytesPerThread() const { return 0; }
109  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
110 
111  bool tuneSharedBytes() const { return false; } // Don't tune the shared memory.
112  bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
113  unsigned int minThreads() const { return arg.threads; }
114 
115  public:
117  : arg(arg), meta(meta), location(location) {
118  writeAuxString("prec=%lu,stride=%d",sizeof(Float),arg.mom.stride);
119  }
120 
121  virtual ~KSForceComplete() {}
122 
123  void apply(const cudaStream_t &stream) {
124  if(location == QUDA_CUDA_FIELD_LOCATION){
125  // Fix this
126  dim3 blockDim(128, 1, 1);
127  dim3 gridDim((arg.threads + blockDim.x - 1) / blockDim.x, 1, 1);
128  completeKSForceKernel<Float><<<gridDim,blockDim>>>(arg);
129  }else{
130  completeKSForceCPU<Float>(arg);
131  }
132  }
133 
134  TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
135 
136  long long flops() const { return 792*arg.X[0]*arg.X[1]*arg.X[2]*arg.X[3]; }
137  long long bytes() const { return 0; } // Fix this
138  };
139 
140  template<typename Float, typename Oprod, typename Gauge, typename Mom>
141  void completeKSForce(Oprod oprod, Gauge gauge, Mom mom, int dim[4], const GaugeField &meta, QudaFieldLocation location, long long *flops)
142  {
143  KSForceArg<Oprod,Gauge,Mom> arg(oprod, gauge, mom, dim);
144  KSForceComplete<Float,Oprod,Gauge,Mom> completeForce(arg,meta,location);
145  completeForce.apply(0);
146  if(flops) *flops = completeForce.flops();
148  }
149 
150 
151  template<typename Float>
152  void completeKSForce(GaugeField& mom, const GaugeField& oprod, const GaugeField& gauge, QudaFieldLocation location, long long *flops)
153  {
154 
155  if(location != QUDA_CUDA_FIELD_LOCATION){
156  errorQuda("Only QUDA_CUDA_FIELD_LOCATION currently supported");
157  }else{
158  if((oprod.Reconstruct() != QUDA_RECONSTRUCT_NO) || (gauge.Reconstruct() != QUDA_RECONSTRUCT_NO) || (mom.Reconstruct() != QUDA_RECONSTRUCT_10)){
159  errorQuda("Reconstruct type not supported");
160  }else{
161  completeKSForce<Float>(FloatNOrder<Float, 18, 2, 18>(oprod),
164  const_cast<int*>(mom.X()),
165  gauge, location, flops);
166  }
167  }
168  return;
169  }
170 
171 
172  void completeKSForce(GaugeField &mom, const GaugeField &oprod, const GaugeField &gauge, QudaFieldLocation location, long long *flops)
173  {
174  if(mom.Precision() == QUDA_HALF_PRECISION){
175  errorQuda("Half precision not supported");
176  }
177 
178  if(mom.Precision() == QUDA_SINGLE_PRECISION){
179  completeKSForce<float>(mom, oprod, gauge, location, flops);
180  }else if(mom.Precision() == QUDA_DOUBLE_PRECISION){
181  completeKSForce<double>(mom, oprod, gauge, location, flops);
182  }else{
183  errorQuda("Precision %d not supported", mom.Precision());
184  }
185  return;
186  }
187 
188 
189 
190 
191  template<typename Result, typename Oprod, typename Gauge>
192  struct KSLongLinkArg {
193  int threads;
194  int X[4]; // grid dimensions
195 #ifdef MULTI_GPU
196  int border[4];
197 #endif
198  double coeff;
199  Result res;
200  Oprod oprod;
201  Gauge gauge;
202 
203  KSLongLinkArg(Result& res, Oprod& oprod, Gauge &gauge, int dim[4])
204  : coeff(1.0), res(res), oprod(oprod), gauge(gauge){
205 
206  threads = 1;
207 #ifdef MULTI_GPU
208  for(int dir=0; dir<4; ++dir) threads *= (dim[dir]-2);
209  for(int dir=0; dir<4; ++dir) X[dir] = dim[dir]-2;
210  for(int dir=0; dir<4; ++dir) border[dir] = 2;
211 #else
212  for(int dir=0; dir<4; ++dir) threads *= dim[dir];
213  for(int dir=0; dir<4; ++dir) X[dir] = dim[dir];
214 #endif
215  }
216 
217  };
218 
219 
220 
221  template<typename Float, typename Result, typename Oprod, typename Gauge>
223 
224  /*
225  int parity = 0;
226  if(idx >= arg.threads/2){
227  parity = 1;
228  idx -= arg.threads/2;
229  }
230 
231  int X[4];
232  for(int dir=0; dir<4; ++dir) X[dir] = arg.X[dir];
233 
234  int x[4];
235  getCoords(x, idx, X, parity);
236 #ifndef BUILD_TIFR_INTERFACE
237 #ifdef MULTI_GPU
238 for(int dir=0; dir<4; ++dir){
239 x[dir] += arg.border[dir];
240 X[dir] += 2*arg.border[dir];
241 }
242 #endif
243 #endif
244 
245 typedef complex<Float> Cmplx;
246 
247 Matrix<Cmplx,3> O;
248 Matrix<Cmplx,3> G;
249 Matrix<Cmplx,3> M;
250 
251 
252 int dx[4] = {0,0,0,0};
253 for(int dir=0; dir<4; ++dir){
254 arg.gauge.load((Float*)(G.data), linkIndexShift(x,dx,X), dir, parity);
255 arg.oprod.load((Float*)(O.data), linkIndexShift(x,dx,X), dir, parity);
256 if(parity==0){
257 M = G*O;
258 }else{
259 M = -G*O;
260 }
261 
262 Float sub = getTrace(M).y/(static_cast<Float>(3));
263 Float temp[10];
264 
265 
266 temp[0] = (M.data[1].x - M.data[3].x)*0.5;
267 temp[1] = (M.data[1].y + M.data[3].y)*0.5;
268 
269 temp[2] = (M.data[2].x - M.data[6].x)*0.5;
270 temp[3] = (M.data[2].y + M.data[6].y)*0.5;
271 
272 temp[4] = (M.data[5].x - M.data[7].x)*0.5;
273 temp[5] = (M.data[5].y + M.data[7].y)*0.5;
274 
275 temp[6] = (M.data[0].y-sub);
276 temp[7] = (M.data[4].y-sub);
277 temp[8] = (M.data[8].y-sub);
278 temp[9] = 0.0;
279 
280 arg.mom.save(temp, idx, dir, parity);
281 }
282  */
283  }
284 
285  template<typename Float, typename Result, typename Oprod, typename Gauge>
287 {
288  int idx = threadIdx.x + blockIdx.x*blockDim.x;
289 
290  if(idx >= arg.threads) return;
291  computeKSLongLinkForceCore<Float,Result,Oprod,Gauge>(arg,idx);
292 }
293 
294 
295 
296 
297  template<typename Float, typename Result, typename Oprod, typename Gauge>
299 {
300  for(int idx=0; idx<arg.threads; idx++){
301  computeKSLongLinkForceCore<Float,Result,Oprod,Gauge>(arg,idx);
302  }
303 }
304 
305 
306 
307 // should be tunable
308 template<typename Float, typename Result, typename Oprod, typename Gauge>
310 
311 
313  const GaugeField &meta;
315 
316  private:
317  unsigned int sharedBytesPerThread() const { return 0; }
318  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
319 
320  bool tuneSharedBytes() const { return false; } // Don't tune the shared memory.
321  bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
322  unsigned int minThreads() const { return arg.threads; }
323 
324  public:
326  : arg(arg), meta(meta), location(location) {
327  writeAuxString("prec=%lu,stride=%d",sizeof(Float),arg.res.stride);
328  }
329 
330  virtual ~KSLongLinkForce() {}
331 
332  void apply(const cudaStream_t &stream) {
333  if(location == QUDA_CUDA_FIELD_LOCATION){
334  // Fix this
335  dim3 blockDim(128, 1, 1);
336  dim3 gridDim((arg.threads + blockDim.x - 1) / blockDim.x, 1, 1);
337  computeKSLongLinkForceKernel<Float><<<gridDim,blockDim>>>(arg);
338  }else{
339  computeKSLongLinkForceCPU<Float>(arg);
340  }
341  }
342 
343  TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
344 
345  long long flops() const { return 0; } // Fix this
346  long long bytes() const { return 0; } // Fix this
347 };
348 
349 
350 
351 
352 template<typename Float, typename Result, typename Oprod, typename Gauge>
353 void computeKSLongLinkForce(Result res, Oprod oprod, Gauge gauge, int dim[4], const GaugeField &meta, QudaFieldLocation location)
354 {
355  KSLongLinkArg<Result,Oprod,Gauge> arg(res, oprod, gauge, dim);
356  KSLongLinkForce<Float,Result,Oprod,Gauge> computeLongLink(arg,meta,location);
357  computeLongLink.apply(0);
359 }
360 
361  template<typename Float>
362 void computeKSLongLinkForce(GaugeField& result, const GaugeField &oprod, const GaugeField &gauge, QudaFieldLocation location)
363 {
364  if(location != QUDA_CUDA_FIELD_LOCATION){
365  errorQuda("Only QUDA_CUDA_FIELD_LOCATION currently supported");
366  }else{
367  if((oprod.Reconstruct() != QUDA_RECONSTRUCT_NO) || (gauge.Reconstruct() != QUDA_RECONSTRUCT_NO) ||
368  (result.Reconstruct() != QUDA_RECONSTRUCT_10)){
369 
370  errorQuda("Reconstruct type not supported");
371  }else{
372  computeKSLongLinkForce<Float>(FloatNOrder<Float, 18, 2, 18>(result),
375  const_cast<int*>(result.X()),
376  gauge, location);
377  }
378  }
379  return;
380 }
381 
382 
383 void computeKSLongLinkForce(GaugeField &result, const GaugeField &oprod, const GaugeField &gauge, QudaFieldLocation location)
384 {
385  if(result.Precision() == QUDA_HALF_PRECISION){
386  errorQuda("Half precision not supported");
387  }
388 
389  if(result.Precision() == QUDA_SINGLE_PRECISION){
390  computeKSLongLinkForce<float>(result, oprod, gauge, location);
391  }else if(result.Precision() == QUDA_DOUBLE_PRECISION){
392  computeKSLongLinkForce<double>(result, oprod, gauge, location);
393  }
394  errorQuda("Precision %d not supported", result.Precision());
395  return;
396 }
397 
398 } // namespace quda
KSForceArg(Oprod &oprod, Gauge &gauge, Mom &mom, int dim[4])
__global__ void completeKSForceKernel(KSForceArg< Oprod, Gauge, Mom > arg)
static __device__ __host__ int linkIndexShift(const I x[], const J dx[], const K X[4])
unsigned int sharedBytesPerThread() const
__global__ void computeKSLongLinkForceKernel(KSLongLinkArg< Result, Oprod, Gauge > arg)
long long flops() const
KSForceComplete(KSForceArg< Oprod, Gauge, Mom > &arg, const GaugeField &meta, QudaFieldLocation location)
#define errorQuda(...)
Definition: util_quda.h:121
bool tuneSharedBytes() const
void completeKSForceCPU(KSForceArg< Oprod, Gauge, Mom > &arg)
const QudaFieldLocation location
cudaStream_t * stream
void completeKSForce(GaugeField &mom, const GaugeField &oprod, const GaugeField &gauge, QudaFieldLocation location, long long *flops=NULL)
const char * VolString() const
unsigned int sharedBytesPerBlock(const TuneParam &param) const
void apply(const cudaStream_t &stream)
QudaGaugeParam param
Definition: pack_test.cpp:17
unsigned int minThreads() const
#define qudaDeviceSynchronize()
__host__ __device__ void completeKSForceCore(KSForceArg< Oprod, Gauge, Mom > &arg, int idx)
Main header file for host and device accessors to GaugeFields.
int X[4]
Definition: covdev_test.cpp:70
bool tuneGridDim() const
TuneKey tuneKey() const
long long bytes() const
enum QudaFieldLocation_s QudaFieldLocation
__host__ __device__ void computeKSLongLinkForceCore(KSLongLinkArg< Result, Oprod, Gauge > &arg, int idx)
unsigned long long flops
Definition: blas_quda.cu:22
void computeKSLongLinkForce(Result res, Oprod oprod, Gauge gauge, int dim[4], const GaugeField &meta, QudaFieldLocation location)
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Accessor routine for CloverFields in native field order.
__device__ __host__ void makeAntiHerm(Matrix< Complex, N > &m)
Definition: quda_matrix.h:746
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:250
void computeKSLongLinkForceCPU(KSLongLinkArg< Result, Oprod, Gauge > &arg)
const GaugeField & meta
QudaPrecision Precision() const
QudaParity parity
Definition: covdev_test.cpp:54
KSForceArg< Oprod, Gauge, Mom > arg
__host__ __device__ int getCoords(int coord[], const Arg &arg, int &idx, int parity, int &dim)
Compute the space-time coordinates we are at.
const int * X() const