QUDA  0.9.0
ks_force_quda.cu
Go to the documentation of this file.
1 #include <quda_internal.h>
2 #include <quda_matrix.h>
3 #include <tune_quda.h>
4 #include <gauge_field.h>
5 #include <gauge_field_order.h>
6 #include <ks_force_quda.h>
7 #include <index_helper.cuh>
8 
9 namespace quda {
10 
11  using namespace gauge;
12 
13  template<typename Oprod, typename Gauge, typename Mom>
14  struct KSForceArg {
15  int threads;
16  int X[4]; // grid dimensions
17 #ifndef BUILD_TIFR_INTERFACE
18 #ifdef MULTI_GPU
19  int border[4];
20 #endif
21 #endif
22  Oprod oprod;
23  Gauge gauge;
24  Mom mom;
25 
26  KSForceArg(Oprod& oprod, Gauge &gauge, Mom& mom, int dim[4])
27  : oprod(oprod), gauge(gauge), mom(mom){
28 
29  threads = 1;
30  for(int dir=0; dir<4; ++dir) threads *= dim[dir];
31 
32  for(int dir=0; dir<4; ++dir) X[dir] = dim[dir];
33 #ifndef BUILD_TIFR_INTERFACE
34 #ifdef MULTI_GPU
35  for(int dir=0; dir<4; ++dir) border[dir] = 2;
36 #endif
37 #endif
38  }
39 
40  };
41 
42 
43  template<typename Float, typename Oprod, typename Gauge, typename Mom>
44  __host__ __device__ void completeKSForceCore(KSForceArg<Oprod,Gauge,Mom>& arg, int idx){
45 
46  int parity = 0;
47  if(idx >= arg.threads/2){
48  parity = 1;
49  idx -= arg.threads/2;
50  }
51 
52  int X[4];
53  for(int dir=0; dir<4; ++dir) X[dir] = arg.X[dir];
54 
55  int x[4];
56  getCoords(x, idx, X, parity);
57 #ifndef BUILD_TIFR_INTERFACE
58 #ifdef MULTI_GPU
59  for(int dir=0; dir<4; ++dir){
60  x[dir] += arg.border[dir];
61  X[dir] += 2*arg.border[dir];
62  }
63 #endif
64 #endif
65 
69 
70 
71  int dx[4] = {0,0,0,0};
72  for(int dir=0; dir<4; ++dir){
73  arg.gauge.load((Float*)(G.data), linkIndexShift(x,dx,X), dir, parity);
74  arg.oprod.load((Float*)(O.data), linkIndexShift(x,dx,X), dir, parity);
75  if(parity==0){
76  M = G*O;
77  }else{
78  M = -G*O;
79  }
80 
81  Float sub = getTrace(M).y/(static_cast<Float>(3));
82  Float temp[10];
83 
84 
85  temp[0] = (M.data[1].x - M.data[3].x)*0.5;
86  temp[1] = (M.data[1].y + M.data[3].y)*0.5;
87 
88  temp[2] = (M.data[2].x - M.data[6].x)*0.5;
89  temp[3] = (M.data[2].y + M.data[6].y)*0.5;
90 
91  temp[4] = (M.data[5].x - M.data[7].x)*0.5;
92  temp[5] = (M.data[5].y + M.data[7].y)*0.5;
93 
94  temp[6] = (M.data[0].y-sub);
95  temp[7] = (M.data[4].y-sub);
96  temp[8] = (M.data[8].y-sub);
97  temp[9] = 0.0;
98 
99  arg.mom.save(temp, idx, dir, parity);
100  }
101  }
102 
103  template<typename Float, typename Oprod, typename Gauge, typename Mom>
105  {
106  int idx = threadIdx.x + blockIdx.x*blockDim.x;
107 
108  if(idx >= arg.threads) return;
109  completeKSForceCore<Float,Oprod,Gauge,Mom>(arg,idx);
110  }
111 
112 
113 
114 
115  template<typename Float, typename Oprod, typename Gauge, typename Mom>
117  {
118  for(int idx=0; idx<arg.threads; idx++){
119  completeKSForceCore<Float,Oprod,Gauge,Mom>(arg,idx);
120  }
121  }
122 
123 
124 
125  template<typename Float, typename Oprod, typename Gauge, typename Mom>
127 
129  const GaugeField &meta;
131 
132  private:
133  unsigned int sharedBytesPerThread() const { return 0; }
134  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
135 
136  bool tuneSharedBytes() const { return false; } // Don't tune the shared memory.
137  bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
138  unsigned int minThreads() const { return arg.threads; }
139 
140  public:
142  : arg(arg), meta(meta), location(location) {
143  writeAuxString("prec=%lu,stride=%d",sizeof(Float),arg.mom.stride);
144  }
145 
146  virtual ~KSForceComplete() {}
147 
148  void apply(const cudaStream_t &stream) {
149  if(location == QUDA_CUDA_FIELD_LOCATION){
150  // Fix this
151  dim3 blockDim(128, 1, 1);
152  dim3 gridDim((arg.threads + blockDim.x - 1) / blockDim.x, 1, 1);
153  completeKSForceKernel<Float><<<gridDim,blockDim>>>(arg);
154  }else{
155  completeKSForceCPU<Float>(arg);
156  }
157  }
158 
159  TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
160 
161  long long flops() const { return 792*arg.X[0]*arg.X[1]*arg.X[2]*arg.X[3]; }
162  long long bytes() const { return 0; } // Fix this
163  };
164 
165  template<typename Float, typename Oprod, typename Gauge, typename Mom>
166  void completeKSForce(Oprod oprod, Gauge gauge, Mom mom, int dim[4], const GaugeField &meta, QudaFieldLocation location, long long *flops)
167  {
168  KSForceArg<Oprod,Gauge,Mom> arg(oprod, gauge, mom, dim);
169  KSForceComplete<Float,Oprod,Gauge,Mom> completeForce(arg,meta,location);
170  completeForce.apply(0);
171  if(flops) *flops = completeForce.flops();
173  }
174 
175 
176  template<typename Float>
177  void completeKSForce(GaugeField& mom, const GaugeField& oprod, const GaugeField& gauge, QudaFieldLocation location, long long *flops)
178  {
179 
180  if(location != QUDA_CUDA_FIELD_LOCATION){
181  errorQuda("Only QUDA_CUDA_FIELD_LOCATION currently supported");
182  }else{
183  if((oprod.Reconstruct() != QUDA_RECONSTRUCT_NO) || (gauge.Reconstruct() != QUDA_RECONSTRUCT_NO) || (mom.Reconstruct() != QUDA_RECONSTRUCT_10)){
184  errorQuda("Reconstruct type not supported");
185  }else{
186  completeKSForce<Float>(FloatNOrder<Float, 18, 2, 18>(oprod),
189  const_cast<int*>(mom.X()),
190  gauge, location, flops);
191  }
192  }
193  return;
194  }
195 
196 
197  void completeKSForce(GaugeField &mom, const GaugeField &oprod, const GaugeField &gauge, QudaFieldLocation location, long long *flops)
198  {
199  if(mom.Precision() == QUDA_HALF_PRECISION){
200  errorQuda("Half precision not supported");
201  }
202 
203  if(mom.Precision() == QUDA_SINGLE_PRECISION){
204  completeKSForce<float>(mom, oprod, gauge, location, flops);
205  }else if(mom.Precision() == QUDA_DOUBLE_PRECISION){
206  completeKSForce<double>(mom, oprod, gauge, location, flops);
207  }else{
208  errorQuda("Precision %d not supported", mom.Precision());
209  }
210  return;
211  }
212 
213 
214 
215 
216  template<typename Result, typename Oprod, typename Gauge>
217  struct KSLongLinkArg {
218  int threads;
219  int X[4]; // grid dimensions
220 #ifdef MULTI_GPU
221  int border[4];
222 #endif
223  double coeff;
224  Result res;
225  Oprod oprod;
226  Gauge gauge;
227 
228  KSLongLinkArg(Result& res, Oprod& oprod, Gauge &gauge, int dim[4])
229  : coeff(1.0), res(res), oprod(oprod), gauge(gauge){
230 
231  threads = 1;
232 #ifdef MULTI_GPU
233  for(int dir=0; dir<4; ++dir) threads *= (dim[dir]-2);
234  for(int dir=0; dir<4; ++dir) X[dir] = dim[dir]-2;
235  for(int dir=0; dir<4; ++dir) border[dir] = 2;
236 #else
237  for(int dir=0; dir<4; ++dir) threads *= dim[dir];
238  for(int dir=0; dir<4; ++dir) X[dir] = dim[dir];
239 #endif
240  }
241 
242  };
243 
244 
245 
246  template<typename Float, typename Result, typename Oprod, typename Gauge>
248 
249  /*
250  int parity = 0;
251  if(idx >= arg.threads/2){
252  parity = 1;
253  idx -= arg.threads/2;
254  }
255 
256  int X[4];
257  for(int dir=0; dir<4; ++dir) X[dir] = arg.X[dir];
258 
259  int x[4];
260  getCoords(x, idx, X, parity);
261 #ifndef BUILD_TIFR_INTERFACE
262 #ifdef MULTI_GPU
263 for(int dir=0; dir<4; ++dir){
264 x[dir] += arg.border[dir];
265 X[dir] += 2*arg.border[dir];
266 }
267 #endif
268 #endif
269 
270 typedef complex<Float> Cmplx;
271 
272 Matrix<Cmplx,3> O;
273 Matrix<Cmplx,3> G;
274 Matrix<Cmplx,3> M;
275 
276 
277 int dx[4] = {0,0,0,0};
278 for(int dir=0; dir<4; ++dir){
279 arg.gauge.load((Float*)(G.data), linkIndexShift(x,dx,X), dir, parity);
280 arg.oprod.load((Float*)(O.data), linkIndexShift(x,dx,X), dir, parity);
281 if(parity==0){
282 M = G*O;
283 }else{
284 M = -G*O;
285 }
286 
287 Float sub = getTrace(M).y/(static_cast<Float>(3));
288 Float temp[10];
289 
290 
291 temp[0] = (M.data[1].x - M.data[3].x)*0.5;
292 temp[1] = (M.data[1].y + M.data[3].y)*0.5;
293 
294 temp[2] = (M.data[2].x - M.data[6].x)*0.5;
295 temp[3] = (M.data[2].y + M.data[6].y)*0.5;
296 
297 temp[4] = (M.data[5].x - M.data[7].x)*0.5;
298 temp[5] = (M.data[5].y + M.data[7].y)*0.5;
299 
300 temp[6] = (M.data[0].y-sub);
301 temp[7] = (M.data[4].y-sub);
302 temp[8] = (M.data[8].y-sub);
303 temp[9] = 0.0;
304 
305 arg.mom.save(temp, idx, dir, parity);
306 }
307  */
308  }
309 
310  template<typename Float, typename Result, typename Oprod, typename Gauge>
312 {
313  int idx = threadIdx.x + blockIdx.x*blockDim.x;
314 
315  if(idx >= arg.threads) return;
316  computeKSLongLinkForceCore<Float,Result,Oprod,Gauge>(arg,idx);
317 }
318 
319 
320 
321 
322  template<typename Float, typename Result, typename Oprod, typename Gauge>
324 {
325  for(int idx=0; idx<arg.threads; idx++){
326  computeKSLongLinkForceCore<Float,Result,Oprod,Gauge>(arg,idx);
327  }
328 }
329 
330 
331 
332 // should be tunable
333 template<typename Float, typename Result, typename Oprod, typename Gauge>
335 
336 
338  const GaugeField &meta;
340 
341  private:
342  unsigned int sharedBytesPerThread() const { return 0; }
343  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
344 
345  bool tuneSharedBytes() const { return false; } // Don't tune the shared memory.
346  bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
347  unsigned int minThreads() const { return arg.threads; }
348 
349  public:
351  : arg(arg), meta(meta), location(location) {
352  writeAuxString("prec=%lu,stride=%d",sizeof(Float),arg.res.stride);
353  }
354 
355  virtual ~KSLongLinkForce() {}
356 
357  void apply(const cudaStream_t &stream) {
358  if(location == QUDA_CUDA_FIELD_LOCATION){
359  // Fix this
360  dim3 blockDim(128, 1, 1);
361  dim3 gridDim((arg.threads + blockDim.x - 1) / blockDim.x, 1, 1);
362  computeKSLongLinkForceKernel<Float><<<gridDim,blockDim>>>(arg);
363  }else{
364  computeKSLongLinkForceCPU<Float>(arg);
365  }
366  }
367 
368  TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
369 
370  long long flops() const { return 0; } // Fix this
371  long long bytes() const { return 0; } // Fix this
372 };
373 
374 
375 
376 
377 template<typename Float, typename Result, typename Oprod, typename Gauge>
378 void computeKSLongLinkForce(Result res, Oprod oprod, Gauge gauge, int dim[4], const GaugeField &meta, QudaFieldLocation location)
379 {
380  KSLongLinkArg<Result,Oprod,Gauge> arg(res, oprod, gauge, dim);
381  KSLongLinkForce<Float,Result,Oprod,Gauge> computeLongLink(arg,meta,location);
382  computeLongLink.apply(0);
384 }
385 
386  template<typename Float>
387 void computeKSLongLinkForce(GaugeField& result, const GaugeField &oprod, const GaugeField &gauge, QudaFieldLocation location)
388 {
389  if(location != QUDA_CUDA_FIELD_LOCATION){
390  errorQuda("Only QUDA_CUDA_FIELD_LOCATION currently supported");
391  }else{
392  if((oprod.Reconstruct() != QUDA_RECONSTRUCT_NO) || (gauge.Reconstruct() != QUDA_RECONSTRUCT_NO) ||
393  (result.Reconstruct() != QUDA_RECONSTRUCT_10)){
394 
395  errorQuda("Reconstruct type not supported");
396  }else{
397  computeKSLongLinkForce<Float>(FloatNOrder<Float, 18, 2, 18>(result),
400  const_cast<int*>(result.X()),
401  gauge, location);
402  }
403  }
404  return;
405 }
406 
407 
408 void computeKSLongLinkForce(GaugeField &result, const GaugeField &oprod, const GaugeField &gauge, QudaFieldLocation location)
409 {
410  if(result.Precision() == QUDA_HALF_PRECISION){
411  errorQuda("Half precision not supported");
412  }
413 
414  if(result.Precision() == QUDA_SINGLE_PRECISION){
415  computeKSLongLinkForce<float>(result, oprod, gauge, location);
416  }else if(result.Precision() == QUDA_DOUBLE_PRECISION){
417  computeKSLongLinkForce<double>(result, oprod, gauge, location);
418  }
419  errorQuda("Precision %d not supported", result.Precision());
420  return;
421 }
422 
423 } // namespace quda
dim3 dim3 blockDim
KSForceArg(Oprod &oprod, Gauge &gauge, Mom &mom, int dim[4])
__global__ void completeKSForceKernel(KSForceArg< Oprod, Gauge, Mom > arg)
static __device__ __host__ int linkIndexShift(const I x[], const J dx[], const K X[4])
unsigned int sharedBytesPerThread() const
__global__ void computeKSLongLinkForceKernel(KSLongLinkArg< Result, Oprod, Gauge > arg)
long long flops() const
KSForceComplete(KSForceArg< Oprod, Gauge, Mom > &arg, const GaugeField &meta, QudaFieldLocation location)
#define errorQuda(...)
Definition: util_quda.h:90
static void sub(Float *dst, Float *a, Float *b, int cnt)
Definition: dslash_util.h:14
bool tuneSharedBytes() const
void completeKSForceCPU(KSForceArg< Oprod, Gauge, Mom > &arg)
const QudaFieldLocation location
cudaStream_t * stream
static __inline__ dim3 dim3 void size_t cudaStream_t int dim
void completeKSForce(GaugeField &mom, const GaugeField &oprod, const GaugeField &gauge, QudaFieldLocation location, long long *flops=NULL)
const char * VolString() const
unsigned int sharedBytesPerBlock(const TuneParam &param) const
void apply(const cudaStream_t &stream)
QudaGaugeParam param
Definition: pack_test.cpp:17
unsigned int minThreads() const
T data[N *N]
Definition: quda_matrix.h:74
__host__ __device__ void completeKSForceCore(KSForceArg< Oprod, Gauge, Mom > &arg, int idx)
Main header file for host and device accessors to GaugeFields.
bool tuneGridDim() const
TuneKey tuneKey() const
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
long long bytes() const
__device__ __host__ T getTrace(const Matrix< T, 3 > &a)
Definition: quda_matrix.h:305
enum QudaFieldLocation_s QudaFieldLocation
__host__ __device__ void computeKSLongLinkForceCore(KSLongLinkArg< Result, Oprod, Gauge > &arg, int idx)
unsigned long long flops
Definition: blas_quda.cu:42
void computeKSLongLinkForce(Result res, Oprod oprod, Gauge gauge, int dim[4], const GaugeField &meta, QudaFieldLocation location)
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:880
Accessor routine for CloverFields in native field order.
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:203
void computeKSLongLinkForceCPU(KSLongLinkArg< Result, Oprod, Gauge > &arg)
const GaugeField & meta
QudaParity parity
Definition: covdev_test.cpp:53
KSForceArg< Oprod, Gauge, Mom > arg
const int * X() const
static __device__ __host__ void getCoords(int x[], int cb_index, const I X[], int parity)