QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
ks_force_quda.cu
Go to the documentation of this file.
1 #include <quda_internal.h>
2 #include <quda_matrix.h>
3 #include <tune_quda.h>
4 #include <gauge_field.h>
5 #include <gauge_field_order.h>
6 #include <ks_force_quda.h>
7 
8 
9 namespace quda {
10 
11  template<typename Oprod, typename Gauge, typename Mom>
12  struct KSForceArg {
13  int threads;
14  int X[4]; // grid dimensions
15 #ifndef BUILD_TIFR_INTERFACE
16 #ifdef MULTI_GPU
17  int border[4];
18 #endif
19 #endif
20  Oprod oprod;
21  Gauge gauge;
22  Mom mom;
23 
24  KSForceArg(Oprod& oprod, Gauge &gauge, Mom& mom, int dim[4])
25  : oprod(oprod), gauge(gauge), mom(mom){
26 
27  threads = 1;
28  for(int dir=0; dir<4; ++dir) threads *= dim[dir];
29 
30  for(int dir=0; dir<4; ++dir) X[dir] = dim[dir];
31 #ifndef BUILD_TIFR_INTERFACE
32 #ifdef MULTI_GPU
33  for(int dir=0; dir<4; ++dir) border[dir] = 2;
34 #endif
35 #endif
36  }
37 
38  };
39 
40  __device__ __host__ inline int linkIndex(int x[], int dx[], const int X[4]) {
41  int y[4];
42  for (int i=0; i<4; i++) y[i] = (x[i] + dx[i] + X[i]) % X[i];
43  int idx = (((y[3]*X[2] + y[2])*X[1] + y[1])*X[0] + y[0]) >> 1;
44  return idx;
45  }
46 
47 
48  __device__ __host__ inline void getCoords(int x[4], int cb_index, const int X[4], int parity)
49  {
50  x[3] = cb_index/(X[2]*X[1]*X[0]/2);
51  x[2] = (cb_index/(X[1]*X[0]/2)) % X[2];
52  x[1] = (cb_index/(X[0]/2)) % X[1];
53  x[0] = 2*(cb_index%(X[0]/2)) + ((x[3]+x[2]+x[1]+parity)&1);
54 
55  return;
56  }
57 
58  template<typename Float, typename Oprod, typename Gauge, typename Mom>
59  __host__ __device__ void completeKSForceCore(KSForceArg<Oprod,Gauge,Mom>& arg, int idx){
60 
61  int parity = 0;
62  if(idx >= arg.threads/2){
63  parity = 1;
64  idx -= arg.threads/2;
65  }
66 
67  int X[4];
68  for(int dir=0; dir<4; ++dir) X[dir] = arg.X[dir];
69 
70  int x[4];
71  getCoords(x, idx, X, parity);
72 #ifndef BUILD_TIFR_INTERFACE
73 #ifdef MULTI_GPU
74  for(int dir=0; dir<4; ++dir){
75  x[dir] += arg.border[dir];
76  X[dir] += 2*arg.border[dir];
77  }
78 #endif
79 #endif
80 
81  typedef typename ComplexTypeId<Float>::Type Cmplx;
82 
86 
87 
88  int dx[4] = {0,0,0,0};
89  for(int dir=0; dir<4; ++dir){
90  arg.gauge.load((Float*)(G.data), linkIndex(x,dx,X), dir, parity);
91  arg.oprod.load((Float*)(O.data), linkIndex(x,dx,X), dir, parity);
92  if(parity==0){
93  M = G*O;
94  }else{
95  M = -G*O;
96  }
97 
98  Float sub = getTrace(M).y/(static_cast<Float>(3));
99  Float temp[10];
100 
101 
102  temp[0] = (M.data[1].x - M.data[3].x)*0.5;
103  temp[1] = (M.data[1].y + M.data[3].y)*0.5;
104 
105  temp[2] = (M.data[2].x - M.data[6].x)*0.5;
106  temp[3] = (M.data[2].y + M.data[6].y)*0.5;
107 
108  temp[4] = (M.data[5].x - M.data[7].x)*0.5;
109  temp[5] = (M.data[5].y + M.data[7].y)*0.5;
110 
111  temp[6] = (M.data[0].y-sub);
112  temp[7] = (M.data[4].y-sub);
113  temp[8] = (M.data[8].y-sub);
114  temp[9] = 0.0;
115 
116  arg.mom.save(temp, idx, dir, parity);
117  }
118  }
119 
120  template<typename Float, typename Oprod, typename Gauge, typename Mom>
122  {
123  int idx = threadIdx.x + blockIdx.x*blockDim.x;
124 
125  if(idx >= arg.threads) return;
126  completeKSForceCore<Float,Oprod,Gauge,Mom>(arg,idx);
127  }
128 
129 
130 
131 
132  template<typename Float, typename Oprod, typename Gauge, typename Mom>
134  {
135  for(int idx=0; idx<arg.threads; idx++){
136  completeKSForceCore<Float,Oprod,Gauge,Mom>(arg,idx);
137  }
138  }
139 
140 
141 
142  template<typename Float, typename Oprod, typename Gauge, typename Mom>
144 
146  const GaugeField &meta;
147  const QudaFieldLocation location;
148 
149  private:
150  unsigned int sharedBytesPerThread() const { return 0; }
151  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
152 
153  bool tuneSharedBytes() const { return false; } // Don't tune the shared memory.
154  bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
155  unsigned int minThreads() const { return arg.threads; }
156 
157  public:
159  : arg(arg), meta(meta), location(location) {
160  writeAuxString("prec=%lu,stride=%d",sizeof(Float),arg.mom.stride);
161  }
162 
163  virtual ~KSForceComplete() {}
164 
165  void apply(const cudaStream_t &stream) {
166  if(location == QUDA_CUDA_FIELD_LOCATION){
167 #if (__COMPUTE_CAPABILITY__ >= 200)
168  // Fix this
169  dim3 blockDim(128, 1, 1);
170  dim3 gridDim((arg.threads + blockDim.x - 1) / blockDim.x, 1, 1);
171  completeKSForceKernel<Float><<<gridDim,blockDim>>>(arg);
172 #else
173  errorQuda("completeKSForce not supported on pre-Fermi architecture");
174 #endif
175  }else{
176  completeKSForceCPU<Float>(arg);
177  }
178  }
179 
180  TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
181 
182  std::string paramString(const TuneParam &param) const { // Don't print the grid dim.
183  std::stringstream ps;
184  ps << "block=(" << param.block.x << "," << param.block.y << "," << param.block.z << "), ";
185  ps << "shared=" << param.shared_bytes;
186  return ps.str();
187  }
188 
189 
190  long long flops() const { return 792*arg.X[0]*arg.X[1]*arg.X[2]*arg.X[3]; }
191  long long bytes() const { return 0; } // Fix this
192  };
193 
194  template<typename Float, typename Oprod, typename Gauge, typename Mom>
195  void completeKSForce(Oprod oprod, Gauge gauge, Mom mom, int dim[4], const GaugeField &meta, QudaFieldLocation location, long long *flops)
196  {
197  KSForceArg<Oprod,Gauge,Mom> arg(oprod, gauge, mom, dim);
198  KSForceComplete<Float,Oprod,Gauge,Mom> completeForce(arg,meta,location);
199  completeForce.apply(0);
200  if(flops) *flops = completeForce.flops();
201  cudaDeviceSynchronize();
202  }
203 
204 
205  template<typename Float>
206  void completeKSForce(GaugeField& mom, const GaugeField& oprod, const GaugeField& gauge, QudaFieldLocation location, long long *flops)
207  {
208 
209  if(location != QUDA_CUDA_FIELD_LOCATION){
210  errorQuda("Only QUDA_CUDA_FIELD_LOCATION currently supported");
211  }else{
212  if((oprod.Reconstruct() != QUDA_RECONSTRUCT_NO) || (gauge.Reconstruct() != QUDA_RECONSTRUCT_NO) || (mom.Reconstruct() != QUDA_RECONSTRUCT_10)){
213  errorQuda("Reconstruct type not supported");
214  }else{
215  completeKSForce<Float>(FloatNOrder<Float, 18, 2, 18>(oprod),
218  const_cast<int*>(mom.X()),
219  gauge, location, flops);
220  }
221  }
222  return;
223  }
224 
225 
226  void completeKSForce(GaugeField &mom, const GaugeField &oprod, const GaugeField &gauge, QudaFieldLocation location, long long *flops)
227  {
228  if(mom.Precision() == QUDA_HALF_PRECISION){
229  errorQuda("Half precision not supported");
230  }
231 
232  if(mom.Precision() == QUDA_SINGLE_PRECISION){
233  completeKSForce<float>(mom, oprod, gauge, location, flops);
234  }else if(mom.Precision() == QUDA_DOUBLE_PRECISION){
235  completeKSForce<double>(mom, oprod, gauge, location, flops);
236  }else{
237  errorQuda("Precision %d not supported", mom.Precision());
238  }
239  return;
240  }
241 
242 
243 
244 
245  template<typename Result, typename Oprod, typename Gauge>
246  struct KSLongLinkArg {
247  int threads;
248  int X[4]; // grid dimensions
249 #ifdef MULTI_GPU
250  int border[4];
251 #endif
252  double coeff;
253  Result res;
254  Oprod oprod;
255  Gauge gauge;
256 
257  KSLongLinkArg(Result& res, Oprod& oprod, Gauge &gauge, int dim[4])
258  : coeff(1.0), res(res), oprod(oprod), gauge(gauge){
259 
260  threads = 1;
261 #ifdef MULTI_GPU
262  for(int dir=0; dir<4; ++dir) threads *= (dim[dir]-2);
263  for(int dir=0; dir<4; ++dir) X[dir] = dim[dir]-2;
264  for(int dir=0; dir<4; ++dir) border[dir] = 2;
265 #else
266  for(int dir=0; dir<4; ++dir) threads *= dim[dir];
267  for(int dir=0; dir<4; ++dir) X[dir] = dim[dir];
268 #endif
269  }
270 
271  };
272 
273 
274 
275  template<typename Float, typename Result, typename Oprod, typename Gauge>
277 
278  /*
279  int parity = 0;
280  if(idx >= arg.threads/2){
281  parity = 1;
282  idx -= arg.threads/2;
283  }
284 
285  int X[4];
286  for(int dir=0; dir<4; ++dir) X[dir] = arg.X[dir];
287 
288  int x[4];
289  getCoords(x, idx, X, parity);
290 #ifndef BUILD_TIFR_INTERFACE
291 #ifdef MULTI_GPU
292 for(int dir=0; dir<4; ++dir){
293 x[dir] += arg.border[dir];
294 X[dir] += 2*arg.border[dir];
295 }
296 #endif
297 #endif
298 
299 typedef typename ComplexTypeId<Float>::Type Cmplx;
300 
301 Matrix<Cmplx,3> O;
302 Matrix<Cmplx,3> G;
303 Matrix<Cmplx,3> M;
304 
305 
306 int dx[4] = {0,0,0,0};
307 for(int dir=0; dir<4; ++dir){
308 arg.gauge.load((Float*)(G.data), linkIndex(x,dx,X), dir, parity);
309 arg.oprod.load((Float*)(O.data), linkIndex(x,dx,X), dir, parity);
310 if(parity==0){
311 M = G*O;
312 }else{
313 M = -G*O;
314 }
315 
316 Float sub = getTrace(M).y/(static_cast<Float>(3));
317 Float temp[10];
318 
319 
320 temp[0] = (M.data[1].x - M.data[3].x)*0.5;
321 temp[1] = (M.data[1].y + M.data[3].y)*0.5;
322 
323 temp[2] = (M.data[2].x - M.data[6].x)*0.5;
324 temp[3] = (M.data[2].y + M.data[6].y)*0.5;
325 
326 temp[4] = (M.data[5].x - M.data[7].x)*0.5;
327 temp[5] = (M.data[5].y + M.data[7].y)*0.5;
328 
329 temp[6] = (M.data[0].y-sub);
330 temp[7] = (M.data[4].y-sub);
331 temp[8] = (M.data[8].y-sub);
332 temp[9] = 0.0;
333 
334 arg.mom.save(temp, idx, dir, parity);
335 }
336  */
337  }
338 
339  template<typename Float, typename Result, typename Oprod, typename Gauge>
341 {
342  int idx = threadIdx.x + blockIdx.x*blockDim.x;
343 
344  if(idx >= arg.threads) return;
345  computeKSLongLinkForceCore<Float,Result,Oprod,Gauge>(arg,idx);
346 }
347 
348 
349 
350 
351  template<typename Float, typename Result, typename Oprod, typename Gauge>
353 {
354  for(int idx=0; idx<arg.threads; idx++){
355  computeKSLongLinkForceCore<Float,Result,Oprod,Gauge>(arg,idx);
356  }
357 }
358 
359 
360 
361 // should be tunable
362 template<typename Float, typename Result, typename Oprod, typename Gauge>
364 
365 
367  const GaugeField &meta;
368  const QudaFieldLocation location;
369 
370  private:
371  unsigned int sharedBytesPerThread() const { return 0; }
372  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
373 
374  bool tuneSharedBytes() const { return false; } // Don't tune the shared memory.
375  bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
376  unsigned int minThreads() const { return arg.threads; }
377 
378  public:
380  : arg(arg), meta(meta), location(location) {
381  writeAuxString("prec=%lu,stride=%d",sizeof(Float),arg.res.stride);
382  }
383 
384  virtual ~KSLongLinkForce() {}
385 
386  void apply(const cudaStream_t &stream) {
387  if(location == QUDA_CUDA_FIELD_LOCATION){
388 #if (__COMPUTE_CAPABILITY__ >= 200)
389  // Fix this
390  dim3 blockDim(128, 1, 1);
391  dim3 gridDim((arg.threads + blockDim.x - 1) / blockDim.x, 1, 1);
392  computeKSLongLinkForceKernel<Float><<<gridDim,blockDim>>>(arg);
393 #else
394  errorQuda("computeKSLongLinkForce not supported on pre-Fermi architecture");
395 #endif
396  }else{
397  computeKSLongLinkForceCPU<Float>(arg);
398  }
399  }
400 
401  TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
402 
403  std::string paramString(const TuneParam &param) const { // Don't print the grid dim.
404  std::stringstream ps;
405  ps << "block=(" << param.block.x << "," << param.block.y << "," << param.block.z << "), ";
406  ps << "shared=" << param.shared_bytes;
407  return ps.str();
408  }
409 
410 
411  void preTune(){}
412  void postTune(){}
413  long long flops() const { return 0; } // Fix this
414  long long bytes() const { return 0; } // Fix this
415 };
416 
417 
418 
419 
420 template<typename Float, typename Result, typename Oprod, typename Gauge>
421 void computeKSLongLinkForce(Result res, Oprod oprod, Gauge gauge, int dim[4], const GaugeField &meta, QudaFieldLocation location)
422 {
423  KSLongLinkArg<Result,Oprod,Gauge> arg(res, oprod, gauge, dim);
424  KSLongLinkForce<Float,Result,Oprod,Gauge> computeLongLink(arg,meta,location);
425  computeLongLink.apply(0);
426  cudaDeviceSynchronize();
427 }
428 
429  template<typename Float>
430 void computeKSLongLinkForce(GaugeField& result, const GaugeField &oprod, const GaugeField &gauge, QudaFieldLocation location)
431 {
432  if(location != QUDA_CUDA_FIELD_LOCATION){
433  errorQuda("Only QUDA_CUDA_FIELD_LOCATION currently supported");
434  }else{
435  if((oprod.Reconstruct() != QUDA_RECONSTRUCT_NO) || (gauge.Reconstruct() != QUDA_RECONSTRUCT_NO) ||
436  (result.Reconstruct() != QUDA_RECONSTRUCT_10)){
437 
438  errorQuda("Reconstruct type not supported");
439  }else{
440  computeKSLongLinkForce<Float>(FloatNOrder<Float, 18, 2, 18>(result),
443  const_cast<int*>(result.X()),
444  gauge, location);
445  }
446  }
447  return;
448 }
449 
450 
451 void computeKSLongLinkForce(GaugeField &result, const GaugeField &oprod, const GaugeField &gauge, QudaFieldLocation location)
452 {
453  if(result.Precision() == QUDA_HALF_PRECISION){
454  errorQuda("Half precision not supported");
455  }
456 
457  if(result.Precision() == QUDA_SINGLE_PRECISION){
458  computeKSLongLinkForce<float>(result, oprod, gauge, location);
459  }else if(result.Precision() == QUDA_DOUBLE_PRECISION){
460  computeKSLongLinkForce<double>(result, oprod, gauge, location);
461  }
462  errorQuda("Precision %d not supported", result.Precision());
463  return;
464 }
465 
466 } // namespace quda
KSForceArg(Oprod &oprod, Gauge &gauge, Mom &mom, int dim[4])
__device__ __host__ int linkIndex(int x[], int dx[], const int X[4])
__global__ void completeKSForceKernel(KSForceArg< Oprod, Gauge, Mom > arg)
TuneKey tuneKey() const
int y[4]
__global__ void computeKSLongLinkForceKernel(KSLongLinkArg< Result, Oprod, Gauge > arg)
KSForceComplete(KSForceArg< Oprod, Gauge, Mom > &arg, const GaugeField &meta, QudaFieldLocation location)
#define errorQuda(...)
Definition: util_quda.h:73
const int * X() const
void completeKSForceCPU(KSForceArg< Oprod, Gauge, Mom > &arg)
cudaStream_t * stream
::std::string string
Definition: gtest.h:1979
long long flops() const
void completeKSForce(GaugeField &mom, const GaugeField &oprod, const GaugeField &gauge, QudaFieldLocation location, long long *flops=NULL)
void apply(const cudaStream_t &stream)
QudaGaugeParam param
Definition: pack_test.cpp:17
void writeAuxString(const char *format,...)
Definition: tune_quda.h:138
const QudaFieldLocation location
Definition: pack_test.cpp:46
FloatingPoint< float > Float
Definition: gtest.h:7350
T data[N *N]
Definition: quda_matrix.h:351
__host__ __device__ void completeKSForceCore(KSForceArg< Oprod, Gauge, Mom > &arg, int idx)
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:168
std::string paramString(const TuneParam &param) const
const char * VolString() const
int x[4]
int dx[4]
long long bytes() const
__device__ __host__ T getTrace(const Matrix< T, 3 > &a)
Definition: quda_matrix.h:378
enum QudaFieldLocation_s QudaFieldLocation
__host__ __device__ void computeKSLongLinkForceCore(KSLongLinkArg< Result, Oprod, Gauge > &arg, int idx)
void computeKSLongLinkForce(Result res, Oprod oprod, Gauge gauge, int dim[4], const GaugeField &meta, QudaFieldLocation location)
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:843
void computeKSLongLinkForceCPU(KSLongLinkArg< Result, Oprod, Gauge > &arg)
const QudaParity parity
Definition: dslash_test.cpp:29
void * gauge[4]
Definition: su3_test.cpp:15
char aux[TuneKey::aux_n]
Definition: tune_quda.h:136
__device__ __host__ void getCoords(int x[4], int cb_index, const int X[4], int parity)