QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
pgauge_det_trace.cu
Go to the documentation of this file.
1 #include <quda_internal.h>
2 #include <quda_matrix.h>
3 #include <tune_quda.h>
4 #include <gauge_field.h>
5 #include <gauge_field_order.h>
6 #include <launch_kernel.cuh>
7 #include <comm_quda.h>
8 #include <pgauge_monte.h>
9 #include <atomic.cuh>
10 #include <cub_helper.cuh>
11 #include <index_helper.cuh>
12 
13 namespace quda {
14 
15 #ifdef GPU_GAUGE_ALG
16 
17 template <typename Gauge>
18 struct KernelArg : public ReduceArg<double2> {
19  int threads; // number of active threads required
20  int X[4]; // grid dimensions
21 #ifdef MULTI_GPU
22  int border[4];
23 #endif
24  Gauge dataOr;
25 
26  KernelArg(const Gauge &dataOr, const cudaGaugeField &data)
27  : ReduceArg<double2>(), dataOr(dataOr) {
28 #ifdef MULTI_GPU
29  for(int dir=0; dir<4; ++dir){
30  border[dir] = data.R()[dir];
31  X[dir] = data.X()[dir] - border[dir]*2;
32  }
33 #else
34  for(int dir=0; dir<4; ++dir) X[dir] = data.X()[dir];
35 #endif
36  threads = X[0]*X[1]*X[2]*X[3]/2;
37  }
38  double2 getValue(){return result_h[0];}
39 };
40 
41 
42 
43 template<int blockSize, typename Float, typename Gauge, int NCOLORS, int functiontype>
44 __global__ void compute_Value(KernelArg<Gauge> arg){
45  int idx = threadIdx.x + blockIdx.x*blockDim.x;
46  int parity = threadIdx.y;
47 
48  complex<double> val(0.0, 0.0);
49  while (idx < arg.threads) {
50  int X[4];
51  #pragma unroll
52  for(int dr=0; dr<4; ++dr) X[dr] = arg.X[dr];
53 
54  int x[4];
55  getCoords(x, idx, X, parity);
56  #ifdef MULTI_GPU
57  #pragma unroll
58  for(int dr=0; dr<4; ++dr) {
59  x[dr] += arg.border[dr];
60  X[dr] += 2*arg.border[dr];
61  }
62  idx = linkIndex(x,X);
63  #endif
64 #pragma unroll
65  for (int mu = 0; mu < 4; mu++) {
66  Matrix<complex<Float>,NCOLORS> U = arg.dataOr(mu, idx, parity);
67  if(functiontype == 0) val += getDeterminant(U);
68  if(functiontype == 1) val += getTrace(U);
69  }
70 
71  idx += blockDim.x*gridDim.x;
72  }
73 
74  double2 sum = make_double2(val.real(), val.imag());
75  reduce2d<blockSize,2>(arg, sum);
76 }
77 
78 
79 
80 template<typename Float, typename Gauge, int NCOLORS, int functiontype>
81 class CalcFunc : TunableLocalParity {
82  KernelArg<Gauge> arg;
83  TuneParam tp;
84  mutable char aux_string[128]; // used as a label in the autotuner
85  private:
86  bool tuneGridDim() const { return true; }
87 
88  public:
89  CalcFunc(KernelArg<Gauge> &arg) : arg(arg) {}
90  ~CalcFunc () { }
91 
92  void apply(const cudaStream_t &stream){
93  tp = tuneLaunch(*this, getTuning(), getVerbosity());
94  arg.result_h[0] = make_double2(0.0, 0.0);
95  LAUNCH_KERNEL_LOCAL_PARITY(compute_Value, tp, stream, arg, Float, Gauge, NCOLORS, functiontype);
97 
98  comm_allreduce_array((double*)arg.result_h, 2);
99  arg.result_h[0].x /= (double)(4*2*arg.threads*comm_size());
100  arg.result_h[0].y /= (double)(4*2*arg.threads*comm_size());
101  }
102 
103  TuneKey tuneKey() const {
104  std::stringstream vol;
105  vol << arg.X[0] << "x" << arg.X[1] << "x" << arg.X[2] << "x" << arg.X[3];
106  sprintf(aux_string,"threads=%d,prec=%lu", arg.threads, sizeof(Float));
107  return TuneKey(vol.str().c_str(), typeid(*this).name(), aux_string);
108 
109  }
110 
111  long long flops() const {
112  if(NCOLORS==3 && functiontype == 0) return 264LL*2*arg.threads+2LL*tp.block.x;
113  else if(NCOLORS==3 && functiontype == 1) return 24LL*2*arg.threads+2LL*tp.block.x;
114  else return 0;
115  }// Only correct if there is no link reconstruction
116  long long bytes() const { return 4LL*NCOLORS * NCOLORS * sizeof(Float)*2*2*arg.threads + tp.block.x * sizeof(double2); }
117 
118 };
119 
120 
121 
122 
123 
124 template<typename Float, int NCOLORS, int functiontype, typename Gauge>
125 double2 computeValue( Gauge dataOr, cudaGaugeField& data) {
126  TimeProfile profileGenericFunc("GenericFunc", false);
127  if (getVerbosity() >= QUDA_SUMMARIZE) profileGenericFunc.TPSTART(QUDA_PROFILE_COMPUTE);
128  KernelArg<Gauge> arg(dataOr, data);
129  CalcFunc<Float, Gauge, NCOLORS, functiontype> func(arg);
130  func.apply(0);
131  if(getVerbosity() >= QUDA_SUMMARIZE && functiontype == 0) printfQuda("Determinant: %.16e, %.16e\n", arg.getValue().x, arg.getValue().y);
132  if(getVerbosity() >= QUDA_SUMMARIZE && functiontype == 1) printfQuda("Trace: %.16e, %.16e\n", arg.getValue().x, arg.getValue().y);
133  checkCudaError();
135  if (getVerbosity() >= QUDA_SUMMARIZE){
136  profileGenericFunc.TPSTOP(QUDA_PROFILE_COMPUTE);
137  double secs = profileGenericFunc.Last(QUDA_PROFILE_COMPUTE);
138  double gflops = (func.flops()*1e-9)/(secs);
139  double gbytes = func.bytes()/(secs*1e9);
140  if(functiontype == 0){
141  #ifdef MULTI_GPU
142  printfQuda("Determinant: Time = %6.6f s, Gflop/s = %6.1f, GB/s = %6.1f\n", secs, gflops*comm_size(), gbytes*comm_size());
143  #else
144  printfQuda("Determinant: Time = %6.6f s, Gflop/s = %6.1f, GB/s = %6.1f\n", secs, gflops, gbytes);
145  #endif
146  }
147  if(functiontype == 1){
148  #ifdef MULTI_GPU
149  printfQuda("Trace: Time = %6.6f s, Gflop/s = %6.1f, GB/s = %6.1f\n", secs, gflops*comm_size(), gbytes*comm_size());
150  #else
151  printfQuda("Trace: Time = %6.6f s, Gflop/s = %6.1f, GB/s = %6.1f\n", secs, gflops, gbytes);
152  #endif
153  }
154  }
155  return arg.getValue();
156 }
157 
158 
159 
160 template<typename Float, int functiontype>
161 double2 computeValue(cudaGaugeField& data) {
162 
163  double2 rtn = make_double2(0.0,0.0);
164 
165  // Switching to FloatNOrder for the gauge field in order to support RECONSTRUCT_12
166  if(data.isNative()) {
167  if(data.Reconstruct() == QUDA_RECONSTRUCT_NO) {
168  //printfQuda("QUDA_RECONSTRUCT_NO\n");
169  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type Gauge;
170  rtn = computeValue<Float, 3, functiontype>(Gauge(data), data);
171  } else if(data.Reconstruct() == QUDA_RECONSTRUCT_12){
172  //printfQuda("QUDA_RECONSTRUCT_12\n");
173  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type Gauge;
174  rtn = computeValue<Float, 3, functiontype>(Gauge(data), data);
175  } else if(data.Reconstruct() == QUDA_RECONSTRUCT_8){
176  //printfQuda("QUDA_RECONSTRUCT_8\n");
177  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type Gauge;
178  rtn = computeValue<Float, 3, functiontype>(Gauge(data), data);
179  } else {
180  errorQuda("Reconstruction type %d of gauge field not supported", data.Reconstruct());
181  }
182  } else {
183  errorQuda("Invalid Gauge Order\n");
184  }
185  return rtn;
186 }
187 #endif // GPU_GAUGE_ALG
188 
195  double2 det = make_double2(0.0,0.0);
196 #ifdef GPU_GAUGE_ALG
197  if (data.Precision() == QUDA_SINGLE_PRECISION) {
198  det = computeValue<float, 0> (data);
199  } else if(data.Precision() == QUDA_DOUBLE_PRECISION) {
200  det = computeValue<double, 0>(data);
201  } else {
202  errorQuda("Precision %d not supported", data.Precision());
203  }
204 #else
205  errorQuda("Pure gauge code has not been built");
206 #endif // GPU_GAUGE_ALG
207  return det;
208 }
209 
215 double2 getLinkTrace( cudaGaugeField& data) {
216  double2 det = make_double2(0.0,0.0);
217 #ifdef GPU_GAUGE_ALG
218  if (data.Precision() == QUDA_SINGLE_PRECISION) {
219  det = computeValue<float, 1> (data);
220  } else if(data.Precision() == QUDA_DOUBLE_PRECISION) {
221  det = computeValue<double, 1>(data);
222  } else {
223  errorQuda("Precision %d not supported", data.Precision());
224  }
225 #else
226  errorQuda("Pure gauge code has not been built");
227 #endif // GPU_GAUGE_ALG
228  return det;
229 }
230 
231 
232 } // namespace quda
double mu
Definition: test_util.cpp:1648
static __device__ __host__ int linkIndex(const int x[], const I X[4])
#define LAUNCH_KERNEL_LOCAL_PARITY(kernel, tp, stream, arg,...)
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21
double2 getLinkDeterminant(cudaGaugeField &data)
Calculate the Determinant.
#define errorQuda(...)
Definition: util_quda.h:121
cudaStream_t * stream
void comm_allreduce_array(double *data, size_t size)
Definition: comm_mpi.cpp:272
__host__ __device__ void sum(double &a, double &b)
Definition: blas_helper.cuh:62
const int * R() const
int comm_size(void)
Definition: comm_mpi.cpp:88
#define qudaDeviceSynchronize()
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:643
double2 getLinkTrace(cudaGaugeField &data)
Calculate the Trace.
Main header file for host and device accessors to GaugeFields.
int X[4]
Definition: covdev_test.cpp:70
__device__ __host__ T getTrace(const Matrix< T, 3 > &a)
Definition: quda_matrix.h:415
#define printfQuda(...)
Definition: util_quda.h:115
unsigned long long flops
Definition: blas_quda.cu:22
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:250
#define checkCudaError()
Definition: util_quda.h:161
__device__ __host__ T getDeterminant(const Mat< T, 3 > &a)
Definition: quda_matrix.h:422
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:52
QudaPrecision Precision() const
bool isNative() const
QudaParity parity
Definition: covdev_test.cpp:54
unsigned long long bytes
Definition: blas_quda.cu:23
__host__ __device__ int getCoords(int coord[], const Arg &arg, int &idx, int parity, int &dim)
Compute the space-time coordinates we are at.
const int * X() const