QUDA  0.9.0
pgauge_det_trace.cu
Go to the documentation of this file.
1 #include <quda_internal.h>
2 #include <quda_matrix.h>
3 #include <tune_quda.h>
4 #include <gauge_field.h>
5 #include <gauge_field_order.h>
6 #include <launch_kernel.cuh>
7 #include <comm_quda.h>
8 #include <pgauge_monte.h>
9 #include <atomic.cuh>
10 #include <cub_helper.cuh>
11 #include <index_helper.cuh>
12 
13 namespace quda {
14 
15 #ifdef GPU_GAUGE_ALG
16 
17 template <typename Gauge>
18 struct KernelArg : public ReduceArg<double2> {
19  int threads; // number of active threads required
20  int X[4]; // grid dimensions
21 #ifdef MULTI_GPU
22  int border[4];
23 #endif
24  Gauge dataOr;
25 
26  KernelArg(const Gauge &dataOr, const cudaGaugeField &data)
27  : ReduceArg<double2>(), dataOr(dataOr) {
28 #ifdef MULTI_GPU
29  for(int dir=0; dir<4; ++dir){
30  border[dir] = data.R()[dir];
31  X[dir] = data.X()[dir] - border[dir]*2;
32  }
33 #else
34  for(int dir=0; dir<4; ++dir) X[dir] = data.X()[dir];
35 #endif
36  threads = X[0]*X[1]*X[2]*X[3]/2;
37  }
38  double2 getValue(){return result_h[0];}
39 };
40 
41 
42 
43 template<int blockSize, typename Float, typename Gauge, int NCOLORS, int functiontype>
44 __global__ void compute_Value(KernelArg<Gauge> arg){
45  int idx = threadIdx.x + blockIdx.x*blockDim.x;
46  int parity = threadIdx.y;
47 
48  complex<double> val(0.0, 0.0);
49  if(idx < arg.threads) {
50  int X[4];
51  #pragma unroll
52  for(int dr=0; dr<4; ++dr) X[dr] = arg.X[dr];
53 
54  int x[4];
55  getCoords(x, idx, X, parity);
56  #ifdef MULTI_GPU
57  #pragma unroll
58  for(int dr=0; dr<4; ++dr) {
59  x[dr] += arg.border[dr];
60  X[dr] += 2*arg.border[dr];
61  }
62  idx = linkIndex(x,X);
63  #endif
64 #pragma unroll
65  for (int mu = 0; mu < 4; mu++) {
66  Matrix<complex<Float>,NCOLORS> U;
67  arg.dataOr.load((Float*)(U.data), idx, mu, parity);
68  if(functiontype == 0) val += getDeterminant(U);
69  if(functiontype == 1) val += getTrace(U);
70  }
71  }
72 
73  double2 sum = make_double2(val.real(), val.imag());
74  reduce2d<blockSize,2>(arg, sum);
75 }
76 
77 
78 
79 template<typename Float, typename Gauge, int NCOLORS, int functiontype>
80 class CalcFunc : TunableLocalParity {
81  KernelArg<Gauge> arg;
82  TuneParam tp;
83  mutable char aux_string[128]; // used as a label in the autotuner
84  private:
85  unsigned int minThreads() const { return arg.threads; }
86 
87  public:
88  CalcFunc(KernelArg<Gauge> &arg) : arg(arg) {}
89  ~CalcFunc () { }
90 
91  void apply(const cudaStream_t &stream){
92  tp = tuneLaunch(*this, getTuning(), getVerbosity());
93  arg.result_h[0] = make_double2(0.0, 0.0);
94  LAUNCH_KERNEL_LOCAL_PARITY(compute_Value, tp, stream, arg, Float, Gauge, NCOLORS, functiontype);
96 
97  comm_allreduce_array((double*)arg.result_h, 2);
98  arg.result_h[0].x /= (double)(4*2*arg.threads*comm_size());
99  arg.result_h[0].y /= (double)(4*2*arg.threads*comm_size());
100  }
101 
102  TuneKey tuneKey() const {
103  std::stringstream vol;
104  vol << arg.X[0] << "x" << arg.X[1] << "x" << arg.X[2] << "x" << arg.X[3];
105  sprintf(aux_string,"threads=%d,prec=%lu", arg.threads, sizeof(Float));
106  return TuneKey(vol.str().c_str(), typeid(*this).name(), aux_string);
107 
108  }
109 
110  long long flops() const {
111  if(NCOLORS==3 && functiontype == 0) return 264LL*2*arg.threads+2LL*tp.block.x;
112  else if(NCOLORS==3 && functiontype == 1) return 24LL*2*arg.threads+2LL*tp.block.x;
113  else return 0;
114  }// Only correct if there is no link reconstruction
115  long long bytes() const { return 4LL*NCOLORS * NCOLORS * sizeof(Float)*2*2*arg.threads + tp.block.x * sizeof(double2); }
116 
117 };
118 
119 
120 
121 
122 
123 template<typename Float, int NCOLORS, int functiontype, typename Gauge>
124 double2 computeValue( Gauge dataOr, cudaGaugeField& data) {
125  TimeProfile profileGenericFunc("GenericFunc", false);
126  if (getVerbosity() >= QUDA_SUMMARIZE) profileGenericFunc.TPSTART(QUDA_PROFILE_COMPUTE);
127  KernelArg<Gauge> arg(dataOr, data);
128  CalcFunc<Float, Gauge, NCOLORS, functiontype> func(arg);
129  func.apply(0);
130  if(getVerbosity() >= QUDA_SUMMARIZE && functiontype == 0) printfQuda("Determinant: %.16e, %.16e\n", arg.getValue().x, arg.getValue().y);
131  if(getVerbosity() >= QUDA_SUMMARIZE && functiontype == 1) printfQuda("Trace: %.16e, %.16e\n", arg.getValue().x, arg.getValue().y);
132  checkCudaError();
134  if (getVerbosity() >= QUDA_SUMMARIZE){
135  profileGenericFunc.TPSTOP(QUDA_PROFILE_COMPUTE);
136  double secs = profileGenericFunc.Last(QUDA_PROFILE_COMPUTE);
137  double gflops = (func.flops()*1e-9)/(secs);
138  double gbytes = func.bytes()/(secs*1e9);
139  if(functiontype == 0){
140  #ifdef MULTI_GPU
141  printfQuda("Determinant: Time = %6.6f s, Gflop/s = %6.1f, GB/s = %6.1f\n", secs, gflops*comm_size(), gbytes*comm_size());
142  #else
143  printfQuda("Determinant: Time = %6.6f s, Gflop/s = %6.1f, GB/s = %6.1f\n", secs, gflops, gbytes);
144  #endif
145  }
146  if(functiontype == 1){
147  #ifdef MULTI_GPU
148  printfQuda("Trace: Time = %6.6f s, Gflop/s = %6.1f, GB/s = %6.1f\n", secs, gflops*comm_size(), gbytes*comm_size());
149  #else
150  printfQuda("Trace: Time = %6.6f s, Gflop/s = %6.1f, GB/s = %6.1f\n", secs, gflops, gbytes);
151  #endif
152  }
153  }
154  return arg.getValue();
155 }
156 
157 
158 
159 template<typename Float, int functiontype>
160 double2 computeValue(cudaGaugeField& data) {
161 
162  double2 rtn = make_double2(0.0,0.0);
163 
164  // Switching to FloatNOrder for the gauge field in order to support RECONSTRUCT_12
165  if(data.isNative()) {
166  if(data.Reconstruct() == QUDA_RECONSTRUCT_NO) {
167  //printfQuda("QUDA_RECONSTRUCT_NO\n");
168  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type Gauge;
169  rtn = computeValue<Float, 3, functiontype>(Gauge(data), data);
170  } else if(data.Reconstruct() == QUDA_RECONSTRUCT_12){
171  //printfQuda("QUDA_RECONSTRUCT_12\n");
172  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type Gauge;
173  rtn = computeValue<Float, 3, functiontype>(Gauge(data), data);
174  } else if(data.Reconstruct() == QUDA_RECONSTRUCT_8){
175  //printfQuda("QUDA_RECONSTRUCT_8\n");
176  typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type Gauge;
177  rtn = computeValue<Float, 3, functiontype>(Gauge(data), data);
178  } else {
179  errorQuda("Reconstruction type %d of gauge field not supported", data.Reconstruct());
180  }
181  } else {
182  errorQuda("Invalid Gauge Order\n");
183  }
184  return rtn;
185 }
186 #endif // GPU_GAUGE_ALG
187 
194  double2 det = make_double2(0.0,0.0);
195 #ifdef GPU_GAUGE_ALG
196  if (data.Precision() == QUDA_SINGLE_PRECISION) {
197  det = computeValue<float, 0> (data);
198  } else if(data.Precision() == QUDA_DOUBLE_PRECISION) {
199  det = computeValue<double, 0>(data);
200  } else {
201  errorQuda("Precision %d not supported", data.Precision());
202  }
203 #else
204  errorQuda("Pure gauge code has not been built");
205 #endif // GPU_GAUGE_ALG
206  return det;
207 }
208 
214 double2 getLinkTrace( cudaGaugeField& data) {
215  double2 det = make_double2(0.0,0.0);
216 #ifdef GPU_GAUGE_ALG
217  if (data.Precision() == QUDA_SINGLE_PRECISION) {
218  det = computeValue<float, 1> (data);
219  } else if(data.Precision() == QUDA_DOUBLE_PRECISION) {
220  det = computeValue<double, 1>(data);
221  } else {
222  errorQuda("Precision %d not supported", data.Precision());
223  }
224 #else
225  errorQuda("Pure gauge code has not been built");
226 #endif // GPU_GAUGE_ALG
227  return det;
228 }
229 
230 
231 } // namespace quda
dim3 dim3 blockDim
double mu
Definition: test_util.cpp:1643
static __device__ __host__ int linkIndex(const int x[], const I X[4])
#define LAUNCH_KERNEL_LOCAL_PARITY(kernel, tp, stream, arg,...)
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
double2 getLinkDeterminant(cudaGaugeField &data)
Calculate the Determinant.
const void * func
#define errorQuda(...)
Definition: util_quda.h:90
cudaStream_t * stream
void comm_allreduce_array(double *data, size_t size)
Definition: comm_mpi.cpp:296
const int * R() const
int comm_size(void)
Definition: comm_mpi.cpp:126
__host__ __device__ void sum(double &a, double &b)
T data[N *N]
Definition: quda_matrix.h:74
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603
double2 getLinkTrace(cudaGaugeField &data)
Calculate the Trace.
Main header file for host and device accessors to GaugeFields.
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
__device__ __host__ T getTrace(const Matrix< T, 3 > &a)
Definition: quda_matrix.h:305
int sprintf(char *, const char *,...) __attribute__((__format__(__printf__
#define printfQuda(...)
Definition: util_quda.h:84
unsigned long long flops
Definition: blas_quda.cu:42
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:880
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:203
#define checkCudaError()
Definition: util_quda.h:129
__device__ __host__ T getDeterminant(const Mat< T, 3 > &a)
Definition: quda_matrix.h:312
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51
QudaPrecision Precision() const
bool isNative() const
static __inline__ enum cudaRoundMode mode enum cudaRoundMode mode enum cudaRoundMode mode enum cudaRoundMode mode int val
QudaParity parity
Definition: covdev_test.cpp:53
unsigned long long bytes
Definition: blas_quda.cu:43
const int * X() const
static __device__ __host__ void getCoords(int x[], int cb_index, const I X[], int parity)