17 template <
typename Gauge>
18 struct KernelArg :
public ReduceArg<double2> {
29 for(
int dir=0; dir<4; ++dir){
30 border[dir] = data.
R()[dir];
31 X[dir] = data.
X()[dir] - border[dir]*2;
34 for(
int dir=0; dir<4; ++dir) X[dir] = data.
X()[dir];
36 threads = X[0]*X[1]*X[2]*X[3]/2;
38 double2 getValue(){
return result_h[0];}
43 template<
int blockSize,
typename Float,
typename Gauge,
int NCOLORS,
int functiontype>
44 __global__
void compute_Value(KernelArg<Gauge>
arg){
45 int idx = threadIdx.x + blockIdx.x*blockDim.x;
49 while (idx < arg.threads) {
52 for(
int dr=0; dr<4; ++dr) X[dr] = arg.X[dr];
58 for(
int dr=0; dr<4; ++dr) {
59 x[dr] += arg.border[dr];
60 X[dr] += 2*arg.border[dr];
65 for (
int mu = 0;
mu < 4;
mu++) {
68 if(functiontype == 1) val +=
getTrace(U);
71 idx += blockDim.x*gridDim.x;
74 double2
sum = make_double2(val.real(), val.imag());
75 reduce2d<blockSize,2>(
arg,
sum);
80 template<
typename Float,
typename Gauge,
int NCOLORS,
int functiontype>
84 mutable char aux_string[128];
86 bool tuneGridDim()
const {
return true; }
89 CalcFunc(KernelArg<Gauge> &arg) : arg(arg) {}
92 void apply(
const cudaStream_t &
stream){
94 arg.result_h[0] = make_double2(0.0, 0.0);
99 arg.result_h[0].x /= (double)(4*2*arg.threads*
comm_size());
100 arg.result_h[0].y /= (double)(4*2*arg.threads*
comm_size());
104 std::stringstream vol;
105 vol << arg.X[0] <<
"x" << arg.X[1] <<
"x" << arg.X[2] <<
"x" << arg.X[3];
106 sprintf(aux_string,
"threads=%d,prec=%lu", arg.threads,
sizeof(Float));
107 return TuneKey(vol.str().c_str(),
typeid(*this).name(), aux_string);
111 long long flops()
const {
112 if(NCOLORS==3 && functiontype == 0)
return 264LL*2*arg.threads+2LL*tp.
block.x;
113 else if(NCOLORS==3 && functiontype == 1)
return 24LL*2*arg.threads+2LL*tp.
block.x;
116 long long bytes()
const {
return 4LL*NCOLORS * NCOLORS *
sizeof(Float)*2*2*arg.threads + tp.
block.x *
sizeof(double2); }
124 template<
typename Float,
int NCOLORS,
int functiontype,
typename Gauge>
126 TimeProfile profileGenericFunc(
"GenericFunc",
false);
128 KernelArg<Gauge>
arg(dataOr, data);
129 CalcFunc<Float, Gauge, NCOLORS, functiontype> func(arg);
138 double gflops = (func.flops()*1e-9)/(secs);
139 double gbytes = func.bytes()/(secs*1e9);
140 if(functiontype == 0){
144 printfQuda(
"Determinant: Time = %6.6f s, Gflop/s = %6.1f, GB/s = %6.1f\n", secs, gflops, gbytes);
147 if(functiontype == 1){
151 printfQuda(
"Trace: Time = %6.6f s, Gflop/s = %6.1f, GB/s = %6.1f\n", secs, gflops, gbytes);
155 return arg.getValue();
160 template<
typename Float,
int functiontype>
163 double2 rtn = make_double2(0.0,0.0);
170 rtn = computeValue<Float, 3, functiontype>(Gauge(data), data);
174 rtn = computeValue<Float, 3, functiontype>(Gauge(data), data);
178 rtn = computeValue<Float, 3, functiontype>(Gauge(data), data);
187 #endif // GPU_GAUGE_ALG 195 double2 det = make_double2(0.0,0.0);
198 det = computeValue<float, 0> (data);
200 det = computeValue<double, 0>(data);
205 errorQuda(
"Pure gauge code has not been built");
206 #endif // GPU_GAUGE_ALG 216 double2 det = make_double2(0.0,0.0);
219 det = computeValue<float, 1> (data);
221 det = computeValue<double, 1>(data);
226 errorQuda(
"Pure gauge code has not been built");
227 #endif // GPU_GAUGE_ALG
static __device__ __host__ int linkIndex(const int x[], const I X[4])
#define LAUNCH_KERNEL_LOCAL_PARITY(kernel, tp, stream, arg,...)
QudaVerbosity getVerbosity()
double2 getLinkDeterminant(cudaGaugeField &data)
Calculate the Determinant.
void comm_allreduce_array(double *data, size_t size)
__host__ __device__ void sum(double &a, double &b)
#define qudaDeviceSynchronize()
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
double2 getLinkTrace(cudaGaugeField &data)
Calculate the Trace.
Main header file for host and device accessors to GaugeFields.
__device__ __host__ T getTrace(const Matrix< T, 3 > &a)
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
QudaReconstructType Reconstruct() const
__device__ __host__ T getDeterminant(const Mat< T, 3 > &a)
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
QudaPrecision Precision() const
__host__ __device__ int getCoords(int coord[], const Arg &arg, int &idx, int parity, int &dim)
Compute the space-time coordinates we are at.