17 template <
typename Gauge>
18 struct KernelArg :
public ReduceArg<double2> {
29 for(
int dir=0; dir<4; ++dir){
30 border[dir] = data.
R()[dir];
31 X[dir] = data.
X()[dir] - border[dir]*2;
34 for(
int dir=0; dir<4; ++dir)
X[dir] = data.
X()[dir];
36 threads =
X[0]*
X[1]*
X[2]*
X[3]/2;
38 double2 getValue(){
return result_h[0];}
43 template<
int blockSize,
typename Float,
typename Gauge,
int NCOLORS,
int functiontype>
44 __global__
void compute_Value(KernelArg<Gauge>
arg){
52 for(
int dr=0; dr<4; ++dr)
X[dr] =
arg.X[dr];
58 for(
int dr=0; dr<4; ++dr) {
59 x[dr] +=
arg.border[dr];
60 X[dr] += 2*
arg.border[dr];
65 for (
int mu = 0;
mu < 4;
mu++) {
73 double2
sum = make_double2(
val.real(),
val.imag());
74 reduce2d<blockSize,2>(
arg,
sum);
79 template<
typename Float,
typename Gauge,
int NCOLORS,
int functiontype>
83 mutable char aux_string[128];
85 unsigned int minThreads()
const {
return arg.threads; }
88 CalcFunc(KernelArg<Gauge> &
arg) :
arg(
arg) {}
91 void apply(
const cudaStream_t &
stream){
93 arg.result_h[0] = make_double2(0.0, 0.0);
103 std::stringstream vol;
104 vol <<
arg.X[0] <<
"x" <<
arg.X[1] <<
"x" <<
arg.X[2] <<
"x" <<
arg.X[3];
105 sprintf(aux_string,
"threads=%d,prec=%lu",
arg.threads,
sizeof(Float));
106 return TuneKey(vol.str().c_str(),
typeid(*this).name(), aux_string);
110 long long flops()
const {
111 if(NCOLORS==3 && functiontype == 0)
return 264LL*2*
arg.threads+2LL*tp.
block.x;
112 else if(NCOLORS==3 && functiontype == 1)
return 24LL*2*
arg.threads+2LL*tp.
block.x;
115 long long bytes()
const {
return 4LL*NCOLORS * NCOLORS *
sizeof(Float)*2*2*
arg.threads + tp.
block.x *
sizeof(double2); }
123 template<
typename Float,
int NCOLORS,
int functiontype,
typename Gauge>
125 TimeProfile profileGenericFunc(
"GenericFunc",
false);
127 KernelArg<Gauge>
arg(dataOr, data);
128 CalcFunc<Float, Gauge, NCOLORS, functiontype>
func(
arg);
137 double gflops = (
func.flops()*1
e-9)/(secs);
138 double gbytes =
func.bytes()/(secs*1e9);
139 if(functiontype == 0){
143 printfQuda(
"Determinant: Time = %6.6f s, Gflop/s = %6.1f, GB/s = %6.1f\n", secs, gflops, gbytes);
146 if(functiontype == 1){
150 printfQuda(
"Trace: Time = %6.6f s, Gflop/s = %6.1f, GB/s = %6.1f\n", secs, gflops, gbytes);
154 return arg.getValue();
159 template<
typename Float,
int functiontype>
162 double2 rtn = make_double2(0.0,0.0);
169 rtn = computeValue<Float, 3, functiontype>(Gauge(data), data);
173 rtn = computeValue<Float, 3, functiontype>(Gauge(data), data);
177 rtn = computeValue<Float, 3, functiontype>(Gauge(data), data);
186 #endif // GPU_GAUGE_ALG 194 double2 det = make_double2(0.0,0.0);
197 det = computeValue<float, 0> (data);
199 det = computeValue<double, 0>(data);
204 errorQuda(
"Pure gauge code has not been built");
205 #endif // GPU_GAUGE_ALG 215 double2 det = make_double2(0.0,0.0);
218 det = computeValue<float, 1> (data);
220 det = computeValue<double, 1>(data);
225 errorQuda(
"Pure gauge code has not been built");
226 #endif // GPU_GAUGE_ALG
static __device__ __host__ int linkIndex(const int x[], const I X[4])
#define LAUNCH_KERNEL_LOCAL_PARITY(kernel, tp, stream, arg,...)
QudaVerbosity getVerbosity()
double2 getLinkDeterminant(cudaGaugeField &data)
Calculate the Determinant.
void comm_allreduce_array(double *data, size_t size)
__host__ __device__ void sum(double &a, double &b)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
double2 getLinkTrace(cudaGaugeField &data)
Calculate the Trace.
Main header file for host and device accessors to GaugeFields.
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
__device__ __host__ T getTrace(const Matrix< T, 3 > &a)
int sprintf(char *, const char *,...) __attribute__((__format__(__printf__
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
QudaReconstructType Reconstruct() const
__device__ __host__ T getDeterminant(const Mat< T, 3 > &a)
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
QudaPrecision Precision() const
static __inline__ enum cudaRoundMode mode enum cudaRoundMode mode enum cudaRoundMode mode enum cudaRoundMode mode int val
static __device__ __host__ void getCoords(int x[], int cb_index, const I X[], int parity)