14 #ifdef GPU_CLOVER_DIRAC 16 template <
typename Float,
typename Clover>
17 struct CloverInvertArg :
public ReduceArg<double2> {
31 template <
typename Float,
typename Arg,
bool computeTrLog,
bool twist>
32 __device__ __host__
inline double cloverInvertCompute(Arg &
arg,
int x_cb,
int parity) {
35 constexpr
int nSpin = 4;
36 constexpr
int N =
nColor*nSpin/2;
40 for (
int ch=0; ch<2; ch++) {
42 A *=
static_cast<Float
>(2.0);
53 if (computeTrLog)
for (
int j=0; j<N; j++) trlogA += 2.0*
log(cholesky.D(j));
55 Mat Ainv =
static_cast<Float
>(0.5)*cholesky.invert();
62 template <
typename Float,
typename Arg,
bool computeTrLog,
bool twist>
65 for (
int x=0;
x<
arg.clover.volumeCB;
x++) {
67 double trlogA = cloverInvertCompute<Float,Arg,computeTrLog,twist>(
arg,
x,
parity);
70 else arg.result_h[0].x += trlogA;
76 template <
int blockSize,
typename Float,
typename Arg,
bool computeTrLog,
bool twist>
77 __global__
void cloverInvertKernel(Arg
arg) {
80 double trlogA_parity = 0.0;
81 if (
idx <
arg.clover.volumeCB)
82 trlogA_parity = cloverInvertCompute<Float,Arg,computeTrLog,twist>(
arg,
idx,
parity);
83 double2 trlogA =
parity ? make_double2(0.0,trlogA_parity) : make_double2(trlogA_parity, 0.0);
85 if (computeTrLog) reduce2d<blockSize,2>(
arg, trlogA);
88 template <
typename Float,
typename Arg>
95 bool tuneSharedBytes()
const {
return false; }
96 unsigned int minThreads()
const {
return arg.clover.volumeCB; }
100 :
arg(
arg), meta(meta), location(location) {
101 writeAuxString(
"stride=%d,prec=%lu,trlog=%s,twist=%s",
arg.clover.stride,
sizeof(Float),
102 arg.computeTraceLog ?
"true" :
"false",
arg.twist ?
"true" :
"false");
104 virtual ~CloverInvert() { ; }
106 void apply(
const cudaStream_t &
stream) {
108 arg.result_h[0] = make_double2(0.,0.);
110 if (
arg.computeTraceLog) {
124 if (
arg.computeTraceLog) {
126 cloverInvert<Float, Arg, true, true>(
arg);
128 cloverInvert<Float, Arg, true, false>(
arg);
132 cloverInvert<Float, Arg, false, true>(
arg);
134 cloverInvert<Float, Arg, false, false>(
arg);
144 long long flops()
const {
return 0; }
145 long long bytes()
const {
return 2*
arg.clover.volumeCB*(
arg.inverse.Bytes() +
arg.clover.Bytes()); }
147 void preTune() {
if (
arg.clover.clover ==
arg.inverse.clover)
arg.inverse.save(); }
148 void postTune() {
if (
arg.clover.clover ==
arg.inverse.clover)
arg.inverse.load(); }
152 template <
typename Float,
typename Clover>
155 CloverInvertArg<Float,Clover>
arg(inverse,
clover, computeTraceLog);
156 CloverInvert<Float,CloverInvertArg<Float,Clover>> invert(
arg, meta, location);
159 if (
arg.computeTraceLog) {
162 trlog[0] =
arg.result_h[0].x;
163 trlog[1] =
arg.result_h[0].y;
167 template <
typename Float>
172 cloverInvert<Float>(C(
clover, 1), C(
clover, 0), computeTraceLog,
185 #ifdef GPU_CLOVER_DIRAC 187 errorQuda(
"Half precision not supported for order %d",
clover.Order());
190 cloverInvert<double>(
clover, computeTraceLog, location);
192 cloverInvert<float>(
clover, computeTraceLog, location);
#define LAUNCH_KERNEL_LOCAL_PARITY(kernel, tp, stream, arg,...)
QudaVerbosity getVerbosity()
Compute Cholesky decomposition of A. By default, we use a modified Cholesky which avoids the division...
void comm_allreduce_array(double *data, size_t size)
const char * VolString() const
void Mat(sFloat *out, gFloat **link, sFloat *in, int daggerBit, int mu)
Main header file for host and device accessors to CloverFields.
void cloverInvert(CloverField &clover, bool computeTraceLog, QudaFieldLocation location)
This function compute the Cholesky decomposition of each clover matrix and stores the clover inverse ...
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Specialized container for Hermitian matrices (e.g., used for wrapping clover matrices) ...
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
__host__ __device__ ValueType log(ValueType x)
enum QudaFieldLocation_s QudaFieldLocation
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...