5 #include <launch_kernel.cuh>
10 #ifdef GPU_CLOVER_DIRAC
12 template <
typename Clover>
13 struct CloverInvertArg {
17 double *
const trlogA_h;
22 CloverInvertArg(Clover &inverse,
const Clover &clover,
bool computeTraceLog=0,
double*
const trlogA=0) :
23 inverse(inverse), clover(clover), computeTraceLog(computeTraceLog), trlogA_h(trlogA), twist(clover.Twisted()), mu2(clover.Mu2()){
24 cudaHostGetDevicePointer(&trlogA_d, trlogA_h, 0);
28 static __inline__ __device__
double atomicAdd(
double *addr,
double val)
30 double old=*addr, assumed;
34 old = __longlong_as_double( atomicCAS((
unsigned long long int*)addr,
35 __double_as_longlong(assumed),
36 __double_as_longlong(val+assumed)));
37 }
while( __double_as_longlong(assumed)!=__double_as_longlong(old) );
46 template <
int blockSize,
typename Float,
typename Clover>
47 __device__ __host__
double cloverInvertCompute(CloverInvertArg<Clover>
arg,
int x,
int parity) {
53 arg.clover.load(A, x, parity);
55 for (
int ch=0; ch<2; ch++) {
64 for (
int i=0; i<6; i++) diag[i] = 2.0*A[ch*36+i];
66 const int idtab[15]={0,1,3,6,10,2,4,7,11,5,8,12,9,13,14};
67 for (
int i=0; i<15; i++) tri[idtab[i]] = complex<Float>(2.0*A[ch*36+6+2*i], 2.0*A[ch*36+6+2*i+1]);
78 aux[ 0] = tri[0]*diag[0]+diag[1]*tri[0]+
conj(tri[2])*tri[1]+
conj(tri[4])*tri[3]+
conj(tri[7])*tri[6]+
conj(tri[11])*tri[10];
80 aux[ 1] = tri[1]*diag[0]+diag[2]*tri[1]+tri[2]*tri[0]+
conj(tri[5])*tri[3]+
conj(tri[8])*tri[6]+
conj(tri[12])*tri[10];
82 aux[ 2] = tri[2]*diag[1]+diag[2]*tri[2]+tri[1]*
conj(tri[0])+
conj(tri[5])*tri[4]+
conj(tri[8])*tri[7]+
conj(tri[12])*tri[11];
84 aux[ 3] = tri[3]*diag[0]+diag[3]*tri[3]+tri[4]*tri[0]+tri[5]*tri[1]+
conj(tri[9])*tri[6]+
conj(tri[13])*tri[10];
86 aux[ 4] = tri[4]*diag[1]+diag[3]*tri[4]+tri[3]*
conj(tri[0])+tri[5]*tri[2]+
conj(tri[9])*tri[7]+
conj(tri[13])*tri[11];
88 aux[ 5] = tri[5]*diag[2]+diag[3]*tri[5]+tri[3]*
conj(tri[1])+tri[4]*
conj(tri[2])+
conj(tri[9])*tri[8]+
conj(tri[13])*tri[12];
90 aux[ 6] = tri[6]*diag[0]+diag[4]*tri[6]+tri[7]*tri[0]+tri[8]*tri[1]+tri[9]*tri[3]+
conj(tri[14])*tri[10];
92 aux[ 7] = tri[7]*diag[1]+diag[4]*tri[7]+tri[6]*
conj(tri[0])+tri[8]*tri[2]+tri[9]*tri[4]+
conj(tri[14])*tri[11];
94 aux[ 8] = tri[8]*diag[2]+diag[4]*tri[8]+tri[6]*
conj(tri[1])+tri[7]*
conj(tri[2])+tri[9]*tri[5]+
conj(tri[14])*tri[12];
96 aux[ 9] = tri[9]*diag[3]+diag[4]*tri[9]+tri[6]*
conj(tri[3])+tri[7]*
conj(tri[4])+tri[8]*
conj(tri[5])+
conj(tri[14])*tri[13];
98 aux[10] = tri[10]*diag[0]+diag[5]*tri[10]+tri[11]*tri[0]+tri[12]*tri[1]+tri[13]*tri[3]+tri[14]*tri[6];
100 aux[11] = tri[11]*diag[1]+diag[5]*tri[11]+tri[10]*
conj(tri[0])+tri[12]*tri[2]+tri[13]*tri[4]+tri[14]*tri[7];
102 aux[12] = tri[12]*diag[2]+diag[5]*tri[12]+tri[10]*
conj(tri[1])+tri[11]*
conj(tri[2])+tri[13]*tri[5]+tri[14]*tri[8];
104 aux[13] = tri[13]*diag[3]+diag[5]*tri[13]+tri[10]*
conj(tri[3])+tri[11]*
conj(tri[4])+tri[12]*
conj(tri[5])+tri[14]*tri[9];
106 aux[14] = tri[14]*diag[4]+diag[5]*tri[14]+tri[10]*
conj(tri[6])+tri[11]*
conj(tri[7])+tri[12]*
conj(tri[8])+tri[13]*
conj(tri[9]);
118 for(
int i = 0; i < 15; i++) tri[i] = aux[i];
121 for (
int j=0; j<6; j++) {
122 diag[j] =
sqrt(diag[j]);
123 tmp[j] = 1.0 / diag[j];
125 for (
int k=j+1; k<6; k++) {
126 int kj = k*(k-1)/2+j;
130 for(
int k=j+1;k<6;k++){
132 diag[k] -= (tri[kj] *
conj(tri[kj])).real();
133 for(
int l=k+1;l<6;l++){
136 tri[lk] -= tri[lj] *
conj(tri[kj]);
142 for (
int j=0;j<6;j++) trlogA += (
double)2.0*
log((
double)(diag[j]));
146 for (
int k=0;k<6;k++) {
147 for(
int l=0;l<k;l++) v1[l] = complex<Float>(0.0, 0.0);
151 for(
int l=k+1;l<6;l++){
153 for(
int j=k;j<l;j++){
155 sum -= tri[lj] * v1[j];
157 v1[l] = sum * tmp[l];
161 v1[5] = v1[5] * tmp[5];
162 for(
int l=4;l>=k;l--){
164 for(
int j=l+1;j<6;j++){
166 sum -=
conj(tri[jl]) * v1[j];
168 v1[l] = sum * tmp[l];
172 diag[k] = v1[k].real();
173 for(
int l=k+1;l<6;l++){
179 for (
int i=0; i<6; i++) A[ch*36+i] = 0.5 * diag[i];
180 for (
int i=0; i<15; i++) {
181 A[ch*36+6+2*i] = 0.5*tri[idtab[i]].real(); A[ch*36+6+2*i+1] = 0.5*tri[idtab[i]].imag();
186 arg.inverse.save(A, x, parity);
191 template <
int blockSize,
typename Float,
typename Clover>
193 for (
int parity=0; parity<2; parity++) {
194 for (
int x=0; x<arg.clover.volumeCB; x++) {
196 double trlogA = cloverInvertCompute<blockSize, Float>(
arg,
x,
parity);
197 if (arg.computeTraceLog) arg.trlogA_h[
parity] += trlogA;
202 template <
int blockSize,
typename Float,
typename Clover>
203 __global__
void cloverInvertKernel(CloverInvertArg<Clover> arg) {
204 int idx = blockIdx.x*blockDim.x + threadIdx.x;
206 int parity = blockIdx.y;
208 if (idx < arg.clover.volumeCB) trlogA = cloverInvertCompute<blockSize, Float>(
arg,
idx,
parity);
210 if (arg.computeTraceLog) {
211 typedef cub::BlockReduce<double, blockSize> BlockReduce;
212 __shared__
typename BlockReduce::TempStorage temp_storage;
213 double aggregate = BlockReduce(temp_storage).Sum(trlogA);
214 if (threadIdx.x == 0) atomicAdd(arg.trlogA_d+parity, aggregate);
219 template <
typename Float,
typename Clover>
220 class CloverInvert : Tunable {
221 CloverInvertArg<Clover>
arg;
222 const CloverField &meta;
226 unsigned int sharedBytesPerThread()
const {
return 0; }
227 unsigned int sharedBytesPerBlock(
const TuneParam &
param)
const {
return 0 ;}
229 bool tuneSharedBytes()
const {
return false; }
230 bool tuneGridDim()
const {
return false; }
231 unsigned int minThreads()
const {
return arg.clover.volumeCB; }
235 : arg(arg), meta(meta), location(location) {
236 writeAuxString(
"stride=%d,prec=%lu",arg.clover.stride,
sizeof(
Float));
238 virtual ~CloverInvert() { ; }
240 void apply(
const cudaStream_t &
stream) {
242 arg.trlogA_h[0] = 0.0; arg.trlogA_h[1] = 0.0;
245 LAUNCH_KERNEL(cloverInvertKernel, tp, stream, arg,
Float, Clover);
247 cloverInvert<1, Float, Clover>(
arg);
249 if (arg.computeTraceLog) {
250 cudaDeviceSynchronize();
255 TuneKey tuneKey()
const {
256 return TuneKey(meta.VolString(),
typeid(*this).name(), aux);
260 std::stringstream ps;
261 ps <<
"block=(" << param.block.x <<
"," << param.block.y <<
"," << param.block.z <<
"), ";
262 ps <<
"shared=" << param.shared_bytes;
266 long long flops()
const {
return 0; }
267 long long bytes()
const {
return 2*arg.clover.volumeCB*(arg.inverse.Bytes() + arg.clover.Bytes()); }
270 template <
typename Float,
typename Clover>
271 void cloverInvert(Clover inverse,
const Clover clover,
bool computeTraceLog,
273 CloverInvertArg<Clover>
arg(inverse, clover, computeTraceLog, trlog);
274 CloverInvert<Float,Clover> invert(arg, meta, location);
276 cudaDeviceSynchronize();
279 template <
typename Float>
282 cloverInvert<Float>(FloatNOrder<Float,72,2>(clover, 1),
283 FloatNOrder<Float,72,2>(clover, 0),
284 computeTraceLog, clover.TrLog(), clover,
location);
286 cloverInvert<Float>(FloatNOrder<Float,72,4>(clover, 1),
287 FloatNOrder<Float,72,4>(clover, 0),
288 computeTraceLog, clover.TrLog(), clover,
location);
290 errorQuda(
"Clover field %d order not supported", clover.Order());
300 #ifdef GPU_CLOVER_DIRAC
302 errorQuda(
"Half precision not supported for order %d", clover.
Order());
305 cloverInvert<double>(clover, computeTraceLog,
location);
307 cloverInvert<float>(clover, computeTraceLog,
location);
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
QudaVerbosity getVerbosity()
__host__ __device__ ValueType sqrt(ValueType x)
QudaPrecision Precision() const
void cloverInvert(CloverField &clover, bool computeTraceLog, QudaFieldLocation location)
cudaColorSpinorField * tmp
const QudaFieldLocation location
FloatingPoint< float > Float
QudaCloverFieldOrder Order() const
void reduceDoubleArray(double *, const int len)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
__host__ __device__ ValueType log(ValueType x)
enum QudaFieldLocation_s QudaFieldLocation
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
__host__ __device__ ValueType conj(ValueType x)