8 namespace CloverOrder {
17 #ifdef GPU_CLOVER_DIRAC
19 template<
typename Float,
typename Clover,
typename Gauge>
35 CloverArg(Clover &clover, Gauge &
gauge,
GaugeField& Fmunu,
double cloverCoeff)
37 cloverCoeff(cloverCoeff),
38 FmunuStride(Fmunu.Stride()), FmunuOffset(Fmunu.Bytes()/(4*sizeof(
Float))),
40 gauge(gauge), clover(clover) {
41 for(
int dir=0; dir<4; ++dir)
X[dir] = Fmunu.
X()[dir];
44 for(
int dir=0; dir<4; ++dir){
51 __device__ __host__
inline int linkIndex(
int x[],
int dx[],
const int X[4]) {
53 for (
int i=0; i<4; i++) y[i] = (x[i] + dx[i] + X[i]) % X[i];
54 int idx = (((y[3]*X[2] + y[2])*X[1] + y[1])*X[0] + y[0]) >> 1;
59 __device__ __host__
inline void getCoords(
int x[4],
int cb_index,
const int X[4],
int parity)
61 x[3] = cb_index/(X[2]*X[1]*X[0]/2);
62 x[2] = (cb_index/(X[1]*X[0]/2)) % X[2];
63 x[1] = (cb_index/(X[0]/2)) % X[1];
64 x[0] = 2*(cb_index%(X[0]/2)) + ((x[3]+x[2]+x[1]+
parity)&1);
73 template <
typename Float,
typename Clover,
typename GaugeOrder>
74 __host__ __device__
void computeFmunuCore(CloverArg<Float,Clover,GaugeOrder>&
arg,
int idx) {
78 if(idx >= arg.threads/2){
84 for(
int dir=0; dir<4; ++dir) X[dir] = arg.X[dir];
89 for(
int dir=0; dir<4; ++dir){
90 x[dir] += arg.border[dir];
91 X[dir] += 2*arg.border[dir];
99 for (
int mu=0;
mu<4;
mu++) {
100 for (
int nu=0; nu<
mu; nu++) {
107 int dx[4] = {0, 0, 0, 0};
124 Ftmp = Ftmp *
conj(U3) ;
139 int dx[4] = {0, 0, 0, 0};
158 Ftmp = Ftmp *
conj(U3);
178 int dx[4] = {0, 0, 0, 0};
205 Ftmp = Ftmp *
conj(U4);
216 int dx[4] = {0, 0, 0, 0};
272 Cmplx* thisFmunu = arg.Fmunu + parity*arg.FmunuOffset;
273 int munu_idx = (mu*(mu-1))/2 + nu;
284 template<
typename Float,
typename Clover,
typename Gauge>
285 __global__
void computeFmunuKernel(CloverArg<Float,Clover,Gauge> arg){
286 int idx = threadIdx.x + blockIdx.x*blockDim.x;
287 if(idx >= arg.threads)
return;
288 computeFmunuCore<Float,Clover,Gauge>(
arg,
idx);
291 template<
typename Float,
typename Clover,
typename Gauge>
292 void computeFmunuCPU(CloverArg<Float,Clover,Gauge>& arg){
293 errorQuda(
"computeFmunuCPU not yet supported\n");
294 for(
int idx=0; idx<arg.threads; idx++){
295 computeFmunuCore(arg,idx);
301 template<
typename Float,
typename Clover,
typename Gauge>
303 CloverArg<Float,Clover,Gauge>
arg;
308 unsigned int sharedBytesPerThread()
const {
return 0; }
309 unsigned int sharedBytesPerBlock(
const TuneParam &
param)
const {
return 0; }
311 bool tuneSharedBytes()
const {
return false; }
312 bool tuneGridDim()
const {
return false; }
313 unsigned int minThreads()
const {
return arg.threads; }
317 : arg(arg), meta(meta), location(location) {
318 writeAuxString(
"threads=%d,stride=%d,prec=%lu",arg.threads,arg.clover.stride,
sizeof(
Float));
320 virtual ~FmunuCompute() {}
322 void apply(
const cudaStream_t &
stream){
324 #if (__COMPUTE_CAPABILITY__ >= 200)
328 errorQuda(
"computeFmunuKernel not supported on pre-Fermi architecture");
331 computeFmunuCPU(arg);
336 return TuneKey(meta.VolString(),
typeid(*this).name(), aux);
341 std::stringstream ps;
342 ps <<
"block=(" << param.
block.x <<
"," << param.
block.y <<
"," << param.
block.z <<
")";
349 long long flops()
const {
return (2430 + 36)*6*arg.threads; }
350 long long bytes()
const {
return (4*4*18 + 18)*6*arg.threads*
sizeof(
Float); }
380 template<
typename Float,
typename Clover,
typename Gauge>
382 void cloverComputeCore(CloverArg<Float,Clover,Gauge>& arg,
int idx){
385 if(idx >= arg.threads/2){
387 idx -= arg.threads/2;
394 for(
int i=0; i<6; ++i){
398 Cmplx I; I.x = 0; I.y = 1.0;
399 Cmplx
coeff; coeff.x = 0; coeff.y = arg.cloverCoeff;
402 block1[0] = coeff*(F[0]-F[5]);
403 block1[1] = coeff*(F[0]+F[5]);
404 block2[0] = arg.cloverCoeff*(F[1]+F[4] - I*(F[2]-F[3]));
405 block2[1] = arg.cloverCoeff*(F[1]-F[4] - I*(F[2]+F[3]));
408 const int idtab[15]={0,1,3,6,10,2,4,7,11,5,8,12,9,13,14};
414 for(
int ch=0; ch<2; ++ch){
417 for(
int i=0; i<3; ++i){
418 diag[i] = 1.0 - block1[ch](i,i).x;
419 diag[i+3] = 1.0 + block1[ch](i,i).x;
424 triangle[0] = - block1[ch](1,0);
426 triangle[1] = - block1[ch](2,0);
427 triangle[2] = - block1[ch](2,1);
429 triangle[3] = block2[ch](0,0);
430 triangle[4] = block2[ch](0,1);
431 triangle[5] = block2[ch](0,2);
433 triangle[6] = block2[ch](1,0);
434 triangle[7] = block2[ch](1,1);
435 triangle[8] = block2[ch](1,2);
436 triangle[9] = block1[ch](1,0);
438 triangle[10] = block2[ch](2,0);
439 triangle[11] = block2[ch](2,1);
440 triangle[12] = block2[ch](2,2);
441 triangle[13] = block1[ch](2,0);
442 triangle[14] = block1[ch](2,1);
445 for(
int i=0; i<6; ++i){
446 A[ch*36 + i] = 0.5*diag[i];
448 for(
int i=0; i<15; ++i){
449 A[ch*36+6+2*i] = 0.5*triangle[idtab[i]].x;
450 A[ch*36+6+2*i + 1] = 0.5*triangle[idtab[i]].y;
456 arg.clover.save(A, idx, parity);
461 template<
typename Float,
typename Clover,
typename Gauge>
463 void cloverComputeKernel(CloverArg<Float,Clover,Gauge> arg){
464 int idx = threadIdx.x + blockIdx.x*blockDim.x;
465 if(idx >= arg.threads)
return;
466 cloverComputeCore(arg, idx);
469 template<
typename Float,
typename Clover,
typename Gauge>
470 void cloverComputeCPU(CloverArg<Float,Clover,Gauge> arg){
471 for(
int idx=0; idx<arg.threads; ++
idx){
472 cloverComputeCore(arg, idx);
477 template<
typename Float,
typename Clover,
typename Gauge>
478 class CloverCompute :
Tunable {
479 CloverArg<Float, Clover, Gauge>
arg;
484 unsigned int sharedBytesPerThread()
const {
return 0; }
485 unsigned int sharedBytesPerBlock(
const TuneParam ¶m)
const {
return 0; }
487 bool tuneSharedBytes()
const {
return false; }
488 bool tuneGridDim()
const {
return false; }
489 unsigned int minThreads()
const {
return arg.threads; }
493 : arg(arg), meta(meta), location(location) {
494 writeAuxString(
"threads=%d,stride=%d,prec=%lu",arg.threads,arg.clover.stride,
sizeof(
Float));
497 virtual ~CloverCompute() {}
499 void apply(
const cudaStream_t &stream) {
501 #if (__COMPUTE_CAPABILITY__ >= 200)
505 errorQuda(
"cloverComputeKernel not supported on pre-Fermi architecture");
508 cloverComputeCPU(arg);
513 return TuneKey(meta.VolString(),
typeid(*this).name(), aux);
517 std::stringstream ps;
518 ps <<
"block=(" << param.
block.x <<
"," << param.
block.y <<
"," << param.
block.z <<
"), ";
525 long long flops()
const {
return 480*arg.threads; }
526 long long bytes()
const {
return arg.threads*(6*18 + 72)*
sizeof(
Float); }
531 template<
typename Float,
typename Clover,
typename Gauge>
533 CloverArg<Float,Clover,Gauge>
arg(clover, gauge, Fmunu, cloverCoeff);
534 FmunuCompute<Float,Clover,Gauge> fmunuCompute(arg, Fmunu, location);
535 fmunuCompute.apply(0);
536 CloverCompute<Float,Clover,Gauge> cloverCompute(arg, Fmunu, location);
537 cloverCompute.apply(0);
538 cudaDeviceSynchronize();
542 template<
typename Float>
562 computeClover(
CloverOrder::quda::FloatNOrder<Float,72,2>(clover,0),
FloatNOrder<Float, 18, 2, 18>(gauge), *Fmunu, cloverCoeff, location);
564 computeClover(
CloverOrder::quda::FloatNOrder<Float,72,2>(clover,0),
FloatNOrder<Float, 18, 2, 12>(gauge), *Fmunu, cloverCoeff, location);
566 computeClover(
CloverOrder::quda::FloatNOrder<Float,72,2>(clover,0),
FloatNOrder<Float, 18, 2, 8>(gauge), *Fmunu, cloverCoeff, location);
573 computeClover(
CloverOrder::quda::FloatNOrder<Float,72,2>(clover,0),
FloatNOrder<Float,18,4,12>(gauge), *Fmunu, cloverCoeff, location);
581 computeClover(
CloverOrder::quda::FloatNOrder<Float,72,4>(clover,0),
FloatNOrder<Float,18,2,18>(gauge), *Fmunu, cloverCoeff, location);
583 computeClover(
CloverOrder::quda::FloatNOrder<Float,72,4>(clover,0),
FloatNOrder<Float,18,2,12>(gauge), *Fmunu, cloverCoeff, location);
590 computeClover(
CloverOrder::quda::FloatNOrder<Float,72,4>(clover,0),
FloatNOrder<Float,18,4,12>(gauge), *Fmunu, cloverCoeff, location);
597 if(Fmunu)
delete Fmunu;
604 #ifdef GPU_CLOVER_DIRAC
606 errorQuda(
"Half precision not supported\n");
__device__ __host__ int linkIndex(int x[], int dx[], const int X[4])
__device__ __host__ void setZero(Matrix< T, N > *m)
QudaVerbosity getVerbosity()
Matrix< N, std::complex< T > > conj(const Matrix< N, std::complex< T > > &mat)
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int mu
QudaGaugeFieldOrder Order() const
__global__ void const FloatN FloatM FloatM Float Float int threads
QudaSiteSubset siteSubset
QudaPrecision Precision() const
const QudaFieldLocation location
FloatingPoint< float > Float
QudaCloverFieldOrder Order() const
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
QudaReconstructType Reconstruct() const
__constant__ double coeff
enum QudaFieldLocation_s QudaFieldLocation
__device__ void loadLinkVariableFromArray(const T *const array, const int dir, const int idx, const int stride, Matrix< T, 3 > *link)
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
__device__ void writeLinkVariableToArray(const Matrix< T, 3 > &link, const int dir, const int idx, const int stride, T *const array)
getCoords(x, sid, kparam.D, oddBit)
void computeClover(CloverField &clover, const GaugeField &gauge, double coeff, QudaFieldLocation location)