11 using namespace gauge;
13 template<
typename Oprod,
typename Gauge,
typename Mom>
17 #ifndef BUILD_TIFR_INTERFACE 27 : oprod(oprod), gauge(gauge), mom(mom){
30 for(
int dir=0; dir<4; ++dir) threads *=
dim[dir];
32 for(
int dir=0; dir<4; ++dir)
X[dir] =
dim[dir];
33 #ifndef BUILD_TIFR_INTERFACE 35 for(
int dir=0; dir<4; ++dir) border[dir] = 2;
43 template<
typename Float,
typename Oprod,
typename Gauge,
typename Mom>
53 for(
int dir=0; dir<4; ++dir)
X[dir] =
arg.X[dir];
57 #ifndef BUILD_TIFR_INTERFACE 59 for(
int dir=0; dir<4; ++dir){
60 x[dir] +=
arg.border[dir];
61 X[dir] += 2*
arg.border[dir];
71 int dx[4] = {0,0,0,0};
72 for(
int dir=0; dir<4; ++dir){
85 temp[0] = (M.
data[1].x - M.
data[3].x)*0.5;
86 temp[1] = (M.
data[1].y + M.
data[3].y)*0.5;
88 temp[2] = (M.
data[2].x - M.
data[6].x)*0.5;
89 temp[3] = (M.
data[2].y + M.
data[6].y)*0.5;
91 temp[4] = (M.
data[5].x - M.
data[7].x)*0.5;
92 temp[5] = (M.
data[5].y + M.
data[7].y)*0.5;
103 template<
typename Float,
typename Oprod,
typename Gauge,
typename Mom>
108 if(
idx >=
arg.threads)
return;
109 completeKSForceCore<Float,Oprod,Gauge,Mom>(
arg,
idx);
115 template<
typename Float,
typename Oprod,
typename Gauge,
typename Mom>
119 completeKSForceCore<Float,Oprod,Gauge,Mom>(
arg,
idx);
125 template<
typename Float,
typename Oprod,
typename Gauge,
typename Mom>
142 :
arg(
arg), meta(meta), location(location) {
143 writeAuxString(
"prec=%lu,stride=%d",
sizeof(Float),
arg.mom.stride);
155 completeKSForceCPU<Float>(
arg);
162 long long bytes()
const {
return 0; }
165 template<
typename Float,
typename Oprod,
typename Gauge,
typename Mom>
170 completeForce.
apply(0);
176 template<
typename Float>
181 errorQuda(
"Only QUDA_CUDA_FIELD_LOCATION currently supported");
184 errorQuda(
"Reconstruct type not supported");
189 const_cast<int*>(mom.
X()),
190 gauge, location,
flops);
200 errorQuda(
"Half precision not supported");
204 completeKSForce<float>(mom, oprod, gauge, location,
flops);
206 completeKSForce<double>(mom, oprod, gauge, location,
flops);
208 errorQuda(
"Precision %d not supported", mom.Precision());
216 template<
typename Result,
typename Oprod,
typename Gauge>
229 :
coeff(1.0), res(res), oprod(oprod), gauge(gauge){
233 for(
int dir=0; dir<4; ++dir) threads *= (
dim[dir]-2);
234 for(
int dir=0; dir<4; ++dir)
X[dir] =
dim[dir]-2;
235 for(
int dir=0; dir<4; ++dir) border[dir] = 2;
237 for(
int dir=0; dir<4; ++dir) threads *=
dim[dir];
238 for(
int dir=0; dir<4; ++dir)
X[dir] =
dim[dir];
246 template<
typename Float,
typename Result,
typename Oprod,
typename Gauge>
310 template<
typename Float,
typename Result,
typename Oprod,
typename Gauge>
315 if(
idx >=
arg.threads)
return;
316 computeKSLongLinkForceCore<Float,Result,Oprod,Gauge>(
arg,
idx);
322 template<
typename Float,
typename Result,
typename Oprod,
typename Gauge>
326 computeKSLongLinkForceCore<Float,Result,Oprod,Gauge>(
arg,
idx);
333 template<
typename Float,
typename Result,
typename Oprod,
typename Gauge>
351 :
arg(
arg), meta(meta), location(location) {
352 writeAuxString(
"prec=%lu,stride=%d",
sizeof(Float),
arg.res.stride);
364 computeKSLongLinkForceCPU<Float>(
arg);
370 long long flops()
const {
return 0; }
371 long long bytes()
const {
return 0; }
377 template<
typename Float,
typename Result,
typename Oprod,
typename Gauge>
382 computeLongLink.
apply(0);
386 template<
typename Float>
390 errorQuda(
"Only QUDA_CUDA_FIELD_LOCATION currently supported");
395 errorQuda(
"Reconstruct type not supported");
400 const_cast<int*>(result.
X()),
411 errorQuda(
"Half precision not supported");
415 computeKSLongLinkForce<float>(result, oprod, gauge, location);
417 computeKSLongLinkForce<double>(result, oprod, gauge, location);
419 errorQuda(
"Precision %d not supported", result.Precision());
KSForceArg(Oprod &oprod, Gauge &gauge, Mom &mom, int dim[4])
__global__ void completeKSForceKernel(KSForceArg< Oprod, Gauge, Mom > arg)
static __device__ __host__ int linkIndexShift(const I x[], const J dx[], const K X[4])
unsigned int sharedBytesPerThread() const
__global__ void computeKSLongLinkForceKernel(KSLongLinkArg< Result, Oprod, Gauge > arg)
KSLongLinkArg< Result, Oprod, Gauge > arg
KSForceComplete(KSForceArg< Oprod, Gauge, Mom > &arg, const GaugeField &meta, QudaFieldLocation location)
static void sub(Float *dst, Float *a, Float *b, int cnt)
bool tuneSharedBytes() const
void completeKSForceCPU(KSForceArg< Oprod, Gauge, Mom > &arg)
const QudaFieldLocation location
static __inline__ dim3 dim3 void size_t cudaStream_t int dim
void completeKSForce(GaugeField &mom, const GaugeField &oprod, const GaugeField &gauge, QudaFieldLocation location, long long *flops=NULL)
const char * VolString() const
unsigned int sharedBytesPerBlock(const TuneParam ¶m) const
void apply(const cudaStream_t &stream)
unsigned int minThreads() const
unsigned int minThreads() const
__host__ __device__ void completeKSForceCore(KSForceArg< Oprod, Gauge, Mom > &arg, int idx)
virtual ~KSForceComplete()
KSLongLinkForce(KSLongLinkArg< Result, Oprod, Gauge > &arg, const GaugeField &meta, QudaFieldLocation location)
Main header file for host and device accessors to GaugeFields.
const QudaFieldLocation location
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
__device__ __host__ T getTrace(const Matrix< T, 3 > &a)
enum QudaFieldLocation_s QudaFieldLocation
__host__ __device__ void computeKSLongLinkForceCore(KSLongLinkArg< Result, Oprod, Gauge > &arg, int idx)
virtual ~KSLongLinkForce()
void computeKSLongLinkForce(Result res, Oprod oprod, Gauge gauge, int dim[4], const GaugeField &meta, QudaFieldLocation location)
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Accessor routine for CloverFields in native field order.
unsigned int sharedBytesPerBlock(const TuneParam ¶m) const
QudaReconstructType Reconstruct() const
void computeKSLongLinkForceCPU(KSLongLinkArg< Result, Oprod, Gauge > &arg)
unsigned int sharedBytesPerThread() const
void apply(const cudaStream_t &stream)
KSLongLinkArg(Result &res, Oprod &oprod, Gauge &gauge, int dim[4])
bool tuneSharedBytes() const
KSForceArg< Oprod, Gauge, Mom > arg
static __device__ __host__ void getCoords(int x[], int cb_index, const I X[], int parity)