11 template<
typename Oprod,
typename Gauge,
typename Mom>
15 #ifndef BUILD_TIFR_INTERFACE
25 : oprod(oprod), gauge(gauge), mom(mom){
28 for(
int dir=0; dir<4; ++dir)
threads *= dim[dir];
30 for(
int dir=0; dir<4; ++dir)
X[dir] = dim[dir];
31 #ifndef BUILD_TIFR_INTERFACE
33 for(
int dir=0; dir<4; ++dir) border[dir] = 2;
40 __device__ __host__
inline int linkIndex(
int x[],
int dx[],
const int X[4]) {
42 for (
int i=0; i<4; i++) y[i] = (x[i] + dx[i] + X[i]) % X[i];
43 int idx = (((y[3]*X[2] + y[2])*X[1] + y[1])*X[0] + y[0]) >> 1;
48 __device__ __host__
inline void getCoords(
int x[4],
int cb_index,
const int X[4],
int parity)
50 x[3] = cb_index/(X[2]*X[1]*X[0]/2);
51 x[2] = (cb_index/(X[1]*X[0]/2)) % X[2];
52 x[1] = (cb_index/(X[0]/2)) % X[1];
53 x[0] = 2*(cb_index%(X[0]/2)) + ((x[3]+x[2]+x[1]+
parity)&1);
58 template<
typename Float,
typename Oprod,
typename Gauge,
typename Mom>
68 for(
int dir=0; dir<4; ++dir) X[dir] = arg.
X[dir];
72 #ifndef BUILD_TIFR_INTERFACE
74 for(
int dir=0; dir<4; ++dir){
75 x[dir] += arg.border[dir];
76 X[dir] += 2*arg.border[dir];
88 int dx[4] = {0,0,0,0};
89 for(
int dir=0; dir<4; ++dir){
102 temp[0] = (M.
data[1].x - M.
data[3].x)*0.5;
103 temp[1] = (M.
data[1].y + M.
data[3].y)*0.5;
105 temp[2] = (M.
data[2].x - M.
data[6].x)*0.5;
106 temp[3] = (M.
data[2].y + M.
data[6].y)*0.5;
108 temp[4] = (M.
data[5].x - M.
data[7].x)*0.5;
109 temp[5] = (M.
data[5].y + M.
data[7].y)*0.5;
111 temp[6] = (M.
data[0].y-sub);
112 temp[7] = (M.
data[4].y-sub);
113 temp[8] = (M.
data[8].y-sub);
116 arg.
mom.save(temp, idx, dir, parity);
120 template<
typename Float,
typename Oprod,
typename Gauge,
typename Mom>
123 int idx = threadIdx.x + blockIdx.x*blockDim.x;
126 completeKSForceCore<Float,Oprod,Gauge,Mom>(
arg,
idx);
132 template<
typename Float,
typename Oprod,
typename Gauge,
typename Mom>
136 completeKSForceCore<Float,Oprod,Gauge,Mom>(
arg,
idx);
142 template<
typename Float,
typename Oprod,
typename Gauge,
typename Mom>
150 unsigned int sharedBytesPerThread()
const {
return 0; }
151 unsigned int sharedBytesPerBlock(
const TuneParam &
param)
const {
return 0; }
153 bool tuneSharedBytes()
const {
return false; }
154 bool tuneGridDim()
const {
return false; }
155 unsigned int minThreads()
const {
return arg.threads; }
159 : arg(arg), meta(meta), location(location) {
167 #if (__COMPUTE_CAPABILITY__ >= 200)
169 dim3 blockDim(128, 1, 1);
170 dim3 gridDim((arg.threads + blockDim.x - 1) / blockDim.x, 1, 1);
171 completeKSForceKernel<Float><<<gridDim,blockDim>>>(arg);
173 errorQuda(
"completeKSForce not supported on pre-Fermi architecture");
176 completeKSForceCPU<Float>(arg);
183 std::stringstream ps;
184 ps <<
"block=(" << param.
block.x <<
"," << param.
block.y <<
"," << param.
block.z <<
"), ";
190 long long flops()
const {
return 792*arg.X[0]*arg.X[1]*arg.X[2]*arg.X[3]; }
191 long long bytes()
const {
return 0; }
194 template<
typename Float,
typename Oprod,
typename Gauge,
typename Mom>
199 completeForce.
apply(0);
200 if(flops) *flops = completeForce.
flops();
201 cudaDeviceSynchronize();
205 template<
typename Float>
210 errorQuda(
"Only QUDA_CUDA_FIELD_LOCATION currently supported");
213 errorQuda(
"Reconstruct type not supported");
218 const_cast<int*>(mom.
X()),
219 gauge, location, flops);
229 errorQuda(
"Half precision not supported");
237 errorQuda(
"Precision %d not supported", mom.Precision());
245 template<
typename Result,
typename Oprod,
typename Gauge>
258 :
coeff(1.0), res(res), oprod(oprod), gauge(gauge){
262 for(
int dir=0; dir<4; ++dir)
threads *= (dim[dir]-2);
263 for(
int dir=0; dir<4; ++dir)
X[dir] = dim[dir]-2;
264 for(
int dir=0; dir<4; ++dir) border[dir] = 2;
266 for(
int dir=0; dir<4; ++dir)
threads *= dim[dir];
267 for(
int dir=0; dir<4; ++dir)
X[dir] = dim[dir];
275 template<
typename Float,
typename Result,
typename Oprod,
typename Gauge>
339 template<
typename Float,
typename Result,
typename Oprod,
typename Gauge>
342 int idx = threadIdx.x + blockIdx.x*blockDim.x;
345 computeKSLongLinkForceCore<Float,Result,Oprod,Gauge>(
arg,
idx);
351 template<
typename Float,
typename Result,
typename Oprod,
typename Gauge>
355 computeKSLongLinkForceCore<Float,Result,Oprod,Gauge>(
arg,
idx);
362 template<
typename Float,
typename Result,
typename Oprod,
typename Gauge>
371 unsigned int sharedBytesPerThread()
const {
return 0; }
372 unsigned int sharedBytesPerBlock(
const TuneParam &
param)
const {
return 0; }
374 bool tuneSharedBytes()
const {
return false; }
375 bool tuneGridDim()
const {
return false; }
376 unsigned int minThreads()
const {
return arg.threads; }
380 : arg(arg), meta(meta), location(location) {
388 #if (__COMPUTE_CAPABILITY__ >= 200)
390 dim3 blockDim(128, 1, 1);
391 dim3 gridDim((arg.threads + blockDim.x - 1) / blockDim.x, 1, 1);
392 computeKSLongLinkForceKernel<Float><<<gridDim,blockDim>>>(arg);
394 errorQuda(
"computeKSLongLinkForce not supported on pre-Fermi architecture");
397 computeKSLongLinkForceCPU<Float>(arg);
404 std::stringstream ps;
405 ps <<
"block=(" << param.
block.x <<
"," << param.
block.y <<
"," << param.
block.z <<
"), ";
413 long long flops()
const {
return 0; }
414 long long bytes()
const {
return 0; }
420 template<
typename Float,
typename Result,
typename Oprod,
typename Gauge>
425 computeLongLink.
apply(0);
426 cudaDeviceSynchronize();
429 template<
typename Float>
433 errorQuda(
"Only QUDA_CUDA_FIELD_LOCATION currently supported");
438 errorQuda(
"Reconstruct type not supported");
443 const_cast<int*>(result.
X()),
454 errorQuda(
"Half precision not supported");
458 computeKSLongLinkForce<float>(result, oprod,
gauge,
location);
460 computeKSLongLinkForce<double>(result, oprod,
gauge,
location);
462 errorQuda(
"Precision %d not supported", result.Precision());
KSForceArg(Oprod &oprod, Gauge &gauge, Mom &mom, int dim[4])
__device__ __host__ int linkIndex(int x[], int dx[], const int X[4])
__global__ void completeKSForceKernel(KSForceArg< Oprod, Gauge, Mom > arg)
__global__ void computeKSLongLinkForceKernel(KSLongLinkArg< Result, Oprod, Gauge > arg)
KSForceComplete(KSForceArg< Oprod, Gauge, Mom > &arg, const GaugeField &meta, QudaFieldLocation location)
void completeKSForceCPU(KSForceArg< Oprod, Gauge, Mom > &arg)
void completeKSForce(GaugeField &mom, const GaugeField &oprod, const GaugeField &gauge, QudaFieldLocation location, long long *flops=NULL)
void apply(const cudaStream_t &stream)
void writeAuxString(const char *format,...)
const QudaFieldLocation location
std::string paramString(const TuneParam ¶m) const
FloatingPoint< float > Float
__host__ __device__ void completeKSForceCore(KSForceArg< Oprod, Gauge, Mom > &arg, int idx)
virtual ~KSForceComplete()
QudaReconstructType Reconstruct() const
KSLongLinkForce(KSLongLinkArg< Result, Oprod, Gauge > &arg, const GaugeField &meta, QudaFieldLocation location)
std::string paramString(const TuneParam ¶m) const
const char * VolString() const
__device__ __host__ T getTrace(const Matrix< T, 3 > &a)
enum QudaFieldLocation_s QudaFieldLocation
__host__ __device__ void computeKSLongLinkForceCore(KSLongLinkArg< Result, Oprod, Gauge > &arg, int idx)
virtual ~KSLongLinkForce()
void computeKSLongLinkForce(Result res, Oprod oprod, Gauge gauge, int dim[4], const GaugeField &meta, QudaFieldLocation location)
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
void computeKSLongLinkForceCPU(KSLongLinkArg< Result, Oprod, Gauge > &arg)
void apply(const cudaStream_t &stream)
KSLongLinkArg(Result &res, Oprod &oprod, Gauge &gauge, int dim[4])
__device__ __host__ void getCoords(int x[4], int cb_index, const int X[4], int parity)