11 #ifndef __CUDACC_RTC__ 14 #include <type_traits> 21 #include <type_traits> 43 template <
typename Float,
typename T>
72 __device__ __host__
inline void operator=(
const M &a) {
81 template <
typename T,
int N>
84 a.gauge.load(data, a.x_cb, a.dim, a.parity, a.phase);
91 template <
typename T,
int N>
94 a.gauge.load(data, a.x_cb, a.dim, a.parity, a.phase);
108 template <
typename Float,
typename T>
124 T &
gauge,
int dim,
int ghost_idx,
int parity, Float phase = 1.0) :
127 ghost_idx(ghost_idx),
147 template <
typename T,
int N>
148 template <
typename S>
150 a.gauge.loadGhost(data, a.ghost_idx, a.dim, a.parity, a.phase);
157 template <
typename T,
int N>
158 template <
typename S>
160 a.gauge.loadGhost(data, a.ghost_idx, a.dim, a.parity, a.phase);
165 template<
typename ReduceType,
typename Float>
struct square_ {
167 __host__ __device__
inline ReduceType
operator()(
const quda::complex<Float> &x)
168 {
return static_cast<ReduceType
>(
norm(x)); }
171 template<
typename ReduceType>
struct square_<ReduceType,char> {
173 square_(
const ReduceType scale) : scale(scale) { }
175 {
return norm(scale * complex<ReduceType>(x.
real(), x.
imag())); }
178 template<
typename ReduceType>
struct square_<ReduceType,short> {
180 square_(
const ReduceType scale) : scale(scale) { }
182 {
return norm(scale * complex<ReduceType>(x.
real(), x.
imag())); }
185 template<
typename ReduceType>
struct square_<ReduceType,int> {
187 square_(
const ReduceType scale) : scale(scale) { }
189 {
return norm(scale * complex<ReduceType>(x.
real(), x.
imag())); }
192 template<
typename Float,
typename storeFloat>
struct abs_ {
194 __host__ __device__ Float
operator()(
const quda::complex<storeFloat> &x) {
return abs(x); }
197 template<
typename Float>
struct abs_<Float,char> {
199 abs_(
const Float scale) : scale(scale) { }
201 {
return abs(scale * complex<Float>(x.
real(), x.
imag())); }
204 template<
typename Float>
struct abs_<Float,short> {
206 abs_(
const Float scale) : scale(scale) { }
208 {
return abs(scale * complex<Float>(x.
real(), x.
imag())); }
211 template<
typename Float>
struct abs_<Float,int> {
213 abs_(
const Float scale) : scale(scale) { }
215 {
return abs(scale * complex<Float>(x.
real(), x.
imag())); }
218 template <
typename Float,
typename storeFloat> __host__ __device__
inline constexpr
bool fixed_point() {
return false; }
223 template <
typename Float,
typename storeFloat> __host__ __device__
inline constexpr
bool match() {
return false; }
224 template<> __host__ __device__
inline constexpr
bool match<int,int>() {
return true; }
234 template <
typename Float,
typename storeFloat>
236 complex<storeFloat> *
v;
240 static constexpr
bool fixed = fixed_point<Float,storeFloat>();
246 __device__ __host__
inline fieldorder_wrapper(complex<storeFloat> *v,
int idx, Float scale, Float scale_inv)
247 : v(v), idx(idx), scale(scale), scale_inv(scale_inv) {}
249 __device__ __host__
inline Float
real()
const {
251 return v[idx].real();
253 return scale_inv*
static_cast<Float
>(v[idx].real());
257 __device__ __host__
inline Float
imag()
const {
259 return v[idx].imag();
261 return scale_inv*
static_cast<Float
>(v[idx].imag());
269 __device__ __host__
inline complex<Float>
operator-()
const {
270 return fixed ? -scale_inv*
static_cast<complex<Float>
>(v[idx]) : -
static_cast<complex<Float>
>(v[idx]);
278 v[idx] = fixed ? complex<storeFloat>(round(scale * a.
real()), round(scale * a.
imag())) : a.
v[a.
idx];
285 template<
typename theirFloat>
286 __device__ __host__
inline void operator=(
const complex<theirFloat> &a) {
287 if (match<storeFloat,theirFloat>()) {
288 v[idx] = complex<storeFloat>(a.x, a.y);
290 v[idx] = fixed ? complex<storeFloat>(round(scale * a.x), round(scale * a.y)) : complex<storeFloat>(a.x, a.y);
298 template<
typename theirFloat>
299 __device__ __host__
inline void operator+=(
const complex<theirFloat> &a) {
300 if (match<storeFloat,theirFloat>()) {
301 v[idx] += complex<storeFloat>(a.x, a.y);
303 v[idx] += fixed ? complex<storeFloat>(round(scale * a.x), round(scale * a.y)) : complex<storeFloat>(a.x, a.y);
311 template<
typename theirFloat>
312 __device__ __host__
inline void operator-=(
const complex<theirFloat> &a) {
313 if (match<storeFloat,theirFloat>()) {
314 v[idx] -= complex<storeFloat>(a.x, a.y);
316 v[idx] -= fixed ? complex<storeFloat>(round(scale * a.x), round(scale * a.y)) : complex<storeFloat>(a.x, a.y);
322 template<
typename Float,
typename storeFloat>
325 if (fixed_point<Float,storeFloat>())
return a*complex<Float>(b.
real(), b.
imag());
326 else return a*complex<Float>(b.
v[b.
idx].real(),b.
v[b.
idx].imag());
329 template<
typename Float,
typename storeFloat>
331 if (fixed_point<Float,storeFloat>())
return complex<Float>(a.
real(), a.
imag()) + b;
332 else return complex<Float>(a.
v[a.
idx].real(),a.
v[a.
idx].imag()) + b;
335 template<
typename Float,
typename storeFloat>
337 if (fixed_point<Float,storeFloat>())
return a + complex<Float>(b.
real(), b.
imag());
338 else return a + complex<Float>(b.
v[b.
idx].real(),b.
v[b.
idx].imag());;
341 template<
typename Float,
int nColor, QudaGaugeFieldOrder order,
typename storeFloat,
bool use_tex>
345 errorQuda(
"Not implemented for order=%d", order);
350 __device__ __host__ complex<Float>&
operator()(
int d,
int parity,
int x,
int row,
int col)
const {
355 template<
typename Float,
int nColor, QudaGaugeFieldOrder order,
bool native_ghost,
typename storeFloat,
bool use_tex>
359 errorQuda(
"Not implemented for order=%d", order);
364 __device__ __host__ complex<Float>&
operator()(
int d,
int parity,
int x,
int row,
int col)
const {
369 template<
typename Float,
int nColor,
typename storeFloat,
bool use_tex>
377 static constexpr
bool fixed = fixed_point<Float,storeFloat>();
380 : volumeCB(U.VolumeCB()), geometry(U.Geometry()), cb_offset((U.Bytes()>>1) / (sizeof(complex<storeFloat>)*U.Geometry())),
381 scale(static_cast<Float>(1.0)), scale_inv(static_cast<Float>(1.0))
384 u[d] = gauge_ ?
static_cast<complex<storeFloat>**
>(gauge_)[d] :
385 static_cast<complex<storeFloat>**
>(
const_cast<void*
>(U.
Gauge_p()))[d];
386 resetScale(U.
Scale());
390 : volumeCB(a.volumeCB), geometry(a.geometry), cb_offset(a.cb_offset), scale(a.scale), scale_inv(a.scale_inv) {
397 scale =
static_cast<Float
>(std::numeric_limits<storeFloat>::max()) / max;
398 scale_inv = max /
static_cast<Float
>(std::numeric_limits<storeFloat>::max());
402 __device__ __host__
inline complex<Float>
operator()(
int d,
int parity,
int x,
int row,
int col)
const 404 complex<storeFloat>
tmp = u[d][ parity*cb_offset + (x*
nColor + row)*
nColor + col];
407 return scale_inv*complex<Float>(
static_cast<Float
>(tmp.x), static_cast<Float>(tmp.y));
409 return complex<Float>(tmp.x,tmp.y);
417 template<
typename theirFloat>
419 const complex<theirFloat> &val)
const {
421 typedef typename vector<storeFloat,2>::type vec2;
422 vec2 *u2 =
reinterpret_cast<vec2*
>(u[
dim] + parity*cb_offset + (x_cb*
nColor + row)*
nColor + col);
423 if (fixed && !match<storeFloat,theirFloat>()) {
424 complex<storeFloat> val_(round(scale * val.real()), round(scale * val.imag()));
430 if (fixed && !match<storeFloat,theirFloat>()) {
431 complex<storeFloat> val_(round(scale * val.real()), round(scale * val.imag()));
432 #pragma omp atomic update 433 u[
dim][ parity*cb_offset + (x_cb*
nColor + row)*
nColor + col].x += val_.x;
434 #pragma omp atomic update
435 u[dim][ parity*cb_offset + (x_cb*
nColor + row)*
nColor + col].y += val_.y;
437 #pragma omp atomic update 438 u[
dim][ parity*cb_offset + (x_cb*
nColor + row)*
nColor + col].x += static_cast<storeFloat>(val.x);
439 #pragma omp atomic update 440 u[
dim][ parity*cb_offset + (x_cb*
nColor + row)*
nColor + col].y += static_cast<storeFloat>(val.y);
445 template<
typename helper,
typename reducer>
447 if (dim >= geometry)
errorQuda(
"Request dimension %d exceeds dimensionality of the field %d", dim, geometry);
448 int lower = (dim == -1) ? 0 : dim;
449 int upper = (dim == -1) ? geometry : dim+1;
450 double result =
init;
453 for (
int d=lower; d<upper; d++) {
454 thrust::device_ptr<complex<storeFloat> > ptr(u[d]);
455 result = thrust::transform_reduce(thrust::cuda::par(alloc), ptr, ptr+2*volumeCB*
nColor*
nColor, h, result, r);
458 for (
int d=lower; d<upper; d++) {
459 result = thrust::transform_reduce(thrust::seq, u[d], u[d]+2*volumeCB*
nColor*
nColor, h, result, r);
467 template<
typename Float,
int nColor,
bool native_ghost,
typename storeFloat,
bool use_tex>
469 complex<storeFloat> *ghost[8];
473 static constexpr
bool fixed = fixed_point<Float,storeFloat>();
476 : scale(static_cast<Float>(1.0)), scale_inv(static_cast<Float>(1.0)) {
477 for (
int d=0; d<4; d++) {
478 ghost[d] = ghost_ ?
static_cast<complex<storeFloat>*
>(ghost_[d]) :
479 static_cast<complex<storeFloat>*
>(const_cast<void*>(U.
Ghost()[d]));
483 ghost_ ?
static_cast<complex<storeFloat>*
>(ghost_[d+4]) :
484 static_cast<complex<storeFloat>*
>(
const_cast<void*
>(U.
Ghost()[d+4]));
488 resetScale(U.
Scale());
492 : scale(a.scale), scale_inv(a.scale_inv) {
493 for (
int d=0; d<8; d++) {
494 ghost[d] = a.
ghost[d];
501 scale =
static_cast<Float
>(std::numeric_limits<storeFloat>::max()) / max;
502 scale_inv = max /
static_cast<Float
>(std::numeric_limits<storeFloat>::max());
506 __device__ __host__
inline complex<Float>
operator()(
int d,
int parity,
int x,
int row,
int col)
const 508 complex<storeFloat>
tmp = ghost[d][ parity*ghostOffset[d] + (x*
nColor + row)*
nColor + col];
510 return scale_inv*complex<Float>(
static_cast<Float
>(tmp.x), static_cast<Float>(tmp.y));
512 return complex<Float>(tmp.x,tmp.y);
521 template<
typename Float,
int nColor,
typename storeFloat,
bool use_tex>
523 complex<storeFloat> *
u;
528 static constexpr
bool fixed = fixed_point<Float,storeFloat>();
531 : u(gauge_ ? static_cast<complex<storeFloat>*>(gauge_) :
532 static_cast<complex<storeFloat>*>(const_cast<void *>(U.Gauge_p()))),
533 volumeCB(U.VolumeCB()), geometry(U.Geometry()),
534 scale(static_cast<Float>(1.0)), scale_inv(static_cast<Float>(1.0)) {
535 resetScale(U.
Scale());
539 : u(a.u), volumeCB(a.volumeCB), geometry(a.geometry), scale(a.scale), scale_inv(a.scale_inv)
544 scale =
static_cast<Float
>(std::numeric_limits<storeFloat>::max()) / max;
545 scale_inv = max /
static_cast<Float
>(std::numeric_limits<storeFloat>::max());
549 __device__ __host__
inline complex<Float>
operator()(
int d,
int parity,
int x,
int row,
int col)
const 551 complex<storeFloat>
tmp = u[(((parity*volumeCB+x)*geometry + d)*
nColor + row)*
nColor + col];
553 return scale_inv*complex<Float>(
static_cast<Float
>(tmp.x), static_cast<Float>(tmp.y));
555 return complex<Float>(tmp.x,tmp.y);
561 (u, (((parity*volumeCB+x)*geometry + d)*
nColor + row)*
nColor + col, scale, scale_inv); }
563 template <
typename theirFloat>
564 __device__ __host__
inline void atomic_add(
int dim,
int parity,
int x_cb,
int row,
int col,
const complex<theirFloat> &val)
const {
566 typedef typename vector<storeFloat,2>::type vec2;
567 vec2 *u2 =
reinterpret_cast<vec2*
>(u + (((parity*volumeCB+
x_cb)*geometry + dim)*
nColor + row)*
nColor + col);
568 if (fixed && !match<storeFloat,theirFloat>()) {
569 complex<storeFloat> val_(round(scale * val.real()), round(scale * val.imag()));
575 if (fixed && !match<storeFloat,theirFloat>()) {
576 complex<storeFloat> val_(round(scale * val.real()), round(scale * val.imag()));
577 #pragma omp atomic update 578 u[(((parity*volumeCB+
x_cb)*geometry + dim)*
nColor + row)*
nColor + col].x += val_.x;
579 #pragma omp atomic update
580 u[(((parity*volumeCB+x_cb)*geometry +
dim)*
nColor + row)*
nColor + col].y += val_.y;
582 #pragma omp atomic update 583 u[(((parity*volumeCB+
x_cb)*geometry + dim)*
nColor + row)*
nColor + col].x += static_cast<storeFloat>(val.x);
584 #pragma omp atomic update 585 u[(((parity*volumeCB+
x_cb)*geometry + dim)*
nColor + row)*
nColor + col].y += static_cast<storeFloat>(val.y);
590 template<
typename helper,
typename reducer>
592 if (dim >= geometry)
errorQuda(
"Request dimension %d exceeds dimensionality of the field %d", dim, geometry);
593 int lower = (dim == -1) ? 0 : dim;
594 int upper = (dim == -1) ? geometry : dim+1;
595 double result =
init;
598 thrust::device_ptr<complex<storeFloat> > ptr(u);
599 result = thrust::transform_reduce(thrust::cuda::par(alloc),
601 ptr+(0*geometry+upper)*volumeCB*nColor*nColor, h, result, r);
602 result = thrust::transform_reduce(thrust::cuda::par(alloc),
603 ptr+(1*geometry+lower)*volumeCB*nColor*nColor,
604 ptr+(1*geometry+upper)*volumeCB*nColor*nColor, h, result, r);
606 result = thrust::transform_reduce(thrust::seq,
608 u+(0*geometry+upper)*volumeCB*nColor*nColor, h, result, r);
609 result = thrust::transform_reduce(thrust::seq,
610 u+(1*geometry+lower)*volumeCB*nColor*nColor,
611 u+(1*geometry+upper)*volumeCB*nColor*nColor, h, result, r);
618 template<
typename Float,
int nColor,
bool native_ghost,
typename storeFloat,
bool use_tex>
620 complex<storeFloat> *ghost[8];
624 static constexpr
bool fixed = fixed_point<Float,storeFloat>();
627 : scale(static_cast<Float>(1.0)), scale_inv(static_cast<Float>(1.0)) {
628 for (
int d=0; d<4; d++) {
629 ghost[d] = ghost_ ?
static_cast<complex<storeFloat>*
>(ghost_[d]) :
630 static_cast<complex<storeFloat>*
>(const_cast<void*>(U.
Ghost()[d]));
634 ghost_ ?
static_cast<complex<storeFloat>*
>(ghost_[d+4]) :
635 static_cast<complex<storeFloat>*
>(
const_cast<void*
>(U.
Ghost()[d+4]));
639 resetScale(U.
Scale());
643 : scale(a.scale), scale_inv(a.scale_inv) {
644 for (
int d=0; d<8; d++) {
645 ghost[d] = a.
ghost[d];
652 scale =
static_cast<Float
>(std::numeric_limits<storeFloat>::max()) / max;
653 scale_inv = max /
static_cast<Float
>(std::numeric_limits<storeFloat>::max());
657 __device__ __host__
inline complex<Float>
operator()(
int d,
int parity,
int x,
int row,
int col)
const 659 complex<storeFloat>
tmp = ghost[d][ parity*ghostOffset[d] + (x*
nColor + row)*
nColor + col];
661 return scale_inv*complex<Float>(
static_cast<Float
>(tmp.x), static_cast<Float>(tmp.y));
663 return complex<Float>(tmp.x,tmp.y);
669 (ghost[d], parity*ghostOffset[d] + (x*
nColor + row)*
nColor + col, scale, scale_inv); }
672 template<
int nColor,
int N>
675 int j = ((row*
nColor+col)*2) / N;
676 int i = ((row*
nColor+col)*2) % N;
677 int index = ((x_cb + dim*stride*M + j*stride)*2+i) / 2;
678 index += parity*offset_cb;
682 template<
typename Float,
int nColor,
typename storeFloat,
bool use_tex>
684 complex<storeFloat> *
u;
686 #ifdef USE_TEXTURE_OBJECTS 688 cudaTextureObject_t tex;
696 static constexpr
bool fixed = fixed_point<Float,storeFloat>();
699 : u(gauge_ ? static_cast<complex<storeFloat>*>(gauge_) :
700 static_cast<complex<storeFloat>*>(const_cast<void*>(U.Gauge_p()))),
701 offset_cb( (U.Bytes()>>1) / sizeof(complex<storeFloat>)),
702 #ifdef USE_TEXTURE_OBJECTS
705 volumeCB(U.VolumeCB()), stride(U.Stride()), geometry(U.Geometry()),
706 max(static_cast<Float>(1.0)), scale(static_cast<Float>(1.0)), scale_inv(static_cast<Float>(1.0))
708 resetScale(U.
Scale());
709 #ifdef USE_TEXTURE_OBJECTS 711 if (use_tex && this->u != U.
Gauge_p() && !
override) {
712 errorQuda(
"Cannot use texture read since data pointer does not equal field pointer - use with use_tex=false instead");
718 : u(a.u), offset_cb(a.offset_cb),
719 #ifdef USE_TEXTURE_OBJECTS
722 volumeCB(a.volumeCB), stride(a.stride), geometry(a.geometry),
723 scale(a.scale), scale_inv(a.scale_inv) { }
728 scale =
static_cast<Float
>(std::numeric_limits<storeFloat>::max()) / max;
729 scale_inv = max /
static_cast<Float
>(std::numeric_limits<storeFloat>::max());
735 #if defined(USE_TEXTURE_OBJECTS) && defined(__CUDA_ARCH__) 737 TexVector vecTmp = tex1Dfetch_<TexVector>(tex, parity*offset_cb + dim*stride*
nColor*
nColor + (row*
nColor+col)*stride + x_cb);
739 return max*complex<Float>(vecTmp.x, vecTmp.y);
741 return complex<Float>(vecTmp.x, vecTmp.y);
746 complex<storeFloat>
tmp = u[parity*offset_cb + dim*stride*
nColor*
nColor + (row*
nColor+col)*stride + x_cb];
748 return scale_inv*complex<Float>(
static_cast<Float
>(tmp.x), static_cast<Float>(tmp.y));
750 return complex<Float>(tmp.x, tmp.y);
761 template <
typename theirFloat>
764 typedef typename vector<storeFloat,2>::type vec2;
765 vec2 *u2 =
reinterpret_cast<vec2*
>(u + parity*offset_cb + dim*stride*
nColor*
nColor + (row*
nColor+col)*stride + x_cb);
766 if (fixed && !match<storeFloat,theirFloat>()) {
767 complex<storeFloat> val_(round(scale * val.real()), round(scale * val.imag()));
773 if (fixed && !match<storeFloat,theirFloat>()) {
774 complex<storeFloat> val_(round(scale * val.real()), round(scale * val.imag()));
775 #pragma omp atomic update 776 u[parity*offset_cb + dim*stride*
nColor*
nColor + (row*
nColor+col)*stride + x_cb].x += val_.x;
777 #pragma omp atomic update
780 #pragma omp atomic update 781 u[parity*offset_cb + dim*stride*
nColor*
nColor + (row*
nColor+col)*stride + x_cb].x += static_cast<storeFloat>(val.x);
782 #pragma omp atomic update 783 u[parity*offset_cb + dim*stride*
nColor*
nColor + (row*
nColor+col)*stride + x_cb].y += static_cast<storeFloat>(val.y);
788 template<
typename helper,
typename reducer>
790 if (dim >= geometry)
errorQuda(
"Request dimension %d exceeds dimensionality of the field %d", dim, geometry);
791 int lower = (dim == -1) ? 0 : dim;
792 int upper = (dim == -1) ? geometry : dim+1;
793 double result =
init;
796 thrust::device_ptr<complex<storeFloat> > ptr(u);
797 result = thrust::transform_reduce(thrust::cuda::par(alloc),
799 ptr+0*offset_cb+upper*stride*nColor*nColor, h, result, r);
800 result = thrust::transform_reduce(thrust::cuda::par(alloc),
801 ptr+1*offset_cb+lower*stride*nColor*nColor,
802 ptr+1*offset_cb+upper*stride*nColor*nColor, h, result, r);
804 result = thrust::transform_reduce(thrust::seq,
806 u+0*offset_cb+upper*stride*nColor*nColor, h, result, r);
807 result = thrust::transform_reduce(thrust::seq,
808 u+1*offset_cb+lower*stride*nColor*nColor,
809 u+1*offset_cb+upper*stride*nColor*nColor, h, result, r);
816 template<
typename Float,
int nColor,
bool native_ghost,
typename storeFloat,
bool use_tex>
818 complex<storeFloat> *ghost[8];
820 int ghostVolumeCB[8];
823 static constexpr
bool fixed = fixed_point<Float,storeFloat>();
827 : volumeCB(U.VolumeCB()), accessor(U, gauge_, ghost_),
828 scale(static_cast<Float>(1.0)), scale_inv(static_cast<Float>(1.0))
830 if (!native_ghost) assert(ghost_ !=
nullptr);
831 for (
int d=0; d<4; d++) {
832 ghost[d] = !native_ghost ?
static_cast<complex<storeFloat>*
>(ghost_[d]) :
nullptr;
837 resetScale(U.
Scale());
841 : volumeCB(a.volumeCB), scale(a.scale), scale_inv(a.scale_inv), accessor(a.accessor)
843 for (
int d=0; d<8; d++) {
844 ghost[d] = a.
ghost[d];
852 scale =
static_cast<Float
>(std::numeric_limits<storeFloat>::max()) / max;
853 scale_inv = max /
static_cast<Float
>(std::numeric_limits<storeFloat>::max());
857 __device__ __host__
inline const complex<Float>
operator()(
int d,
int parity,
int x_cb,
int row,
int col)
const 860 return accessor(d%4, parity, x_cb+(d/4)*ghostVolumeCB[d]+volumeCB, row, col);
862 complex<storeFloat>
tmp = ghost[d][ ((parity*
nColor + row)*
nColor+col)*ghostVolumeCB[d] +
x_cb ];
864 return scale_inv*complex<Float>(
static_cast<Float
>(tmp.x), static_cast<Float>(tmp.y));
866 return complex<Float>(tmp.x, tmp.y);
874 return accessor(d%4, parity, x_cb+(d/4)*ghostVolumeCB[d]+volumeCB, row, col);
877 (ghost[d], ((parity*
nColor + row)*
nColor+col)*ghostVolumeCB[d] +
x_cb, scale, scale_inv);
895 bool native_ghost=
true,
typename storeFloat=Float,
bool use_tex=
false>
903 static constexpr
int nColorCoarse = nColor / nSpinCoarse;
913 : volumeCB(U.VolumeCB()), nDim(U.Ndim()), geometry(U.Geometry()),
914 location(U.Location()),
915 accessor(U, gauge_, ghost_), ghostAccessor(U, gauge_, ghost_)
918 errorQuda(
"GaugeField ordering not supported with reconstruction");
922 nDim(o.nDim), geometry(o.geometry), location(o.location),
923 accessor(o.accessor), ghostAccessor(o.ghostAccessor)
931 static constexpr
bool fixedPoint() {
return fixed_point<Float,storeFloat>(); }
941 __device__ __host__ complex<Float>
operator()(
int d,
int parity,
int x,
int row,
int col)
const 942 {
return accessor(d,parity,x,row,col); }
953 {
return accessor(d,parity,x,row,col); }
963 __device__ __host__ complex<Float>
Ghost(
int d,
int parity,
int x,
int row,
int col)
const 964 {
return ghostAccessor(d,parity,x,row,col); }
975 {
return ghostAccessor(d,parity,x,row,col); }
987 __device__ __host__
inline const complex<Float>
operator()(
int d,
int parity,
int x,
int s_row,
988 int s_col,
int c_row,
int c_col)
const {
989 return (*
this)(d,
parity, x, s_row*nColorCoarse + c_row, s_col*nColorCoarse + c_col);
1003 (
int d,
int parity,
int x,
int s_row,
int s_col,
int c_row,
int c_col) {
1004 return (*
this)(d,
parity, x, s_row*nColorCoarse + c_row, s_col*nColorCoarse + c_col);
1017 __device__ __host__
inline complex<Float>
Ghost(
int d,
int parity,
int x,
int s_row,
1018 int s_col,
int c_row,
int c_col)
const {
1019 return Ghost(d, parity, x, s_row*nColorCoarse + c_row, s_col*nColorCoarse + c_col);
1033 Ghost(
int d,
int parity,
int x,
int s_row,
int s_col,
int c_row,
int c_col) {
1034 return Ghost(d, parity, x, s_row*nColorCoarse + c_row, s_col*nColorCoarse + c_col);
1037 template <
typename theirFloat>
1038 __device__ __host__
inline void atomicAdd(
int d,
int parity,
int x,
int s_row,
int s_col,
1039 int c_row,
int c_col,
const complex<theirFloat> &val) {
1040 accessor.atomic_add(d, parity, x, s_row*nColorCoarse + c_row, s_col*nColorCoarse + c_col, val);
1053 __device__ __host__
inline int Ndim()
const {
return nDim; }
1056 __device__ __host__
inline int Geometry()
const {
return geometry; }
1059 __device__ __host__
inline int NspinCoarse()
const {
return nSpinCoarse; }
1062 __device__ __host__
inline int NcolorCoarse()
const {
return nColorCoarse; }
1069 __host__
double norm1(
int dim=-1,
bool global=
true)
const {
1071 thrust::plus<double>(), 0.0);
1081 __host__
double norm2(
int dim=-1,
bool global=
true)
const {
1083 thrust::plus<double>(), 0.0);
1095 thrust::maximum<Float>(), 0.0);
1107 thrust::minimum<Float>(), std::numeric_limits<double>::max());
1113 size_t Bytes()
const {
return static_cast<size_t>(
volumeCB) * nColor * nColor * 2ll *
sizeof(storeFloat); }
1124 template <
int N,
typename Float, QudaGhostExchange ghostExchange_, QudaStaggeredPhase = QUDA_STAGGERED_PHASE_NO>
1139 for (
int i = 0; i < N / 2; i++) {
1140 out[2 * i + 0] = scale_inv *
in[i].real();
1141 out[2 * i + 1] = scale_inv *
in[i].imag();
1145 for (
int i = 0; i < N / 2; i++) {
1146 out[2 * i + 0] =
in[i].real();
1147 out[2 * i + 1] =
in[i].imag();
1152 template <
typename I>
1154 const I *
X,
const int *
R)
const 1158 for (
int i = 0; i < N / 2; i++) {
out[i] = scale *
complex(
in[2 * i + 0],
in[2 * i + 1]); }
1161 for (
int i = 0; i < N / 2; i++) {
out[i] =
complex(
in[2 * i + 0],
in[2 * i + 1]); }
1178 template <QudaGhostExchange ghostExchange_,
typename T,
typename I>
1180 T tBoundary, T scale,
int firstTimeSliceBound,
int lastTimeSliceBound,
bool isFirstTimeSlice,
1188 if (idx >= firstTimeSliceBound) {
1189 return isFirstTimeSlice ? tBoundary : scale;
1190 }
else if (idx >= lastTimeSliceBound) {
1191 return isLastTimeSlice ? tBoundary : scale;
1197 if (idx >= (
R[3] - 1) *
X[0] *
X[1] *
X[2] / 2 && idx <
R[3] *
X[0] *
X[1] *
X[2] / 2) {
1199 return isFirstTimeSlice ? tBoundary : scale;
1200 }
else if (idx >= (
X[3] -
R[3] - 1) *
X[0] *
X[1] *
X[2] / 2 && idx < (
X[3] -
R[3]) *
X[0] *
X[1] *
X[2] / 2) {
1202 return isLastTimeSlice ? tBoundary : scale;
1211 template <
typename Float,
typename I>
1214 Float sign =
static_cast<Float
>(1.0);
1216 case 0:
if ( ((x[3] - R[3]) & 1) != 0) sign = -static_cast<Float>(1.0);
break;
1217 case 1:
if ( ((x[0] - R[0] + x[3] - R[3]) & 1) != 0) sign = -static_cast<Float>(1.0);
break;
1218 case 2:
if ( ((x[0] - R[0] + x[1] - R[1] + x[3] - R[3]) & 1) != 0) sign = -static_cast<Float>(1.0);
break;
1230 template <
typename Float, QudaGhostExchange ghostExchange_>
struct Reconstruct<12, Float, ghostExchange_> {
1242 anisotropy(u.Anisotropy()),
1243 tBoundary(static_cast<
real>(u.TBoundary())),
1244 firstTimeSliceBound(u.VolumeCB()),
1245 lastTimeSliceBound((u.
X()[3] - 1) * u.
X()[0] * u.
X()[1] * u.
X()[2] / 2),
1246 isFirstTimeSlice(
comm_coord(3) == 0 ? true : false),
1248 ghostExchange(u.GhostExchange())
1253 anisotropy(recon.anisotropy),
1254 tBoundary(recon.tBoundary),
1255 firstTimeSliceBound(recon.firstTimeSliceBound),
1256 lastTimeSliceBound(recon.lastTimeSliceBound),
1257 isFirstTimeSlice(recon.isFirstTimeSlice),
1258 isLastTimeSlice(recon.isLastTimeSlice),
1259 ghostExchange(recon.ghostExchange)
1266 for (
int i = 0; i < 6; i++) {
1267 out[2 * i + 0] = in[i].real();
1268 out[2 * i + 1] = in[i].imag();
1272 template <
typename I>
1274 const I *
X,
const int *
R)
const 1277 for (
int i = 0; i < 6; i++) out[i] =
complex(in[2 * i + 0], in[2 * i + 1]);
1279 const real u0 = dir < 3 ?
1281 timeBoundary<ghostExchange_>(idx,
X,
R, tBoundary,
static_cast<real>(1.0), firstTimeSliceBound,
1282 lastTimeSliceBound, isFirstTimeSlice, isLastTimeSlice, ghostExchange);
1285 out[6] =
cmul(out[2], out[4]);
1286 out[6] =
cmac(out[1], out[5], -out[6]);
1287 out[6] = u0 *
conj(out[6]);
1290 out[7] =
cmul(out[0], out[5]);
1291 out[7] =
cmac(out[2], out[3], -out[7]);
1292 out[7] = u0 *
conj(out[7]);
1295 out[8] =
cmul(out[1], out[3]);
1296 out[8] =
cmac(out[0], out[4], -out[8]);
1297 out[8] = u0 *
conj(out[8]);
1313 template <
typename Float, QudaGhostExchange ghostExchange_>
struct Reconstruct<11, Float, ghostExchange_> {
1323 for (
int i = 0; i < 2; i++) {
1324 out[2 * i + 0] = in[i + 1].real();
1325 out[2 * i + 1] = in[i + 1].imag();
1327 out[4] = in[5].real();
1328 out[5] = in[5].imag();
1329 out[6] = in[0].imag();
1330 out[7] = in[4].imag();
1331 out[8] = in[8].imag();
1335 template <
typename I>
1337 const I *
X,
const int *
R)
const 1340 out[1] =
complex(in[0], in[1]);
1341 out[2] =
complex(in[2], in[3]);
1344 out[5] =
complex(in[4], in[5]);
1361 template <
typename Float, QudaGhostExchange ghostExchange_, QudaStaggeredPhase stag_phase>
1371 reconstruct_12(recon.reconstruct_12),
1373 scale_inv(recon.scale_inv)
1379 reconstruct_12.
Pack(out, in, idx);
1382 template <
typename I>
1384 const I *
X,
const int *
R)
const 1387 for (
int i = 0; i < 6; i++) out[i] =
complex(in[2 * i + 0], in[2 * i + 1]);
1389 out[6] =
cmul(out[2], out[4]);
1390 out[6] =
cmac(out[1], out[5], -out[6]);
1391 out[6] = scale_inv *
conj(out[6]);
1393 out[7] =
cmul(out[0], out[5]);
1394 out[7] =
cmac(out[2], out[3], -out[7]);
1395 out[7] = scale_inv *
conj(out[7]);
1397 out[8] =
cmul(out[1], out[3]);
1398 out[8] =
cmac(out[0], out[4], -out[8]);
1399 out[8] = scale_inv *
conj(out[8]);
1405 complex A(cos_sin[0], cos_sin[1]);
1406 out[6] =
cmul(A, out[6]);
1407 out[7] =
cmul(A, out[7]);
1408 out[8] =
cmul(A, out[8]);
1418 #if 1 // phase from cross product 1420 complex denom =
conj(in[0] * in[4] - in[1] * in[3]) * scale_inv;
1421 complex expI3Phase = in[8] / denom;
1424 return arg(expI3Phase) /
static_cast<real>(3.0);
1426 return expI3Phase.real() > 0 ? 1 : -1;
1428 #else // phase from determinant 1431 for (
int i = 0; i < 9; i++) a(i) = scale_inv * in[i];
1445 template <
typename Float, QudaGhostExchange ghostExchange_>
struct Reconstruct<8, Float, ghostExchange_> {
1458 anisotropy(u.Anisotropy() * scale, 1.0 / (u.Anisotropy() * scale)),
1459 tBoundary(static_cast<
real>(u.TBoundary()) * scale, 1.0 / (static_cast<
real>(u.TBoundary()) * scale)),
1460 firstTimeSliceBound(u.VolumeCB()),
1461 lastTimeSliceBound((u.
X()[3] - 1) * u.
X()[0] * u.
X()[1] * u.
X()[2] / 2),
1462 isFirstTimeSlice(
comm_coord(3) == 0 ? true : false),
1464 ghostExchange(u.GhostExchange())
1469 anisotropy(recon.anisotropy),
1470 tBoundary(recon.tBoundary),
1471 firstTimeSliceBound(recon.firstTimeSliceBound),
1472 lastTimeSliceBound(recon.lastTimeSliceBound),
1473 isFirstTimeSlice(recon.isFirstTimeSlice),
1474 isLastTimeSlice(recon.isLastTimeSlice),
1475 ghostExchange(recon.ghostExchange)
1484 for (
int i = 1; i < 4; i++) {
1485 out[2 * i + 0] = in[i].real();
1486 out[2 * i + 1] = in[i].imag();
1490 template <
typename I>
1495 real u0_inv = u.imag();
1498 for (
int i = 1; i <= 3; i++)
1499 out[i] =
complex(in[2 * i + 0], in[2 * i + 1]);
1503 out[0] =
complex(tmp[0], tmp[1]);
1506 out[6] =
complex(tmp[0], tmp[1]);
1509 real row_sum = out[1].real() * out[1].real();
1510 row_sum += out[1].imag() * out[1].imag();
1511 row_sum += out[2].real() * out[2].real();
1512 row_sum += out[2].imag() * out[2].imag();
1513 real row_sum_inv =
static_cast<real>(1.0) / row_sum;
1515 real diff = u0_inv * u0_inv - row_sum;
1516 real U00_mag = diff > 0.0 ? diff * rsqrt(diff) :
static_cast<real>(0.0);
1521 real column_sum = out[0].real() * out[0].real();
1522 column_sum += out[0].imag() * out[0].imag();
1523 column_sum += out[3].real() * out[3].real();
1524 column_sum += out[3].imag() * out[3].imag();
1526 diff = u0_inv * u0_inv - column_sum;
1527 real U20_mag = diff > 0.0 ? diff * rsqrt(diff) :
static_cast<real>(0.0);
1532 real r_inv2 = u0_inv * row_sum_inv;
1538 out[4] =
cmac(u0 * A, out[1], out[4]);
1539 out[4] = -r_inv2 * out[4];
1543 out[5] =
cmac(-u0 * A, out[2], out[5]);
1544 out[5] = r_inv2 * out[5];
1552 out[7] =
cmac(-u0 * A, out[1], out[7]);
1553 out[7] = r_inv2 * out[7];
1557 out[8] =
cmac(u0 * A, out[2], out[8]);
1558 out[8] = -r_inv2 * out[8];
1562 template <
typename I>
1563 __device__ __host__
inline void 1565 const complex scale =
complex(static_cast<real>(1.0), static_cast<real>(1.0)))
const 1569 timeBoundary<ghostExchange_>(idx,
X,
R, tBoundary, scale, firstTimeSliceBound, lastTimeSliceBound,
1570 isFirstTimeSlice, isLastTimeSlice, ghostExchange);
1571 Unpack(out, in, idx, dir, phase, X, R, scale, u);
1585 template <
typename Float, QudaGhostExchange ghostExchange_, QudaStaggeredPhase stag_phase>
1596 reconstruct_8(recon.reconstruct_8),
1598 scale_inv(recon.scale_inv)
1604 #if 1 // phase from cross product 1606 complex denom =
conj(in[0] * in[4] - in[1] * in[3]) * scale_inv;
1607 complex expI3Phase = in[8] / denom;
1609 return arg(expI3Phase) /
static_cast<real>(3.0);
1611 return expI3Phase.real() > 0 ? 1 : -1;
1613 #else // phase from determinant 1616 for (
int i = 0; i < 9; i++) a(i) = scale_inv * in[i];
1626 real phase = getPhase(in);
1632 complex z(cos_sin[0], cos_sin[1]);
1635 for (
int i = 0; i < 9; i++) su3[i] =
cmul(z, in[i]);
1638 for (
int i = 0; i < 9; i++) { su3[i] = phase * in[i]; }
1640 reconstruct_8.
Pack(out, su3, idx);
1643 template <
typename I>
1645 const I *X,
const int *R)
const 1647 reconstruct_8.
Unpack(out, in, idx, dir, phase, X, R,
complex(static_cast<real>(1.0), static_cast<real>(1.0)),
1648 complex(static_cast<real>(1.0), static_cast<real>(1.0)));
1653 complex z(cos_sin[0], cos_sin[1]);
1656 for (
int i = 0; i < 9; i++) out[i] =
cmul(z, out[i]);
1659 for (
int i = 0; i < 18; i++) { out[i] *=
phase; }
1664 __host__ __device__ constexpr
int ct_sqrt(
int n,
int i = 1)
1666 return n == i ? n : (i * i < n ?
ct_sqrt(n, i + 1) : i);
1679 template <QudaStaggeredPhase phase> __host__ __device__
inline bool static_phase()
1685 default:
return false;
1689 template <
typename Float,
int length,
int N,
int reconLenParam,
1701 static const int reconLen = (reconLenParam == 11) ? 10 : reconLenParam;
1702 static const int hasPhase = (reconLen == 9 || reconLen == 13) ? 1 : 0;
1705 #ifdef USE_TEXTURE_OBJECTS 1707 cudaTextureObject_t tex;
1708 const int tex_offset;
1716 int faceVolumeCB[4];
1724 : reconstruct(u), gauge(gauge_ ? gauge_ : (Float*)u.Gauge_p()),
1725 offset(u.Bytes()/(2*sizeof(Float))),
1726 #ifdef USE_TEXTURE_OBJECTS
1727 tex(0), tex_offset(offset/N),
1729 ghostExchange(u.GhostExchange()),
1730 volumeCB(u.VolumeCB()), stride(u.Stride()), geometry(u.Geometry()),
1731 phaseOffset(u.PhaseOffset()), backup_h(nullptr), bytes(u.Bytes())
1734 errorQuda(
"This accessor does not support coarse-link fields (lacks support for bidirectional ghost zone");
1738 for (
int i = 0; i < 4; i++) {
1741 ghost[i] = ghost_ ? ghost_[i] : 0;
1744 #ifdef USE_TEXTURE_OBJECTS 1746 if (!huge_alloc && this->gauge != u.
Gauge_p() && !
override) {
1747 errorQuda(
"Cannot use texture read since data pointer does not equal field pointer - use with huge_alloc=true instead");
1753 : reconstruct(order.reconstruct), gauge(order.gauge), offset(order.offset),
1754 #ifdef USE_TEXTURE_OBJECTS
1755 tex(order.tex), tex_offset(order.tex_offset),
1757 ghostExchange(order.ghostExchange),
1758 volumeCB(order.volumeCB), stride(order.stride), geometry(order.geometry),
1759 phaseOffset(order.phaseOffset), backup_h(nullptr), bytes(order.bytes)
1761 for (
int i=0; i<4; i++) {
1764 ghost[i] = order.
ghost[i];
1771 const int M = reconLen / N;
1775 for (
int i=0; i<M; i++){
1777 #if defined(USE_TEXTURE_OBJECTS) && defined(__CUDA_ARCH__) 1779 TexVector vecTmp = tex1Dfetch_<TexVector>(tex, parity * tex_offset + (dir * M + i) * stride + x);
1782 for (
int j = 0; j < N; j++) copy(tmp[i * N + j], reinterpret_cast<real *>(&vecTmp)[j]);
1787 Vector vecTmp = vector_load<Vector>(gauge + parity * offset, (dir * M + i) * stride + x);
1790 for (
int j=0; j<N; j++) copy(tmp[i*N+j], reinterpret_cast<Float*>(&vecTmp)[j]);
1797 if (static_phase<stag_phase>() && (reconLen == 13 || use_inphase)) {
1800 copy(phase, (gauge + parity * offset)[phaseOffset /
sizeof(Float) + stride * dir + x]);
1801 phase *=
static_cast<real>(2.0) *
static_cast<real>(M_PI);
1805 reconstruct.
Unpack(v, tmp, x, dir, phase, X, R);
1810 const int M = reconLen / N;
1812 reconstruct.
Pack(tmp, v, x);
1815 for (
int i=0; i<M; i++){
1819 for (
int j=0; j<N; j++) copy(reinterpret_cast<Float*>(&vecTmp)[j], tmp[i*N+j]);
1821 vector_store(gauge + parity * offset, x + (dir * M + i) * stride, vecTmp);
1825 copy((gauge + parity * offset)[phaseOffset /
sizeof(Float) + dir * stride + x],
1826 static_cast<real>(phase / (2. * M_PI)));
1856 real phase = 1.0)
const 1864 load(v, volumeCB + x, dir, parity, inphase);
1867 const int M = reconLen / N;
1871 for (
int i=0; i<M; i++) {
1873 Vector vecTmp = vector_load<Vector>(
1874 ghost[dir] + parity * faceVolumeCB[dir] * (M * N + hasPhase), i * faceVolumeCB[dir] + x);
1877 for (
int j = 0; j < N; j++) copy(tmp[i * N + j], reinterpret_cast<Float *>(&vecTmp)[j]);
1886 copy(phase, ghost[dir][parity * faceVolumeCB[dir] * (M * N + 1) + faceVolumeCB[dir] * M * N + x]);
1887 phase *=
static_cast<real>(2.0) *
static_cast<real>(M_PI);
1890 reconstruct.
Unpack(v, tmp, x, dir, phase, X, R);
1897 save(v, volumeCB + x, dir, parity);
1899 const int M = reconLen / N;
1901 reconstruct.
Pack(tmp, v, x);
1904 for (
int i=0; i<M; i++) {
1908 for (
int j=0; j<N; j++) copy(reinterpret_cast<Float*>(&vecTmp)[j], tmp[i*N+j]);
1910 vector_store(ghost[dir]+parity*faceVolumeCB[dir]*(M*N + hasPhase), i*faceVolumeCB[dir]+x, vecTmp);
1915 copy(ghost[dir][parity * faceVolumeCB[dir] * (M * N + 1) + faceVolumeCB[dir] * M * N + x],
1916 static_cast<real>(phase / (2. * M_PI)));
1948 real phase = 1.0)
const 1953 __device__ __host__
inline void loadGhostEx(
complex v[length / 2],
int buff_idx,
int extended_idx,
int dir,
1954 int dim,
int g,
int parity,
const int R[])
const 1956 const int M = reconLen / N;
1960 for (
int i=0; i<M; i++) {
1962 Vector vecTmp = vector_load<Vector>(ghost[
dim] + ((dir*2+
parity)*geometry+g)*R[
dim]*faceVolumeCB[
dim]*(M*N + hasPhase),
1963 +i*R[dim]*faceVolumeCB[dim]+buff_idx);
1966 for (
int j=0; j<N; j++) copy(tmp[i*N+j], reinterpret_cast<Float*>(&vecTmp)[j]);
1971 ghost[dim][((dir * 2 + parity) * geometry + g) * R[dim] * faceVolumeCB[dim] * (M * N + 1)
1972 + R[dim] * faceVolumeCB[dim] * M * N + buff_idx]);
1975 reconstruct.
Unpack(v, tmp, extended_idx, g, 2. * M_PI * phase, X, R);
1978 __device__ __host__
inline void saveGhostEx(
const complex v[length / 2],
int buff_idx,
int extended_idx,
int dir,
1979 int dim,
int g,
int parity,
const int R[])
1981 const int M = reconLen / N;
1984 reconstruct.
Pack(tmp, v, extended_idx);
1987 for (
int i=0; i<M; i++) {
1991 for (
int j=0; j<N; j++) copy(reinterpret_cast<Float*>(&vecTmp)[j], tmp[i*N+j]);
1993 vector_store(ghost[dim] + ((dir*2+parity)*geometry+g)*R[dim]*faceVolumeCB[dim]*(M*N + hasPhase),
1994 i*R[dim]*faceVolumeCB[dim]+buff_idx, vecTmp);
1998 copy(ghost[dim][((dir * 2 + parity) * geometry + g) * R[dim] * faceVolumeCB[dim] * (M * N + 1)
1999 + R[dim] * faceVolumeCB[dim] * M * N + buff_idx],
2000 static_cast<real>(phase / (2. * M_PI)));
2008 if (backup_h)
errorQuda(
"Already allocated host backup");
2010 cudaMemcpy(backup_h, gauge, bytes, cudaMemcpyDeviceToHost);
2018 cudaMemcpy(gauge, backup_h, bytes, cudaMemcpyHostToDevice);
2024 size_t Bytes()
const {
return reconLen *
sizeof(Float); }
2033 template <
typename real,
int length>
struct S {
2035 __host__ __device__
const real &
operator[](
int i)
const {
return v[i]; }
2055 volumeCB(u.VolumeCB()),
2057 geometry(u.Geometry()),
2061 errorQuda(
"This accessor does not support coarse-link fields (lacks support for bidirectional ghost zone");
2063 for (
int i = 0; i < 4; i++) {
2064 ghost[i] = (ghost_) ? ghost_[i] : (Float *)(u.
Ghost()[i]);
2070 volumeCB(order.volumeCB),
2071 stride(order.stride),
2072 geometry(order.geometry),
2075 for (
int i = 0; i < 4; i++) {
2076 ghost[i] = order.
ghost[i];
2083 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE) 2085 trove::coalesced_ptr<structure> ghost_((structure *)ghost[dir]);
2086 structure v_ = ghost_[parity * faceVolumeCB[dir] + x];
2088 auto v_ = &ghost[dir][(parity * faceVolumeCB[dir] + x) * length];
2090 for (
int i = 0; i < length / 2; i++) v[i] =
complex(v_[2 * i + 0], v_[2 * i + 1]);
2095 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE) 2097 trove::coalesced_ptr<structure> ghost_((structure *)ghost[dir]);
2099 for (
int i = 0; i < length / 2; i++) {
2100 v_[2 * i + 0] = (Float)v[i].
real();
2101 v_[2 * i + 1] = (Float)v[i].imag();
2103 ghost_[parity * faceVolumeCB[dir] + x] = v_;
2105 auto v_ = &ghost[dir][(parity * faceVolumeCB[dir] + x) * length];
2106 for (
int i = 0; i < length / 2; i++) {
2107 v_[2 * i + 0] = (Float)v[i].
real();
2108 v_[2 * i + 1] = (Float)v[i].imag();
2140 real phase = 1.0)
const 2146 int parity,
const int R[])
const 2148 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE) 2150 trove::coalesced_ptr<structure> ghost_((structure*)ghost[dim]);
2151 structure v_ = ghost_[((dir*2+
parity)*R[dim]*faceVolumeCB[dim] + x)*geometry+g];
2153 auto v_ = &ghost[
dim][(((dir * 2 +
parity) * R[dim] * faceVolumeCB[dim] + x) * geometry + g) * length];
2155 for (
int i = 0; i < length / 2; i++) v[i] =
complex(v_[2 * i + 0], v_[2 * i + 1]);
2159 int g,
int parity,
const int R[])
2161 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE) 2163 trove::coalesced_ptr<structure> ghost_((structure *)ghost[dim]);
2165 for (
int i = 0; i < length / 2; i++) {
2166 v_[2 * i + 0] = (Float)v[i].
real();
2167 v_[2 * i + 1] = (Float)v[i].imag();
2169 ghost_[((dir * 2 +
parity) * R[dim] * faceVolumeCB[dim] + x) * geometry + g] = v_;
2171 auto v_ = &ghost[
dim][(((dir * 2 +
parity) * R[dim] * faceVolumeCB[dim] + x) * geometry + g) * length];
2172 for (
int i = 0; i < length / 2; i++) {
2173 v_[2 * i + 0] = (Float)v[i].
real();
2174 v_[2 * i + 1] = (Float)v[i].imag();
2191 :
LegacyOrder<Float,length>(u, ghost_), volumeCB(u.VolumeCB())
2192 {
for (
int i=0; i<4; i++) gauge[i] = gauge_ ? ((Float**)gauge_)[i] : ((Float**)u.
Gauge_p())[i]; }
2194 for(
int i=0; i<4; i++) gauge[i] = order.
gauge[i];
2199 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE) 2201 trove::coalesced_ptr<structure> gauge_((structure*)gauge[dir]);
2202 structure v_ = gauge_[parity*volumeCB + x];
2204 auto v_ = &gauge[dir][(parity * volumeCB + x) * length];
2206 for (
int i = 0; i < length / 2; i++) v[i] =
complex(v_[2 * i + 0], v_[2 * i + 1]);
2211 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE) 2213 trove::coalesced_ptr<structure> gauge_((structure*)gauge[dir]);
2215 for (
int i = 0; i < length / 2; i++) {
2216 v_[2 * i + 0] = (Float)v[i].
real();
2217 v_[2 * i + 1] = (Float)v[i].imag();
2219 gauge_[parity * volumeCB + x] = v_;
2221 auto v_ = &gauge[dir][(parity * volumeCB + x) * length];
2222 for (
int i = 0; i < length / 2; i++) {
2223 v_[2 * i + 0] = (Float)v[i].
real();
2224 v_[2 * i + 1] = (Float)v[i].imag();
2259 size_t Bytes()
const {
return length *
sizeof(Float); }
2273 :
LegacyOrder<Float,length>(u, ghost_), volumeCB(u.VolumeCB())
2274 {
for (
int i=0; i<4; i++) gauge[i] = gauge_ ? ((Float**)gauge_)[i] : ((Float**)u.
Gauge_p())[i]; }
2276 for(
int i=0; i<4; i++) gauge[i] = order.
gauge[i];
2281 for (
int i = 0; i < length / 2; i++) {
2282 v[i].real((
real)gauge[dir][((0 * (length / 2) + i) * 2 + parity) * volumeCB + x]);
2283 v[i].imag((
real)gauge[dir][((1 * (length / 2) + i) * 2 + parity) * volumeCB + x]);
2289 for (
int i = 0; i < length / 2; i++) {
2290 gauge[dir][((0 * (length / 2) + i) * 2 +
parity) * volumeCB + x] = v[i].
real();
2291 gauge[dir][((1 * (length / 2) + i) * 2 +
parity) * volumeCB + x] = v[i].imag();
2325 size_t Bytes()
const {
return length *
sizeof(Float); }
2340 LegacyOrder<Float,length>(u, ghost_), gauge(gauge_ ? gauge_ : (Float*)u.Gauge_p()),
2341 volumeCB(u.VolumeCB()), geometry(u.Geometry()) { ; }
2343 gauge(order.gauge), volumeCB(order.volumeCB), geometry(order.geometry)
2348 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE) 2350 trove::coalesced_ptr<structure> gauge_((structure*)gauge);
2351 structure v_ = gauge_[(parity*volumeCB+x)*geometry + dir];
2353 auto v_ = &gauge[((parity * volumeCB + x) * geometry + dir) *
length];
2355 for (
int i = 0; i < length / 2; i++) v[i] =
complex(v_[2 * i + 0], v_[2 * i + 1]);
2360 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE) 2362 trove::coalesced_ptr<structure> gauge_((structure*)gauge);
2364 for (
int i = 0; i < length / 2; i++) {
2365 v_[2 * i + 0] = v[i].real();
2366 v_[2 * i + 1] = v[i].imag();
2368 gauge_[(parity*volumeCB+x)*geometry + dir] = v_;
2370 auto v_ = &gauge[((parity * volumeCB + x) * geometry + dir) *
length];
2371 for (
int i = 0; i < length / 2; i++) {
2372 v_[2 * i + 0] = v[i].real();
2373 v_[2 * i + 1] = v[i].imag();
2408 size_t Bytes()
const {
return length *
sizeof(Float); }
2437 gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()),
2438 volumeCB(u.VolumeCB()),
2439 geometry(u.Geometry()),
2440 offset(u.SiteOffset()),
2443 if ((uintptr_t)((
char *)gauge + offset) % 16 != 0) {
errorQuda(
"MILC structure has misaligned offset"); }
2449 volumeCB(order.volumeCB),
2450 geometry(order.geometry),
2451 offset(order.offset),
2459 const Float *gauge0 =
reinterpret_cast<const Float*
>(
reinterpret_cast<const char*
>(
gauge) + (parity*volumeCB+x)*size + offset);
2461 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE) 2463 trove::coalesced_ptr<structure> gauge_((structure*)gauge0);
2464 structure v_ = gauge_[dir];
2466 auto v_ = &gauge0[dir *
length];
2468 for (
int i = 0; i < length / 2; i++) v[i] =
complex(v_[2 * i + 0], v_[2 * i + 1]);
2474 Float *gauge0 =
reinterpret_cast<Float*
>(
reinterpret_cast<char*
>(
gauge) + (parity*volumeCB+x)*size + offset);
2476 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE) 2478 trove::coalesced_ptr<structure> gauge_((structure*)gauge0);
2480 for (
int i = 0; i < length / 2; i++) {
2481 v_[2 * i + 0] = v[i].real();
2482 v_[2 * i + 1] = v[i].imag();
2486 for (
int i = 0; i < length / 2; i++) {
2487 gauge0[dir * length + 2 * i + 0] = v[i].real();
2488 gauge0[dir * length + 2 * i + 1] = v[i].imag();
2523 size_t Bytes()
const {
return length *
sizeof(Float); }
2539 static constexpr
int Nc = 3;
2543 gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()),
2544 volumeCB(u.VolumeCB()),
2545 anisotropy(u.Anisotropy()),
2546 anisotropy_inv(1.0 / anisotropy),
2547 geometry(u.Geometry())
2549 if (length != 18)
errorQuda(
"Gauge length %d not supported", length);
2554 volumeCB(order.volumeCB),
2555 anisotropy(order.anisotropy),
2556 anisotropy_inv(order.anisotropy_inv),
2557 geometry(order.geometry)
2563 __device__ __host__
inline void load(
complex v[9],
int x,
int dir,
int parity, Float inphase = 1.0)
const 2565 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE) 2567 trove::coalesced_ptr<structure> gauge_((structure*)gauge);
2568 structure v_ = gauge_[((parity*volumeCB+x)*geometry + dir)];
2570 auto v_ = &gauge[((parity * volumeCB + x) * geometry + dir) *
length];
2572 for (
int i=0; i<Nc; i++) {
2573 for (
int j=0; j<Nc; j++) {
2574 v[i * Nc + j] =
complex(v_[(j * Nc + i) * 2 + 0], v_[(j * Nc + i) * 2 + 1]) * anisotropy_inv;
2581 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE) 2583 trove::coalesced_ptr<structure> gauge_((structure*)gauge);
2585 for (
int i=0; i<Nc; i++)
2586 for (
int j = 0; j < Nc; j++) {
2587 v_[(j * Nc + i) * 2 + 0] = anisotropy * v[i * Nc + j].
real();
2588 v_[(j * Nc + i) * 2 + 1] = anisotropy * v[i * Nc + j].imag();
2590 gauge_[((parity*volumeCB+x)*geometry + dir)] = v_;
2592 auto v_ = &gauge[((parity * volumeCB + x) * geometry + dir) *
length];
2593 for (
int i=0; i<Nc; i++) {
2594 for (
int j=0; j<Nc; j++) {
2595 v_[(j * Nc + i) * 2 + 0] = anisotropy * v[i * Nc + j].
real();
2596 v_[(j * Nc + i) * 2 + 1] = anisotropy * v[i * Nc + j].imag();
2632 size_t Bytes()
const {
return Nc * Nc * 2 *
sizeof(Float); }
2649 static constexpr
int Nc = 3;
2652 gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()),
2653 volumeCB(u.VolumeCB())
2655 if (length != 18)
errorQuda(
"Gauge length %d not supported", length);
2657 exVolumeCB = u.
X()[0]/2 + 2;
2658 for (
int i=1; i<4; i++) exVolumeCB *= u.
X()[i] + 2;
2663 volumeCB(order.volumeCB),
2664 exVolumeCB(order.exVolumeCB)
2666 if (length != 18)
errorQuda(
"Gauge length %d not supported", length);
2672 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE) 2674 trove::coalesced_ptr<structure> gauge_((structure *)gauge);
2675 structure v_ = gauge_[(dir * 2 +
parity) * exVolumeCB + x];
2677 auto v_ = &gauge[((dir * 2 +
parity) * exVolumeCB + x) *
length];
2679 for (
int i = 0; i < Nc; i++) {
2680 for (
int j = 0; j < Nc; j++) { v[i * Nc + j] =
complex(v_[(j * Nc + i) * 2 + 0], v_[(j * Nc + i) * 2 + 1]); }
2686 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE) 2688 trove::coalesced_ptr<structure> gauge_((structure*)gauge);
2690 for (
int i=0; i<Nc; i++)
2691 for (
int j = 0; j < Nc; j++) {
2692 v_[(j * Nc + i) * 2 + 0] = v[i * Nc + j].
real();
2693 v_[(j * Nc + i) * 2 + 1] = v[i * Nc + j].imag();
2695 gauge_[(dir * 2 +
parity) * exVolumeCB + x] = v_;
2697 auto v_ = &gauge[((dir * 2 +
parity) * exVolumeCB + x) *
length];
2698 for (
int i = 0; i < Nc; i++) {
2699 for (
int j = 0; j < Nc; j++) {
2700 v_[(j * Nc + i) * 2 + 0] = v[i * Nc + j].
real();
2701 v_[(j * Nc + i) * 2 + 1] = v[i * Nc + j].imag();
2737 size_t Bytes()
const {
return Nc * Nc * 2 *
sizeof(Float); }
2750 static constexpr
int Nc = 3;
2755 gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()),
2756 volumeCB(u.VolumeCB()),
2758 scale_inv(1.0 / scale)
2760 if (length != 18)
errorQuda(
"Gauge length %d not supported", length);
2765 volumeCB(order.volumeCB),
2767 scale_inv(1.0 / scale)
2769 if (length != 18)
errorQuda(
"Gauge length %d not supported", length);
2775 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE) 2777 trove::coalesced_ptr<structure> gauge_((structure *)gauge);
2778 structure v_ = gauge_[(dir * 2 +
parity) * volumeCB + x];
2780 auto v_ = &gauge[((dir * 2 +
parity) * volumeCB + x) *
length];
2782 for (
int i = 0; i < Nc; i++) {
2783 for (
int j = 0; j < Nc; j++) {
2784 v[i * Nc + j] =
complex(v_[(j * Nc + i) * 2 + 0], v_[(j * Nc + i) * 2 + 1]) * scale_inv;
2791 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE) 2793 trove::coalesced_ptr<structure> gauge_((structure*)gauge);
2795 for (
int i=0; i<Nc; i++)
2796 for (
int j = 0; j < Nc; j++) {
2797 v_[(j * Nc + i) * 2 + 0] = v[i * Nc + j].
real() * scale;
2798 v_[(j * Nc + i) * 2 + 1] = v[i * Nc + j].imag() * scale;
2800 gauge_[(dir * 2 +
parity) * volumeCB + x] = v_;
2802 auto v_ = &gauge[((dir * 2 +
parity) * volumeCB + x) *
length];
2803 for (
int i = 0; i < Nc; i++) {
2804 for (
int j = 0; j < Nc; j++) {
2805 v_[(j * Nc + i) * 2 + 0] = v[i * Nc + j].
real() * scale;
2806 v_[(j * Nc + i) * 2 + 1] = v[i * Nc + j].imag() * scale;
2842 size_t Bytes()
const {
return Nc * Nc * 2 *
sizeof(Float); }
2856 static constexpr
int Nc = 3;
2863 gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()),
2864 volumeCB(u.VolumeCB()),
2867 scale_inv(1.0 / scale),
2868 dim {u.
X()[0], u.
X()[1], u.
X()[2], u.
X()[3]},
2869 exDim {u.X()[0], u.X()[1], u.X()[2] + 4, u.X()[3]}
2871 if (length != 18)
errorQuda(
"Gauge length %d not supported", length);
2874 for (
int i=0; i<4; i++) exVolumeCB *= exDim[i];
2881 volumeCB(order.volumeCB),
2882 exVolumeCB(order.exVolumeCB),
2884 scale_inv(order.scale_inv),
2885 dim {order.
dim[0], order.
dim[1], order.
dim[2], order.
dim[3]},
2886 exDim {order.exDim[0], order.exDim[1], order.exDim[2], order.exDim[3]}
2888 if (length != 18)
errorQuda(
"Gauge length %d not supported", length);
2908 int y = getPaddedIndex(x, parity);
2910 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE) 2912 trove::coalesced_ptr<structure> gauge_((structure*)gauge);
2913 structure v_ = gauge_[(dir*2+
parity)*exVolumeCB + y];
2915 auto v_ = &gauge[((dir * 2 +
parity) * exVolumeCB + y) *
length];
2917 for (
int i = 0; i < Nc; i++) {
2918 for (
int j = 0; j < Nc; j++) {
2919 v[i * Nc + j] =
complex(v_[(j * Nc + i) * 2 + 0], v_[(j * Nc + i) * 2 + 1]) * scale_inv;
2926 int y = getPaddedIndex(x, parity);
2928 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE) 2930 trove::coalesced_ptr<structure> gauge_((structure*)gauge);
2932 for (
int i=0; i<Nc; i++)
2933 for (
int j = 0; j < Nc; j++) {
2934 v_[(j * Nc + i) * 2 + 0] = v[i * Nc + j].
real() * scale;
2935 v_[(j * Nc + i) * 2 + 1] = v[i * Nc + j].imag() * scale;
2937 gauge_[(dir * 2 +
parity) * exVolumeCB + y] = v_;
2939 auto v_ = &gauge[((dir * 2 +
parity) * exVolumeCB + y) *
length];
2940 for (
int i = 0; i < Nc; i++) {
2941 for (
int j = 0; j < Nc; j++) {
2942 v_[(j * Nc + i) * 2 + 0] = v[i * Nc + j].
real() * scale;
2943 v_[(j * Nc + i) * 2 + 1] = v[i * Nc + j].imag() * scale;
2979 size_t Bytes()
const {
return Nc * Nc * 2 *
sizeof(Float); }
2984 template <
typename otherFloat,
typename storeFloat>
2990 template <
typename otherFloat,
typename storeFloat>
2996 template <
typename otherFloat,
typename storeFloat>
3002 template <
typename otherFloat,
typename storeFloat>
3011 bool use_inphase =
false>
3016 template <
int N, QudaStaggeredPhase stag,
bool huge_alloc, QudaGhostExchange ghostExchange,
bool use_inphase>
3020 template <
int N, QudaStaggeredPhase stag,
bool huge_alloc, QudaGhostExchange ghostExchange,
bool use_inphase>
3024 template <
int N, QudaStaggeredPhase stag,
bool huge_alloc, QudaGhostExchange ghostExchange,
bool use_inphase>
3028 template <
int N, QudaStaggeredPhase stag,
bool huge_alloc, QudaGhostExchange ghostExchange,
bool use_inphase>
3032 template <
int N, QudaStaggeredPhase stag,
bool huge_alloc, QudaGhostExchange ghostExchange,
bool use_inphase>
3036 template <
int N, QudaStaggeredPhase stag,
bool huge_alloc, QudaGhostExchange ghostExchange,
bool use_inphase>
3042 template <
int N, QudaStaggeredPhase stag,
bool huge_alloc, QudaGhostExchange ghostExchange,
bool use_inphase>
3046 template <
int N, QudaStaggeredPhase stag,
bool huge_alloc, QudaGhostExchange ghostExchange,
bool use_inphase>
3050 template <
int N, QudaStaggeredPhase stag,
bool huge_alloc, QudaGhostExchange ghostExchange,
bool use_inphase>
3054 template <
int N, QudaStaggeredPhase stag,
bool huge_alloc, QudaGhostExchange ghostExchange,
bool use_inphase>
3058 template <
int N, QudaStaggeredPhase stag,
bool huge_alloc, QudaGhostExchange ghostExchange,
bool use_inphase>
3062 template <
int N, QudaStaggeredPhase stag,
bool huge_alloc, QudaGhostExchange ghostExchange,
bool use_inphase>
3068 template <
int N, QudaStaggeredPhase stag,
bool huge_alloc, QudaGhostExchange ghostExchange,
bool use_inphase>
3072 template <
int N, QudaStaggeredPhase stag,
bool huge_alloc, QudaGhostExchange ghostExchange,
bool use_inphase>
3076 template <
int N, QudaStaggeredPhase stag,
bool huge_alloc, QudaGhostExchange ghostExchange,
bool use_inphase>
3080 template <
int N, QudaStaggeredPhase stag,
bool huge_alloc, QudaGhostExchange ghostExchange,
bool use_inphase>
3084 template <
int N, QudaStaggeredPhase stag,
bool huge_alloc, QudaGhostExchange ghostExchange,
bool use_inphase>
3088 template <
int N, QudaStaggeredPhase stag,
bool huge_alloc, QudaGhostExchange ghostExchange,
bool use_inphase>
3094 template <
int N, QudaStaggeredPhase stag,
bool huge_alloc, QudaGhostExchange ghostExchange,
bool use_inphase>
3098 template <
int N, QudaStaggeredPhase stag,
bool huge_alloc, QudaGhostExchange ghostExchange,
bool use_inphase>
3102 template <
int N, QudaStaggeredPhase stag,
bool huge_alloc, QudaGhostExchange ghostExchange,
bool use_inphase>
3106 template <
int N, QudaStaggeredPhase stag,
bool huge_alloc, QudaGhostExchange ghostExchange,
bool use_inphase>
3110 template <
int N, QudaStaggeredPhase stag,
bool huge_alloc, QudaGhostExchange ghostExchange,
bool use_inphase>
3114 template <
int N, QudaStaggeredPhase stag,
bool huge_alloc, QudaGhostExchange ghostExchange,
bool use_inphase>
3131 #define INSTANTIATE_RECONSTRUCT(func, g, ...) \ 3133 if (!data.isNative()) \ 3134 errorQuda("Field order %d and precision %d is not native", g.Order(), g.Precision()); \ 3135 if( g.Reconstruct() == QUDA_RECONSTRUCT_NO) { \ 3136 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type Gauge; \ 3137 func(Gauge(g), g, __VA_ARGS__); \ 3138 } else if( g.Reconstruct() == QUDA_RECONSTRUCT_13){ \ 3139 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_13>::type Gauge; \ 3140 func(Gauge(g), g, __VA_ARGS__); \ 3141 } else if( g.Reconstruct() == QUDA_RECONSTRUCT_12){ \ 3142 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type Gauge; \ 3143 func(Gauge(g), g, __VA_ARGS__); \ 3144 } else if( g.Reconstruct() == QUDA_RECONSTRUCT_9){ \ 3145 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_9>::type Gauge; \ 3146 func(Gauge(g), g, __VA_ARGS__); \ 3147 } else if( g.Reconstruct() == QUDA_RECONSTRUCT_8){ \ 3148 typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type Gauge; \ 3149 func(Gauge(g), g, __VA_ARGS__); \ 3151 errorQuda("Reconstruction type %d of gauge field not supported", g.Reconstruct()); \ 3155 #define INSTANTIATE_PRECISION(func, lat, ...) \ 3157 if (lat.Precision() == QUDA_DOUBLE_PRECISION) { \ 3158 func<double>(lat, __VA_ARGS__); \ 3159 } else if(lat.Precision() == QUDA_SINGLE_PRECISION) { \ 3160 func<float>(lat, __VA_ARGS__); \ 3162 errorQuda("Precision %d not supported", lat.Precision()); \ 3168 #endif // _GAUGE_ORDER_H __device__ __host__ void Unpack(complex out[9], const real in[10], int idx, int dir, real phase, const I *X, const int *R) const
complex< storeFloat > * u[QUDA_MAX_GEOMETRY]
gauge::FloatNOrder< double, N, 2, 9, stag, huge_alloc, ghostExchange, use_inphase > type
struct to define TIFR ordered gauge fields: [mu][parity][volumecb][col][row]
__device__ __host__ gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
__host__ __device__ constexpr int Ncolor(int length)
Return the number of colors of the accessor based on the length of the field.
gauge::TIFROrder< T, 2 *Nc *Nc > type
__device__ __host__ complex< Float > operator-() const
negation operator
__host__ __device__ constexpr bool fixed_point< float, short >()
__device__ __host__ gauge_ghost_wrapper< real, Accessor > Ghost(int dim, int ghost_idx, int parity, real phase=1.0)
This accessor routine returns a gauge_ghost_wrapper to this object, allowing us to overload various o...
gauge_wrapper is an internal class that is used to wrap instances of gauge accessors, currying in a specific location on the field. The operator() accessors in gauge-field accessors return instances to this class, allowing us to then use operator overloading upon this class to interact with the Matrix class. As a result we can include gauge-field accessors directly in Matrix expressions in kernels without having to declare temporaries with explicit calls to the load/save methods in the gauge-field accessors.
__device__ __host__ complex< Float > operator()(int d, int parity, int x, int row, int col) const
Accessor(const Accessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, storeFloat, use_tex > &a)
Float * ghost[QUDA_MAX_DIM]
gauge::FloatNOrder< short, N, 4, 9, stag, huge_alloc, ghostExchange, use_inphase > type
typename mapper< Float >::type real
__device__ __host__ void Unpack(complex out[9], const real in[12], int idx, int dir, real phase, const I *X, const int *R) const
void resetScale(Float max_)
QDPJITOrder(const QDPJITOrder &order)
__device__ __host__ const gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
AllocType< huge_alloc >::type AllocInt
static __device__ __host__ int linkIndex(const int x[], const I X[4])
typename mapper< Float >::type real
Reconstruct(const Reconstruct< N, Float, ghostExchange_ > &recon)
Reconstruct(const GaugeField &u, real scale=1.0)
TIFROrder(const TIFROrder &order)
MILCSiteOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
__host__ __device__ ReduceType operator()(const quda::complex< short > &x)
__device__ __host__ void load(complex v[length/2], int x, int dir, int parity, real inphase=1.0) const
gauge::FloatNOrder< char, N, 2, 11, stag, huge_alloc, ghostExchange, use_inphase > type
__device__ __host__ void operator=(const M &a)
Assignment operator with Matrix instance as input.
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
constexpr bool default_huge_alloc
Reconstruct(const Reconstruct< 9, Float, ghostExchange_, stag_phase > &recon)
__device__ __host__ fieldorder_wrapper(complex< storeFloat > *v, int idx, Float scale, Float scale_inv)
fieldorder_wrapper constructor
__device__ __host__ void Pack(real out[12], const complex in[9], int idx) const
GhostAccessor(const GaugeField &U, void *gauge_=0, void **ghost_=0)
fieldorder_wrapper is an internal class that is used to wrap instances of FieldOrder accessors...
__device__ __host__ void operator=(const M &a)
Assignment operator with Matrix instance as input.
__device__ __host__ gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
gauge::FloatNOrder< float, N, 2, 11, stag, huge_alloc, ghostExchange, use_inphase > type
__device__ __host__ void save(const complex v[9], int x, int dir, int parity)
__host__ __device__ constexpr int ct_sqrt(int n, int i=1)
void resetScale(Float max)
gauge::FloatNOrder< double, N, 2, 12, stag, huge_alloc, ghostExchange, use_inphase > type
__device__ __host__ void save(const complex v[9], int x, int dir, int parity)
GhostAccessor(const GhostAccessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, native_ghost, storeFloat, use_tex > &a)
__device__ __host__ void load(complex v[9], int x, int dir, int parity, real inphase=1.0) const
void load()
Restore the field from the host after tuning.
Gauge reconstruct 12 helper where we reconstruct the third row from the cross product of the first tw...
typename mapper< Float >::type real
__host__ double abs_max(int dim=-1, bool global=true) const
Returns the Linfinity norm of the field in a given dimension.
complex< storeFloat > * v
__host__ __device__ complex< real > cmac(const complex< real > &x, const complex< real > &y, const complex< real > &z)
gauge::FloatNOrder< short, N, 2, 11, stag, huge_alloc, ghostExchange, use_inphase > type
gauge::FloatNOrder< double, N, 2, 11, stag, huge_alloc, ghostExchange, use_inphase > type
__device__ __host__ void load(complex v[9], int x, int dir, int parity, Float inphase=1.0) const
static std::map< void *, MemAlloc > alloc[N_ALLOC_TYPE]
const int lastTimeSliceBound
__device__ __host__ T timeBoundary(int idx, const I X[QUDA_MAX_DIM], const int R[QUDA_MAX_DIM], T tBoundary, T scale, int firstTimeSliceBound, int lastTimeSliceBound, bool isFirstTimeSlice, bool isLastTimeSlice, QudaGhostExchange ghostExchange=QUDA_GHOST_EXCHANGE_NO)
timeBoundary Compute boundary condition correction
Reconstruct(const GaugeField &u)
complex< storeFloat > * u
cudaColorSpinorField * tmp
square_(ReduceType scale)
__device__ __host__ complex< Float > operator()(int d, int parity, int x, int row, int col) const
__device__ __host__ fieldorder_wrapper< Float, storeFloat > operator()(int d, int parity, int x, int row, int col)
__host__ __device__ char real() const volatile
gauge::FloatNOrder< float, N, 4, 12, stag, huge_alloc, ghostExchange, use_inphase > type
Gauge reconstruct helper for Momentum field with 10 packed elements (really 9 from the Lie algebra...
gauge::FloatNOrder< float, N, 4, 13, stag, huge_alloc, ghostExchange, use_inphase > type
square_(const ReduceType scale)
__host__ __device__ complex()
Reconstruct(const GaugeField &u)
typename mapper< Float >::type real
int_fastdiv X[QUDA_MAX_DIM]
__device__ __host__ void save(const complex v[length/2], int x, int dir, int parity)
__host__ __device__ void copy(T1 &a, const T2 &b)
LegacyOrder(const LegacyOrder &order)
Reconstruct< reconLenParam, Float, ghostExchange_, stag_phase > reconstruct
__device__ __host__ fieldorder_wrapper< Float, storeFloat > operator()(int d, int parity, int x, int row, int col)
const int * SurfaceCB() const
__device__ __host__ void atomic_add(int dim, int parity, int x_cb, int row, int col, const complex< theirFloat > &val) const
QudaFieldGeometry Geometry() const
__device__ __host__ void operator-=(const complex< theirFloat > &a)
Operator-= with complex number instance as input.
Accessor< Float, nColor, order, storeFloat, use_tex > accessor
gauge::BQCDOrder< T, 2 *Nc *Nc > type
GhostAccessor(const GhostAccessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, native_ghost, storeFloat, use_tex > &a)
Accessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, storeFloat, use_tex > accessor
__device__ __host__ void operator=(const fieldorder_wrapper< Float, storeFloat > &a)
Assignment operator with fieldorder_wrapper instance as input.
gauge::FloatNOrder< double, N, 2, 13, stag, huge_alloc, ghostExchange, use_inphase > type
__host__ double transform_reduce(QudaFieldLocation location, int dim, helper h, reducer r, double init) const
__device__ __host__ void loadGhost(complex v[length/2], int x, int dir, int parity, real phase=1.0) const
__device__ __host__ gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
__device__ __host__ void Unpack(complex out[9], const real in[8], int idx, int dir, real phase, const I *X, const int *R, const complex scale=complex(static_cast< real >(1.0), static_cast< real >(1.0))) const
QDPOrder(const QDPOrder &order)
Accessor(const GaugeField &U, void *gauge_=0, void **ghost_=0)
const bool isLastTimeSlice
__device__ __host__ gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
Accessor(const GaugeField &, void *gauge_=0, void **ghost_=0)
complex< storeFloat > * ghost[8]
gauge::FloatNOrder< float, N, 2, N, stag, huge_alloc, ghostExchange, use_inphase > type
__device__ __host__ gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
GhostAccessor(const GhostAccessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, native_ghost, storeFloat, use_tex > &a)
__device__ __host__ void operator=(const Matrix< U, N > &b)
This is just a dummy structure we use for trove to define the required structure size.
__host__ __device__ const real & operator[](int i) const
complex< storeFloat > * u
const Reconstruct< 8, Float, ghostExchange_ > reconstruct_8
void resetScale(Float max)
Reconstruct(const Reconstruct< 12, Float, ghostExchange_ > &recon)
__device__ __host__ void operator+=(const complex< theirFloat > &a)
Operator+= with complex number instance as input.
__host__ __device__ constexpr bool match< short, short >()
Accessor(const GaugeField &U, void *gauge_=0, void **ghost_=0)
__device__ __host__ fieldorder_wrapper< Float, storeFloat > operator()(int d, int parity, int x_cb, int row, int col)
struct to define gauge fields packed into an opaque MILC site struct:
QudaGhostExchange ghostExchange
GhostAccessor(const GaugeField &U, void *gauge_, void **ghost_=0)
static __device__ double2 atomicAdd(double2 *addr, double2 val)
Implementation of double2 atomic addition using two double-precision additions.
FieldOrder(GaugeField &U, void *gauge_=0, void **ghost_=0)
BQCDOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
MILCOrder(const MILCOrder &order)
__device__ __host__ void saveGhost(const complex v[length/2], int x, int dir, int parity)
gauge::FloatNOrder< double, N, 2, 8, stag, huge_alloc, ghostExchange, use_inphase > type
__device__ __host__ gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
__device__ __host__ void Pack(real out[8], const complex in[9], int idx) const
gauge::FloatNOrder< char, N, 4, 8, stag, huge_alloc, ghostExchange, use_inphase > type
__device__ __host__ complex< Float > operator()(int d, int parity, int x, int row, int col) const
__device__ __host__ int Ndim() const
void comm_allreduce_min(double *data)
void resetScale(Float dummy)
const int firstTimeSliceBound
__host__ double abs_min(int dim=-1, bool global=true) const
Returns the minimum absolute value of the field.
QDPOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
__device__ __host__ int NcolorCoarse() const
gauge::QDPOrder< T, 2 *Nc *Nc > type
__device__ __host__ Float real() const
enum QudaStaggeredPhase_s QudaStaggeredPhase
typename mapper< Float >::type real
gauge::FloatNOrder< short, N, 4, 13, stag, huge_alloc, ghostExchange, use_inphase > type
void resetScale(Float dummy)
MILCOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
__device__ __host__ gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
gauge::FloatNOrder< char, N, 4, 9, stag, huge_alloc, ghostExchange, use_inphase > type
__device__ __host__ void Pack(real out[10], const complex in[9], int idx) const
__device__ __host__ void load(complex v[length/2], int x, int dir, int parity, real inphase=1.0) const
__device__ __host__ void Pack(real out[12], const complex in[9], int idx) const
__device__ __host__ void atomic_add(int dim, int parity, int x_cb, int row, int col, const complex< theirFloat > &val) const
__device__ __host__ void Unpack(complex out[9], const real in[8], int idx, int dir, real phase, const I *X, const int *R) const
__host__ __device__ int imag() const volatile
struct to define BQCD ordered gauge fields:
__host__ __device__ Float operator()(const quda::complex< char > &x)
__host__ __device__ int real() const volatile
__device__ __host__ void save(const complex v[length/2], int x, int dir, int parity)
enum QudaGhostExchange_s QudaGhostExchange
__device__ __host__ void vector_store(void *ptr, int idx, const VectorType &value)
Generic reconstruction helper with no reconstruction.
__host__ __device__ bool static_phase()
MILCSiteOrder(const MILCSiteOrder &order)
__host__ __device__ constexpr bool fixed_point< float, char >()
__device__ __host__ void loadGhostEx(complex v[length/2], int buff_idx, int extended_idx, int dir, int dim, int g, int parity, const int R[]) const
gauge::FloatNOrder< double, N, 2, N, stag, huge_alloc, ghostExchange, use_inphase > type
__device__ __host__ real getPhase(const complex in[9])
__device__ __host__ void save(const complex v[9], int x, int dir, int parity)
__device__ __host__ void saveGhostEx(const complex v[length/2], int buff_idx, int extended_idx, int dir, int dim, int g, int parity, const int R[])
void save()
Backup the field to the host when tuning.
__host__ __device__ short imag() const volatile
gauge_ghost_wrapper is an internal class that is used to wrap instances of gauge ghost accessors...
gauge::FloatNOrder< short, N, 2, N, stag, huge_alloc, ghostExchange, use_inphase > type
__device__ __host__ real getPhase(const complex in[9])
__device__ __host__ void save(const complex v[length/2], int x, int dir, int parity)
typename mapper< Float >::type real
__host__ __device__ constexpr bool match< int, int >()
__host__ __device__ real & operator[](int i)
gauge::FloatNOrder< char, N, 4, 12, stag, huge_alloc, ghostExchange, use_inphase > type
__device__ __host__ fieldorder_wrapper< Float, storeFloat > Ghost(int d, int parity, int x, int row, int col)
__device__ __host__ Float imag() const
CPSOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
Gauge reconstruct 13 helper where we reconstruct the third row from the cross product of the first tw...
__device__ __host__ real getPhase(const complex in[N/2]) const
Accessor(const Accessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, storeFloat, use_tex > &a)
Reconstruct(const GaugeField &u)
const void ** Ghost() const
enum QudaGaugeFieldOrder_s QudaGaugeFieldOrder
__device__ __host__ real getPhase(const complex in[9]) const
Float * gauge[QUDA_MAX_DIM]
Provides precision abstractions and defines the register precision given the storage precision using ...
__device__ __host__ real getPhase(const complex in[9])
GhostAccessor< Float, nColor, order, native_ghost, storeFloat, use_tex > ghostAccessor
__device__ __host__ void loadGhostEx(complex v[length/2], int x, int dummy, int dir, int dim, int g, int parity, const int R[]) const
__host__ __device__ constexpr bool fixed_point< float, int >()
__device__ __host__ void load(complex v[length/2], int x, int dir, int parity, real inphase=1.0) const
TIFRPaddedOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
const bool isFirstTimeSlice
__device__ __host__ void atomic_add(int dim, int parity, int x_cb, int row, int col, const complex< theirFloat > &val) const
__device__ __host__ real getPhase(const complex in[9]) const
void init()
Create the CUBLAS context.
__device__ __host__ const gauge_ghost_wrapper< real, Accessor > Ghost(int dim, int ghost_idx, int parity, real phase=1.0) const
This accessor routine returns a const gauge_ghost_wrapper to this object, allowing us to overload var...
void resetScale(Float max)
__host__ __device__ short real() const volatile
__device__ __host__ fieldorder_wrapper< Float, storeFloat > operator()(int dim, int parity, int x_cb, int row, int col)
gauge::FloatNOrder< short, N, 4, 8, stag, huge_alloc, ghostExchange, use_inphase > type
#define safe_malloc(size)
__device__ __host__ complex< Float > & operator()(int d, int parity, int x, int row, int col) const
const bool isFirstTimeSlice
__host__ double transform_reduce(QudaFieldLocation location, int dim, helper h, reducer r, double init) const
__host__ __device__ Float operator()(const quda::complex< storeFloat > &x)
__device__ __host__ const gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
const AllocInt phaseOffset
__device__ __host__ const gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity, real phase=1.0) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
__host__ __device__ constexpr bool match()
void resetScale(Float max)
__device__ __host__ void save(const complex v[9], int x, int dir, int parity)
__device__ __host__ gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
__host__ double norm1(int dim=-1, bool global=true) const
Returns the L1 norm of the field in a given dimension.
gauge::FloatNOrder< float, N, 4, 9, stag, huge_alloc, ghostExchange, use_inphase > type
__host__ __device__ Float operator()(const quda::complex< int > &x)
__host__ __device__ complex< real > cmul(const complex< real > &x, const complex< real > &y)
QudaFieldLocation Location() const
LegacyOrder(const GaugeField &u, Float **ghost_)
The LegacyOrder defines the ghost zone storage and ordering for all cpuGaugeFields, which use the same ghost zone storage.
gauge::FloatNOrder< short, N, 4, 12, stag, huge_alloc, ghostExchange, use_inphase > type
FloatNOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0, bool override=false)
__device__ __host__ complex< Float > Ghost(int d, int parity, int x, int s_row, int s_col, int c_row, int c_col) const
__host__ __device__ ReduceType operator()(const quda::complex< int > &x)
static int index(int ndim, const int *dims, const int *x)
gauge::MILCOrder< T, 2 *Nc *Nc > type
__device__ __host__ void Pack(real out[N], const complex in[N/2], int idx) const
__device__ __host__ complex< Float > operator()(int d, int parity, int x, int row, int col) const
__device__ __host__ const gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
__device__ __host__ const complex< Float > operator()(int dim, int parity, int x_cb, int row, int col) const
Float * gauge[QUDA_MAX_DIM]
__device__ __host__ void Unpack(complex out[9], const real in[8], int idx, int dir, real phase, const I *X, const int *R, const complex scale, const complex u) const
QudaGhostExchange ghostExchange
size_t bytes
host memory for backing up the field when tuning
Reconstruct(const Reconstruct< 8, Float, ghostExchange_ > &recon)
enum QudaFieldLocation_s QudaFieldLocation
__host__ __device__ volatile complex< float > & operator=(const complex< T > z) volatile
int faceVolumeCB[QUDA_MAX_DIM]
cpuColorSpinorField * out
BQCDOrder(const BQCDOrder &order)
const Reconstruct< 12, Float, ghostExchange_ > reconstruct_12
Reconstruct(const GaugeField &u)
const int lastTimeSliceBound
VectorType< Float, N >::type Vector
typename mapper< Float >::type real
__host__ __device__ constexpr bool fixed_point()
__host__ __device__ char imag() const volatile
typename mapper< Float >::type real
const int firstTimeSliceBound
enum QudaReconstructType_s QudaReconstructType
Reconstruct(const Reconstruct< 11, Float, ghostExchange_ > &recon)
__device__ __host__ ColorSpinor< Float, Nc, Ns > operator+(const ColorSpinor< Float, Nc, Ns > &x, const ColorSpinor< Float, Nc, Ns > &y)
ColorSpinor addition operator.
__device__ __host__ void Unpack(complex out[N/2], const real in[N], int idx, int dir, real phase, const I *X, const int *R) const
__device__ __host__ gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity, real phase=1.0)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
__device__ __host__ fieldorder_wrapper< Float, storeFloat > operator()(int d, int parity, int x, int row, int col)
__device__ __host__ gauge_ghost_wrapper< real, Accessor > Ghost(int dim, int ghost_idx, int parity, real phase=1.0)
This accessor routine returns a gauge_ghost_wrapper to this object, allowing us to overload various o...
__device__ __host__ fieldorder_wrapper< Float, storeFloat > Ghost(int d, int parity, int x, int s_row, int s_col, int c_row, int c_col)
Gauge reconstruct 9 helper where we reconstruct the gauge matrix from 8 packed elements (maximal comp...
const int_fastdiv geometry
#define QUDA_MAX_GEOMETRY
Maximum geometry supported by a field. This essentially is the maximum number of dimensions supported...
typename mapper< Float >::type real
__device__ __host__ const complex< Float > operator()(int d, int parity, int x, int s_row, int s_col, int c_row, int c_col) const
TIFROrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
gauge::QDPJITOrder< T, 2 *Nc *Nc > type
typename mapper< Float >::type real
Accessor(const GaugeField &U, void *gauge_=0, void **ghost_=0, bool override=false)
__device__ __host__ int Volume() const
QudaGhostExchange ghostExchange
__device__ __host__ void load(complex v[length/2], int x, int dir, int parity, real inphase=1.0) const
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
typename mapper< Float >::type real
__device__ __host__ complex< Float > Ghost(int d, int parity, int x, int row, int col) const
__device__ __host__ Matrix()
__device__ __host__ const gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
const real anisotropy_inv
__device__ __host__ int Geometry() const
void resetScale(double max)
__device__ __host__ void Unpack(complex out[9], const real in[12], int idx, int dir, real phase, const I *X, const int *R) const
__device__ __host__ fieldorder_wrapper< Float, storeFloat > operator()(int d, int parity, int x, int row, int col)
__host__ double norm2(int dim=-1, bool global=true) const
Returns the L2 norm squared of the field in a given dimension.
TIFRPaddedOrder(const TIFRPaddedOrder &order)
QDPJITOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
__device__ __host__ void load(complex v[9], int x, int dir, int parity, real inphase=1.0) const
__device__ __host__ int VolumeCB() const
QudaReconstructType Reconstruct() const
Accessor(const Accessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, storeFloat, use_tex > &a)
const bool isLastTimeSlice
__host__ __device__ ValueType abs(ValueType x)
square_(const ReduceType scale)
Reconstruct(const GaugeField &u)
square_(const ReduceType scale)
__device__ __host__ const gauge_ghost_wrapper< real, Accessor > Ghost(int dim, int ghost_idx, int parity, real phase=1.0) const
This accessor routine returns a const gauge_ghost_wrapper to this object, allowing us to overload var...
__host__ __device__ ReduceType operator()(const quda::complex< char > &x)
__device__ __host__ void saveGhost(const complex v[length/2], int x, int dir, int parity)
__device__ __host__ void load(complex v[9], int x, int dir, int parity, real inphase=1.0) const
GhostAccessor(const GaugeField &, void *gauge_=0, void **ghost_=0)
__device__ __host__ void operator=(const complex< theirFloat > &a)
Assignment operator with complex number instance as input.
__device__ __host__ Float milcStaggeredPhase(int dim, const int x[], const I R[])
__device__ __host__ ColorSpinor< Float, Nc, Ns > operator*(const S &a, const ColorSpinor< Float, Nc, Ns > &x)
Compute the scalar-vector product y = a * x.
__device__ __host__ void load(complex v[length/2], int x, int dir, int parity, real inphase=1.0) const
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
__device__ __host__ int Ncolor() const
CPSOrder(const CPSOrder &order)
__device__ __host__ complex< Float > & operator()(int d, int parity, int x, int row, int col) const
__device__ __host__ void save(const complex v[length/2], int x, int dir, int parity)
void comm_allreduce(double *data)
__device__ __host__ const gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
__device__ __host__ complex< Float > operator()(int d, int parity, int x, int row, int col) const
__device__ __host__ T getDeterminant(const Mat< T, 3 > &a)
__host__ __device__ ValueType conj(ValueType x)
gauge::FloatNOrder< char, N, 2, N, stag, huge_alloc, ghostExchange, use_inphase > type
__device__ __host__ const gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
__device__ __host__ int indexFloatN(int dim, int parity, int x_cb, int row, int col, int stride, int offset_cb)
void comm_allreduce_max(double *data)
gauge::FloatNOrder< T, 2 *Nc *Nc, 2, 2 *Nc *Nc > type
GhostAccessor(const GaugeField &U, void *gauge_=0, void **ghost_=0)
static constexpr bool fixedPoint()
__device__ __host__ const complex< Float > operator()(int d, int parity, int x_cb, int row, int col) const
__host__ double transform_reduce(QudaFieldLocation location, int dim, helper h, reducer r, double init) const
__host__ __device__ Float operator()(const quda::complex< short > &x)
__device__ __host__ int getPaddedIndex(int x_cb, int parity) const
Compute the index into the padded field. Assumes that parity doesn't change from unpadded to padded...
__device__ __host__ const gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
__device__ __host__ void saveGhostEx(const complex v[length/2], int x, int dummy, int dir, int dim, int g, int parity, const int R[])
__device__ __host__ void Pack(real out[8], const complex in[9], int idx) const
FieldOrder(const FieldOrder &o)
__host__ __device__ volatile complex< double > & operator=(const complex< T > z) volatile
Reconstruct(const Reconstruct< 13, Float, ghostExchange_, stag_phase > &recon)
void resetScale(Float max)
complex< storeFloat > * ghost[8]
__host__ __device__ ReduceType operator()(const quda::complex< Float > &x)
__device__ __host__ int NspinCoarse() const
gauge::FloatNOrder< char, N, 4, 13, stag, huge_alloc, ghostExchange, use_inphase > type
FloatNOrder(const FloatNOrder &order)
__host__ __device__ complex()
complex< storeFloat > * ghost[8]
const QudaFieldLocation location
__host__ __device__ int getCoords(int coord[], const Arg &arg, int &idx, int parity, int &dim)
Compute the space-time coordinates we are at.
gauge::TIFRPaddedOrder< T, 2 *Nc *Nc > type
This is just a dummy structure we use for trove to define the required structure size.
__device__ __host__ const gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
__device__ __host__ void atomicAdd(int d, int parity, int x, int s_row, int s_col, int c_row, int c_col, const complex< theirFloat > &val)
__device__ __host__ void save(const complex v[length/2], int x, int dir, int parity)
Gauge reconstruct 8 helper where we reconstruct the gauge matrix from 8 packed elements (maximal comp...
gauge::FloatNOrder< float, N, 4, 8, stag, huge_alloc, ghostExchange, use_inphase > type
__device__ __host__ void loadGhost(complex v[length/2], int x, int dir, int parity, real inphase=1.0) const